def parse_content(self, response): selector = etree.HTML(response.text) cj_list = selector.xpath("//ul[@class='listContent']//li") for cj in cj_list: item = LianjiaItem() item['region'] = self.regions.get(response.meta['region']) href = cj.xpath('./a/@href') if not len(href): continue item['href'] = href[0] content = cj.xpath('.//div[@class="title"]/a/text()') if len(content): content = content[0].split() item['name'] = content[0] item['style'] = content[1] item['area'] = content[2] content = cj.xpath('.//div[@class="houseInfo"]/text()') if len(content): content = content[0].split('|') item['orientation'] = content[0] item['decoration'] = content[1] if len(content) == 3: item['elevator'] = content[2] else: item['elevator'] = "无" content = cj.xpath('.//div[@class="positionInfo"]/text()') if len(content): content = content[0].split() item['floor'] = content[0] if len(content) == 2: item['build_year'] = content[1] else: item['build_year'] = '无' content = cj.xpath('.//div[@class="dealDate"]/text()') if len(content): item['sign_time'] = content[0] content = cj.xpath('.//div[@class="totalPrice"]/span/text()') if len(content): item['total_price'] = content[0] content = cj.xpath('.//div[@class="unitPrice"]/span/text()') if len(content): item['unit_price'] = content[0] content = cj.xpath('.//span[@class="dealHouseTxt"]/span/text()') if len(content): for i in content: if i.find("房屋满") != -1: # 找到了返回的是非-1得数,找不到的返回的是-1 item['fangchan_class'] = i elif i.find("号线") != -1: item['subway'] = i elif i.find("学") != -1: item['school'] = i yield item
def parse(self, response): item = LianjiaItem() soup = BeautifulSoup(response.text, 'html.parser') # 用BeautifulSoup解析html datas = soup.find_all('div', class_="info clear") for data in datas: # item['referer']=response.url item['room_url'] = data.find( 'a', href=re.compile(r'https://sz.lianjia.com/ershoufang/'))['href'] item['room_id'] = re.search(r'\d+', item['room_url']).group() #若房源id重复,则不记录 if item['room_id'] in self.id: continue self.id.add(item['room_id']) if soup.find('a', class_='selected', title=re.compile(r'在售')): item['area'] = soup.find('a', class_='selected', title=re.compile(r'在售')).get_text() item['room_name'] = data.find('div', class_='title').get_text() item['community'] = data.find( 'div', class_="houseInfo").find('a').get_text() item['introduction'] = data.find('div', class_="houseInfo").get_text() item['space'] = re.search(r'\d{2,}\.*\d*', item['introduction']).group() #只记录楼层的数字和建造的时间 position = data.find('div', class_="positionInfo").get_text() item['floor'] = re.search(r'\d{1,2}', position).group() if re.search(r'\d{4}', position): item['build_time'] = re.search(r'\d{4}', position).group() item['positionInfo'] = data.find( 'div', class_="positionInfo").find('a').get_text() #只记录关注的人数,带看的人数,和发布售楼信息的时间 followinfo = data.find('div', class_="followInfo").get_text() item['people_focus'] = re.search( '\d*', re.search('\d*人关注', followinfo).group()).group() item['look'] = re.search('\d*', re.search('\d*次带看', followinfo).group()).group() item['publish_time'] = re.search('\w*发布', followinfo).group().replace( '发布', '') item['tag'] = data.find('div', class_='tag').get_text() item['price'] = data.find( 'div', class_="totalPrice").find('span').get_text() item['unitprice'] = re.search( '\d+', data.find('div', class_="unitPrice").get_text()).group() yield item #发送新的url给调度器 #获得各个区域的url(第一层url) first_url = 'https://sz.lianjia.com' area_urls = soup.find_all('a', href=re.compile(r'^/ershoufang/\w+/$'), title=re.compile(r'在售二手房')) if area_urls: for area_url in area_urls: full_url = first_url + area_url['href'] yield scrapy.Request(url=full_url, callback=self.parse) # 解析翻页url #只在第一层url下解析翻页url if re.search(r'/ershoufang/\w+/$', response.url): last_url = soup.find_all( 'a', href=re.compile(r'^/ershoufang/\w+/p\d/$')) if last_url: for url in last_url: if re.search(r'\(\d+', url.get_text()): number = re.search(r'\(\d+', url.get_text()).group().replace( '(', '') if int(number) % 30 == 0: n = int(int(number) / 30) else: n = int(int(number) / 30) + 1 one_url = first_url + url['href'] if n <= 100: if re.search(r'/p[a-z]', one_url): for i in range(1, n + 1): full_url = one_url.replace( '/p', '/pg' + str(i) + 'p').replace( '/pg' + str(i) + 'p', '/p', 1) yield scrapy.Request(url=full_url, callback=self.parse) else: for i in range(1, n + 1): full_url = one_url.replace( '/p', '/pg' + str(i) + 'p') yield scrapy.Request(url=full_url, callback=self.parse) else: if re.search(r'/p[a-z]', one_url): for i in range(1, 101): full_url = one_url.replace( '/p', '/pg' + str(i) + 'p').replace( '/pg' + str(i) + 'p', '/p', 1) yield scrapy.Request(url=full_url, callback=self.parse) else: for i in range(1, 101): full_url = one_url.replace( '/p', '/pg' + str(i) + 'p') yield scrapy.Request(url=full_url, callback=self.parse)
def parse_detail(self, response): item_dict = response.meta item = LianjiaItem() item['province'] = '北京' item['city'] = '北京市' item['areas'] = '顺义区' item['coordinate_type'] = '百度' item['crawl_time'] = '2019-5-30' item['title'] = item_dict['title'] item['prices'] = item_dict['prices'] item['price'] = item_dict['price'] hou = response.xpath( '//h3[@class="similar_data"]/div/div[2]/p[2]/text()' ).extract_first() # 3室2厅 # if hou: area = response.xpath( '//h3[@class="similar_data"]/div/div[3]/p[2]/text()' ).extract_first() address = response.xpath( '//ul[@class="house_description big lightblack lazyload_ulog"]/li[12]/a/text()' ).extract_first() listing_time = response.xpath( '//ul[@class="house_description big lightblack lazyload_ulog"]/li[2]/text()' ).extract_first() orient = response.xpath( '//ul[@class="house_description big lightblack lazyload_ulog"]/li[3]/text()' ).extract_first() flo = response.xpath( '//ul[@class="house_description big lightblack lazyload_ulog"]/li[4]/text()' ).extract_first() # 底层/4 fitment = response.xpath( '//ul[@class="house_description big lightblack lazyload_ulog"]/li[7]/text()' ).extract_first() elevator = response.xpath( '//ul[@class="house_description big lightblack lazyload_ulog"]/li[6]/text()' ).extract_first() build_type = response.xpath( '//ul[@class="house_description big lightblack lazyload_ulog"]/li[5]/text()' ).extract_first() build_time = response.xpath( '//ul[@class="house_description big lightblack lazyload_ulog"]/li[8]/text()' ).extract_first() quanshu = response.xpath( '//ul[@class="house_description big lightblack lazyload_ulog"]/li[10]/text()' ).extract_first() # coordinate = response.xpath('//body/div[7]/div/a/@href').extract_first() coo = response.xpath('//div[@class="sub_mod_box location"]/div/a/@href' ).extract_first() item['hou'] = hou if hou else 'None' item['area'] = area if area else 'None' item['address'] = address if address else 'None' item['listing_time'] = listing_time if listing_time else 'None' item['orient'] = orient if orient else 'None' item['flo'] = flo if flo else 'None' item['fitment'] = fitment if fitment else 'None' item['elevator'] = elevator if elevator else 'None' item['build_type'] = build_type if build_type else 'None' item['build_time'] = build_time if build_time else 'None' item['quanshu'] = quanshu if quanshu else 'None' if coo: coordinate = re.search('pos=(\d+.\d+,\d+.\d+)', coo) item['coordinate'] = coordinate.group(1) if coordinate else 'None' else: item['coordinate'] = '' time.sleep(0.05) yield item
def parse_body_bj(self, response): print response.url city_name = self.city_name content = response.body tree = etree.HTML(content) nodes = tree.xpath('//ul[@class="listContent"]/li') print "len : ", len(nodes) for node in nodes: items = LianjiaItem() name = node.xpath('.//div[@class="title"]/a/text()')[0] items['name'] = name try: position = node.xpath('.//div[@class="positionInfo"]/a/text()') address = position[0] + position[1] except: address = 'NA' print address items['location'] = address items['city_name'] = city_name try: text_content = node.xpath( './/div[@class="positionInfo"]/text()') # print len(build_date) detail = text_content[3].split('/') # 除去北京,北京的页面会多一个小区结构 building_date = detail[1].strip() building_type = "NA" ''' for k, i in enumerate(detail): print k, i if len(detail) == 4: buiding_type = detail[1].strip() + detail[3].strip() build_date = detail[3].strip() elif len(detail) == 3: buiding_type = detail[1].strip() build_date = detail[2].strip() ''' except: building_date = '未知年建成' items['building_date'] = building_date items['building_type'] = building_type # details = desc.split() price_t = node.xpath('.//div[@class="totalPrice"]/span/text()')[0] p = re.findall('\d+', price_t) if len(p) != 0: price = int(price_t) else: price = '均价未知' print price price_detail = { 'price': price, 'origin': 'LJ', 'crawl_date': self.crawl_date } price_list = [] price_list.append(price_detail) price_dict = {self.price_month: price_list} items['price'] = price_dict yield items
class ZufangSpider(scrapy.Spider): name = 'zufang' allowed_domains = ['sz.lianjia.com'] start_url = 'https://sz.lianjia.com/zufang/' page_titles = 30 Item = LianjiaItem() def start_requests(self): """ 开始页 :return: """ yield scrapy.Request(url=self.start_url, callback=self.parse_district) def parse_district(self, response): """ 解析深圳各个区 :param response: :return: """ districts = response.xpath('//li[contains(@data-id, "2300")]') for dis in districts: url = dis.xpath('./a/@href').get() district = dis.xpath('./a/text()').get() if url: yield scrapy.Request(url=response.urljoin(url), callback=self.parse_bizcircle, meta={'district': district}) def parse_bizcircle(self, response): """ 解析各个区中的商圈 :param response: :return: """ bizcircles = response.xpath('//li[@class="filter__item--level3 "]') for biz in bizcircles: district = response.meta['district'] url = biz.xpath('./a/@href').get() bizcircle = biz.xpath('./a/text()').get() if url: yield scrapy.Request(url=response.urljoin(url), callback=self.parse_page, meta={ 'bizcircle': bizcircle, 'district': district }) def parse_page(self, response): """ 根据租房总量算出页数 page=int(total/30) :param response: :return: """ title_num = response.xpath( '//span[@class="content__title--hl"]/text()').get() max_page = int(int(title_num) / self.page_titles + 1) for page in range(1, max_page + 1): bizcircle = response.meta['bizcircle'] district = response.meta['district'] yield scrapy.Request(url=response.url + 'pg{}'.format(page), callback=self.parse_content, meta={ 'bizcircle': bizcircle, 'district': district }) def parse_content(self, response): """ 解析租房数据 :param response: :return: """ contents = response.xpath('//div[@class="content__list--item"]') for content in contents: url = response.urljoin( content.xpath( './/p[@class="content__list--item--title twoline"]/a/@href' ).get()) title = content.xpath( './/p[@class="content__list--item--title twoline"]/a/text()' ).get().strip() district = response.meta['district'] bizcircle = response.meta['bizcircle'] area = content.xpath( './/p[@class="content__list--item--des"]//text()').re_first( '(\d+㎡)') price = content.xpath( './/span[@class="content__list--item-price"]/em/text()').get( ).strip() apartment = content.xpath( './/p[@class="content__list--item--des"]//text()').re_first( '(\d室\d厅\d卫)') company = content.xpath( './/p[@class="content__list--item--brand oneline"]//text()' ).get() if company: company = company.strip() for field in self.Item.fields: try: self.Item[field] = eval(field) yield self.Item except: print('Field is not Defined: ' + field)
def disposeData(self, response): item = LianjiaItem() length = response.xpath( "//div[@class='base']/div[@class='content']/ul/li").extract() #基本属性 if (len(length) == 12): item["house_type"] = response.xpath( "//div[@class='content']/ul/li[1]/text()").extract()[0] item["floor"] = response.xpath( "//div[@class='content']/ul/li[2]/text()").extract()[0] item["area"] = response.xpath( "//div[@class='content']/ul/li[3]/text()").extract()[0] item["house_structure"] = response.xpath( "//div[@class='content']/ul/li[4]/text()").extract()[0] item["inside_space"] = response.xpath( "//div[@class='content']/ul/li[5]/text()").extract()[0] item["building_type"] = response.xpath( "//div[@class='content']/ul/li[6]/text()").extract()[0] item["direct"] = response.xpath( "//div[@class='content']/ul/li[7]/text()").extract()[0] item["building_structure"] = response.xpath( "//div[@class='content']/ul/li[8]/text()").extract()[0] item["decorate_situation"] = response.xpath( "//div[@class='content']/ul/li[9]/text()").extract()[0] item["elevator_proportion"] = response.xpath( "//div[@class='content']/ul/li[10]/text()").extract()[0] item["equipped_escalators"] = response.xpath( "//div[@class='content']/ul/li[11]/text()").extract()[0] item["property_term"] = response.xpath( "//div[@class='content']/ul/li[12]/text()").extract()[0] item["villa_type"] = "" elif (len(length) == 9): item["house_type"] = response.xpath( "//div[@class='content']/ul/li[1]/text()").extract()[0] item["floor"] = response.xpath( "//div[@class='content']/ul/li[2]/text()").extract()[0] item["area"] = response.xpath( "//div[@class='content']/ul/li[3]/text()").extract()[0] item["inside_space"] = response.xpath( "//div[@class='content']/ul/li[4]/text()").extract()[0] item["direct"] = response.xpath( "//div[@class='content']/ul/li[5]/text()").extract()[0] item["building_structure"] = response.xpath( "//div[@class='content']/ul/li[6]/text()").extract()[0] item["decorate_situation"] = response.xpath( "//div[@class='content']/ul/li[7]/text()").extract()[0] item["villa_type"] = response.xpath( "//div[@class='content']/ul/li[8]/text()").extract()[0] item["property_term"] = response.xpath( "//div[@class='content']/ul/li[9]/text()").extract()[0] item["house_structure"] = "" item["building_type"] = "" item["elevator_proportion"] = "" item["equipped_escalators"] = "" elif (len(length) == 3): item["floor"] = response.xpath( "//div[@class='content']/ul/li[1]/text()").extract()[0] item["area"] = response.xpath( "//div[@class='content']/ul/li[2]/text()").extract()[0] item["direct"] = response.xpath( "//div[@class='content']/ul/li[3]/text()").extract()[0] item["house_type"] = "" item["inside_space"] = "" item["building_structure"] = "" item["decorate_situation"] = "" item["villa_type"] = "" item["property_term"] = "" item["house_structure"] = "" item["building_type"] = "" item["elevator_proportion"] = "" item["equipped_escalators"] = "" elif (len(length) == 15): item["house_type"] = response.xpath( "//div[@class='content']/ul/li[1]/text()").extract()[0] item["floor"] = response.xpath( "//div[@class='content']/ul/li[2]/text()").extract()[0] item["area"] = response.xpath( "//div[@class='content']/ul/li[3]/text()").extract()[0] item["house_structure"] = response.xpath( "//div[@class='content']/ul/li[4]/text()").extract()[0] item["inside_space"] = response.xpath( "//div[@class='content']/ul/li[5]/text()").extract()[0] item["building_type"] = response.xpath( "//div[@class='content']/ul/li[6]/text()").extract()[0] item["direct"] = response.xpath( "//div[@class='content']/ul/li[7]/text()").extract()[0] item["building_structure"] = response.xpath( "//div[@class='content']/ul/li[8]/text()").extract()[0] item["decorate_situation"] = response.xpath( "//div[@class='content']/ul/li[9]/text()").extract()[0] item["elevator_proportion"] = response.xpath( "//div[@class='content']/ul/li[10]/text()").extract()[0] item["equipped_escalators"] = response.xpath( "//div[@class='content']/ul/li[11]/text()").extract()[0] item["property_term"] = response.xpath( "//div[@class='content']/ul/li[12]/text()").extract()[0] item["water_type"] = response.xpath( "//div[@class='content']/ul/li[13]/text()").extract()[0] item["electricity_type"] = response.xpath( "//div[@class='content']/ul/li[14]/text()").extract()[0] item["gas_price"] = response.xpath( "//div[@class='content']/ul/li[15]/text()").extract()[0] item["villa_type"] = "" #交易属性 item["time_tone"] = response.xpath( "//div[@class='content']/ul/li[1]/span[2]/text()").extract()[0] item["trading_ownership"] = response.xpath( "//div[@class='content']/ul/li[2]/span[2]/text()").extract()[0] item["last_transaction"] = response.xpath( "//div[@class='content']/ul/li[3]/span[2]/text()").extract()[0] item["house_usage"] = response.xpath( "//div[@class='content']/ul/li[4]/span[2]/text()").extract()[0] item["house_term"] = response.xpath( "//div[@class='content']/ul/li[5]/span[2]/text()").extract()[0] item["property_owner"] = response.xpath( "//div[@class='content']/ul/li[6]/span[2]/text()").extract()[0] mortgage_info = response.xpath( "//div[@class='content']/ul/li[7]/span[2]/text()").extract()[0] mortgage_info = mortgage_info.replace(' ', '') mortgage_info = mortgage_info.replace('\n', '') item["mortgage_info"] = mortgage_info item["house_certificate"] = response.xpath( "//div[@class='content']/ul/li[8]/span[2]/text()").extract()[0] #其他属性 item["total_price"] = response.xpath( "//span[@class='total']/text()").extract()[0] item["unit_price"] = response.xpath( "//span[@class='unitPriceValue']/text()").extract()[0] item["housing_name"] = response.xpath( "//div[@class='communityName']/a[1]/text()").extract()[0] item["county"] = response.xpath( "//div[@class='areaName']/span[2]/a[1]/text()").extract()[0] item["street"] = response.xpath( "//div[@class='areaName']/span[2]/a[2]/text()").extract()[0] item["built_year"] = response.xpath( "//div[@class='area']/div[2]/text()").extract()[0] print("爬取成功") yield item
def parse_content(self, response): ''' parse_content 这个方法就是解析具体的页面了,可以看到,这个方法里面包含了非常多的条件判断,这是因为,我们之前定义的item字段里面的 信息,并不是每一个小区都有的,就是说,我们要的信息他不是一个规规矩矩的信息,很多的房源没有提供相关的信息,比如地 铁,周边学校等等的信息,我们这里就是如果有这个信息,我们就把它提取出来,如果没有的话,我们就给他自定义一个内容。 最后将item提交给item pipeline进行后续的处理。 ''' selector = etree.HTML(response.text) cj_list = selector.xpath("//ul[@class='listContent']/li") for cj in cj_list: item = LianjiaItem() item['region'] = self.regions.get(response.meta['region']) href = cj.xpath('./a/@href') if not len(href): continue item['href'] = href[0] content = cj.xpath('.//div[@class="title"]/a/text()') if len(content): content = content[0].split() item['name'] = content[0] item['style'] = content[1] item['area'] = content[2] content = cj.xpath('.//div[@class="houseInfo"]/text()') if len(content): content = content[0].split('|') item['orientation'] = content[0] item['decoration'] = content[1] if len(content) == 3: item['elevator'] = content[2] else: item['elevator'] = '无' content = cj.xpath('.//div[@class="positionInfo"]/text()') if len(content): content = content[0].split() item['floor'] = content[0] if len(content) == 2: item['build_year'] = content[1] else: item['build_year'] = '无' content = cj.xpath('.//div[@class="dealDate"]/text()') if len(content): item['sign_time'] = content[0] content = cj.xpath('.//div[@class="totalPrice"]/span/text()') if len(content): item['total_price'] = content[0] content = cj.xpath('.//div[@class="unitPrice"]/span/text()') if len(content): item['unit_price'] = content[0] content = cj.xpath('.//span[@class="dealHouseTxt"]/span/text()') if len(content): for i in content: if i.find("房屋满") != -1: item['fangchan_class'] = i elif i.find("号线") != -1: item['subway'] = i elif i.find("学") != -1: item['school'] = i yield item
def parse_item(self, response): item = LianjiaItem() # 照片 house_photo = response.xpath( '//ul[@class="smallpic"]/li/@data-src').extract() # 价格 mon = response.xpath('//div[@class="content"]/div/span').xpath( 'string(.)').extract()[2:4] money = '/'.join(mon) # 一平米多少钱 one_area = \ response.xpath('//div[@class="content"]/div/div[@class="text"]/div[1]/span').xpath('string(.)').extract()[0] # 房间 rom = response.xpath('//div[@class="houseInfo"]/div[@class="room"]/div' ).xpath('string(.)').extract() room = ','.join(rom) # 类型 tp = response.xpath('//div[@class="houseInfo"]/div[@class="type"]/div' ).xpath('string(.)').extract() typ = ','.join(tp) # 面积 ara = response.xpath('//div[@class="houseInfo"]/div[@class="area"]/div' ).xpath('string(.)').extract() area = ','.join(ara) # 介绍 # 小区名称 info_position = response.xpath( '//div[@class="aroundInfo"]/div[1]').xpath('string(.)').extract() # 位置 region = response.xpath('//div[@class="aroundInfo"]/div[2]').xpath( 'string(.)').extract() info_region = ''.join(region).split() # 看房时间 info_look_time = response.xpath( '//div[@class="aroundInfo"]/div[3]').xpath('string(.)').extract() # 链家编号 info_number = response.xpath( '//div[@class="aroundInfo"]/div[4]//span/text()').extract() # 基本属性 basic_attributes = response.xpath( '//div[@class="base"]/div[@class="content"]/ul/li').xpath( 'string(.)').extract() # 交易属性 tran_attribute = response.xpath( '//div[@class="transaction"]/div[@class="content"]/ul/li').xpath( 'string(.)').extract() transaction_attribute = ''.join(tran_attribute).split() # 注意 careful = response.xpath( '//div[@class="introContent"]/div[@class="disclaimer"]/text()' ).extract() # 房源特色-->房源标签 hous_label = response.xpath( '//div[@class="box-l"]/div[2]/div/div[1]/div[2]/a/text()').extract( ) housing_label = ''.join(hous_label).split()[0] # 房源特色-->周边配套 peri_matching = response.xpath( '//div[@class="box-l"]/div[2]/div/div[2]/div[2]/text()').extract() peripheral_matching = ''.join(peri_matching).split()[0] # 房源特色-->小区介绍 com_introduction = response.xpath( '//div[@class="box-l"]/div[2]/div/div[3]/div[2]/text()').extract() community_introduction = ''.join(com_introduction).split()[0] # 房源特色-->装修描述 dec_description = response.xpath( '//div[@class="box-l"]/div[2]/div/div[4]/div[2]/text()').extract() decoration_description = ''.join(dec_description).split()[0] # 房源特色 --> 核心卖点 cor_sell_point = response.xpath( '//div[@class="box-l"]/div[2]/div/div[5]/div[2]/text()').extract() core_selling_point = ''.join(cor_sell_point).split()[0] # 注意事项 matters_needing_attention = \ response.xpath('//div[@class="box-l"]/div[2]/div/div[@class="disclaimer"]/text()').extract()[0] # 联系人照片 contacts_photo = response.xpath( '//div[@class="component-agent-es-pc-6"]/a/img/@src').extract()[0] # 联系人名字 contacts_name = response.xpath( '//div[@class="component-agent-es-pc-6"]/div/div/a/text()' ).extract()[0] # 联系人评分 contacts_sco = response.xpath( '//div[@class="component-agent-es-pc-6"]/div/div[2]/span/text()' ).extract() contacts_score = ''.join(contacts_sco) # 联系人电话 contacts_tel = response.xpath( '//div[@class="component-agent-es-pc-6"]/div/div[3]/text()' ).extract() contacts_telephone = '转'.join(contacts_tel) contacts_company = response.xpath( '//div[@class="component-agent-es-pc-6"]/div/div[@class="brokerName"]/div/span/text()' ).extract()[0] item['house_photo'] = house_photo item['money'] = money item['one_area'] = one_area item['room'] = room item['area'] = area item['typ'] = typ item['careful'] = careful item['info_position'] = info_position item['info_region'] = info_region item['info_look_time'] = info_look_time item['info_number'] = info_number item['basic_attributes'] = basic_attributes item['transaction_attribute'] = transaction_attribute item['housing_label'] = housing_label item['peripheral_matching'] = peripheral_matching item['community_introduction'] = community_introduction item['decoration_description'] = decoration_description item['core_selling_point'] = core_selling_point item['matters_needing_attention'] = matters_needing_attention item['contacts_photo'] = contacts_photo item['contacts_name'] = contacts_name item['contacts_score'] = contacts_score item['contacts_telephone'] = contacts_telephone item['contacts_company'] = contacts_company yield item
def parse_detail(self, response): # get detail data house_detail_url = response.url house_title = response.xpath( '//h3[@class="house_desc lazyload_ulog"]/text()').extract_first( ).replace("\n", "").strip() if len( response.xpath('//h3[@class="house_desc lazyload_ulog"]/text()' )) > 0 else None house_price = response.xpath( '//h3[@class="similar_data"]/div[1]/p[2]/span[1]/text()' ).extract_first().split("万")[0] if len( response.xpath( '//h3[@class="similar_data"]/div/p[2]/span[1]/text()') ) > 0 else None house_single_price = response.xpath( '//ul[@class="house_description big lightblack"]/li[1]/text()' ).extract_first().split("元/平")[0] if len( response.xpath( '//ul[@class="house_description big lightblack"]/li[1]/text()') ) > 0 else None house_room_type = response.xpath( '//h3[@class="similar_data"]/div[2]/p[2]/text()').extract_first( ) if len( response.xpath('//h3[@class="similar_data"]/div[2]/p[2]/text()' )) > 0 else None house_heading = response.xpath( '//ul[@class="house_description big lightblack"]/li[3]/text()' ).extract_first() if len( response.xpath( '//ul[@class="house_description big lightblack"]/li[3]/text()') ) > 0 else None house_floor = response.xpath( '//ul[@class="house_description big lightblack"]/li[4]/text()' ).extract_first() if len( response.xpath( '//ul[@class="house_description big lightblack"]/li[4]/text()') ) > 0 else None house_building_type = response.xpath( '//ul[@class="house_description big lightblack"]/li[5]/text()' ).extract_first() if len( response.xpath( '//ul[@class="house_description big lightblack"]/li[5]/text()') ) > 0 else None house_elevator = response.xpath( '//ul[@class="house_description big lightblack"]/li[6]/text()' ).extract_first() if len( response.xpath( '//ul[@class="house_description big lightblack"]/li[6]/text()') ) > 0 else None house_decoration = response.xpath( '//ul[@class="house_description big lightblack"]/li[7]/text()' ).extract_first() if len( response.xpath( '//ul[@class="house_description big lightblack"]/li[7]/text()') ) > 0 else None house_year = response.xpath( '//ul[@class="house_description big lightblack"]/li[8]/text()' ).extract_first() if len( response.xpath( '//ul[@class="house_description big lightblack"]/li[8]/text()') ) > 0 else None house_use = response.xpath( '//ul[@class="house_description big lightblack"]/li[9]/text()' ).extract_first() if len( response.xpath( '//ul[@class="house_description big lightblack"]/li[9]/text()') ) > 0 else None house_ownership = response.xpath( '//ul[@class="house_description big lightblack"]/li[10]/text()' ).extract_first() if len( response.xpath( '//ul[@class="house_description big lightblack"]/li[10]/text()' )) > 0 else None house_commiunity = response.xpath( '//ul[@class="house_description big lightblack"]/li[12]/a/text()' ).extract_first() if len( response.xpath( '//ul[@class="house_description big lightblack"]/li[12]/a/text()' )) > 0 else None house_area = response.xpath( '//h3[@class="similar_data"]/div[3]/p[2]/text()').extract_first( ).split("m²")[0] if len( response.xpath('//h3[@class="similar_data"]/div[3]/p[2]/text()' )) > 0 else None house_introduction = ''.join( response.xpath( '//div[@class="mod_cont fiveline house_intro_mod_cont"]/text()' ).extract() ).replace("\n", "").replace(" ", "") if len( response.xpath( '//div[@class="mod_cont fiveline house_intro_mod_cont"]/text()' )) > 0 else None house_adders = ''.join( response.xpath( '//div[@class="sub_mod_box location"]/div/a/div/div[@class="marker_desc"]/p/text()' ).extract() ).replace("\n", "").replace(" ", "") if len( response.xpath( '//div[@class="sub_mod_box location"]/div/a/div/div[@class="marker_desc"]/p/text()' )) > 0 else None house_date = response.xpath( '//ul[@class="house_description big lightblack"]/li[2]/text()' ).extract_first() if len( response.xpath( '//ul[@class="house_description big lightblack"]/li[2]/text()') ) > 0 else None item = LianjiaItem() item["house_title"] = house_title item["house_price"] = house_price item["house_single_price"] = house_single_price item["house_room_type"] = house_room_type item["house_heading"] = house_heading item["house_floor"] = house_floor item["house_building_type"] = house_building_type item["house_elevator"] = house_elevator item["house_decoration"] = house_decoration item["house_year"] = house_year item["house_use"] = house_use item["house_ownership"] = house_ownership item["house_commiunity"] = house_commiunity item["house_detail_url"] = house_detail_url item["house_area"] = house_area item["house_introduction"] = house_introduction item["house_adders"] = house_adders item["house_date"] = house_date yield item