Example #1
0
 def parse(self, response):
     # print(response)
     zf = ZufangItem()
     title_list = response.xpath(
         ".//div[@class='f-list-item ershoufang-list']/dl/dd[1]/a/text()"
     ).extract()
     lease_type_list = response.xpath(
         ".//div[@class='f-list-item ershoufang-list']/dl/dd[2]/span[@class='first js-huxing']/text()"
     ).extract()
     address = response.xpath(
         ".//div[@class='f-list-item ershoufang-list']/dl/dd[3]/span")
     address_list = []
     for i in address:
         ads = ''.join(i.xpath('string(.)').extract()[0].split())
         # print(ads)
         address_list.append(ads)
     # print(address_list)
     price_list = response.xpath(
         ".//div[@class='f-list-item ershoufang-list']/dl/dd[5]/div[1]/span[1]/text()"
     ).extract()
     # print(len(title_list))
     # print(len(lease_type_list))
     # print(len(address_list))
     # print(len(price_list))
     for i, j, x, y in zip(title_list, lease_type_list, address_list,
                           price_list):
         zf['title'] = i
         zf['lease_type'] = j
         zf['address'] = x
         zf['price'] = y
         yield zf
Example #2
0
    def parse(self, response):
        # pass

        node_list1 = response.xpath('//ul[@class="listUl"]/li[@logr]')
        # 下面匹配不到内容
        # node_list2 = response.xpath('//ul[@class="listUl"]/li[@class="apartments"]')

        for node in node_list1:
            item1 = ZufangItem()

            title = node.xpath('.//h2/a/text()').extract()
            info = node.xpath(
                './/p[contains(@class, "room")]/text()').extract()
            address = node.xpath(
                './/p[contains(@class, "add")]/text()').extract()
            from_ = node.xpath(
                './/div[@class="jjr"]/text()|.//p[@class="geren"]/text()'
            ).extract()
            price = node.xpath('.//div[@class="money"]/b/text()').extract()

            item1['title'] = title[0]
            item1['info'] = info[0]
            item1['address'] = address[0]
            item1['from_'] = from_[0]
            item1['price'] = price[0]

            yield item1
Example #3
0
    def parse(self, response):
        item = ZufangItem()
        #注意div的位置,优先查看源码
        dakuangjia = response.xpath('//div[@class="zu-itemmod  "]')
        for xiaokuangjia in dakuangjia:
            title = xiaokuangjia.xpath('./div[@class="zu-info"]/h3/a/text()').extract()
            size = xiaokuangjia.xpath('./div[@class="zu-info"]/p[1]/text()').extract()
            price = xiaokuangjia.xpath('./div[@class="zu-side"]/p/strong/text()').extract()
            place = xiaokuangjia.xpath('./div[@class="zu-info"]/address/text()').extract()
            item['title'] = title
            try:
                item['size'] = size[1]
            except IndexError:
                print('')
            item['price'] = price
            item['place'] = place[1].replace('  ','').replace('\xa0\xa0\n','')#用replace()函数去除空格和换行
            yield item
        # prices = response.xpath('//div[@class="zu-side"]/p/strong/text()').extract()
        # for price in prices:
        #     item['price']=price
        #     yield item


        # next_page = response.xpath('//div[@class="fanye"]/a[contains(text(),"下一页")]/@href').extract_first()
        next_page_url = response.xpath('//a[@class="aNxt"]/@href').extract_first()
        if next_page_url:
            # print(next_page_url)
            yield Request(next_page_url,callback=self.parse)
Example #4
0
    def parse_item(self, response):

        home_list=response.xpath("//div[@class='content']//li")
        for home in home_list:
            item=ZufangItem()
            item['title']=home.xpath(".//h2/a/text()").extract_first()
            net_url='http:'+home.xpath(".//div[@class='des']/h2/a/@href").extract_first()
            yield scrapy.Request(url=net_url,callback=self.parse_next,meta={'item':item})
Example #5
0
 def parse(self, response):
     regions = response.xpath('//div[@class="city-list"]/dl')
     for info in regions:
         for node in info.xpath("./dd/a"):
             item = ZufangItem()
             item['region'] = info.xpath("./dt/text()").extract()[0]
             item['city'] = node.xpath('./text()').extract()[0]
             city_url = node.xpath('./@href').extract()[0]
             yield scrapy.Request(city_url, meta={"meta_1": item}, callback=self.parse_city)
Example #6
0
 def parse_topic(self, response):
     # import pdb;pdb.set_trace()
     zufang_item = ZufangItem()
     zufang_item['url'] = response.url
     zufang_item['group_type'] = 'douban'
     zufang_item['title'] = self.get_title(response)
     zufang_item['author'] = self.get_author(response)
     zufang_item['description'] = self.get_description(response)
     zufang_item['create_time'] = self.get_create_time(response)
     yield zufang_item
Example #7
0
    def parse(self, response):
        selector = scrapy.Selector(response)
        items = []
        item = ZufangItem()
        for info in selector.xpath(
                '//div[@class="f-list-item ershoufang-list"]'):
            # 租房标题
            item['title'] = info.xpath('./dl/dd[1]/a/text()').extract()[0]

            # 价格
            item['price'] = info.xpath(
                './dl/dd[5]/div[1]/span[1]/text()').extract()[0]

            # 地址
            address = info.xpath('./dl/dd[3]/span/a/text()').extract()
            address_0 = address[0] + '区'
            address_else = info.xpath(
                'normalize-space(./dl/dd[3]/span/text()[3])').extract()

            # 剔除地址中无用数据
            if len(address) > 2 and ' ' not in address:
                print((len(address)))
                item['address'] = address_0 + '-'.join(address[1:3])
            elif len(address) == 2 and ' ' not in address:
                item['address'] = address_0 + ''.join(
                    address[1]) + '-'.join(address_else)
            else:
                continue

            # 房屋信息描述
            description = info.xpath(
                './dl/dd[2]/span[position()>1]/text()').extract()
            item['pattern'] = ','.join(description)
            print(description)

            # 租房类型
            item['type'] = info.xpath('./dl/dd[2]/span[1]/text()').extract()[0]

            # 图片地址
            img_url = info.xpath('./dl/dt/div/a/img/@data-original').extract()
            if len(img_url):
                item['img'] = "".join(img_url)
            else:
                item['img'] = info.xpath('./dl/dt/div/a/img/@src').extract()[0]
            items.append(item)
            yield item

        # 翻页
        next_page = response.xpath(
            ".//div[@class='pageBox']/ul/li/a[@class='next']/@href"
        ).extract_first()
        if next_page:
            url = response.urljoin(next_page)
            # 爬每一页
            yield scrapy.Request(url, self.parse)
Example #8
0
 def parse(self, response):
     zf = ZufangItem()
     title_list = response.xpath(
         ".//div[@class='f-list-item ']/dl/dd[1]/a/text()").extract()
     money_list = response.xpath(
         ".//div[@class='f-list-item ']/dl/dd[5]/div[1]/span[1]/text()"
     ).extract()
     for i, j in zip(title_list, money_list):
         zf['title'] = i
         zf['money'] = j
         yield zf
Example #9
0
    def parse_item(self, response):
        item = ZufangItem()
        item['name'] = response.xpath(
            '//*[contains(@class,"room-name")]/h1/text()').extract()
        item['url'] = response.url
        item['url_id'] = self.get_md5(response.url)
        item['ifhezu'] = response.xpath(
            '//*[contains(@class,"methodroom-rent")]/text()').extract()
        item['zhuangtai'] = self.remove_kongge(
            response.xpath(
                '//*[contains(@class,"room-title")]//text()').extract())
        item['money'] = response.xpath(
            '//*[contains(@class,"room-price-num")]/text()')[0].extract()
        item['one_money'] = self.remove_kongge(
            response.xpath('//*[contains(@class,"room-price-sale")]//text()')
            [0].extract())

        item['size'] = response.xpath(
            '//*[contains(@class,"room-detail-box")] [1]//label[1]/text()'
        )[0].extract()
        item['number'] = response.xpath(
            '//*[contains(@class,"room-detail-box")] [1]//label[1]/text()'
        )[1].extract()

        item['type'] = self.remove_kongge(
            response.xpath(
                '//*[contains(@class,"room-detail-box")] [1]//label[1]/text()')
            [2].extract())

        item['orientation'] = response.xpath(
            '//*[contains(@class,"room-detail-box")] [2]//label[1]/text()'
        )[1].extract()
        item['floor'] = response.xpath(
            '//*[contains(@class,"room-detail-box")] [2]//label[1]/text()'
        )[0].extract()
        item['location'] = response.xpath(
            '//*[contains(@class,"detail-roombox")]/@title').extract()

        item['subway'] = response.xpath(
            '//*[contains(@class,"room-detail-box")] [2]//label[1]/text()'
        )[6].extract()
        item['deploy'] = self.remove_kongge(
            response.xpath(
                '//*[contains(@class,"room-info-list")]/table/tr[2]//text()').
            extract())
        if self.if_roomie(response):
            item['roomie'] = self.get_roomie(response)
        return item
Example #10
0
 def parse_content(self, response):
     item = ZufangItem()
     item['name'] = response.xpath(
         '//div[@class="wrapper"]/div/h3/text()').extract()[0]
     item['address'] = response.xpath(
         '//div[@class="box"]//dl[5]/dd/a/text()').extract()
     item['price'] = response.xpath(
         '///div[@class="box"]//dd/strong/span/text()').extract()[0]
     item['typehouse'] = response.xpath(
         '//div[@class="box"]//dl[4]/dd/text()').extract()[0]
     item['region'] = response.xpath(
         '//div[@class="box"]//dl[6]/dd/a/text()').extract()
     item['contacts'] = response.xpath(
         '//div[@class="rbox"]/div/div/h2/text()').extract()[0]
     item['phone'] = response.xpath(
         '//div[@class="rbox"]/div/p/text()').extract()[0]
     yield item
Example #11
0
    def parse_chuzu(self, response):
        headers = self.headers.copy()
        next_page = response.xpath(
            '//*[@id="bottom_ad_li"]/div[2]/a[@class="next"]/@href'
        ).extract_first()
        # print(next_page)
        item = ZufangItem()
        messages = response.xpath(
            '/html/body/div[3]/div[1]/div[5]/div[2]/ul/li')[10:]
        for message in messages:
            try:
                item['href'] = message.xpath(
                    './div[2]/h2/a/@href').extract_first()
            except:
                item['href'] = None
            try:
                item['describe'] = message.xpath(
                    './div[2]/h2/a/text()').extract_first().strip()
            except:
                item['describe'] = None
            try:
                item['room'] = message.xpath('./div[2]/p[1]/text()').re_first(
                    '(.*?)\s')
            except:
                item['room'] = None
            try:
                location = message.xpath('./div[2]/p[2]/a').re('>(.*?)</a>')
                item['location'] = ' '.join(location)
            except:
                item['location'] = None
            try:
                item['price'] = message.xpath(
                    './div[3]/div[2]/b/text()').extract_first()
            except:
                item['price'] = None
            yield item

        if next_page:
            time.sleep(30)
            yield Request(next_page,
                          headers=headers,
                          callback=self.parse_chuzu)
Example #12
0
 def parse_content(self, response):
     item = ZufangItem()
     item['name'] = response.xpath(
         '//div[@class="house-title"]/h1/text()').extract()[0]
     item['address'] = response.xpath(
         '//div[@class="house-desc-item fl c_333"]/ul/li[3]/span/text()'
     ).extract()
     item['price'] = response.xpath(
         '//div[@class="house-desc-item fl c_333"]/div/span/b/text()'
     ).extract()[0]
     item['typehouse'] = response.xpath(
         '//div[@class="house-desc-item fl c_333"]/ul/li[1]/span/text()'
     ).extract()
     item['region'] = response.xpath(
         '//div[@class="house-desc-item fl c_333"]/ul/li[4]/span/a/text()'
     ).extract()[0]
     item['contacts'] = response.xpath(
         '//div[@class="house-agent-info fr"]/p/a/text()').extract()[0]
     item['phone'] = response.xpath(
         '//div[@class="house-chat-phone"]/span/text()').extract()[0]
     yield item
Example #13
0
 def parse(self, response):
     '''/ html / body / div[5] / div / div[5] / div[2] / ul / li[1] / div[2] / h2 / a
     / html / body / div[5] / div / div[5] / div[2] / ul / li[2] / div[2] / h2 / a
     / html / body / div[5] / div / div[5] / div[2] / ul / li[2] / div[3] / div[2] / b
     / html / body / div[5] / div / div[5] / div[2] / ul / li[2] / div[2] / p[2] / text()
     / html / body / div[5] / div / div[5] / div[2] / ul / li[2] / div[2] / p[1]'''
     s = etree.HTML(response.text)
     print('开始爬虫')
     print(s)
     datas = s.xpath(
         '/ html / body / div[5] / div / div[5] / div[2] / ul / li')
     for data in datas:
         name = data.xpath('./ div[2] / h2 / a/text()')
         price = data.xpath('./ div[3] / div[2] / b/text()')
         community = data.xpath('./ div[2] / p[2] / text()')
         decorate = data.xpath('./ div[2] / p[1]/text()')
         print(name, price, community, decorate)
         item = ZufangItem(name=name,
                           price=price,
                           community=community,
                           decorate=decorate)
         yield item
     pass
Example #14
0
 def detail_page(self, response):
     city = response.xpath(
         '//div[@class="fl l-txt"]/a[2]/text()').extract()[0][:-2]
     address_temp = response.xpath(
         '//div[@class="zf-room"]/p[7]/a/text()').extract()
     address = ''.join(address_temp)
     name = response.xpath(
         '//div[@class="zf-room"]/p[6]/a/text()').extract()[0]
     price_temp = response.xpath(
         '//div[@class="price "]//span[1]/text()').extract()
     price = ''.join(price_temp)
     price_num = price_temp[0]
     area = response.xpath(
         '//div[@class="zf-room"]/p[1]/text()').extract()[0]
     area_num = area[:-2]
     type = response.xpath(
         '//div[@class="zf-room"]/p[2]/text()').extract()[0].replace(
             ' ', '')
     floor = response.xpath(
         '//div[@class="zf-room"]/p[3]/text()').extract()[0]
     direction = response.xpath(
         '//div[@class="zf-room"]/p[4]/text()').extract()[0]
     date_temp = response.xpath(
         '//div[@class="zf-room"]/p[8]/text()').extract()[0]
     insert_time = self.detail_date(date_temp)
     detail_page = response.url
     try:
         img = response.xpath(
             '//div[@class="thumbnail"]/ul/li[1]/@data-src').extract()[0]
     except Exception as e:
         img = 'none'
     source = 'lianjia'
     item = ZufangItem()
     for field in item.fields.keys():
         item[field] = eval(field)
     yield item