Ejemplo n.º 1
0
    def parse(self, response):  #解析页面

        item = AnjukeItem()
        Houses = response.css("#list-content > div.zu-itemmod")
        for eachHouse in Houses:

            title = eachHouse.css("div.zu-info > h3 > a::text").extract()
            address = eachHouse.css("div.zu-info > address > a::text").extract(
            ) + eachHouse.css("div.zu-info > address::text").extract()  #拼接地址
            detail = eachHouse.css(
                "div.zu-info > p.details-item.tag::text").extract()
            price = eachHouse.css("div.zu-side > p > strong::text").extract()
            address = "".join(address)  #将list中字符串提取出来

            yield {
                'title': title,
                'address': address.replace(' ', '').replace('\n', ''),
                'detail': detail,
                'price': price,
            }
        nextLink = response.css(
            "div.page-content > div.multi-page > a.aNxt::attr(href)"
        )  #寻找下一页的url
        if len(nextLink) != 0:
            nextLink = nextLink.extract()
            nextLink = "".join(nextLink)  #将list中的字符串提取出来
            yield Request(nextLink, callback=self.parse, dont_filter=True)
        else:
            return
Ejemplo n.º 2
0
 def parse_newHouse(self, response):
     sel = Selector(response)
     item = AnjukeItem()
     href = response.meta.get('href')
     # # 房屋信息
     new_houses = sel.xpath('//*[@id="container"]/div[2]/div[1]/div[4]/div')
     for new_house in new_houses:
         if new_house.xpath('./div/a[1]/h3/span/text()'):
             item['id'] = new_house.xpath(
                 './@data-link').extract()[0][-11:-5]
             item['img'] = new_house.xpath('./a/img/@src').extract()[0]
             item['title'] = new_house.xpath(
                 './div/a[1]/h3/span/text()').extract()[0]
             item['address'] = new_house.xpath(
                 './div/a[2]/span/text()').extract()[0].replace(
                     '[', '').replace(']', '').replace('\xa0', ' ').strip()
             if new_house.xpath('./div/a[3]/span/text()'):
                 item['house_type'] = ','.join(
                     new_house.xpath('./div/a[3]/span/text()').extract())
             else:
                 item['house_type'] = new_house.xpath(
                     './div/a[3]/text()').extract()[0]
             item['status_icon'] = ','.join(
                 new_house.xpath('./div/a[4]/div/i/text()').extract())
             item['tag'] = ','.join(
                 new_house.xpath('./div/a[4]/div/span/text()').extract())
             item['name'] = response.meta.get('name')
             item['area_name'] = response.meta.get('area_name')
             item['type_name'] = response.meta.get('type')
             if new_house.xpath('./a[2]/p[1]/span/text()'):
                 item['price'] = new_house.xpath('./a[2]/p/text()').extract(
                 )[0] + new_house.xpath('./a[2]/p[1]/span/text()').extract(
                 )[0] + new_house.xpath('./a[2]/p[1]/text()').extract()[1]
                 if new_house.xpath('./a[2]/p[2]/text()'):
                     item['tel'] = new_house.xpath(
                         './a[2]/p[2]/text()').extract()[0]
             else:
                 item['price'] = new_house.xpath(
                     './a[2]/p/text()').extract()[0]
                 item['tel'] = new_house.xpath(
                     './a[2]/p/text()').extract()[-1]
             yield item
     if sel.xpath(
             '//*[@id="container"]/div[2]/div[1]/div[@class="list-page"]/div/a/text()'
     ):
         if sel.xpath(
                 '//*[@id="container"]/div[2]/div[1]/div[@class="list-page"]/div/a/text()'
         ).extract()[-1] == '下一页':
             yield Request(
                 href[:-3] + 'p' + str(int(response.meta.get('page')) + 1) +
                 '_' + href[-3:],
                 callback=self.parse_newHouse,
                 meta={
                     'href': href,
                     'name': response.meta.get('name'),
                     'area_name': response.meta.get('area_name'),
                     'type': response.meta.get('type'),
                     'page': str(int(response.meta.get('page')) + 1)
                 })
Ejemplo n.º 3
0
 def parse_house_info(self, response):
     '''This function parses a sample response. Some contracts are mingled
         with this docstring.
         @url https://bj.zu.anjuke.com/?kw=%E8%A7%92%E9%97%A8&cw=%E8%A7%92%E9%97%A8
         @returns requests 1 100
         @scrapes title price trail
     '''
     #gender all the house info
     item_list = response.xpath('//div[contains(@class,"zu-item")]')
     next_page = response.xpath(
         '/html/body/div[5]/div[3]/div[3]/div/i[@class="curr"]//following-sibling::a[not(contains(text(),"下一页"))]//@href'
     ).extract()
     for i in item_list:
         item = AnjukeItem()
         item['title'] = i.xpath(
             'div[@class="zu-info"]//a[1]//@title')[0].extract().strip()
         item['house_detail_url'] = i.xpath(
             'div[@class="zu-info"]//a[1]/@href')[0].extract().strip()
         base = i.xpath(
             'div[@class="zu-info"]//p/text()[1]')[0].extract().strip()
         square = i.xpath(
             'div[@class="zu-info"]//p/text()[2]')[0].extract().strip()
         floor = i.xpath(
             'div[@class="zu-info"]//p/text()[3]')[0].extract().strip()
         contract = i.xpath(
             'div[@class="zu-info"]//p/text()[4]')[0].extract().strip()
         direction = i.xpath(
             'div[@class="zu-info"]//p[2]/span[2]')[0].extract().strip()
         try:
             trail = i.xpath('div[@class="zu-info"]//p[2]/span[3]'
                             )[0].extract().strip()
         except:
             trail = "None"
         price = i.xpath('div[@class="zu-side"]//p/strong//text()'
                         )[0].extract().strip()
         current_page = response.xpath(
             '/html/body/div[5]/div[3]/div[3]/div/i[@class="curr"]//text()'
         ).extract()
         item['base'] = base
         item['square'] = square
         item['floor'] = floor
         item['contract'] = contract
         item['direction'] = direction
         item['trail'] = trail
         item['price'] = price
         item['current_page'] = current_page
         yield scrapy.Request(url=item['house_detail_url'],
                              dont_filter=True,
                              meta={'item': item},
                              callback=self.parse_item)
     #crawl the nextpage
     for page in next_page:
         if page not in self.visited_set:
             self.visited_set.add(page)
             yield scrapy.Request(
                 url=page,
                 callback=self.parse_house_info,
                 dont_filter=True,
             )
Ejemplo n.º 4
0
 def parse2(self, response):
     selector = Selector(response)
     item = AnjukeItem()
     try:
         item['name'] = selector.xpath(
             "//div[@class='basic-info']/h1/text()").extract()[0]
     except Exception, e:
         item['name'] = ""
         print e.message
Ejemplo n.º 5
0
 def parse_detail(self, response):
     house_info = response.xpath('//*[@class="houseInfo-wrap"]')
     if house_info:
         l = ItemLoader(AnjukeItem(), house_info)
         l.add_xpath('mode', '//div/div[2]/dl[1]/dd/text()')
         l.add_xpath('area', '//div/div[2]/dl[2]/dd/text()')
         l.add_xpath('floor', '//div/div[2]/dl[4]/dd/text()')
         l.add_xpath('age', '//div/div[1]/dl[3]/dd/text()')
         l.add_xpath('price', '//div/div[3]/dl[2]/dd/text()')
         l.add_xpath('location', '//div/div[1]/dl[1]/dd/a/text()')
         l.add_xpath('district', '//div/div[1]/dl[2]/dd/p/a[1]/text()')
         yield l.load_item()
Ejemplo n.º 6
0
    def parse_dir_contents(self, response):
        item = AnjukeItem()

        str_nam = response.xpath('//div[@class="firstline clearfix"]//a/text()'
                                 ).extract_first().strip()
        str_nam = str(str_nam)
        str_nam = str_nam.split("的")[0]
        #str_nam = str_nam.replace('\\r','').replace('\\n','').replace('\\t','')
        #str_nam = str_nam.strip()
        item['name'] = str_nam
        str_phn = response.xpath('//title/text()').extract()
        str_phn = str(str_phn)
        item['phone'] = re.findall(r"1\d{10}", str_phn)
        yield item
Ejemplo n.º 7
0
 def info(self, response):
     item = AnjukeItem()
     selector = scrapy.Selector(response)
     community = selector.xpath(
         '//*[@id="content"]/div[2]/div/div/h3/text()').extract()[0]
     average_price = selector.xpath(
         '//*[@id="content"]/div[2]/div/div/p/span/em/text()').extract()[0]
     item['community'] = community
     item['average_price'] = average_price
     print(item['community'], item['average_price'])
     print(type(item['community']))
     print(item)
     print(type(item))
     yield item
Ejemplo n.º 8
0
 def parse_item(self, response):
     itemloader = AnjukeItemLoader(item=AnjukeItem(), response=response)
     itemloader.add_xpath('title', '//div[@class="lp-tit"]/h1/text()')
     itemloader.add_xpath('price', '//dd[contains(@class, "price")]/p/em')
     itemloader.add_xpath('around_price', '//dd[@class="around-price"]/span/text()')
     itemloader.add_xpath('house_type', '//dd[@class="ajust"]/div[@class="house-item"]/a/text()')
     itemloader.add_xpath('address', '//span[@class="lpAddr-text"]/text()')
     itemloader.add_xpath('phone', '//div[contains(@class, "tel-box")]/p/strong/text()')
     itemloader.add_xpath('opentime', '//p[contains(@class, "info-new")]', re='<label>最新开盘</label>\s+(.*)<a.*')
     itemloader.add_xpath('completetime', '//p[contains(@class, "info-new")]', re='<label>交房时间</label>\s+(.*)</p>.*')
     itemloader.add_value('url', response.url)
     # pattern = re.compile(r'.*m.anjuke.com.*')
     # if not len(pattern.findall(response.url)) > 0:
     yield itemloader.load_item()
Ejemplo n.º 9
0
    def parse(self, response):
        divs = response.xpath(
            '''//li[@class="list-item"]''')  # 使用xpath从response中获取需要的html块
        for div in divs:
            item = AnjukeItem()  # 实例化item对象
            address = div.xpath(
                './/span[@class="comm-address"]/@title').extract_first(
                )  # 楼盘地址和小区名称,由于地址和小区名称在一起,所以下面是拆分
            address1 = address[address.index("\xa0\xa0") +
                               2:]  #地址,以“\xa0\xa0”区分,结果:宝山-大华-真北路4333弄
            address2 = address1[address1.index("-") +
                                1:]  #第一次以“-”分隔,结果:大华-真北路4333弄
            address3 = address2[address2.index("-") +
                                1:]  #第二次以“-”分隔,结果:真北路4333弄

            name1 = address[:address.index("\xa0\xa0")]  #小区名称
            try:
                type_1 = div.xpath('.//div[@class="details-item"]/span/text()'
                                   ).extract_first()  # 房子类型比如两房一厅这样子~
            except:
                pass

            # item['tags'] = div.xpath('.//span[@class="item-tags tag-metro"]/text()').extract()  # 网站给楼盘定的标签~

            price = div.xpath('.//span[@class="price-det"]/strong/text()'
                              ).extract_first()  # 价格
            price1 = price + '万'
            try:
                area = div.xpath(
                    './/div[@class="details-item"]/span/text()').extract()[1:2]
                area1 = ''.join(area)  #将list转化为string
            except:
                pass

            item['address'] = address3
            item['name'] = name1
            item['type_'] = type_1
            item['price'] = price1
            item['area'] = area1

            yield item

        next_ = response.xpath(
            '//div[@class="multi-page"]/a[@class="aNxt"]/@href').extract_first(
            )  # 获取下一页的链接
        print('-------next----------')
        print(next_)
        yield response.follow(url=next_,
                              callback=self.parse)  # 将下一页的链接加入爬取队列~~
Ejemplo n.º 10
0
    def parse_item(self, response):
        for each in response.xpath('//div[@class="zu-info"]'):

            item = AnjukeItem()
            titleold = each.xpath('h3').xpath("string(.)").extract()[0]
            title = self.zhuanma(titleold)
            link = each.xpath('h3/a/@href').extract()[0]
            sizeold = each.xpath(
                'p[@class="details-item tag"]/b/text()').extract()[0]
            size = self.zhuanma(sizeold)
            #content.replace(u'\xa0', u'')

            item['title'] = title.strip()
            item['link'] = link.strip()
            item['size'] = size.strip()

            yield item
Ejemplo n.º 11
0
 def detail_parse(self, response):
     items = AnjukeItem()
     content = response.body
     area_url = response.meta['area_url']
     page = response.meta['page']
     city = response.meta['city']
     area = response.meta['area']
     if 'antispam' in response.url:
         url = area_url + 'p{}/'.format(page)
         yield scrapy.Request(url,
                              headers=header,
                              callback=self.detail_parse,
                              meta={
                                  'city': city,
                                  'area': area,
                                  'page': page,
                                  'area_url': area_url
                              },
                              dont_filter=True)
     else:
         house_info = re.findall('<div class="zu-info">([\s\S]*?)</div>',
                                 content)
         for info in house_info[:-1]:
             pattern = re.search('title="([\s\S]*?)"[\s\S]*?href="(.*?)"',
                                 info)
             title = pattern.group(1)
             link = pattern.group(2)
             items['city'] = city
             items['area'] = area
             items['title'] = title
             items['link'] = link
             yield items
         if 'aNxt' in content:
             page += 1
             url = area_url + 'p{}/'.format(page)
             yield scrapy.Request(url,
                                  headers=header,
                                  callback=self.detail_parse,
                                  meta={
                                      'city': city,
                                      'area': area,
                                      'page': page,
                                      'area_url': area_url
                                  },
                                  dont_filter=True)
Ejemplo n.º 12
0
    def parse(self, response):

        sel = Selector(response)
        item = AnjukeItem()
        item = self._item_init(item)
        try:
            fang_info = {'title': '', 'info': '', 'desc': '', 'pic_tab': ''}
            url = item['url'] = response.url
            fang_id = item['fang_id'] = (re.search(r'\d+_\d+', url)).group(0)
            item['body'] = (response.body).decode('gbk').encode('utf8')

            try:
                fang_info['title'] = sel.xpath(
                    '//div[@class="mainBoxL"]/div[@class="title"]').extract(
                    )[0]
            except Exception as e:
                print Exception, ":", e
            try:
                fang_info['info'] = sel.xpath(
                    '//div[@class="houseInfor clearfix"]/div[@class="inforTxt"]'
                ).extract()[0]
            except Exception as e:
                print Exception, ":", e

            try:
                fang_info['desc'] = sel.xpath(
                    '//div[@id="hsPro-pos"]/div[@class="describe mt10"]'
                ).extract()[0]
            except Exception as e:
                print Exception, ":", e

            try:
                fang_info['pic_tab'] = sel.xpath(
                    '//div[@id="hsPic-pos"]').extract()[0]
            except Exception as e:
                print Exception, ":", e

            m = hashlib.md5()
            m.update(str(fang_info))
            follow_value = m.hexdigest()
            yield item

        except Exception as e:
            print Exception, ":", e
Ejemplo n.º 13
0
    def parse_item(self, response):

        item_loader = AnjukeItemLoader(item=AnjukeItem(), response=response)

        item_loader.add_xpath("title", "//h3[@class='long-title']/text()")
        item_loader.add_xpath("size", "//span[@class='info-tag'][2]/em/text()")
        item_loader.add_xpath("total_price",
                              "//span[@class='light info-tag']/em/text()")
        item_loader.add_xpath(
            "locate", "//div[@class='houseInfo-content']/p/a[1]/text()")
        meter_price = response.xpath("//div[@class='houseInfo-content']"
                                     )[2].xpath("text()").extract_first("")
        item_loader.add_value("meter_price", meter_price)
        crawl_time = datetime.now().strftime("%Y-%m-%d %H:%M")

        item_loader.add_value("crawl_time", crawl_time)
        anjuke_item = item_loader.load_item()

        yield anjuke_item
Ejemplo n.º 14
0
    def parse(self, response):

        item = AnjukeItem()
        selector = Selector(response)

        infos = selector.xpath('//div[@class="zu-itemmod  "]')

        for info in infos:
            url = info.xpath('a/@href').extract()
            item['url'] = url

            price = info.xpath('div[2]/p/strong/text()').extract()
            item['price']= price

            roomType = info.xpath('div[1]/p[1]/text()[1]').extract()
            item['roomType']= roomType

            rentType = info.xpath('div[1]/p[1]/text()[2]').extract()
            item['rentType']= rentType

            decoration = info.xpath('div[1]/p[1]/text()[3]').extract()
            item['decoration']= decoration

            floor = info.xpath('div[1]/p[1]/text()[4]').extract()
            item['floor']= floor

            area = info.xpath('div[1]/address/a/text()').extract()
            if len(area):
                item['area']= area[0].strip()

            address = info.xpath('div[1]/address/text()').extract()
            if len(address) > 1:
                item['address']= address[1].strip()

            title = info.xpath('div[1]/h3/a/text()').extract()
            item['title']= title

            yield item

        for i in range(2,14):
            nexturl = 'http://sh.zu.anjuke.com/fangyuan/huacao/fx3-p%s/'%i

            yield Request(nexturl,callback=self.parse)
Ejemplo n.º 15
0
    def parse(self, response):
        item = AnjukeItem() #所有数据
        selector = Selector(response)
        HouseData = selector.xpath('//*[@id="list-content"]/div')  #div[1],div[2]需要舍弃
        for eachhouse in HouseData[3:]:
            house_type = eachhouse.xpath('div[1]/p[1]/text()[1]').extract()
            rent_type = eachhouse.xpath('div[1]/p[1]/text()[2]').extract()
            renovation = eachhouse.xpath('div[1]/p[1]/text()[3]').extract()
            address = eachhouse.xpath('div[1]/address/text()').extract()
            owner = eachhouse.xpath('div[1]/p[2]/span/text()').extract()
            price = eachhouse.xpath('div[2]/p/strong/text()').extract() #不要写成/div[2]/p/...没看清坑了自己

            if house_type:
                item['house_type'] = house_type
            else:
                item['house_type'] = None
            if rent_type:
                item['rent_type'] = rent_type
            else:
                item['rent_type'] = None
            if renovation:
                item['renovation'] = renovation
            else:
                item['renovation'] = None
            if address:
                item['address'] = address
            else:
                item['address'] = None
            if owner:
                item['owner'] = owner
            else:
                item['owner'] = None
            if price:
                item['price'] = price
            else:
                item['price'] = None
            yield item

        nextpage = selector.xpath('//div[@class="multi-page"]/a/@href').extract()[-1] #取最后一个href,顺序无法取
        print nextpage
        if nextpage:
            yield Request(nextpage,callback=self.parse)
Ejemplo n.º 16
0
    def parse_detail(self, response):

        print("正在下载详情页", response.url)
        html = response.body.decode("utf-8", 'ignore')
        info = response.meta['info1']

        res = re.search(r'.*area.*?(\[.*?\]).*', html).group(1)
        if res:
            res_list = eval(res)
            history = []
            for item in res_list:
                for _, v in item.items():
                    history.append(v)
            info['history'] = '|'.join(history)

        from items import AnjukeItem

        item = AnjukeItem()

        for k in info.keys():
            item[k] = info[k]
        yield item
Ejemplo n.º 17
0
    def parse(self, response):

        sel = Selector(response)
        item = AnjukeItem()
        item = self._item_init(item)

        try:
            hourse_info = sel.xpath(
                '//h4[@class="block-title houseInfo-title"]/span/text()'
            ).extract()[0]
            item['anjuke_id'] = (re.search(r"\d{9,}", hourse_info)).group(0)
            item['deploy_time'] = (re.search(
                r"\d{4}%s\d{2}%s\d{2}%s" %
                ('年'.decode("utf-8"), '月'.decode("utf-8"),
                 '日'.decode("utf-8")), hourse_info)).group(0)
        except Exception as e:
            print Exception, ":", e
        try:
            item['Cur_url'] = response.url
        except Exception as e:
            print Exception, ":", e
        try:
            item['City'] = (sel.xpath(
                '//*[@id="content"]/div[1]/a[2]/text()').extract()[0]).replace(
                    '二手房'.decode("utf8"), '')
        except Exception as e:
            print Exception, ":", e
        try:
            item['District'] = (sel.xpath(
                '//*[@id="content"]/div[1]/a[3]/text()').extract()[0]).replace(
                    '二手房'.decode("utf8"), '')
        except Exception as e:
            print Exception, ":", e
        try:
            item['Block'] = (sel.xpath(
                '//*[@id="content"]/div[1]/a[4]/text()').extract()[0]).replace(
                    '二手房'.decode("utf8"), '')
        except Exception as e:
            print Exception, ":", e
        try:
            item['Estate'] = sel.xpath(
                '//*[@id="content"]/div[1]/a[4]/text()').extract()[0]
        except Exception as e:
            print Exception, ":", e
        try:
            item['Title'] = sel.xpath(
                '//*[@id="content"]/div[@class="wrapper"]/h3[@class="long-title"]/text()'
            ).extract()[0]
        except Exception as e:
            print Exception, ":", e
        try:
            item['Price'] = (re.compile(r'<[^>]+>', re.S)).sub(
                '',
                sel.xpath('//*[@id="content"]/div[2]/div[1]/div[1]/span[1]').
                extract()[0])
        except Exception as e:
            print Exception, ":", e
        try:
            item['Layout'] = (sel.xpath(
                '//*[@id="content"]/div[2]/div[1]/div[3]/div/div/div[1]/div/dl[dt="%s"]/dd/text()'
                % '房型:'.decode("utf8")).extract()[0]).replace('\n',
                                                              '').replace(
                                                                  '\t', '')
        except Exception as e:
            print Exception, ":", e
        try:
            item['Decoration'] = sel.xpath(
                '//*[@id="content"]/div[2]/div[1]/div[3]/div/div/div[1]/div/dl[dt="%s"]/dd/text()'
                % '装修程度:'.decode("utf8")).extract()[0]
        except Exception as e:
            print Exception, ":", e
        try:
            item['Location'] = (re.compile(r'<[^>]+>', re.S)).sub(
                '',
                sel.xpath(
                    '//*[@id="content"]/div[2]/div[1]/div[3]/div/div/div[1]/div/dl[dt="%s"]/dd/p'
                    % '位置:'.decode("utf8")).extract()[0]).replace('\n',
                                                                  '').replace(
                                                                      '\t', '')
        except Exception as e:
            print Exception, ":", e
        try:
            item['Area'] = sel.xpath(
                '//*[@id="content"]/div[2]/div[1]/div[3]/div/div/div[1]/div/dl[dt="%s"]/dd/text()'
                % '面积:'.decode("utf8")).extract()[0]
        except Exception as e:
            print Exception, ":", e
        try:
            item['Unit_Price'] = sel.xpath(
                '//*[@id="content"]/div[2]/div[1]/div[3]/div/div/div[1]/div/dl[dt="%s"]/dd/text()'
                % '房屋单价:'.decode("utf8")).extract()[0]
        except Exception as e:
            print Exception, ":", e
        try:
            item['Years'] = sel.xpath(
                '//*[@id="content"]/div[2]/div[1]/div[3]/div/div/div[1]/div/dl[dt="%s"]/dd/text()'
                % '年代:'.decode("utf8")).extract()[0]
        except Exception as e:
            print Exception, ":", e
        try:
            item['Orientation'] = sel.xpath(
                '//*[@id="content"]/div[2]/div[1]/div[3]/div/div/div[1]/div/dl[dt="%s"]/dd/text()'
                % '朝向:'.decode("utf8")).extract()[0]
        except Exception as e:
            print Exception, ":", e
        try:
            item['Downpayment'] = (sel.xpath(
                '//*[@id="content"]/div[2]/div[1]/div[3]/div/div/div[1]/div/dl[dt="%s"]/dd/text()'
                % '参考首付:'.decode("utf8")).extract()[0]).replace('\n',
                                                                '').replace(
                                                                    '\t', '')
        except Exception as e:
            print Exception, ":", e
        try:
            item['Type'] = sel.xpath(
                '//*[@id="content"]/div[2]/div[1]/div[3]/div/div/div[1]/div/dl[dt="%s"]/dd/text()'
                % '类型:'.decode("utf8")).extract()[0]
        except Exception as e:
            print Exception, ":", e
        try:
            item['Floor'] = sel.xpath(
                '//*[@id="content"]/div[2]/div[1]/div[3]/div/div/div[1]/div/dl[dt="%s"]/dd/text()'
                % '楼层:'.decode("utf8")).extract()[0]
        except Exception as e:
            print Exception, ":", e
        try:
            item['Monthly_Payments'] = sel.xpath(
                '//*[@id="content"]/div[2]/div[1]/div[3]/div/div/div[1]/div/dl[dt="%s"]/dd/span/text()'
                % '参考月供:'.decode("utf8")).extract()[0]
        except Exception as e:
            print Exception, ":", e
        try:
            item['Desc'] = (re.compile(r'<[^>]+>', re.S)).sub(
                '',
                sel.xpath(
                    '//*[@id="content"]/div[2]/div[1]/div[3]/div/div/div[3]/div/div'
                ).extract()[0])
        except Exception as e:
            print Exception, ":", e
        try:
            item['Agent'] = sel.xpath(
                '//p[@class="broker-name"]/a/text()').extract()[0]
        except Exception as e:
            print Exception, ":",
        try:
            item['Agent_Phone'] = (sel.xpath(
                '//p[@class="broker-mobile"]/text()').extract()[0]).replace(
                    ' ', '')
        except Exception as e:
            print Exception, ":", e
        try:
            item['Agent_Company'] = sel.xpath(
                '//div[@class="broker-company"]/a[1]/text()').extract()[0]
        except Exception as e:
            print Exception, ":", e
        yield item
Ejemplo n.º 18
0
    def parse(self, response):

        print("开始解析第%s页 >>> " % self.page_index)
        crt_url = response.url
        print("当前url: {url}".format(url=crt_url))

        pnum = extract_page_index(crt_url)
        print(pnum)
        cache_crt_page_index(pnum)

        item = AnjukeItem()
        print('------------------------------------------------')
        info_list = response.xpath("//*[@id='houselist-mod-new']/li")
        for info in info_list:
            # 标题
            title = info.xpath("./div[2]/div[1]/a/text()").extract_first()
            # 安选验真信息
            guarantee_info = info.xpath(
                "./div[2]/div[1]/em/@title").extract_first()
            # 链接
            link = info.xpath("./div[2]/div[1]/a/@href").extract_first()
            # 房屋id
            house_id = extract_house_id(link)
            # 户型
            house_type = info.xpath(
                "./div[2]/div[2]/span[1]/text()").extract_first()
            # 面积
            area = info.xpath("./div[2]/div[2]/span[2]/text()").extract_first()
            # 楼层信息
            floor_info = info.xpath(
                "./div[2]/div[2]/span[3]/text()").extract_first()
            # 建造时间
            build_time_info = info.xpath(
                "./div[2]/div[2]/span[4]/text()").extract_first()
            # 经纪人姓名
            broker_name = info.xpath(
                "./div[2]/div[2]/span[5]/text()").extract_first()
            # 地址
            address = info.xpath("./div[2]/div[3]/span/text()").extract_first()
            # 标签信息
            tags = []
            for tag in info.xpath("./div[2]/div[4]"):
                tag_str = tag.xpath("./span/text()").extract()
                tags.extend(tag_str)
            # 价格
            price = info.xpath(
                "./div[3]/span[1]/strong/text()").extract_first()
            # 每平米价格
            unit_price = info.xpath("./div[3]/span[2]/text()").extract_first()

            # 赋值到item对象上------------
            item['house_id'] = house_id
            item['title'] = title.strip() if title else ''
            item['guarantee_info'] = guarantee_info if guarantee_info else ''
            item['link'] = link if link else ''
            item['house_type'] = house_type if house_type else ''
            item['area'] = area if area else ''
            item['floor_info'] = floor_info if floor_info else ''
            item[
                'build_time_info'] = build_time_info if build_time_info else ''
            item['broker_name'] = broker_name if broker_name else ''
            item['address'] = address.strip() if address else ''
            item['tags'] = tags if tags else []
            item['price'] = price if price else ''
            item['unit_price'] = unit_price if unit_price else ''
            yield item

        # 下一页地址
        next_page_url = response.xpath(
            "//*[@id='content']/div[4]/div[7]/a[@class='aNxt']/@href"
        ).extract_first()
        print(next_page_url)
        if next_page_url is not None:
            yield scrapy.Request(response.urljoin(next_page_url))

        self.page_index += 1
Ejemplo n.º 19
0
    def parse_item(self, response):
        meta = response.meta
        if 'callback' not in response.url and response.status == 200:
            li = response.xpath('//ul[@id="houselist-mod-new"]/li')
            if li:
                for i in li:
                    city_name = meta.get('city_name')

                    area = meta.get('area')
                    location = meta.get('location')
                    item = AnjukeItem()
                    item['area'] = area
                    item['location'] = location
                    item['city_name'] = city_name

                    title = i.xpath(
                        './div[@class="house-details"]/div/a/@title').extract(
                        )
                    base_url = i.xpath(
                        './div[@class="house-details"]/div/a/@href').extract()
                    if base_url:
                        item['base_url'] = ''.join(base_url)
                    else:
                        item['base_url'] = ''
                    if title:
                        item['title'] = ''.join(title)
                    else:
                        item['title'] = ''
                    addr = i.xpath(
                        './div[@class="house-details"]/div[@class="details-item"]/span[@class="comm-address"]/@title'
                    ).extract()
                    if addr:
                        item['addr'] = ''.join(addr)
                    else:
                        item['addr'] = ''
                    sum_price = i.xpath(
                        './div[@class="pro-price"]/span[@class="price-det"]/strong/text()'
                    ).extract()
                    if sum_price:
                        item['sum_price'] = ''.join(sum_price)
                    else:
                        item['sum_price'] = ''
                    unit_price = i.xpath(
                        './div[@class="pro-price"]/span[@class="unit-price"]/text()'
                    ).extract()
                    if unit_price:
                        item['unit_price'] = ''.join(unit_price)
                    else:
                        item['unit_price'] = ''
                    item['url'] = response.url
                    item['dt'] = dt
                    yield item

            next_url = response.xpath(
                '//div[@class="multi-page"]/a[@class="aNxt"]/@href').extract()
            if next_url:
                city_name = meta.get('city_name')
                area = meta.get('area')
                location = meta.get('location')
                url = ''.join(next_url)
                yield scrapy.Request(url,
                                     meta={
                                         'url': url,
                                         'city_name': city_name,
                                         'area': area,
                                         'location': location
                                     },
                                     callback=self.parse_item,
                                     dont_filter=True)
        else:
            url = meta.get('url')
            city_name = meta.get('city_name')
            area = meta.get('area')
            location = meta.get('location')
            if url:
                yield scrapy.Request(url,
                                     meta={
                                         'url': url,
                                         'city_name': city_name,
                                         'area': area,
                                         'location': location
                                     },
                                     callback=self.parse_item,
                                     dont_filter=True)
Ejemplo n.º 20
0
    def parse(self, response):

        item = AnjukeItem()
        item = self._item_init(item)
        sel = Selector(response)
        item['batch_id'] = batch_id = response.meta['batch_id']
        item['submit_time'] = submit_time = response.meta['submit_time']
        item['schedule_time'] = schedule_time = str(
            time.strftime("%Y-%m-%d %H:%M:%S",
                          time.localtime(response.meta['schedule_time'])))
        item['received_time'] = received_time = str(
            time.strftime("%Y-%m-%d %H:%M:%S",
                          time.localtime(response.meta['received_time'])))
        item['page_index'] = page_index = response.meta['page_index']
        server_time = time.mktime(
            time.strptime(response.headers['Date'],
                          "%a, %d %b %Y %H:%M:%S %Z")) + 8 * 3600
        item['server_time'] = str(
            time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(server_time)))
        try:
            if sel.xpath('//div[@class="list sorry_word"]') != []:
                if 'retry_count' in response.meta:
                    retry_count = int(response.meta['retry_count'])
                else:
                    retry_count = 0
                if retry_count <= 2:
                    print "retry......"
                    yield Request(url=response.url,
                                  method='GET',
                                  callback=self.parse,
                                  meta={
                                      'submit_time': submit_time,
                                      'schedule_time': schedule_time,
                                      'received_time': received_time,
                                      'retry_count': retry_count + 1,
                                      'page_index': page_index,
                                      'batch_id': batch_id
                                  })
                else:
                    return
            if len(sel.xpath(
                    '//div[@class="houseList"]/dl[@class="list rel"]')) > 30:
                dl_list = sel.xpath(
                    '//div[@class="houseList"]/dl[@class="list rel"]')
                for dl_index in range(1, len(dl_list)):
                    try:
                        item['fang_id'] = fang_id = (re.search(
                            r'\d_\d+', (dl_list[dl_index].xpath(
                                './dd[@class="info rel floatr"]/p[@class="title"]/a/@href'
                            ).extract()[0]))).group(0)
                        item['rank'] = rank = 30 * (page_index - 1) + dl_index
                        item['update_tag'] = update_tag = dl_list[
                            dl_index].xpath(
                                './dd[@class="info rel floatr"]/p[@class="gray6 mt10"]/span[@class="ml10 gray9"]/text()'
                            ).extract()[0]
                        if re.match(r'\d+秒前更新'.decode("utf-8"), update_tag):
                            deviation = int(
                                update_tag.replace('秒前更新'.decode("utf-8"), ''))
                            item['update_time'] = update_time = str(
                                time.strftime(
                                    "%Y-%m-%d %H:%M:%S",
                                    time.localtime(server_time - deviation)))
                        if re.match(r'\d+分钟前更新'.decode("utf-8"), update_tag):
                            deviation = int(
                                update_tag.replace('分钟前更新'.decode("utf-8"),
                                                   '')) * 60
                            item['update_time'] = update_time = str(
                                time.strftime(
                                    "%Y-%m-%d %H:%M:%S",
                                    time.localtime(server_time - deviation)))
                        if re.match(r'\d+小时前更新'.decode("utf-8"), update_tag):
                            deviation = int(
                                update_tag.replace('小时前更新'.decode("utf-8"),
                                                   '')) * 3600
                            item['update_time'] = update_time = str(
                                time.strftime(
                                    "%Y-%m-%d %H:%M:%S",
                                    time.localtime(server_time - deviation)))
                        if re.match(r'\d+天前更新'.decode("utf-8"), update_tag):
                            deviation = int(
                                update_tag.replace('天前更新'.decode("utf-8"),
                                                   '')) * 3600 * 24
                            item['update_time'] = update_time = str(
                                time.strftime(
                                    "%Y-%m-%d %H:%M:%S",
                                    time.localtime(server_time - deviation)))
                    except Exception as e:
                        print Exception, ":", e
                    yield item
            else:
                dl_list = sel.xpath(
                    '//div[@class="houseList"]/dl[@class="list rel"]')
                for dl_index in range(0, len(dl_list)):
                    try:
                        item['fang_id'] = fang_id = (re.search(
                            r'\d_\d+', (dl_list[dl_index].xpath(
                                './dd[@class="info rel floatr"]/p[@class="title"]/a/@href'
                            ).extract()[0]))).group(0)
                        item['rank'] = rank = 30 * (page_index -
                                                    1) + dl_index + 1
                        item['update_tag'] = update_tag = dl_list[
                            dl_index].xpath(
                                './dd[@class="info rel floatr"]/p[@class="gray6 mt10"]/span[@class="ml10 gray9"]/text()'
                            ).extract()[0]
                        if re.match(r'\d+秒前更新'.decode("utf-8"), update_tag):
                            deviation = int(
                                update_tag.replace('秒前更新'.decode("utf-8"), ''))
                            item['update_time'] = update_time = str(
                                time.strftime(
                                    "%Y-%m-%d %H:%M:%S",
                                    time.localtime(server_time - deviation)))
                        if re.match(r'\d+分钟前更新'.decode("utf-8"), update_tag):
                            deviation = int(
                                update_tag.replace('分钟前更新'.decode("utf-8"),
                                                   '')) * 60
                            item['update_time'] = update_time = str(
                                time.strftime(
                                    "%Y-%m-%d %H:%M:%S",
                                    time.localtime(server_time - deviation)))
                        if re.match(r'\d+小时前更新'.decode("utf-8"), update_tag):
                            deviation = int(
                                update_tag.replace('小时前更新'.decode("utf-8"),
                                                   '')) * 3600
                            item['update_time'] = update_time = str(
                                time.strftime(
                                    "%Y-%m-%d %H:%M:%S",
                                    time.localtime(server_time - deviation)))
                        if re.match(r'\d+天前更新'.decode("utf-8"), update_tag):
                            deviation = int(
                                update_tag.replace('天前更新'.decode("utf-8"),
                                                   '')) * 3600 * 24
                            item['update_time'] = update_time = str(
                                time.strftime(
                                    "%Y-%m-%d %H:%M:%S",
                                    time.localtime(server_time - deviation)))
                    except Exception as e:
                        print Exception, ":", e
                    yield item
        except Exception as e:
            print Exception, ":", e