Ejemplo n.º 1
0
    def parse_list(self, response):
        print('parse_list response.url:' + response.url)
        self.logger.debug('parse_list response.url:' + response.url)
        item = HotelItem()

        m = re.findall('{"hotelList":(.*),"sortInfo"', response.text)
        print('parse_list m: {}'.format(m))
        self.logger.debug('parse_list m: {}'.format(m))

        result = json.loads(m[0] + '}')

        for record in result.get('records'):
            item['title'] = record.get('shopName')
            item['url'] = 'http://www.dianping.com' + record.get('shopUrl')
            item['is_bookable'] = record.get('isBookable')
            item['location'] = record.get('regionName')
            item['walk_distance'] = record.get('distanceText')
            item['price'] = record.get('price')
            item['star'] = record.get('star') / 10
            item['review_num'] = record.get('reviewCount')
            item['number'] = record.get('id')
            item['pic_array'] = str(record.get('picArray')).replace(
                '\'', '').strip('[').strip(']')
            getlocation(item)
            yield item

        le = LinkExtractor(restrict_css='.page .next')
        print('4' * 200)
        links = le.extract_links(response)
        if links:
            next_url = links[0].url
            print('parse_list next_url:', next_url)
            self.logger.debug('parse_list next_url:{}'.format(next_url))
            yield Request(next_url, callback=self.parse_list)
Ejemplo n.º 2
0
    def parse_list(self, response):
        item = FilmItem()

        li = response.css('#shop-all-list>ul>li')
        print('parse_list li:{}  response.url: {}'.format(
            li.css('.txt>.tit h4::text').extract(), response.url))
        self.logger.debug('parse_list li:{}  response.url: {}'.format(
            li.css('.txt>.tit h4::text').extract(), response.url))
        for i in li:
            item['title'] = i.css('.txt>.tit h4::text').extract_first().strip()
            item['url'] = i.css('.txt>.tit>a::attr(href)').extract_first()
            if i.css('.shop-branch::text'):
                item['branch'] = i.css(
                    '.shop-branch::attr(href)').extract_first()
            item['img'] = i.css('img::attr(data-src)').extract_first()
            item['star'] = float(
                i.css('.sml-rank-stars::attr(class)').re_first(
                    r'[1-9]\d*|0')) / 10
            if i.css('.review-num b::text'):
                print('review-num : {}'.format(
                    i.css('.review-num>b::text').extract_first()))
                item['review_num'] = int(
                    i.css('.review-num>b::text').extract_first())
            if i.css('.mean-price b::text'):
                item['mean_price'] = i.css(
                    '.mean-price b::text').extract_first().strip('¥')
            print('type location 1: {}'.format(
                i.css('.tag-addr span::text').extract()))
            item['type'] = i.css('.tag-addr span::text').extract()[0].strip()
            item['location'] = i.css(
                '.tag-addr span::text').extract()[1].strip()
            item['address'] = i.css('.addr::text').extract_first().strip()
            getlocation(item)
            item['number'] = item['url'].split('/')[-1]
            yield item
Ejemplo n.º 3
0
    def parse_list(self, response):
        print('parse_list response.url:' + response.url)
        self.logger.debug('parse_list response.url:' + response.url)

        item = TourItem()

        li = response.css('#shop-all-list>ul>li')
        print('parse_list li:{}  response.url: {}'.format(
            li.css('.txt>.tit h4::text').extract(), response.url))
        self.logger.debug('parse_list li:{}  response.url: {}'.format(
            li.css('.txt>.tit h4::text').extract(), response.url))
        for i in li:
            item['title'] = i.css('.txt>.tit h4::text').extract_first().strip()
            item['url'] = i.css('.txt>.tit>a::attr(href)').extract_first()
            if i.css('.shop-branch::text'):
                item['branch'] = i.css(
                    '.shop-branch::attr(href)').extract_first()
            item['img'] = i.css('img::attr(data-src)').extract_first()
            item['star'] = float(
                i.css('.sml-rank-stars::attr(class)').re_first(
                    r'[1-9]\d*|0')) / 10
            if i.css('.review-num b::text'):
                print('review-num : {}'.format(
                    i.css('.review-num>b::text').extract_first()))
                item['review_num'] = int(
                    i.css('.review-num>b::text').extract_first())
            if i.css('.mean-price b::text'):
                item['mean_price'] = i.css(
                    '.mean-price b::text').extract_first().strip('¥')
            # else:
            #     item['mean_price'] = '-'
            print('1111111 score environment service: {}'.format(
                i.css('.comment-list b::text').extract()))
            if i.css('.comment-list b::text').extract():
                print('222222 score environment service: {}'.format(
                    i.css('.comment-list b::text').extract()))
                item['score'] = float(
                    i.css('.comment-list b::text').extract()[0])
                item['environment'] = float(
                    i.css('.comment-list b::text').extract()[1])
                item['service'] = float(
                    i.css('.comment-list b::text').extract()[2])
            print('type location 1: {}'.format(
                i.css('.tag-addr span::text').extract()))
            item['type'] = i.css('.tag-addr span::text').extract()[0].strip()
            item['location'] = i.css(
                '.tag-addr span::text').extract()[1].strip()
            item['address'] = i.css('.addr::text').extract_first().strip()
            getlocation(item)
            item['number'] = item['url'].split('/')[-1]
            yield item

        le = LinkExtractor(restrict_css='div.page > a.next')
        print('4' * 200)
        links = le.extract_links(response)
        if links:
            next_url = links[0].url
            print('next_url:', next_url)
            self.logger.debug('next_url:' + next_url)
            yield Request(next_url, callback=self.parse_list)
Ejemplo n.º 4
0
    def parse_list(self, response):
        item = WeddingItem()

        li = response.css('.shop-list>li')
        print('parse_list li:{}  response.url: {}'.format(
            li.css('.shopname::text').extract(), response.url))
        self.logger.debug('parse_list li:{}  response.url: {}'.format(
            li.css('.shopname::text').extract(), response.url))
        for i in li:
            item['title'] = i.css('.shopname::text').extract_first()
            item['url'] = 'http:' + i.css(
                '.shopname::attr(href)').extract_first()
            if i.css('a img::attr(data-lazyload)').extract_first():
                item['img'] = 'http:' + i.css(
                    'a img::attr(data-lazyload)').extract_first()
            else:
                item['img'] = 'http:' + i.css(
                    'a img::attr(src)').extract_first()
            item['star'] = float(
                i.css('.item-rank-rst::attr(class)').re_first(
                    r'[1-9]\d*|0')) / 10
            item['review_num'] = i.css('p.remark > span:nth-child(2) > a::text'
                                       ).re_first(r'[1-9]\d*|0')
            item['mean_price'] = i.css('.price::text').extract_first()
            item['product_photos'] = i.css(
                'p.remark > span:nth-child(3) > a::text').extract_first()
            if i.css('.area-list::text').extract_first():
                item['location'] = ' '.join(
                    i.css('.area-list::text').extract_first().strip().split())
            else:
                item['location'] = ''
            getlocation(item)
            item['number'] = item['url'].split('/')[-1]
            yield item
Ejemplo n.º 5
0
    def parse_list(self, response):
        item = FoodItem()

        li = response.css('#shop-all-list>ul>li')
        print('parse_list li:{}  response.url: {}'.format(
            li.css('.txt>.tit h4::text').extract(), response.url))
        self.logger.debug('parse_list li:{}  response.url: {}'.format(
            li.css('.txt>.tit h4::text').extract(), response.url))
        for i in li:
            item['title'] = i.css('.txt>.tit h4::text').extract_first().strip()
            item['url'] = i.css('.txt>.tit>a::attr(href)').extract_first()
            if i.css('.shop-branch::text'):
                item['branch'] = i.css(
                    '.shop-branch::attr(href)').extract_first()
            item['img'] = i.css('img::attr(data-src)').extract_first()
            item['star'] = float(
                i.css('.sml-rank-stars::attr(class)').re_first(
                    r'[1-9]\d*|0')) / 10
            if i.css('.review-num b::text'):
                print('review-num : {}'.format(i.css('.review-num>b::text')))
                item['review_num'] = int(
                    i.css('.review-num>b::text').extract_first())
            if i.css('.mean-price b::text'):
                item['mean_price'] = i.css(
                    '.mean-price b::text').extract_first().strip('¥')
            # else:
            #     item['mean_price'] = '-'
            print('1111111 taste environment service: {}'.format(
                i.css('.comment-list b::text').extract()))
            if i.css('.comment-list b::text').extract():
                print('222222 taste environment service: {}'.format(
                    i.css('.comment-list b::text').extract()))
                item['taste'] = float(
                    i.css('.comment-list b::text').extract()[0])
                item['environment'] = float(
                    i.css('.comment-list b::text').extract()[1])
                item['service'] = float(
                    i.css('.comment-list b::text').extract()[2])
            if i.css('div.txt > div.recommend'):
                print("i.css('div.txt > div.recommend a') : {}".format(
                    i.css('div.txt > div.recommend a::text').extract()))
                item['recommend'] = ' '.join(
                    i.css('div.txt > div.recommend a::text').extract())
            print('type location 1: {}'.format(
                i.css('.tag-addr span::text').extract()))
            item['type'] = i.css('.tag-addr span::text').extract()[0].strip()
            item['location'] = i.css(
                '.tag-addr span::text').extract()[1].strip()
            item['address'] = i.css('.addr::text').extract_first().strip()
            getlocation(item)
            item['number'] = item['url'].split('/')[-1]
            yield item
Ejemplo n.º 6
0
    def parse_list(self, response):
        print('parse_list response.url:' + response.url)
        self.logger.debug('parse_list response.url:' + response.url)
        item = HomeItem()

        li = response.css('.shop-list>.shop-list-item')
        for i in li:
            item['title'] = i.css(
                '.shop-title>h3>a::text').extract_first().strip()
            item['url'] = 'https:' + i.css(
                '.shop-title>h3>a::attr(href)').extract_first()
            if i.css('.shop-images img::attr(data-src)').extract_first():
                item['img'] = 'http:' + i.css(
                    '.shop-images img::attr(data-src)').extract_first()
            else:
                item['img'] = 'http:' + i.css(
                    '.shop-images img::attr(src)').extract_first()
            item['star'] = float(
                i.css('.item-rank-rst::attr(class)').re_first(
                    r'[1-9]\d*|0')) / 10
            item['review_num'] = int(
                i.css('.user-comment>a::text').re_first(r'[1-9]\d*|0'))
            td = i.css('.shop-info-text-i>span::text').extract()
            print('td : {}'.format(td))
            if len(td) == 4:
                item['type'] = ' '.join((td[0], td[1]))
                item['district'] = td[2]
                item['location'] = td[3]
            elif len(td) == 3:
                item['type'] = td[0]
                item['district'] = td[1]
                item['location'] = td[2]
            elif len(td) == 2:
                item['type'] = td[0]
                item['district'] = td[1]
                item['location'] = ''
            else:
                print('td in else: {}'.format(td))
            getlocation(item)
            item['number'] = item['url'].split('/')[-1]
            yield item

        le = LinkExtractor(restrict_css='div.pages a.nextPage')
        print('4' * 200)
        links = le.extract_links(response)
        if links:
            next_url = links[0].url
            print('parse_list next_url:', next_url)
            self.logger.debug('parse_list next_url:{}'.format(next_url))
            yield Request(next_url, callback=self.parse_list)
Ejemplo n.º 7
0
    def parse_list_decoration(self, response):
        item = HomeItem()

        div = response.css('.shop-list>.shop-list-item')
        for i in div:
            item['title'] = i.css(
                '.shop-title>h3>a::text').extract_first().strip()
            item['url'] = 'https:' + i.css(
                '.shop-title>h3>a::attr(href)').extract_first()
            if i.css('.shop-images img::attr(data-src)').extract_first():
                item['img'] = 'http:' + i.css(
                    '.shop-images img::attr(data-src)').extract_first()
            else:
                item['img'] = 'http:' + i.css(
                    '.shop-images img::attr(src)').extract_first()
            # item['star'] = float(
            #     re.search(r'[1-9]\d*|0', i.css('.item-rank-rst::attr(class)').extract_first())[0]) / 10
            item['star'] = float(
                i.css('.item-rank-rst::attr(class)').re_first(
                    r'[1-9]\d*|0')) / 10
            # item['review_num'] = re.search(r'[1-9]\d*|0', i.css('.shop-info-text-i>a::text').extract_first())[0]
            item['review_num'] = i.css('.shop-info-text-i>a::text').re_first(
                r'[1-9]\d*|0')
            item['contract_price'] = i.css(
                'div.row.shop-info-text-i > span:nth-child(3)::text'
            ).extract_first()
            if len(i.css('.ml-26').extract()) > 1:
                types = i.css(
                    'div.row.shop-info-text-i > span:nth-child(4) a::text'
                ).extract()
                print(
                    "types{} len(types){} len(i.css('.ml-26').extract()) : {}".
                    format(types, len(types), len(i.css('.ml-26').extract())))
                item['type'] = ' '.join(types)
                print("item['type'] : {}".format(item['type']))
            else:
                item['type'] = '装修设计'
            item['district'] = i.css(
                '.shop-location>span:first-child::text').extract_first()
            item['location'] = i.css(
                '.shop-location>span:last-child::text').extract_first()
            if i.css('.shop-team').extract():
                item['design'] = i.css(
                    '.shop-team i:first-child::text').extract_first()
                item['designer'] = i.css(
                    '.shop-team i:last-child::text').extract_first()
            getlocation(item)
            item['number'] = item['url'].split('/')[-1]
            yield item
Ejemplo n.º 8
0
    def parse_list(self, response):
        print('parse_list response.url:' + response.url)
        self.logger.debug('parse_list response.url:' + response.url)
        item = BabyItem()

        li = response.css('.shop-list>li')
        print('parse_list li:{}  response.url: {}'.format(
            li.css('.shopname::text').extract(), response.url))
        self.logger.debug('parse_list li:{}  response.url: {}'.format(
            li.css('.shopname::text').extract(), response.url))
        for i in li:
            item['title'] = i.css('.shopname::text').extract_first()
            item['url'] = 'http:' + i.css(
                '.shopname::attr(href)').extract_first()
            if i.css('a img::attr(data-lazyload)').extract_first():
                item['img'] = 'http:' + i.css(
                    'a img::attr(data-lazyload)').extract_first()
            else:
                item['img'] = 'http:' + i.css(
                    'a img::attr(src)').extract_first()
            item['star'] = float(
                i.css('.item-rank-rst::attr(class)').re_first(
                    r'[1-9]\d*|0')) / 10
            if i.css('.comment-count a::text').re_first(r'[1-9]\d*|0'):
                item['review_num'] = int(
                    i.css('.comment-count a::text').re_first(r'[1-9]\d*|0'))
            item['mean_price'] = i.css('.price::text').extract_first()
            if i.css('.product-count a::text').extract_first():
                item['product_photos'] = i.css(
                    '.product-count a::text').extract_first().strip('"')
            if i.css('.key-list::text').extract_first():
                item['location'] = ' '.join(
                    i.css('.key-list::text').extract_first().strip().split())
            else:
                item['location'] = ''
            getlocation(item)
            item['number'] = item['url'].split('/')[-1]
            yield item

        le = LinkExtractor(restrict_css='div.Pages > a.NextPage')
        print('4' * 200)
        links = le.extract_links(response)
        # print(links, links.url, links.text)
        if links:
            next_url = links[0].url
            print('next_url:', next_url)
            yield Request(next_url, callback=self.parse_list)
Ejemplo n.º 9
0
    def parse_list(self, response):
        item = HotelItem()
        m = re.findall('{"hotelList":(.*),"sortInfo"', response.text)
        print('parse_list m: {}'.format(m))
        self.logger.debug('parse_list m: {}'.format(m))
        result = json.loads(m[0] + '}')

        for record in result.get('records'):
            item['title'] = record.get('shopName')
            item['url'] = 'http://www.dianping.com' + record.get('shopUrl')
            item['is_bookable'] = record.get('isBookable')
            item['location'] = record.get('regionName')
            item['walk_distance'] = record.get('distanceText')
            item['price'] = record.get('price')
            item['star'] = record.get('star') / 10
            item['review_num'] = record.get('reviewCount')
            item['number'] = record.get('id')
            item['pic_array'] = record.get('picArray')
            getlocation(item)
            yield item
Ejemplo n.º 10
0
    def parse_list(self, response):
        item = HomeItem()

        li = response.css('.shop-list>.shop-list-item')
        for i in li:
            item['title'] = i.css(
                '.shop-title>h3>a::text').extract_first().strip()
            item['url'] = 'https:' + i.css(
                '.shop-title>h3>a::attr(href)').extract_first()
            if i.css('.shop-images img::attr(data-src)').extract_first():
                item['img'] = 'http:' + i.css(
                    '.shop-images img::attr(data-src)').extract_first()
            else:
                item['img'] = 'http:' + i.css(
                    '.shop-images img::attr(src)').extract_first()
            item['star'] = float(
                i.css('.item-rank-rst::attr(class)').re_first(
                    r'[1-9]\d*|0')) / 10
            item['review_num'] = int(
                i.css('.shop-info-text-i>a::text').re_first(r'[1-9]\d*|0'))
            td = i.css('.shop-info-text-i>span::text').extract()
            print('td : {}'.format(td))
            if len(td) == 4:
                item['type'] = ' '.join((td[0], td[1]))
                item['district'] = td[2]
                item['location'] = td[3]
            elif len(td) == 3:
                item['type'] = td[0]
                item['district'] = td[1]
                item['location'] = td[2]
            elif len(td) == 2:
                item['type'] = td[0]
                item['district'] = td[1]
                item['location'] = ''
            else:
                print('td in else: {}'.format(td))
            getlocation(item)
            item['number'] = item['url'].split('/')[-1]
            yield item
Ejemplo n.º 11
0
    def parse_list_decoration(self, response):
        print('parse_list_decoration response.url:' + response.url)
        self.logger.debug('parse_list_decoration response.url:' + response.url)
        item = HomeItem()

        div = response.css('.shop-list>.shop-list-item')
        print('parse_list_decoration li:{}  response.url: {}'.format(
            div.css('.shop-title>h3>a::text').extract(), response.url))
        self.logger.debug(
            'parse_list_decoration li:{}  response.url: {}'.format(
                div.css('.shop-title>h3>a::text').extract(), response.url))
        for i in div:
            item['title'] = i.css(
                '.shop-title>h3>a::text').extract_first().strip()
            item['url'] = 'https:' + i.css(
                '.shop-title>h3>a::attr(href)').extract_first()
            if i.css('.shop-images img::attr(data-src)').extract_first():
                item['img'] = 'http:' + i.css(
                    '.shop-images img::attr(data-src)').extract_first()
            else:
                item['img'] = 'http:' + i.css(
                    '.shop-images img::attr(src)').extract_first()
            # item['star'] = float(
            #     re.search(r'[1-9]\d*|0', i.css('.item-rank-rst::attr(class)').extract_first())[0]) / 10
            item['star'] = float(
                i.css('.item-rank-rst::attr(class)').re_first(
                    r'[1-9]\d*|0')) / 10
            # item['review_num'] = re.search(r'[1-9]\d*|0', i.css('.shop-info-text-i>a::text').extract_first())[0]
            item['review_num'] = i.css('.shop-info-text-i>a::text').re_first(
                r'[1-9]\d*|0')
            item['contract_price'] = i.css(
                'div.row.shop-info-text-i > span:nth-child(3)::text'
            ).extract_first()
            if len(i.css('.ml-26').extract()) > 1:
                types = i.css(
                    'div.row.shop-info-text-i > span:nth-child(4) a::text'
                ).extract()
                print(
                    "types{} len(types){} len(i.css('.ml-26').extract()) : {}".
                    format(types, len(types), len(i.css('.ml-26').extract())))
                item['type'] = ' '.join(types)
                print("item['type'] : {}".format(item['type']))
            else:
                item['type'] = '装修设计'
            loc = i.css('.shop-location>span::text').extract()
            if len(loc) == 2:
                item['district'] = loc[0]
                item['location'] = loc[1]
            else:
                item['district'] = loc[0]
                item['location'] = ''
            # item['district'] = i.css('.shop-location>span:first-child::text').extract_first()
            # item['location'] = i.css('.shop-location>span:last-child::text').extract_first()
            if i.css('.shop-team').extract():
                item['design'] = i.css(
                    '.shop-team i:first-child::text').extract_first()
                item['designer'] = i.css(
                    '.shop-team i:last-child::text').extract_first()
            getlocation(item)
            item['number'] = item['url'].split('/')[-1]
            yield item

        le = LinkExtractor(restrict_css='div.pages a.nextPage')
        print('4' * 200)
        links = le.extract_links(response)
        if links:
            next_url = links[0].url
            print('parse_list_decoration next_url:', next_url)
            self.logger.debug(
                'parse_list_decoration next_url:{}'.format(next_url))
            yield Request(next_url, callback=self.parse_list_decoration)