def parse_list(self, response): print('parse_list response.url:' + response.url) self.logger.debug('parse_list response.url:' + response.url) item = HotelItem() m = re.findall('{"hotelList":(.*),"sortInfo"', response.text) print('parse_list m: {}'.format(m)) self.logger.debug('parse_list m: {}'.format(m)) result = json.loads(m[0] + '}') for record in result.get('records'): item['title'] = record.get('shopName') item['url'] = 'http://www.dianping.com' + record.get('shopUrl') item['is_bookable'] = record.get('isBookable') item['location'] = record.get('regionName') item['walk_distance'] = record.get('distanceText') item['price'] = record.get('price') item['star'] = record.get('star') / 10 item['review_num'] = record.get('reviewCount') item['number'] = record.get('id') item['pic_array'] = str(record.get('picArray')).replace( '\'', '').strip('[').strip(']') getlocation(item) yield item le = LinkExtractor(restrict_css='.page .next') print('4' * 200) links = le.extract_links(response) if links: next_url = links[0].url print('parse_list next_url:', next_url) self.logger.debug('parse_list next_url:{}'.format(next_url)) yield Request(next_url, callback=self.parse_list)
def parse_list(self, response): item = FilmItem() li = response.css('#shop-all-list>ul>li') print('parse_list li:{} response.url: {}'.format( li.css('.txt>.tit h4::text').extract(), response.url)) self.logger.debug('parse_list li:{} response.url: {}'.format( li.css('.txt>.tit h4::text').extract(), response.url)) for i in li: item['title'] = i.css('.txt>.tit h4::text').extract_first().strip() item['url'] = i.css('.txt>.tit>a::attr(href)').extract_first() if i.css('.shop-branch::text'): item['branch'] = i.css( '.shop-branch::attr(href)').extract_first() item['img'] = i.css('img::attr(data-src)').extract_first() item['star'] = float( i.css('.sml-rank-stars::attr(class)').re_first( r'[1-9]\d*|0')) / 10 if i.css('.review-num b::text'): print('review-num : {}'.format( i.css('.review-num>b::text').extract_first())) item['review_num'] = int( i.css('.review-num>b::text').extract_first()) if i.css('.mean-price b::text'): item['mean_price'] = i.css( '.mean-price b::text').extract_first().strip('¥') print('type location 1: {}'.format( i.css('.tag-addr span::text').extract())) item['type'] = i.css('.tag-addr span::text').extract()[0].strip() item['location'] = i.css( '.tag-addr span::text').extract()[1].strip() item['address'] = i.css('.addr::text').extract_first().strip() getlocation(item) item['number'] = item['url'].split('/')[-1] yield item
def parse_list(self, response): print('parse_list response.url:' + response.url) self.logger.debug('parse_list response.url:' + response.url) item = TourItem() li = response.css('#shop-all-list>ul>li') print('parse_list li:{} response.url: {}'.format( li.css('.txt>.tit h4::text').extract(), response.url)) self.logger.debug('parse_list li:{} response.url: {}'.format( li.css('.txt>.tit h4::text').extract(), response.url)) for i in li: item['title'] = i.css('.txt>.tit h4::text').extract_first().strip() item['url'] = i.css('.txt>.tit>a::attr(href)').extract_first() if i.css('.shop-branch::text'): item['branch'] = i.css( '.shop-branch::attr(href)').extract_first() item['img'] = i.css('img::attr(data-src)').extract_first() item['star'] = float( i.css('.sml-rank-stars::attr(class)').re_first( r'[1-9]\d*|0')) / 10 if i.css('.review-num b::text'): print('review-num : {}'.format( i.css('.review-num>b::text').extract_first())) item['review_num'] = int( i.css('.review-num>b::text').extract_first()) if i.css('.mean-price b::text'): item['mean_price'] = i.css( '.mean-price b::text').extract_first().strip('¥') # else: # item['mean_price'] = '-' print('1111111 score environment service: {}'.format( i.css('.comment-list b::text').extract())) if i.css('.comment-list b::text').extract(): print('222222 score environment service: {}'.format( i.css('.comment-list b::text').extract())) item['score'] = float( i.css('.comment-list b::text').extract()[0]) item['environment'] = float( i.css('.comment-list b::text').extract()[1]) item['service'] = float( i.css('.comment-list b::text').extract()[2]) print('type location 1: {}'.format( i.css('.tag-addr span::text').extract())) item['type'] = i.css('.tag-addr span::text').extract()[0].strip() item['location'] = i.css( '.tag-addr span::text').extract()[1].strip() item['address'] = i.css('.addr::text').extract_first().strip() getlocation(item) item['number'] = item['url'].split('/')[-1] yield item le = LinkExtractor(restrict_css='div.page > a.next') print('4' * 200) links = le.extract_links(response) if links: next_url = links[0].url print('next_url:', next_url) self.logger.debug('next_url:' + next_url) yield Request(next_url, callback=self.parse_list)
def parse_list(self, response): item = WeddingItem() li = response.css('.shop-list>li') print('parse_list li:{} response.url: {}'.format( li.css('.shopname::text').extract(), response.url)) self.logger.debug('parse_list li:{} response.url: {}'.format( li.css('.shopname::text').extract(), response.url)) for i in li: item['title'] = i.css('.shopname::text').extract_first() item['url'] = 'http:' + i.css( '.shopname::attr(href)').extract_first() if i.css('a img::attr(data-lazyload)').extract_first(): item['img'] = 'http:' + i.css( 'a img::attr(data-lazyload)').extract_first() else: item['img'] = 'http:' + i.css( 'a img::attr(src)').extract_first() item['star'] = float( i.css('.item-rank-rst::attr(class)').re_first( r'[1-9]\d*|0')) / 10 item['review_num'] = i.css('p.remark > span:nth-child(2) > a::text' ).re_first(r'[1-9]\d*|0') item['mean_price'] = i.css('.price::text').extract_first() item['product_photos'] = i.css( 'p.remark > span:nth-child(3) > a::text').extract_first() if i.css('.area-list::text').extract_first(): item['location'] = ' '.join( i.css('.area-list::text').extract_first().strip().split()) else: item['location'] = '' getlocation(item) item['number'] = item['url'].split('/')[-1] yield item
def parse_list(self, response): item = FoodItem() li = response.css('#shop-all-list>ul>li') print('parse_list li:{} response.url: {}'.format( li.css('.txt>.tit h4::text').extract(), response.url)) self.logger.debug('parse_list li:{} response.url: {}'.format( li.css('.txt>.tit h4::text').extract(), response.url)) for i in li: item['title'] = i.css('.txt>.tit h4::text').extract_first().strip() item['url'] = i.css('.txt>.tit>a::attr(href)').extract_first() if i.css('.shop-branch::text'): item['branch'] = i.css( '.shop-branch::attr(href)').extract_first() item['img'] = i.css('img::attr(data-src)').extract_first() item['star'] = float( i.css('.sml-rank-stars::attr(class)').re_first( r'[1-9]\d*|0')) / 10 if i.css('.review-num b::text'): print('review-num : {}'.format(i.css('.review-num>b::text'))) item['review_num'] = int( i.css('.review-num>b::text').extract_first()) if i.css('.mean-price b::text'): item['mean_price'] = i.css( '.mean-price b::text').extract_first().strip('¥') # else: # item['mean_price'] = '-' print('1111111 taste environment service: {}'.format( i.css('.comment-list b::text').extract())) if i.css('.comment-list b::text').extract(): print('222222 taste environment service: {}'.format( i.css('.comment-list b::text').extract())) item['taste'] = float( i.css('.comment-list b::text').extract()[0]) item['environment'] = float( i.css('.comment-list b::text').extract()[1]) item['service'] = float( i.css('.comment-list b::text').extract()[2]) if i.css('div.txt > div.recommend'): print("i.css('div.txt > div.recommend a') : {}".format( i.css('div.txt > div.recommend a::text').extract())) item['recommend'] = ' '.join( i.css('div.txt > div.recommend a::text').extract()) print('type location 1: {}'.format( i.css('.tag-addr span::text').extract())) item['type'] = i.css('.tag-addr span::text').extract()[0].strip() item['location'] = i.css( '.tag-addr span::text').extract()[1].strip() item['address'] = i.css('.addr::text').extract_first().strip() getlocation(item) item['number'] = item['url'].split('/')[-1] yield item
def parse_list(self, response): print('parse_list response.url:' + response.url) self.logger.debug('parse_list response.url:' + response.url) item = HomeItem() li = response.css('.shop-list>.shop-list-item') for i in li: item['title'] = i.css( '.shop-title>h3>a::text').extract_first().strip() item['url'] = 'https:' + i.css( '.shop-title>h3>a::attr(href)').extract_first() if i.css('.shop-images img::attr(data-src)').extract_first(): item['img'] = 'http:' + i.css( '.shop-images img::attr(data-src)').extract_first() else: item['img'] = 'http:' + i.css( '.shop-images img::attr(src)').extract_first() item['star'] = float( i.css('.item-rank-rst::attr(class)').re_first( r'[1-9]\d*|0')) / 10 item['review_num'] = int( i.css('.user-comment>a::text').re_first(r'[1-9]\d*|0')) td = i.css('.shop-info-text-i>span::text').extract() print('td : {}'.format(td)) if len(td) == 4: item['type'] = ' '.join((td[0], td[1])) item['district'] = td[2] item['location'] = td[3] elif len(td) == 3: item['type'] = td[0] item['district'] = td[1] item['location'] = td[2] elif len(td) == 2: item['type'] = td[0] item['district'] = td[1] item['location'] = '' else: print('td in else: {}'.format(td)) getlocation(item) item['number'] = item['url'].split('/')[-1] yield item le = LinkExtractor(restrict_css='div.pages a.nextPage') print('4' * 200) links = le.extract_links(response) if links: next_url = links[0].url print('parse_list next_url:', next_url) self.logger.debug('parse_list next_url:{}'.format(next_url)) yield Request(next_url, callback=self.parse_list)
def parse_list_decoration(self, response): item = HomeItem() div = response.css('.shop-list>.shop-list-item') for i in div: item['title'] = i.css( '.shop-title>h3>a::text').extract_first().strip() item['url'] = 'https:' + i.css( '.shop-title>h3>a::attr(href)').extract_first() if i.css('.shop-images img::attr(data-src)').extract_first(): item['img'] = 'http:' + i.css( '.shop-images img::attr(data-src)').extract_first() else: item['img'] = 'http:' + i.css( '.shop-images img::attr(src)').extract_first() # item['star'] = float( # re.search(r'[1-9]\d*|0', i.css('.item-rank-rst::attr(class)').extract_first())[0]) / 10 item['star'] = float( i.css('.item-rank-rst::attr(class)').re_first( r'[1-9]\d*|0')) / 10 # item['review_num'] = re.search(r'[1-9]\d*|0', i.css('.shop-info-text-i>a::text').extract_first())[0] item['review_num'] = i.css('.shop-info-text-i>a::text').re_first( r'[1-9]\d*|0') item['contract_price'] = i.css( 'div.row.shop-info-text-i > span:nth-child(3)::text' ).extract_first() if len(i.css('.ml-26').extract()) > 1: types = i.css( 'div.row.shop-info-text-i > span:nth-child(4) a::text' ).extract() print( "types{} len(types){} len(i.css('.ml-26').extract()) : {}". format(types, len(types), len(i.css('.ml-26').extract()))) item['type'] = ' '.join(types) print("item['type'] : {}".format(item['type'])) else: item['type'] = '装修设计' item['district'] = i.css( '.shop-location>span:first-child::text').extract_first() item['location'] = i.css( '.shop-location>span:last-child::text').extract_first() if i.css('.shop-team').extract(): item['design'] = i.css( '.shop-team i:first-child::text').extract_first() item['designer'] = i.css( '.shop-team i:last-child::text').extract_first() getlocation(item) item['number'] = item['url'].split('/')[-1] yield item
def parse_list(self, response): print('parse_list response.url:' + response.url) self.logger.debug('parse_list response.url:' + response.url) item = BabyItem() li = response.css('.shop-list>li') print('parse_list li:{} response.url: {}'.format( li.css('.shopname::text').extract(), response.url)) self.logger.debug('parse_list li:{} response.url: {}'.format( li.css('.shopname::text').extract(), response.url)) for i in li: item['title'] = i.css('.shopname::text').extract_first() item['url'] = 'http:' + i.css( '.shopname::attr(href)').extract_first() if i.css('a img::attr(data-lazyload)').extract_first(): item['img'] = 'http:' + i.css( 'a img::attr(data-lazyload)').extract_first() else: item['img'] = 'http:' + i.css( 'a img::attr(src)').extract_first() item['star'] = float( i.css('.item-rank-rst::attr(class)').re_first( r'[1-9]\d*|0')) / 10 if i.css('.comment-count a::text').re_first(r'[1-9]\d*|0'): item['review_num'] = int( i.css('.comment-count a::text').re_first(r'[1-9]\d*|0')) item['mean_price'] = i.css('.price::text').extract_first() if i.css('.product-count a::text').extract_first(): item['product_photos'] = i.css( '.product-count a::text').extract_first().strip('"') if i.css('.key-list::text').extract_first(): item['location'] = ' '.join( i.css('.key-list::text').extract_first().strip().split()) else: item['location'] = '' getlocation(item) item['number'] = item['url'].split('/')[-1] yield item le = LinkExtractor(restrict_css='div.Pages > a.NextPage') print('4' * 200) links = le.extract_links(response) # print(links, links.url, links.text) if links: next_url = links[0].url print('next_url:', next_url) yield Request(next_url, callback=self.parse_list)
def parse_list(self, response): item = HotelItem() m = re.findall('{"hotelList":(.*),"sortInfo"', response.text) print('parse_list m: {}'.format(m)) self.logger.debug('parse_list m: {}'.format(m)) result = json.loads(m[0] + '}') for record in result.get('records'): item['title'] = record.get('shopName') item['url'] = 'http://www.dianping.com' + record.get('shopUrl') item['is_bookable'] = record.get('isBookable') item['location'] = record.get('regionName') item['walk_distance'] = record.get('distanceText') item['price'] = record.get('price') item['star'] = record.get('star') / 10 item['review_num'] = record.get('reviewCount') item['number'] = record.get('id') item['pic_array'] = record.get('picArray') getlocation(item) yield item
def parse_list(self, response): item = HomeItem() li = response.css('.shop-list>.shop-list-item') for i in li: item['title'] = i.css( '.shop-title>h3>a::text').extract_first().strip() item['url'] = 'https:' + i.css( '.shop-title>h3>a::attr(href)').extract_first() if i.css('.shop-images img::attr(data-src)').extract_first(): item['img'] = 'http:' + i.css( '.shop-images img::attr(data-src)').extract_first() else: item['img'] = 'http:' + i.css( '.shop-images img::attr(src)').extract_first() item['star'] = float( i.css('.item-rank-rst::attr(class)').re_first( r'[1-9]\d*|0')) / 10 item['review_num'] = int( i.css('.shop-info-text-i>a::text').re_first(r'[1-9]\d*|0')) td = i.css('.shop-info-text-i>span::text').extract() print('td : {}'.format(td)) if len(td) == 4: item['type'] = ' '.join((td[0], td[1])) item['district'] = td[2] item['location'] = td[3] elif len(td) == 3: item['type'] = td[0] item['district'] = td[1] item['location'] = td[2] elif len(td) == 2: item['type'] = td[0] item['district'] = td[1] item['location'] = '' else: print('td in else: {}'.format(td)) getlocation(item) item['number'] = item['url'].split('/')[-1] yield item
def parse_list_decoration(self, response): print('parse_list_decoration response.url:' + response.url) self.logger.debug('parse_list_decoration response.url:' + response.url) item = HomeItem() div = response.css('.shop-list>.shop-list-item') print('parse_list_decoration li:{} response.url: {}'.format( div.css('.shop-title>h3>a::text').extract(), response.url)) self.logger.debug( 'parse_list_decoration li:{} response.url: {}'.format( div.css('.shop-title>h3>a::text').extract(), response.url)) for i in div: item['title'] = i.css( '.shop-title>h3>a::text').extract_first().strip() item['url'] = 'https:' + i.css( '.shop-title>h3>a::attr(href)').extract_first() if i.css('.shop-images img::attr(data-src)').extract_first(): item['img'] = 'http:' + i.css( '.shop-images img::attr(data-src)').extract_first() else: item['img'] = 'http:' + i.css( '.shop-images img::attr(src)').extract_first() # item['star'] = float( # re.search(r'[1-9]\d*|0', i.css('.item-rank-rst::attr(class)').extract_first())[0]) / 10 item['star'] = float( i.css('.item-rank-rst::attr(class)').re_first( r'[1-9]\d*|0')) / 10 # item['review_num'] = re.search(r'[1-9]\d*|0', i.css('.shop-info-text-i>a::text').extract_first())[0] item['review_num'] = i.css('.shop-info-text-i>a::text').re_first( r'[1-9]\d*|0') item['contract_price'] = i.css( 'div.row.shop-info-text-i > span:nth-child(3)::text' ).extract_first() if len(i.css('.ml-26').extract()) > 1: types = i.css( 'div.row.shop-info-text-i > span:nth-child(4) a::text' ).extract() print( "types{} len(types){} len(i.css('.ml-26').extract()) : {}". format(types, len(types), len(i.css('.ml-26').extract()))) item['type'] = ' '.join(types) print("item['type'] : {}".format(item['type'])) else: item['type'] = '装修设计' loc = i.css('.shop-location>span::text').extract() if len(loc) == 2: item['district'] = loc[0] item['location'] = loc[1] else: item['district'] = loc[0] item['location'] = '' # item['district'] = i.css('.shop-location>span:first-child::text').extract_first() # item['location'] = i.css('.shop-location>span:last-child::text').extract_first() if i.css('.shop-team').extract(): item['design'] = i.css( '.shop-team i:first-child::text').extract_first() item['designer'] = i.css( '.shop-team i:last-child::text').extract_first() getlocation(item) item['number'] = item['url'].split('/')[-1] yield item le = LinkExtractor(restrict_css='div.pages a.nextPage') print('4' * 200) links = le.extract_links(response) if links: next_url = links[0].url print('parse_list_decoration next_url:', next_url) self.logger.debug( 'parse_list_decoration next_url:{}'.format(next_url)) yield Request(next_url, callback=self.parse_list_decoration)