def parse_list(self, response): print('parse_list response.url:' + response.url) self.logger.debug('parse_list response.url:' + response.url) item = LianjiaNewItem() items = response.css( 'ul.resblock-list-wrapper > li[data-project-name]') for i in items: item['title'] = i.css('.resblock-name a::text').extract_first() item['url'] = 'https://sz.fang.lianjia.com' + i.css( '.resblock-name a::attr(href)').extract_first() item['type'] = i.css('.resblock-type::text').extract_first() item['status'] = i.css('.sale-status::text').extract_first() item['total_price'] = i.css('.second::text').extract_first() item['unit_price'] = i.css('.number::text').extract_first() item['img'] = i.css( '.lj-lazy::attr(data-original)').extract_first() item['area'] = i.css('.resblock-area span::text').extract_first() item['location'] = i.css( 'div.resblock-location > span:nth-child(1)::text' ).extract_first() item['community'] = i.css( 'div.resblock-location > span:nth-child(3)::text' ).extract_first() item['address'] = i.css( 'div.resblock-location > a::text').extract_first() getlocation(item) item['number'] = item['url'].split('/')[-2].split('_')[-1] yield item
def parse_list(self, response): print('parse_list response.url:' + response.url) self.logger.debug('parse_list response.url:' + response.url) item = TongchengShoprentalItem() li = response.css('.house-list-wrap>li') for i in li: item['title'] = i.css('.title_des::text').extract_first().strip() item['number'] = i.xpath('@logr').extract_first().split('_')[3] item['url'] = 'http://sz.58.com/shangpu/' + item[ 'number'] + 'x.shtml' item['month_price'] = i.css('p.sum > b::text').extract_first() item['day_price'] = i.css('.unit span::text').extract_first() # item['time'] = i.css('.time::text').extract_first() item['img'] = i.css('img::attr(data-src)').extract_first() # print('area : {}'.format(i.css('div.list-info > p:nth-child(2) > span:nth-child(1)::text').re_first('\d+(\.\d+)?'))) # self.logger.debug('area : {}'.format(i.css('div.list-info > p:nth-child(2) > span:nth-child(1)::text').re_first('\d+(\.\d+)?'))) # item['area'] = float( # i.css('div.list-info > p:nth-child(2) > span:nth-child(1)::text').re_first(r'\d+(\.\d+)?')) con = i.css( 'div.list-info > p:nth-child(2) > span::text').extract() if len(con) == 3: item['area'] = con[0].strip() item['type'] = con[1].strip() item['status'] = con[2].strip() elif len(con) == 2: item['area'] = con[0].strip() item['status'] = con[1].strip() # item['area'] = i.css('div.list-info > p:nth-child(2) > span:nth-child(1)::text').extract_first() # item['type'] = i.css('div.list-info > p:nth-child(2) > span:nth-child(2)::text').extract_first() # item['status'] = i.css('div.list-info > p:nth-child(2) > span:nth-child(3)::text').extract_first() loc = i.css( 'div.list-info > p:nth-child(3) > span:nth-child(1)::text' ).extract_first().split('-') item['district'] = loc[0].strip() if len(loc) > 1: item['location'] = loc[1].strip() else: item['district'] = '' if i.css('div.list-info > p:nth-child(3) > span:nth-child(2)::text' ): item['address'] = i.css( 'div.list-info > p:nth-child(3) > span:nth-child(2)::text' ).extract_first().replace('-', '') else: item['address'] = '' item['tags'] = ' '.join( i.css('div.list-info > p.tag-wrap > span::text').extract()) getlocation(item) yield item le = LinkExtractor(restrict_css='div.pager > a.next') print('5' * 200) links = le.extract_links(response) if links: next_url = links[0].url print('next_url:', next_url) self.logger.debug('next_url:' + next_url) yield Request(next_url, callback=self.parse_list)
def parse_list(self, response): print('parse_list response.url:' + response.url) self.logger.debug('parse_list response.url:' + response.url) item = AnjukeShoprentalItem() li = response.css('#list-content>.list-item') for i in li: item['title'] = i.css('.item-title::text').extract_first().strip() item['url'] = i.xpath('@link').extract_first() item['price'] = i.css('em::text').extract_first() + i.css( '.price-a::text').extract()[1].strip() item['img'] = i.css('img::attr(src)').extract_first() item['area'] = int( i.css('dl > dd:nth-child(2) > span:nth-child(1)::text'). re_first('[1-9]\d*|0')) item['floor'] = i.css( 'dl > dd:nth-child(2) > span:nth-child(3)::text' ).extract_first() item['type'] = i.css( 'dl > dd:nth-child(2) > span:nth-child(5)::text' ).extract_first() item['community'] = i.css('dd.address > a::text').extract_first() # comm_address = i.css('dd.address > span::text').extract_first().split() # print("i.css('dd.address > span::text'):" + i.css('dd.address > span::text').extract_first()) # self.logger.debug("i.css('dd.address > span::text'):" + i.css('dd.address > span::text').extract_first()) if i.css('dd.address > span::text').extract_first(): comm_address = i.css( 'dd.address > span::text').extract_first().split() print('parse_list comm_address:' + str(comm_address)) self.logger.debug('parse_list comm_address:' + str(comm_address)) total_adress = comm_address[0].strip('[').split('-') item['district'] = total_adress[0] if len(total_adress) > 1: item['location'] = total_adress[1] else: item['location'] = '' if len(comm_address) > 1: item['address'] = comm_address[1].strip(']') else: item['address'] = '' else: item['district'] = '' item['location'] = '' item['address'] = '' getlocation(item) item['number'] = item['url'].split('?', 1)[0].split('/')[-2] yield item le = LinkExtractor(restrict_css='div.multi-page > a.aNxt') print('5' * 200) links = le.extract_links(response) if links: next_url = links[0].url print('next_url:', next_url) self.logger.debug('next_url:' + next_url) yield Request(next_url, callback=self.parse_list)
def parse_list(self, response): print('parse_list response.url:' + response.url) self.logger.debug('parse_list response.url:' + response.url) item = QfangTwoItem() items = response.css('#cycleListings>ul>li') for i in items: item['title'] = i.css( '.house-title a::text').extract_first().strip() item['url'] = 'https://shenzhen.qfang.com' + i.css( '.house-title a::attr(href)').extract_first() item['total_price'] = int( i.css('.sale-price::text').extract_first()) item['unit_price'] = int( i.css('.show-price p::text').re_first(r'[1-9]\d*')) item['img'] = i.css( 'img::attr(data-original)').extract_first().strip() item['layout'] = i.css( 'p.house-about.clearfix > span:nth-child(2)::text' ).extract_first() item['area'] = float( i.css('p.house-about > span:nth-child(4)::text').re_first( '[1-9]\d*\.\d*|0\.\d*[1-9]\d*|[1-9]\d*|0')) item['decoration'] = i.css( 'p.house-about.clearfix > span:nth-child(6)::text' ).extract_first() item['floor'] = i.css( 'p.house-about.clearfix > span:nth-child(8)::text' ).extract_first().strip() item['orientation'] = i.css( 'p.house-about.clearfix > span:nth-child(10)::text' ).extract_first().strip() item['build_year'] = int( i.css('p.house-about.clearfix > span:nth-child(12)::text'). re_first(r'[1-9]\d*')) item['district'] = i.css( 'span.whole-line > a:nth-child(1)::text').extract_first() item['location'] = i.css( 'span.whole-line > a:nth-child(2)::text').extract_first() item['community'] = i.css( 'span.whole-line > a:nth-child(3)::text').extract_first() getlocation(item) item['number'] = item['url'].split('/')[-1].split('?')[0] yield item le = LinkExtractor(restrict_css='.turnpage_next') print('5' * 200) links = le.extract_links(response) if links: next_url = links[0].url print('next_url:', next_url) self.logger.debug('next_url:' + next_url) yield Request(next_url, callback=self.parse_list)
def parse_list(self, response): print('parse_list response.url:' + response.url) self.logger.debug('parse_list response.url:' + response.url) item = AnjukeTwoItem() li = response.css('.houselist-mod-new>li') for i in li: item['title'] = i.css( '.house-title a::text').extract_first().strip() item['url'] = i.css( '.house-title a::attr(href)').extract_first().split('?', 1)[0] item['total_price'] = float(i.css('strong::text').extract_first()) item['unit_price'] = int( i.css('.unit-price::text').re_first(r'[1-9]\d*|0')) item['img'] = i.css('img::attr(src)').extract_first() item['layout'] = i.css( 'div.house-details > div:nth-child(2) > span:nth-child(1)::text' ).extract_first() item['area'] = i.css( 'div.house-details > div:nth-child(2) > span:nth-child(3)::text' ).extract_first() item['floor'] = i.css( 'div.house-details > div:nth-child(2) > span:nth-child(5)::text' ).extract_first() item['build_year'] = i.css( 'div.house-details > div:nth-child(2) > span:nth-child(7)::text' ).extract_first() if i.css('.comm-address::text').extract_first(): comm_address = i.css( '.comm-address::text').extract_first().strip().split() print('comm_address :', comm_address) self.logger.debug('comm_address :' + str(comm_address)) item['community'] = comm_address[0] total_adress = comm_address[1].split('-') # print('total_adress :', total_adress) item['district'] = total_adress[0] item['location'] = total_adress[1] item['address'] = total_adress[2] getlocation(item) item['number'] = item['url'].split('/')[-1].split('?')[0] yield item le = LinkExtractor(restrict_css='div.multi-page > a.aNxt') print('5' * 200) links = le.extract_links(response) if links: next_url = links[0].url print('next_url:', next_url) self.logger.debug('next_url:' + next_url) yield Request(next_url, callback=self.parse_list)
def parse_list(self, response): print('parse_list response.url:' + response.url) self.logger.debug('parse_list response.url:' + response.url) item = FangtianxiaTwoItem() dl = response.css('.shop_list.shop_list_4>dl[id]') for i in dl: item['title'] = i.css('.tit_shop::text').extract_first().strip() item['url'] = 'http://esf.sz.fang.com' + i.css( 'h4 a::attr(href)').extract_first() item['total_price'] = float(i.css('.red>b::text').extract_first()) item['unit_price'] = int( i.css('.price_right > span:nth-child(2)::text').re_first( r'[1-9]\d*|0')) if i.css('.floatl img[src2]'): item['img'] = i.css('.floatl img::attr(src2)').extract_first() else: item['img'] = i.css('.floatl img::attr(src)').extract_first() desc = i.css('p.tel_shop::text').extract() item['layout'] = desc[0].strip() item['area'] = re.search(r'[1-9]\d*', desc[1].strip())[0] item['floor'] = desc[2].strip() if len(desc) == 6: item['orientation'] = desc[3].strip() item['build_year'] = desc[4].strip() elif len(desc) > 3: item['orientation'] = '' item['build_year'] = desc[3].strip() item['community'] = i.css( '.add_shop a::text').extract_first().strip() addr = i.css('.add_shop span::text').extract_first().split('-') print('addr :', addr) self.logger.debug('addr :' + str(addr)) item['location'] = addr[0].strip() item['address'] = addr[1].strip() item['distance'] = i.css('.bg_none.icon_dt::text').extract_first() getlocation(item) item['number'] = item['url'].split('/')[-1].split('.')[0] yield item next_page = Selector(response).re(u'<a href="(\S*)">下一页</a>') if next_page: next_url = 'http://sz.esf.fang.com' + next_page[0] print('next_url:', next_url) self.logger.debug('next_url:' + next_url) yield Request(url=next_url, callback=self.parse_list)
def parse_list(self, response): print('parse_list response.url:' + response.url) self.logger.debug('parse_list response.url:' + response.url) item = FangtianxiaShopsaleItem() dl = response.css('.shop_list>dl[id]') for i in dl: item['title'] = i.css('.tit_shop::text').extract_first().strip() item['url'] = 'http://sz.shop.fang.com' + i.css( 'h4 a::attr(href)').extract_first() item['total_price'] = int(i.css('.red>b::text').extract_first()) item['unit_price'] = float( i.css('dd.price_right > span:nth-child(2) > i::text '). extract_first()) if i.css('.floatl img[src2]'): item['img'] = i.css('.floatl img::attr(src2)').extract_first() else: item['img'] = i.css('.floatl img::attr(src)').extract_first() item['area'] = int(i.css('span.color3 > b::text').extract_first()) if i.css('.add_shop a::text').extract_first(): item['community'] = i.css( '.add_shop a::text').extract_first().strip() else: item['community'] = i.css( '.add_shop::text').extract_first().strip() if '商铺' in item['community']: item['community'] = item['community'].replace('商铺', '') item['address'] = i.css( 'p.add_shop > span::text').extract_first().strip() desc = i.css('.tel_shop::text').extract() item['type'] = desc[0].split(':')[1].strip() item['floor'] = desc[1].split(':')[1].strip() getlocation(item) item['number'] = item['url'].split('/')[-1].split('.')[0] yield item le = LinkExtractor(restrict_css='#PageControl1_hlk_next') print('5' * 200) links = le.extract_links(response) if links: next_url = links[0].url print('next_url:', next_url) self.logger.debug('next_url:' + next_url) yield Request(next_url, callback=self.parse_list)
def parse_list(self, response): print('parse_list response.url:' + response.url) self.logger.debug('parse_list response.url:' + response.url) item = AnjukeNewItem() items = response.css('.key-list .item-mod') for i in items: item['title'] = i.css('.items-name::text').extract_first() item['url'] = i.css('.lp-name::attr(href)').extract_first() if i.css('.price-txt::text'): item['no_price'] = i.css('.price-txt::text').extract_first() if i.css('.price::text'): p = i.css('.price::text').extract() q = i.css('.price > span::text').extract() item['price'] = p[0].strip() + q[0] + p[1].strip() if i.css('.around-price::text'): p = i.css('.around-price::text').extract() q = i.css('.around-price > span::text').extract() item['price'] = p[0].strip() + q[0] + p[1].strip() item['phone'] = i.css('p.tel::text').extract_first() if i.css('.list-dp::text'): item['comment'] = int(i.css('.list-dp::text').re_first(r'[1-9]\d*|0')) item['img'] = i.css('img::attr(src)').extract_first() item['layout'] = '/'.join(i.css('a.huxing > span::text').extract()[0:-1]) item['area'] = i.css('a.huxing > span::text').extract()[-1] comm_address = i.css('.list-map::text').extract_first().strip().split() item['district'] = comm_address[1] item['location'] = comm_address[2] item['address'] = comm_address[-1] item['status'] = i.css('i.status-icon.forsale::text').extract_first() item['type'] = i.css('i.status-icon.wuyetp::text').extract_first() getlocation(item) item['number'] = item['url'].split('/')[-1].split('.')[0] yield item le = LinkExtractor(restrict_css='a.next-page.next-link') print('5' * 200) links = le.extract_links(response) if links: next_url = links[0].url print('next_url:', next_url) self.logger.debug('next_url:' + next_url) yield Request(next_url, callback=self.parse_list)
def parse_list(self, response): print('parse_list response.url:' + response.url) self.logger.debug('parse_list response.url:' + response.url) item = LianjiaTwoItem() li = response.css('.sellListContent>.clear.LOGCLICKDATA') for i in li: item['title'] = i.css('.title a::text').extract_first().strip() item['url'] = i.css('.title a::attr(href)').extract_first() item['total_price'] = float(i.css('.totalPrice span::text').extract_first()) item['unit_price'] = int(i.css('.unitPrice span::text').re_first(r'[1-9]\d*|0')) item['img'] = i.css('.lj-lazy::attr(data-original)').extract_first() item['community'] = i.css('.address a::text').extract_first() desc = i.css('.houseInfo::text').extract_first().split('|') if len(desc) == 6: item['layout'] = desc[1].strip() item['area'] = re.findall(r'[1-9]\d*|0', desc[2].strip())[0] item['orientation'] = desc[3].strip() item['decoration'] = desc[4].strip() item['elevator'] = desc[5].strip() elif len(desc) == 5: item['layout'] = desc[1].strip() item['area'] = re.findall(r'[1-9]\d*|0', desc[2].strip())[0] item['orientation'] = desc[3].strip() item['decoration'] = desc[4].strip() item['elevator'] = '' item['floor'] = i.css('.positionInfo::text').extract_first().split('-')[0].strip() item['location'] = i.css('.positionInfo a::text').extract_first() num = i.css('.followInfo::text').extract_first().split('/') print('num:{}'.format(num)) self.logger.debug('num:{}'.format(num)) if num: item['focus_num'] = num[0].strip() item['watch_num'] = num[1].strip() item['pubdate'] = num[2].strip() getlocation(item) item['number'] = item['url'].split('/')[-1].split('.')[0] yield item
def parse_list(self, response): print('parse_list response.url:' + response.url) self.logger.debug('parse_list response.url:' + response.url) item = TongchengTwoItem() li = response.css('.house-list-wrap>li') for i in li: item['title'] = i.css('.title a::text').extract_first().strip() item['url'] = i.css('.title a::attr(href)').extract_first() item['total_price'] = float(i.css('.sum b::text').extract_first()) item['unit_price'] = int(i.css('.unit::text').re_first(r'[1-9]\d*|0')) item['time'] = i.css('.time::text').extract_first() item['img'] = i.css('img::attr(data-src)').extract_first() item['layout'] = i.css('div.list-info > p:nth-child(2) > span:nth-child(1)::text').extract_first().strip() print('parse_list area:' + i.css('div.list-info > p:nth-child(2) > span:nth-child(2)::text').extract_first()) self.logger.debug('parse_list area:' + i.css('div.list-info > p:nth-child(2) > span:nth-child(2)::text').extract_first()) item['area'] = float(i.css('div.list-info > p:nth-child(2) > span:nth-child(2)::text').re_first('[1-9]\d*\.\d*|0\.\d*[1-9]\d*|[1-9]\d*|0')) item['orientation'] = i.css('div.list-info > p:nth-child(2) > span:nth-child(3)::text').extract_first() item['floor'] = i.css('div.list-info > p:nth-child(2) > span:nth-child(4)::text').extract_first() item['community'] = i.css('div.list-info > p:nth-child(3) > span:nth-child(1) > a:nth-child(1)::text').extract_first() item['district'] = i.css('div.list-info > p:nth-child(3) > span:nth-child(1) > a:nth-child(2)::text').extract_first() if i.css('div.list-info > p:nth-child(3) > span:nth-child(1) > a:nth-child(3)::text'): item['location'] = i.css('div.list-info > p:nth-child(3) > span:nth-child(1) > a:nth-child(3)::text').extract_first() else: item['location'] = '' getlocation(item) item['number'] = item['url'].split('/')[-1].split('.')[0] yield item le = LinkExtractor(restrict_css='div.pager > a.next') print('5' * 200) links = le.extract_links(response) if links: next_url = links[0].url print('next_url:', next_url) self.logger.debug('next_url:' + next_url) yield Request(next_url, callback=self.parse_list)
def parse_list(self, response): print('parse_list response.url:' + response.url) self.logger.debug('parse_list response.url:' + response.url) item = QfangNewItem() items = response.css('#newhouse-list > .clearfix') for i in items: item['title'] = i.css( '.house-title a::text').extract_first().strip() # if i.css('.alias-text::text'): # item['alias'] = i.css('.alias-text::text').extract_first() item['alias'] = i.css( 'div.house-title.clearfix > span::text').extract_first() item['url'] = 'https://shenzhen.qfang.com' + i.css( '.house-title a::attr(href)').extract_first() item['status'] = i.css('.state-label::text').extract_first() # item['status'] = i.css('div.house-title.clearfix > span::text').extract_first() item['unit_price'] = i.css('.sale-price::text').extract_first() if i.css('.show-price p::text'): item['total_price'] = i.css( '.show-price p::text').extract_first() item['img'] = i.css('img::attr(src)').extract_first().strip() desc = i.css('div.natures > span::text').extract() if len(desc) == 3: item['district'] = desc[0].split()[0] item['location'] = desc[0].split()[1] item['type'] = ' '.join(desc[1].strip().split()) item['decoration'] = desc[2].strip() elif len(desc) == 2: item['district'] = desc[0].split()[0] item['location'] = desc[0].split()[1] item['decoration'] = desc[1].strip() # item['district'] = i.css('div.natures > span:nth-child(1)::text').extract_first().split()[0] # item['location'] = i.css('div.natures > span:nth-child(1)::text').extract_first().split()[1] # item['type'] = i.css('div.natures > span:nth-child(3)::text').extract_first().strip() # item['decoration'] = i.css('div.natures > span:nth-child(5)::text').extract_first().strip() item['layout'] = ' '.join( i.css('div.new-house-dsp > p:nth-child(1) > span::text'). extract()) item['area'] = i.css( 'div.new-house-dsp > p:nth-child(2) > span::text' ).extract_first() item['time'] = i.css( 'div.new-house-dsp > p:nth-child(3) > span::text' ).extract_first().strip() item['address'] = i.css( 'div.new-house-dsp > p:nth-child(4) > span::text' ).extract_first().strip() if i.css('p.new-house-phone > em'): phone_list = i.css('p.new-house-phone::text').extract() phone_text = i.css( 'p.new-house-phone > em::text').extract_first() item['phone'] = phone_list[0].strip( ) + phone_text + phone_list[1].strip() else: item['phone'] = i.css( 'p.new-house-phone::text').extract_first() getlocation(item) item['number'] = item['url'].split('/')[-1].split('?')[0] yield item le = LinkExtractor(restrict_css='.turnpage_next') print('5' * 200) links = le.extract_links(response) if links: next_url = links[0].url print('next_url:', next_url) self.logger.debug('next_url:' + next_url) yield Request(next_url, callback=self.parse_list)