def _get_shop_info(self, loader, response): # shop info loader.add_value('market_id', response.text, re=r"shopId\s*:\s*'(\d+)',") loader.add_value('seller_id', response.text, re=r"venderId\s*:\s*(\d+),") loader.add_value('market_type', 'shop') loader.add_value( 'market_url', urljoin( response.meta['original_url'], str( loader.get_css( 'div.contact div.J-hove-wrap div.item div.name a::attr(href)', TakeFirst())))) if not re.search(r'^https?://mall.jd.com', loader.get_collected_values('market_url')[0]): loader.add_value('market_code', loader.get_collected_values('market_url'), re=r'https?://(\w+)\.jd\.com') loader.add_css( 'market_name', 'div.contact div.J-hove-wrap div.item div.name a::attr(title)') loader.add_value('market_status', '1')
def parse(response, rules, spider): # 广度优先 or 深度优先 res = Selector(response=response) base_url = response.url for rule in rules: method = rule['method'] r = rule['rule'] extract = rule['extract'] urls = getattr(getattr(res, method)(r), extract)() for url in urls: if url not in spider.url_set: ui = { 'url': urljoin(base_url, url), 'method': 'get', 'cookie': '', 'session': '' } if spider.orm: uid = UrlMiddleWare.save(pid=spider.project_id, url=ui['url']) ui['id'] = uid spider.urls_queue.insert(0, ui) spider.url_set.add(ui['url']) pass
def _get_img_info(self, loader, response): # image info for s in response.css('ul#thumblist > li'): small_url = urljoin( response.meta['original_url'], s.css('div > a > img::attr(src)').extract_first()) mid_url = urljoin( response.meta['original_url'], s.css('div > a > img::attr(mid)').extract_first()) big_url = urljoin( response.meta['original_url'], s.css('div > a > img::attr(big)').extract_first()) loader.add_value('img_url', small_url) loader.add_value('img_url', mid_url) loader.add_value('img_url', big_url) # for images pipeline # loader.add_value('image_urls', big_url) # https://img.alicdn.com/bao/uploaded/i4/3164746790/TB2faCjrrGYBuNjy0FoXXciBFXa_!!3164746790.jpg_60x60.jpg # https://img1.vvic.com/upload/1522943484301_494356.jpg?x-oss-process=image/resize,mfit,h_500,w_500 loader.add_value('img_size', small_url, re=r'_(\d+[xX]\d+)\.') loader.add_value('img_size', mid_url, re=r'_(\d+[xX]\d+)\.') small_url_match = re.findall(r'h_(\d+),w_(\d+)', small_url) if small_url_match: loader.add_value('img_size', 'x'.join(small_url_match[0])) mid_url_match = re.findall(r'h_(\d+),w_(\d+)', mid_url) if mid_url_match: loader.add_value('img_size', 'x'.join(mid_url_match[0])) loader.add_value('img_size', '') # loader.add_value('status', '0') # loader.add_value('status', '0') # loader.add_value('status', '0') if small_url in loader.get_output_value('product_img_index_url'): loader.add_value('img_purpose', 'index') else: loader.add_value('img_purpose', '') if mid_url in loader.get_output_value('product_img_index_url'): loader.add_value('img_purpose', 'index') else: loader.add_value('img_purpose', '') if big_url in loader.get_output_value('product_img_index_url'): loader.add_value('img_purpose', 'index') else: loader.add_value('img_purpose', '') loader.add_value('img_description', 'small') loader.add_value('img_description', 'middle') loader.add_value('img_description', 'big')
def parse(self, response): district_name = response.xpath( '//div[@class="qxName"]/a[not(@class="org bold") and not(contains(text(),"周边"))]/text()' ).extract() district_url = response.xpath( '//div[@class="qxName"]/a[not(@class="org bold") and not(contains(text(),"周边"))]/@href' ).extract() for name, url in zip(district_name, district_url): print("开始区=========") print(url) yield scrapy.Request(url=response.urljoin(url), callback=self.town) time.sleep(random.randint(1, 3))
def parse_item(self, response): item = LinkGraphItem() item['predecessor_url'] = unquote_plus(urldefrag(response.url).url.rstrip('/')) linked_urls = [] for anchor in response.css('a::attr(href)'): page_path = urljoin(response.url, anchor.get()) is_valid = self.__is_valid_page_path(page_path) if is_valid: page_path = unquote_plus(urldefrag(page_path).url.rstrip('/')) linked_urls.append(page_path) item['successor_urls'] = linked_urls return item
def _parse_market_from_shop_table(self, response, selector_list, market_id): original_url = response.meta['original_url'] loader = ItemLoader(item=MarketPage(), response=response) loader.add_value('platform_code', response.meta['platform_code']) for s in selector_list: loader.add_value('market_id', s.xpath('a/@data-id').extract_first()) loader.add_value('parent_market_id', market_id) loader.add_value('market_type', 'shop') loader.add_value('market_status', '1') loader.add_value( 'market_url', urljoin(original_url, s.xpath('a/@href').extract_first())) loader.add_value('market_name', s.xpath('a/@data-title').extract_first()) loader.add_value( 'market_contact_wangwang', self._convert_str_to_list( s.xpath('a/@data-ww').extract_first())) loader.add_value( 'market_contact_qq', self._convert_str_to_list( s.xpath('a/@data-qq').extract_first())) loader.add_value( 'market_contact_phone', self._convert_str_to_list( s.xpath('a/@data-tel').extract_first())) loader.add_value( 'market_contact_weixin', self._convert_str_to_list( s.xpath('a/@data-wachat').extract_first())) loader.add_value('market_addr', (s.xpath('a/@data-source').extract_first() + ' ' + s.xpath('a/@data-market').extract_first() + ' ' + s.xpath('a/@data-floor').extract_first() + '楼 ' + s.xpath('a/@data-position').extract_first())) loader.add_value( 'market_start_time', datetime.datetime.strftime( datetime.datetime.strptime( s.xpath('a/@data-sbd').extract_first(), '%Y-%m-%d'), '%Y-%m-%d %H:%M:%S')) loader.add_value('market_exist_time', s.xpath('a/@data-years').extract_first()) loader.add_value( 'created_time', datetime.datetime.strftime(datetime.datetime.today(), '%Y-%m-%d %H:%M:%S')) return loader.load_item()
def town(self, response): town_names = response.xpath( '//p[@id="shangQuancontain"]/a[not(@class="org bold")]/text()' ).extract() town_urls = response.xpath( '//p[@id="shangQuancontain"]/a[not(@class="org bold")]/@href' ).extract() for name, url in zip(town_names, town_urls): print("开始镇=========") print(url) yield scrapy.Request(url=response.urljoin(url), callback=self.town_data) time.sleep(random.randint(1, 3))
def town(self, response): town_names = response.xpath( '//div[@class="option-list sub-option-list gio_plate"]/a[@class!="on"]/text()' ).extract() town_urls = response.xpath( '//div[@class="option-list sub-option-list gio_plate"]/a[@class!="on"]/@href' ).extract() for name, url in zip(town_names, town_urls): print("开始镇=========") print(url) yield scrapy.Request(url=response.urljoin(url), callback=self.town_data) time.sleep(random.randint(1, 3))
def _get_product_info(self, loader, response): # product info loader.add_value('product_url', self.url.strip()) self.logger.info('==url: %s' % self.url) # product url '^https?://([A-Za-z0-9\-]+\.)+vvic\.com/item/(\d+)$') loader.add_value('product_id', self.url.strip(), re=r'/(\d+)\.html?') loader.add_css('product_name', 'div.product-intro div.sku-name::text', MapCompose(lambda v: v.strip())) loader.add_value( 'product_category_id', response.meta.get( 'category_id', re.findall(r'cat\s*:\s*\[([\d,]+)],', response.text)[0])) loader.add_value( 'product_img_index_url', urljoin(response.meta['original_url'], str(loader.get_css('img#spec-img::attr(data-origin)')[0]))) # get product details for s in response.css('div.p-parameter > ul.parameter2 > li'): loader.add_value('product_detail', s.css('::text').extract(), lambda v: v[-1].strip().replace('\xa0', ' ')) # Get price and stock state product_id = loader.get_collected_values('product_id')[0] vender_id = re.findall(r'venderId\s*:\s*(\d+),', response.text)[0] cat = re.findall(r'cat\s*:\s*\[([\d,]+)],', response.text)[0] price_url = ( 'https://c0.3.cn/stock?skuId={0}&area=1_72_4137_0&venderId={1}&' 'cat={2}&buyNum=1&choseSuitSkuIds=&extraParam=' '{{%22originid%22:%221%22}}&ch=1&fqsp=0&pdpin=&detailedAdd=null' '&callback=jQuery172871') price_url = price_url.format(product_id, vender_id, cat) # self.logger.info('**price_url: %s' % price_url) result = requests.get(price_url) result.encoding = 'GBK' if result.status_code == 200: json_str = re.findall(r'jQuery172871\((.+)\)', result.text)[0] stock_dict = json.loads(json_str) loader.add_value('product_sale_price', stock_dict['stock']['jdPrice']['p']) loader.add_value('product_price_unit', 'CNY') loader.add_value( 'product_status', 'onshelf' if stock_dict['stock']['skuState'] == 1 else 'offshelf') else: self.logger.error('Cannot get price of product %s' % product_id)
def parse(self, response): dt = response.xpath('//*[@id="dlpage"]/dl/dt') dd = response.xpath('//*[@id="dlpage"]/dl/dd') for idt, idd in zip(dt, dd): url = idt.xpath( './span/a[@title="Download PDF"]/@href').extract_first() authors = idd.xpath( './/div[@class="list-authors"]/a/text()').extract() title = idd.xpath( './/div[@class="list-title mathjax"]/text()').extract()[-1] if "re-id" in title.lower(): item = PaperClawerItem() item["url"] = urljoin(response.url,url) item["authors"] = authors item["title"] = title.strip() yield item
def parse_from_market_with_product_page_pagination(self, response): """ parse market with product page to get all product info in this market """ original_url = response.meta['original_url'] for s in response.css('div.goods-list.shop-list ul li'): url = urljoin( original_url, s.css('div.item div.desc div.title a::attr(href)'). extract_first()) request = scrapy.Request( url=url, callback=self.parse_product_from_product_page) request.meta['original_url'] = url request.meta['category_id'] = response.meta['category_id'] request.meta['platform_code'] = self.platform_code yield request
def _extract_product_from_searched_result_part_page1(self, response): # self.logger.info('**keyword: %s' % self.keyword) # self.logger.info('html:\n%s' % response.text) for record in response.css('li[data-sku].gl-item'): # self.logger.info('record: %s' % record) if self.item_num <= self.item_count: self.logger.info('item_num: %s, item_count: %s' % (self.item_num, self.item_count)) self.logger.info('last item_count = %s' % self.item_count) # raise scrapy.exceptions.CloseSpider() return item_url = record.css( 'div.gl-i-wrap > div.p-img a::attr(href)').extract_first() # self.logger.info('item_url: %s' % item_url) if not item_url: self.logger.error('The item has no URL: %s' % record.extract()) continue item_url = urljoin(self.original_url, item_url) request = scrapy.Request( url=item_url, callback=self.parse_product_from_product_page) request.meta['original_url'] = item_url request.meta['platform_code'] = self.platform_code self.item_count += 1 yield request self.logger.info('item_count = %s' % self.item_count) url = ('https://search.jd.com/Search?keyword=%s&enc=utf-8&qrst=1' '&rt=1&stop=1&vt=2&psort=0&stock=1&page=%s' % (quote(self.keyword), self.page_num + 2)) request = scrapy.Request( url=url, callback=self.parse_searched_product_from_search_result_page) request.meta['item_num'] = self.item_num request.meta['page_num'] = self.page_num + 2 request.meta['keyword'] = self.keyword request.meta['original_url'] = url request.meta['platform_code'] = self.platform_code yield request
def parse(self, response): hack = response.xpath('//*[@id="container"]') post_title = response.xpath('//*[@class=" subject_new"]/a/text()') post_time = response.xpath('//*[@class="lastpost smalltext"]/text()') post_lastpost = response.xpath('//*[@class="lastpost smalltext"]/a/text()') post_author = response.xpath('//*[@class="author smalltext"]/a/text()') post_link = response.xpath('//*[@class=" subject_new"]/a/@href') items = [] for post_title, post_time, post_author, post_lastpost, post_link in zip(post_title, post_time, post_author, post_lastpost, post_link): item = HackforumsItem() item['Title'] = re.sub(r'[^a-z A-Z0-9?]', '', post_title.extract().strip()) item['Title'] = post_time.extract().strip() item['Lastpost'] = post_lastpost.extract().strip() item['Author'] = post_author.extract().strip() item['Link'] = post_link.extract().strip() items.append(item) yield item next_page_url = response.xpath().extract_first() absolute_next_page_url = response.urljoin(next_page_url) yield scrapy.Request(absolute_next_page_url, callback=self.parse)
def _get_category_info_from_category_str(self, cate_str): """ The Category string is sperated by '|': "jr.jd.com/|金融首页||0" """ cate_list = cate_str.split('|') cate_url = cate_list[0] if cate_url: match = re.match(r'^/?/?([.\w+]+)\.jd\.(com|hk)/?', cate_url.strip()) # "6144-6167|翡翠玉石||0" match1 = re.match(r'^(\d+-\d+-?\d*)', cate_url.strip()) if match: self.loader.add_value( 'category_url', urljoin(self.original_url, ('//' + cate_list[0] if re.match( r'[-\w]+\.[-\w]+', cate_list[0]) else cate_list[0]))) elif match1: self.loader.add_value( 'category_url', 'http://list.jd.com/list.html?cat=' + cate_url) else: self.logger.error('Unexpected URL: %s' % cate_url) self._get_category_id_from_url(cate_url) self.loader.add_value('category_status', '1') self.loader.add_value('category_level', '1') self.loader.add_value('category_is_leaf', '0') self.loader.add_value('category_name', cate_list[1]) else: self.logger.info('Category url is empty. The category string: %s' % cate_str)
def parse_searched_product_from_search_result_page(self, response): """Parse the search page to get the product info. :param str keyword: The searched keyword :param str item_num: Number of item whick will be scraped """ self.page_num = response.meta['page_num'] self.keyword = response.meta['keyword'] self.platform_code = response.meta['platform_code'] self.original_url = response.meta['original_url'] self.sku_list = [] self.product_request_list = [] if self.item_num <= self.item_count: self.logger.info('0 - last item_count = %s' % self.item_count) # raise scrapy.exceptions.CloseSpider() return flag = True for record in response.css('li[data-sku].gl-item'): if (self.item_num <= self.item_count and not self.product_request_list): self.logger.info('1 - last item_count = %s' % self.item_count) # raise scrapy.exceptions.CloseSpider() return if self.item_num <= self.item_count: self.logger.info('2 - last item_count = %s' % self.item_count) # raise scrapy.exceptions.CloseSpider() # return flag = False item_url = record.css( 'div.gl-i-wrap > div.p-img a::attr(href)').extract_first() item_sku = record.css('::attr(data-sku)').extract_first() self.sku_list.append(item_sku) if not item_url: self.logger.error('The item has no URL: %s' % record.extract()) continue if flag: self.product_request_list.append(item_url) for item_url in self.product_request_list: item_url = urljoin(self.original_url, item_url) request = scrapy.Request( url=item_url, callback=self.parse_product_from_product_page) request.meta['original_url'] = item_url request.meta['platform_code'] = self.platform_code self.item_count += 1 yield request # Get next 30 products in the same webpage self.logger.info('item_count: %s' % self.item_count) url = ('https://search.jd.com/s_new.php?keyword=%s&enc=utf-8&qrst=1' '&rt=1&stop=1&vt=2&stock=1&page=%s&s=30&scrolling=y&tpl=1_M' '&show_items=%s') % (quote( self.keyword), self.page_num + 1, ','.join(self.sku_list)) headers = {'referer': response.url} request = scrapy.Request( url=url, headers=headers, dont_filter=True, callback=self._extract_product_from_searched_result_part_page1) yield request
def _get_img_info(self, loader, response): # image info # self.logger.info('**img: %s' % # loader.get_css('img#spec-img::attr(data-origin)', # re=r'(//.+)jfs/')) mid_img_url_base = urljoin( response.meta['original_url'], loader.get_css('img#spec-img::attr(data-origin)', re=r'(//.+)jfs/')[0]) small_img_url_base = urljoin( response.meta['original_url'], loader.get_css('li.img-hover > img::attr(src)', re=r'(//.+)jfs/')[0]) big_img_url_base = urljoin( response.meta['original_url'], loader.get_value(mid_img_url_base, lambda v: v[0] + '//n0/', re=r'(//[\w.-]+?)/')) img_url_list_match = re.findall(r'imageList\s*:\s*(\[.+?]),', response.text, re.S) if img_url_list_match: img_url_list = json.loads(img_url_list_match[0]) else: img_url_list = [] for img_url in img_url_list: small_url = small_img_url_base + img_url mid_url = mid_img_url_base + img_url big_url = big_img_url_base + img_url loader.add_value('img_url', small_url) loader.add_value('img_url', mid_url) loader.add_value('img_url', big_url) # for images pipeline # loader.add_value('image_urls', big_url) # //img14.360buyimg.com/n5/s54x54_jfs/t18286/85/1937544663/60335/fe70148f/5addc747N13eb0a41.jpg # //img14.360buyimg.com/n1/s450x450_jfs/t18403/63/1935249568/49837/59c8d6c5/5addc772N4761924e.jpg if re.search(r's(\d+[xX]\d+)_', small_url): loader.add_value('img_size', small_url, re=r's(\d+[xX]\d+)_') else: loader.add_value('img_size', '') if re.search(r's(\d+[xX]\d+)_', mid_url): loader.add_value('img_size', mid_url, re=r's(\d+[xX]\d+)_') else: loader.add_value('img_size', '') loader.add_value('img_size', '') # loader.add_value('status', '0') # loader.add_value('status', '0') # loader.add_value('status', '0') if small_url in loader.get_output_value('product_img_index_url'): loader.add_value('img_purpose', 'index') else: loader.add_value('img_purpose', '') if mid_url in loader.get_output_value('product_img_index_url'): loader.add_value('img_purpose', 'index') else: loader.add_value('img_purpose', '') if big_url in loader.get_output_value('product_img_index_url'): loader.add_value('img_purpose', 'index') else: loader.add_value('img_purpose', '') loader.add_value('img_description', 'small') loader.add_value('img_description', 'middle') loader.add_value('img_description', 'big')
def _get_product_info(self, loader, response): # product info loader.add_value('product_url', self.url.strip()) # product url '^https?://([A-Za-z0-9\-]+\.)+vvic\.com/item/(\d+)$') loader.add_value('product_id', self.url.strip(), Compose(lambda v: v[-1]), re='/item/(\w+)') loader.add_css('product_name', 'div.product-detail div.d-name strong::text') loader.add_css( 'product_sale_price', 'div.v-price.d-p div.p-value span.fl strong.d-sale::text') loader.add_value('product_price_unit', 'CNY') loader.add_css('product_source_url', 'div.product-detail div.d-name a::attr(href)') loader.add_value('product_source_id', response.text, re="_TID = '(\d+)';") loader.add_css('product_source_price', 'div.v-price > div.p-value > span.d-sale::text') loader.add_value('product_source_price_unit', 'CNY') loader.add_value( 'product_img_index_url', urljoin( response.meta['original_url'], str( loader.get_value(response.text, TakeFirst(), re="_INDEXIMGURL = '(.*)';")))) for s in response.css('div.product-detail dl.summary dd'): # debugger.print(s) if (s.css('div.name::text').extract_first() and s.css('div.name::text').extract_first().strip() == '货号'): loader.add_value( 'product_art_no', s.css('div.value.ff-arial::text').extract_first(), lambda v: v.strip()) if (s.css('div.name::text').extract_first() and s.css('div.name::text').extract_first().strip() == '上新时间'): loader.add_value( 'product_upload_time', s.css('div.value.ff-arial::text').extract_first(), lambda v: v[0].strip(), lambda v: v if len(v) >= 19 else v + ':00', re='(\d{4}-\d{2}-\d{2}\s\d{2}:\d{2})') unshelf_time = loader.get_css('div.sold-info::text', re='(\d{4}-\d{2}-\d{2}\s\d{2}:\d{2})') if unshelf_time: loader.add_value('product_unshelf_time', unshelf_time, lambda v: v[0].strip(), lambda v: v if len(v) >= 19 else v + ':00') loader.add_value('product_status', 'unshelf') else: loader.add_value('product_status', 'onshelf') # loader.add_xpath('product_unshelf_time', # '//div[@class="sold-info"]/text()', # re='(\d{4}-\d{2}-\d{2}\s\d{2}:\d{2})') # get product details for s in response.css('div.d-attr.clearfix > ul > li'): loader.add_value('product_detail', s.css('::text').extract(), lambda v: v[-1].strip().replace('\xa0', ' '))
def town_data(self, response): ershou = ajk() data_area = response.xpath( '//div[@class="houseList"]/div[@class="list rel"]') names = [] prices = [] bdyears = [] bdaddrs = [] bddists = [] lats = [] lngs = [] cdates = [] for data in data_area: print("开始小区=========") name = data.xpath( 'dl[@class="plotListwrap clearfix"]/dd/p/a[@class="plotTit"]/text()' ).extract() price = data.xpath( 'div[@class="listRiconwrap"]/p[@class="priceAverage"]/span[1]/text()' ).extract() bdaddr = data.xpath( 'dl[@class="plotListwrap clearfix"]/dd/p[2]/text()').extract() bddistrict = data.xpath( 'dl[@class="plotListwrap clearfix"]/dd/p[2]/a[1]/text()' ).extract() bdyear = data.xpath( 'dl[@class="plotListwrap clearfix"]/dd/ul[@class="sellOrRenthy clearfix"]/li[3]/text()' ).extract() if name: names.append(name[0].strip().rstrip().lstrip()) lats.append(" ") lngs.append(" ") bddists.append(bddistrict[0]) bdaddrs.append(" ") if bdyear: bdyears.append(bdyear[0]) else: bdyears.append('9999') if price: tmp = re.findall("[-+]?\d+[\.]?\d*", price[0].strip().rstrip().lstrip()) if tmp: prices.append(tmp[0]) elif not tmp: prices.append('暂无均价') elif not price: prices.append('暂无均价') dt = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") cdates.append(dt) for block_name, block_price, block_bdyear, block_bdaddr, block_bddist, block_lat, block_lng, block_date in zip( names, prices, bdyears, bdaddrs, bddists, lats, lngs, cdates): ershou['house_name'] = block_name ershou['house_price'] = block_price ershou['house_bdyear'] = block_bdyear ershou['house_bdaddr'] = block_bdaddr ershou['house_bddist'] = block_bddist ershou['house_lat'] = block_lat ershou['house_lng'] = block_lng ershou['craw_date'] = block_date ershou['source_web'] = "fang" yield ershou next_link = response.xpath( '//div[@class="fanye gray6"]/a[contains(text(), "下一页")]/@href' ).extract() if next_link: url = next_link[0] print('next page =============' + url) time.sleep(random.randint(1, 3)) yield scrapy.Request(url=response.urljoin(url), callback=self.town_data)
def _get_shop_info(self, loader, response): # shop info loader.add_value('market_id', response.text, re="_SHOPID = '(\d+)';") loader.add_value('market_type', 'shop') loader.add_value( 'market_url', urljoin( response.meta['original_url'], str( loader.get_css( 'div.stall-head.fl div[class*=stall-head-name] a::attr(href)', TakeFirst())))) shop_loader = loader.nested_css('div.shop-info div.shop-content') shop_loader.add_css('market_name', 'h2.shop-name span::text', lambda v: v[0].strip()) # shop_table = shop_loader.nested_css('ul.mt10') attr_list = response.css( 'div.shop-info div.shop-content ul.mt10 div.attr::text').extract() value_list = response.css( 'div.shop-info div.shop-content ul.mt10 div.text') if len(attr_list) != len(value_list): self.logger.error('shop table name and value don\'t match') sys.exit(1) if not attr_list: self.logger.error('shop table name list is empty') sys.exit(1) attr_list = [e.split(':')[0] for e in attr_list] if '排行' in attr_list: loader.add_value( 'market_rank', value_list[attr_list.index('排行')].xpath( 'a/em/text()').extract()) if '旺旺' in attr_list: loader.add_value( 'market_contact_wangwang', value_list[attr_list.index( '旺旺')].xpath('span[@class="fl"]/text()').extract()) if '商品' in attr_list: # the page structure is changed num_class_list = value_list[attr_list.index('商品')].xpath( './ol/@class').extract() num = ''.join( re.findall(r'v-num num(\d+)', n)[0] for n in num_class_list if re.findall(r'v-num num(\d+)', n)) loader.add_value('market_item_num', num) if '电话' in attr_list: phone_class_list = value_list[attr_list.index('电话')].xpath('p') phones = [ ''.join(p.xpath('span[@class]/text()').extract()) for p in phone_class_list ] loader.add_value('market_contact_phone', phones) if '微信' in attr_list: loader.add_value( 'market_contact_weixin', ''.join(value_list[attr_list.index('微信')].xpath( 'span[@class and not(@style)]/text()').extract())) if 'QQ' in attr_list: # loader.add_value( # 'market_contact_qq', # value_list[attr_list.index('QQ')].xpath('text()').extract()) loader.add_value( 'market_contact_qq', ''.join(value_list[attr_list.index( 'QQ')].xpath('span[@class]/text()').extract())) if '产地' in attr_list: loader.add_value( 'market_addr', value_list[attr_list.index('产地')].xpath('text()').extract()) if '地址' in attr_list: loader.replace_value( 'market_addr', loader.get_collected_values('market_addr')[0] + ' ' + value_list[attr_list.index('地址')].xpath( 'text()').extract_first().strip())
def town_data(self, response): ershou = ajk() data_area = response.xpath( '//div[@class="list-wrap"]/ul[@class="house-lst"]/li') names = [] prices = [] bdyears = [] bdaddrs = [] lats = [] lngs = [] cdates = [] for data in data_area: name = data.xpath('div[@class="info-panel"]/h2/a/text()').extract() price = data.xpath( 'div[@class="info-panel"]/div[@class="col-3"]/div[@class="price"]/span[@class="num"]/text()' ).extract() bdaddr = data.xpath( 'div[@class="info-panel"]/div[@class="col-1"]/div[@class="where"]/a[@class="actshowMap_list"]/@xiaoqu' ).extract() bddistrict = data.xpath( 'div[@class="info-panel"]/div[@class="col-1"]/div[@class="where"]/a[@class="actshowMap_list"]/@districtname' ).extract() bdyear = data.xpath( 'div[@class="info-panel"]/div[@class="col-1"]/div[@class="other"]/div[@class="con"]/text()' ).extract() if name: names.append(name[0].strip().rstrip().lstrip()) if bdaddr[0]: tmp = re.findall("[-+]?\d+[\.]?\d*", bdaddr[0]) if tmp: lngs.append(tmp[0]) lats.append(tmp[1]) else: lats.append(" ") lngs.append(" ") else: lats.append(" ") lngs.append(" ") bdaddrs.append(bddistrict[0]) if (len(bdyear) >= 4): if bdyear[3]: tmp = bdyear[3].strip().rstrip().lstrip() if tmp: bdyears.append(tmp) elif not tmp: bdyears.append('9999') elif not bdyear[3]: bdyears.append('9999') else: bdyears.append('9999') if price: tmp = re.findall("[-+]?\d+[\.]?\d*", price[0].strip().rstrip().lstrip()) if tmp: prices.append(tmp[0]) elif not tmp: prices.append('暂无均价') elif not price: prices.append('暂无均价') dt = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") cdates.append(dt) for block_name, block_price, block_bdyear, block_bdaddr, block_lat, block_lng, block_date in zip( names, prices, bdyears, bdaddrs, lats, lngs, cdates): ershou['house_name'] = block_name ershou['house_price'] = block_price ershou['house_bdyear'] = block_bdyear ershou['house_bdaddr'] = block_bdaddr ershou['house_bddist'] = block_bdaddr ershou['house_lat'] = block_lat ershou['house_lng'] = block_lng ershou['craw_date'] = block_date ershou['source_web'] = "lianjia" yield ershou next_link = response.xpath( '//div[@class="page-box house-lst-page-box"]/a[contains(text(), "下一页")]/@href' ).extract() if next_link: url = next_link[0] print('next page =============' + url) time.sleep(random.randint(1, 3)) yield scrapy.Request(url=response.urljoin(url), callback=self.town_data)
def parse_from_index_page(self, response): """ Parse website index page to get 'city' market info. Indext url: http://www.vvic.com """ # if response.status in (301, 302): # self.logger.error( # 'redirect error, status: %s' % response.status) # self.logger.error( # 'redirect error, url: %s' % response.meta['original_url']) # self.logger.error('redirect url: %s' % response.url) # self.logger.error('redirect response: %s' % response.text) data_type = response.meta['data_type'] if data_type in ['market', 'product']: original_url = response.meta['original_url'] # get the top market info loader = ItemLoader(item=IndexPage(), response=response) top_market_num = 0 # plaftform info loader.add_value('platform_code', response.meta['platform_code']) global_markets_str = re.findall( r"_gobal_marketes\s*=\s*'\[(.*)\]';", response.text)[0] # global_markets = re.findall( # r"{panggeFlag=(\d+),\s*code=(\w+)," # "\s*rankFlag=(\d+),\s*name=(\w+),\s*id=(\d+)}", # global_markets_str) global_markets = re.findall( r"panggeFlag=(\d+),\s*code=(\w+),\s*rankFlag=(\d+)," "\s*modelFlag=\d+,\s*name=(\w+),\s*showHot=\d+,\s*id=(\d+),", global_markets_str) next_url_list = [] next_code_list = [] next_addr_list = [] next_market_id_list = [] for market in global_markets: loader.add_value('market_id', market[-1]) loader.add_value('market_name', market[-2]) loader.add_value('market_code', market[1]) loader.add_value('market_type', 'city') loader.add_value('parent_market_id', '0') loader.add_value( 'market_url', urljoin(response.meta['original_url'], market[1])) if market[-2] == '杭州男装': loader.add_value('market_addr', '浙江省 ' + market[-2]) else: loader.add_value('market_addr', '广东 ' + market[-2]) loader.add_value('market_status', '1') next_url_list.append( urljoin(response.meta['original_url'], market[1])) next_market_id_list.append(market[-1]) next_code_list.append(market[1]) if market[-2] == '杭州男装': next_addr_list.append('浙江省 ' + market[-2]) else: next_addr_list.append('广东 ' + market[-2]) # next_addr_list.append('广东 ' + market[-2]) top_market_num += 1 loader.add_value( 'created_time', datetime.datetime.strftime(datetime.datetime.today(), '%Y-%m-%d %H:%M:%S')) yield loader.load_item() # get the basic info about the website request = scrapy.Request(url=urljoin(response.request.url, '/contact.html'), callback=self.parse_from_contact_page) request.meta['original_url'] = original_url request.meta['platform_code'] = self.platform_code request.meta['market_num'] = top_market_num yield request # get city market info for (url, mid, code, addr) in zip(next_url_list, next_market_id_list, next_code_list, next_addr_list): self.logger.info('url, mid, code, addr:', url, mid, code, addr) request = scrapy.Request( url=url, callback=self.parse_from_city_index_page) request.meta['original_url'] = url request.meta['platform_code'] = self.platform_code request.meta['market_id'] = mid request.meta['market_code'] = code request.meta['market_addr'] = addr request.meta['data_type'] = data_type yield request
def parse_from_search_page(self, response): """ parse search index page to get category info entrance url: http://www.vvic.com/gz/list/index.html """ data_type = response.meta['data_type'] if data_type == 'category': original_url = response.meta['original_url'] loader = ItemLoader(item=SearchPage(), response=response) # plaftform info loader.add_value('platform_code', response.meta['platform_code']) # get city code # global_markets_str = re.findall( # r"_gobal_marketes\s*=\s*'\[(.*)\]';", response.text)[0] # global_markets = re.findall( # r"{panggeFlag=(\d+),\s*code=(\w+),\s*rankFlag=(\d+)," # "\s*name=(\w+),\s*id=(\d+)}", # global_markets_str) # next_code_list = [m[1] for m in global_markets] # get top category code for s in response.css("div.search-condition div.screen " "div.nav-category.nav-pid " "div.nc-value div.types a[href='#']"): pid = s.xpath('@data-val').extract_first() url = urljoin(original_url, '?pid=%s' % pid) loader.add_value('category_id', pid) loader.add_value('parent_category_id', '0') loader.add_value('category_name', s.xpath('text()').extract_first().strip()) loader.add_value('category_url', url) loader.add_value('category_status', '1') loader.add_value('category_level', '1') loader.add_value('category_is_leaf', '0') request = scrapy.Request( url=url, callback=self.parse_from_search_page_with_top_category) request.meta['original_url'] = url request.meta['category_id'] = pid request.meta['platform_code'] = self.platform_code request.meta['data_type'] = data_type yield request loader.add_value( 'created_time', datetime.datetime.strftime(datetime.datetime.today(), '%Y-%m-%d %H:%M:%S')) yield loader.load_item() if data_type == 'product': keyword = response.meta['keyword'] city = response.meta['city'] page_num = (response.meta.get('page_num') if response.meta.get( 'page_num', None) else '1') self.logger.info('keyword = %s, city = %s, page_num = %s' % (keyword, city, page_num)) url = ('https://www.vvic.com/apic/search/asy?' 'merge=1&q=%s&searchCity=%s¤tPage=%s' % (quote(keyword), city, page_num)) request = scrapy.Request( url=url, callback=self.parse_searched_product_from_search_ajax_page) request.meta['item_num'] = response.meta['item_num'] request.meta['page_num'] = page_num request.meta['keyword'] = response.meta['keyword'] request.meta['city'] = response.meta['city'] request.meta['original_url'] = url request.meta['platform_code'] = self.platform_code yield request
def parse_from_city_index_page(self, response): """ parse city market index page to get 'mall' market info(the tab '市场' in web page') city market index e.g.: http://www.vvic.com/gz """ # if response.status in (301, 302): # self.logger.error( # 'redirect error, url: %s' % response.meta['original_url']) # self.logger.error('redirect response: %s' % response.text) data_type = response.meta['data_type'] if data_type in ['market', 'product']: original_url = response.meta['original_url'] loader = ItemLoader(item=MarketPage(), response=response) loader.add_value('platform_code', response.meta['platform_code']) next_url_list = [] next_market_id_list = [] next_market_name_list = [] next_code_list = [] next_addr_list = [] mall_market_num = 0 for ele in response.css( 'div.index_markets div.index_markets_list a'): # 解放南鞋城 比较特殊, # url为http://www.vvic.com/jfn/markets.html#floor200 if response.meta['market_code'] == 'jfn': mid = response.meta['market_id'] + '-' + ele.xpath( '@href').extract_first().split('#')[-1] loader.add_value('market_id', mid) next_market_id_list.append(mid) next_code_list.append('jfn') else: mid = ele.xpath('@href').extract_first().split('/')[-1] loader.add_value('market_id', mid) next_market_id_list.append(mid) next_code_list.append(None) loader.add_value('parent_market_id', response.meta['market_id']) loader.add_value('market_name', ele.xpath('./text()').extract_first()) loader.add_value('market_type', 'mall') loader.add_value( 'market_url', urljoin(original_url, ele.xpath('@href').extract_first())) loader.add_value( 'market_addr', response.meta['market_addr'] + ' ' + ele.xpath('./text()').extract_first()) loader.add_value('market_status', '1') next_url_list.append( urljoin(original_url, ele.xpath('@href').extract_first())) next_addr_list.append(response.meta['market_addr'] + ' ' + ele.xpath('./text()').extract_first()) next_market_name_list.append( ele.xpath('./text()').extract_first()) mall_market_num += 1 loader.add_value( 'created_time', datetime.datetime.strftime(datetime.datetime.today(), '%Y-%m-%d %H:%M:%S')) yield loader.load_item() # update number of market mall for city market loader = ItemLoader(item=MarketPage(), response=response) loader.add_value('platform_code', response.meta['platform_code']) loader.add_value('market_id', response.meta['market_id']) loader.add_value('market_url', original_url) loader.add_value('market_item_num', mall_market_num) yield loader.load_item() # parse mall market index page to get 'floor' market info jfn_url = '' jfn_market_id_list = [] for (url, mid, code, addr, name) in zip(next_url_list, next_market_id_list, next_code_list, next_addr_list, next_market_name_list): # download the web only one time for the tow urls: # http://www.vvic.com/jfn/markets.html#floor202 # http://www.vvic.com/jfn/markets.html#floor201 if code == 'jfn': jfn_url = url jfn_market_id_list.append(mid) continue request = scrapy.Request( url=url, callback=self.parse_from_mall_index_page) request.meta['original_url'] = url request.meta['platform_code'] = self.platform_code request.meta['market_id'] = mid request.meta['market_code'] = code request.meta['market_addr'] = addr request.meta['market_name'] = name request.meta['data_type'] = data_type yield request # parse mall market index page for '解放南鞋城' request = scrapy.Request(url=url, callback=self.parse_from_mall_index_page) request.meta['original_url'] = jfn_url.split('#')[0] request.meta['platform_code'] = self.platform_code request.meta['market_id_list'] = jfn_market_id_list request.meta['market_code'] = 'jfn' request.meta['data_type'] = data_type yield request
def parse_from_mall_index_page(self, response): """ parse mall market index page to get 'floor' market info and 'shop' market info mall market index e.g.: http://www.vvic.com/xt/shops/400 """ # if response.status in (301, 302): # self.logger.error( # 'redirect error, url: %s' % response.meta['original_url']) # self.logger.error('redirect response: %s' % response.text) original_url = response.meta['original_url'] floor_market_item_num = 0 shop_market_item_num = 0 # get info of market floor loader = ItemLoader(item=MarketPage(), response=response) loader.add_value('platform_code', response.meta['platform_code']) selector_list = response.css( 'div.w.w-shops div.mk-shops dl.stall-table') # floor info table for ele in selector_list: data_id = ele.xpath('@data-id').extract_first() # 解放南鞋城 比较特殊, get shop info directly if response.meta['market_code'] == 'jfn': data_id_dict = { e.split('-')[-1]: e for e in response.meta['market_id_list'] } # self.logger.info('**: %s' % data_id) if data_id in data_id_dict: # update number of market shop for market mall # in '解放南鞋城' shop_market_item_num = ele.css( '[data-id="%s"] dt span.count::text' % data_id).re(r'(\d+)')[0] # self.logger.info('^^: %s' % shop_market_item_num) loader.add_value('market_id', data_id_dict[data_id]) loader.add_value('market_url', urljoin(original_url, '#%s' % data_id)) loader.add_value('market_item_num', shop_market_item_num) # yield loader.load_item() # get market shop info data_index = ele.xpath('./@data-index').extract_first() selector_list = ele.css('dd ul.floor-item-%s li.last' % data_index) yield self._parse_market_from_shop_table( response, selector_list, data_id_dict[data_id]) # get all product in all market if response.meta['data_type'] == 'product': for s in selector_list: url = urljoin(original_url, s.xpath('a/@href').extract_first()) request = scrapy.Request( url=url, callback=(self.parse_from_market_shop_page)) request.meta['original_url'] = url request.meta['data_type'] = response.meta[ 'data_type'] request.meta['platform_code'] = self.platform_code yield request continue if data_id == '-1': # skip "推荐档口" continue if len(selector_list) == 1 and ele.xpath( 'dt/h2/text()').extract_first().strip() == '全部': shop_market_item_num = ele.css( '[data-id="%s"] dt span.count::text' % data_id).re(r'(\d+)')[0] floor_market_item_num = shop_market_item_num # get market shop info data_index = ele.xpath('./@data-index').extract_first() selector_list = ele.css('dd ul.floor-item-%s li.last' % data_index) yield self._parse_market_from_shop_table( response, selector_list, response.meta['market_id']) break loader.add_value('market_id', response.meta['market_id'] + '-' + data_id) loader.add_value('parent_market_id', response.meta['market_id']) loader.add_value( 'market_name', response.meta['market_name'] + ' ' + ele.xpath('dt/h2/text()').extract_first().strip()) loader.add_value('market_type', 'floor') loader.add_value('market_url', urljoin(original_url, '#%s' % data_id)) loader.add_value( 'market_addr', response.meta['market_addr'] + ' ' + ele.xpath('dt/h2/text()').extract_first().strip()) loader.add_value('market_status', '1') loader.add_value('market_item_num', ele.css('dt span.count::text').extract_first(), re='(\d+)') floor_market_item_num += int( ele.css('dt span.count::text').re(r'(\d+)')[0]) # get market shop info data_index = ele.xpath('./@data-index').extract_first() selector_list = ele.css('dd ul.floor-item-%s li.last' % data_index) yield self._parse_market_from_shop_table( response, selector_list, response.meta['market_id'] + '-' + data_id) # get all product in all market if response.meta['data_type'] == 'product': for s in selector_list: url = urljoin(original_url, s.xpath('a/@href').extract_first()) request = scrapy.Request( url=url, callback=self.parse_from_market_shop_page) request.meta['original_url'] = url request.meta['data_type'] = response.meta['data_type'] request.meta['platform_code'] = self.platform_code yield request else: loader.add_value( 'created_time', datetime.datetime.strftime(datetime.datetime.today(), '%Y-%m-%d %H:%M:%S')) yield loader.load_item() # update number of market floor for market mall # 解放南鞋城 比较特殊, get shop info directly if response.meta['market_code'] != 'jfn': loader = ItemLoader(item=MarketPage(), response=response) loader.add_value('platform_code', response.meta['platform_code']) loader.add_value('market_id', response.meta['market_id']) loader.add_value('market_url', original_url) loader.add_value('market_item_num', str(floor_market_item_num)) loader.add_value( 'created_time', datetime.datetime.strftime(datetime.datetime.today(), '%Y-%m-%d %H:%M:%S')) loader.add_value('market_item_num', shop_market_item_num) yield loader.load_item()