def parse_good_list(self, response): """ 解析三级类别下的商品列表页面 """ for g_item in response.css('div#plist li.gl-item div.j-sku-item'): item = { GIF.SKUID: g_item.xpath('./@data-sku').extract_first(), GIF.NAME: g_item.css('div.p-name a em::text').extract_first(), GIF.URL: g_item.css('div.p-name a::attr(href)').extract_first(), GIF.CATEGORY: response.meta[GIF.CATEGORY] } item[GIF.URL] = urlparse.urljoin(response.url, item[GIF.URL]) req = Request(item[GIF.URL], callback=self.parse_good_brand) req.meta['item'] = item yield req # 解析该商品类别页面的下一页 next_page_url = response.css( '#J_bottomPage span.p-num a.pn-next::attr(href)').extract_first() if next_page_url: req = Request(url=urlparse.urljoin(response.url, next_page_url), callback=self.parse_good_list, dont_filter=True) req.meta[GIF.CATEGORY] = response.meta[GIF.CATEGORY] yield req
def start_requests(self): #生成爬虫爬取的链接 urls = "http://www.qiushibaike.com/text/" yield Request(url=urls, callback=self.parse) for i in range(1, 6): url = "http://www.qiushibaike.com/text/page/" + str( i) + "/?s=4964137" yield Request(url=url, callback=self.parse)
def parse_good_list(self, response): """ 解析三级类别下的商品列表页面 """ for g_item in response.css('div#plist li.gl-item div.j-sku-item'): item = { GIF.SKUID: g_item.xpath('./@data-sku').extract_first(), GIF.NAME: ''.join(g_item.css('div.p-name a em::text').extract()), GIF.URL: g_item.css('div.p-name a::attr(href)').extract_first(), GIF.CATEGORY: response.meta[GIF.CATEGORY] } # 判断sku-id是否是京东自营,如果非自营,则不爬取 if not self.is_sku_self_supported(item[GIF.SKUID]): continue item[GIF.URL] = urlparse.urljoin(response.url, item[GIF.URL]) req = Request(item[GIF.URL], callback=self.parse_good_brand, errback=self.errback_http) req.meta['item'] = item # req.meta['dont_redirect'] = True yield req # 解析该商品类别页面的下一页 next_page_url = response.css( '#J_bottomPage span.p-num a.pn-next::attr(href)').extract_first() if next_page_url: req = Request(url=urlparse.urljoin(response.url, next_page_url), callback=self.parse_good_list, errback=self.errback_http, dont_filter=True) req.meta[GIF.CATEGORY] = response.meta[GIF.CATEGORY] yield req
def parse_good_price(self, response): """ 解析每个商品的价格 """ try: pt_index = response.meta['pt_index'] if response.meta['is_jd_api']: price = self.jdPriceTool.get_price_from_response(response) else: price = self.priceToolList[pt_index].get_price_from_response( response) item = response.meta['item'] item[GIF.PRICE] = price item[GIF.UPDATE_TIME] = datetime.utcnow() item[GIF.SOURCE] = self.source good_item = GItem(item) yield good_item except Exception as e: # 返回的数据格式:[{"id":"J_4426168","p":"23.90","m":"32.01","op":"32.00"}] logging.error(u"解析价格错误,链接为: " + response.url) logging.error(e.message) logging.error(traceback.format_exc()) if response.meta['is_jd_api']: raise CloseSpider(u'解析价格错误, 返回数据为: ' + response.body) else: # 使用京东API进行查询 item = response.meta['item'] req = Request(url=self.jdPriceTool.get_price_url( item[GIF.SKUID]), callback=self.parse_good_price) req.meta['item'] = item req.meta['is_jd_api'] = True yield req
def parse_nearbyHotel(self, response): if response.status == 403 or response.status == 500 or response.status == 10060 or response.status == 503: yield Request(response.url, callback=self.parse_nearbyHotel, dont_filter=True) return item = UpItem() item['coll'] = 'nearbyhotels' item['resID'] = int(re.search(r'd\d+', response.url).group(0)[1:]) hotels = response.xpath( '//a[@class="property_title prominent "]/text()').extract() hrefs = response.xpath( '//a[@class="property_title prominent "]/@href').extract() ids = [] for href in hrefs: ids.append(int(re.search(r'd\d+', href).group(0)[1:])) diss = response.xpath('//b/text()').extract() list = [] length = len(ids) for i in range(0, length): hotelitem = NearbyItem() hotelitem['ID'] = ids[i] hotelitem['name'] = hotels[i] hotelitem['dis'] = float(diss[i][0:-2]) list.append(hotelitem) item['list'] = list print(item) yield item url = response.xpath( '//a[@class="nav next taLnk ui_button primary"]/@href').extract( )[0] next_url = self.base + url yield Request(next_url, self.parse_nearbyHotel)
def parse(self, response): company_selector = Selector(response) company_iterator = company_selector.xpath( r'//ul[@class="list-search"]/li') for eachcompany in company_iterator: companyitem = QiyeqianzhanItem() compony_name_1 = eachcompany.xpath( r'div[@class="tit"]/a/text()[1]').extract() compony_name_2 = eachcompany.xpath( r'div[@class="tit"]/a/em/text()').extract() compony_name_3 = eachcompany.xpath( r'div[@class="tit"]/a/text()[2]').extract() compony_url = eachcompany.xpath( r'div[@class="tit"]/a/@href').extract() if compony_name_1: if compony_name_2: if compony_name_3: companyitem['compony_name'] = compony_name_1[0].strip( ) + compony_name_2[0].strip( ) + compony_name_3[0].strip() else: companyitem['compony_name'] = compony_name_1[0].strip( ) + compony_name_2[0].strip() else: companyitem['compony_name'] = compony_name_1[0].strip() else: if compony_name_2: if compony_name_3: companyitem['compony_name'] = compony_name_2[0].strip( ) + compony_name_3[0].strip() else: companyitem['compony_name'] = compony_name_2[0].strip() if compony_url: companyitem[ 'compony_url'] = self.parent_url + compony_url[0].strip() companyitem["id"] = str(self.i) self.i += 1 # yield companyitem #测试内容网址 print(self.parent_url + compony_url[0].strip()) time.sleep(2) yield Request(self.parent_url + compony_url[0].strip(), meta={"item": companyitem}, callback=self.parse_article_content, cookies=self.cookies2) #nextlink = response.xpath(r'//div[@class="page-list"]/a[contains(text(),"下一页")]/@href').extract() nextlink = response.xpath( r'//div[@class="page-list"]/a[@class="next"]/@href').extract() if nextlink: Nextlink = nextlink[0].strip() request = Request(self.parent_url + Nextlink, callback=self.parse, cookies=self.cookies) time.sleep(2) yield request else: print('下一页链接为空')
def start_requests(self): for url in self.list_of_start_urls: if not url.startswith('http'): url = 'http://www.' + url request = Request(url) request.meta['orig_domain'] = urlparse(url).netloc yield request
def start_requests(self): try: req = Request( 'http://www.lfp.fr/competitionPluginCalendrierResultat/changeCalendrierHomeJournee?c=ligue1&js=%s&id=0' % self.journee, dont_filter=True) except AttributeError: req = Request('http://www.lfp.fr/ligue1/calendrier_resultat', dont_filter=True) return [req]
def start_requests(self): tags = BOOK_CATEGORY.keys() for tag in tags: url = 'https://book.douban.com/tag/' + tag request = Request(url=url, callback=self.parse, cookies={'bid': random.choice(self.bids)}) request.meta['real_tag'] = tag yield request
def start_requests(self): url = "https://www.apartments.com/" input_file_dir = getattr(self, 'input', None) with open(input_file_dir) as data_file: data = json.load(data_file) if data: for query in data['queries']: area_url = "".join([url, query['area']]) yield Request(area_url, self.parse) else: yield Request(url, self.parse)
def parse(self, response): self.check_code(response) movies_url = response.xpath('//a[@class="nbg"]/@href').extract() for movie_url in movies_url: logging.info("movie_url: %s", movie_url) yield Request(movie_url, callback=self.parse_subject) next_url = response.xpath('//span[@class="next"]/a/@href').extract() if next_url: logging.info("tag: %s", next_url[0]) yield Request(next_url[0])
def parse_pagination(self, response): product_urls = response.css('.product-image a ::attr(href)').extract() for product_url in product_urls: yield Request(url=product_url, meta=response.meta.copy(), callback=self.parse_product) raw_url = response.css( '.infinite-scroll-placeholder ::attr(data-grid-url)' ).extract_first() if raw_url: yield Request(url=HTMLParser().unescape(raw_url), callback=self.parse_pagination)
def parse(self, response): root = Selector(response) links = root \ .xpath('//a[@title]/@href').getall() for link in links: yield Request(urljoin(response.url, link), callback=self.parse_laptop_page) next_page_url = root.xpath( '//li[@class="listing__pagination-nav"][last()]/a/@href').get() yield Request(urljoin(response.url, next_page_url), callback=self.parse)
def parse_pagination(self, response): trail = self.add_trail(response) product_urls = response.css('a.product-link ::attr(href)').extract() for product_url in product_urls: yield Request(url=response.urljoin(product_url), meta={"trail": trail}, callback=self.parse_product) next_page = response.css('a#loadmore ::attr(href)').extract_first() if next_page: yield Request(url=response.urljoin(next_page), meta={"trail": trail}, callback=self.parse_pagination)
def start_requests(self): connection = pymongo.MongoClient(settings['MONGODB_ADDR'], ) db = connection[settings['MONGODB_DB']] posts = db['posts'] logger.info('Querying mongo to get participants') participants = posts.aggregate(pipeline=[ { '$project': { 'users_participated': 1, 'blog_id': 1 } }, { '$unwind': '$users_participated' }, { '$group': { '_id': { 'username': '******', 'blog_id': '$blog_id' }, 'count': { '$sum': 1 } }, }, { '$sort': { 'count': -1 } }, ]) participants = list(participants) logger.info('Total {} participants'.format(len(participants))) for entry in participants: _id = entry['_id'] blog_url = BLOG_URLS[_id['blog_id']] yield Request( urlparse.urljoin( blog_url, '/users/{}/favorites/'.format(_id['username'])), self.parse_favorites, priority=1, ) yield Request( urlparse.urljoin(blog_url, '/users/{}/'.format(_id['username'])), self.parse_user, )
def parse_res(self, response): resID = int(re.search(r'd\d+', response.url).group(0)[1:]) # 附近酒店 url = response.url # print(response.url) hotelurl = response.url hotelurl = hotelurl.replace('Restaurant_Review', 'HotelsNear') hotelurl = hotelurl.replace('-Reviews-', '-') # print(url) yield Request(hotelurl, callback=self.parse_nearbyHotel, meta={'resID': resID}) # 附近餐厅 resurl = url resurl = resurl.replace('Restaurant_Review', 'RestaurantsNear') resurl = resurl.replace('-Reviews-', '-') for i in range(0, 7): if i == 0: yield Request(resurl, callback=self.parse_nearbyRes, meta={'resID': resID}) else: index = re.search(r'd\d+', resurl).span()[-1] next_url = resurl[0:index] + "-oa%s" % (i * 30) + resurl[index:] print(next_url) yield Request(next_url, callback=self.parse_nearbyRes, meta={'resID': resID}) # 附近景点 spoturl = url spoturl = spoturl.replace('Restaurant_Review', 'AttractionsNear') spoturl = spoturl.replace('-Reviews-', '-') for i in range(0, 7): if i == 0: yield Request(spoturl, callback=self.parse_nearbySpot, meta={'resID': resID}) else: index = re.search(r'd\d+', spoturl).span()[-1] next_url = spoturl[0:index] + "-oa%s" % (i * 30) + spoturl[index:] print(next_url) yield Request(next_url, callback=self.parse_nearbySpot, meta={'resID': resID})
def parse_category(self, response): prod_links = response.xpath( '//a[@class="product-block__image"]/@href').extract() for link in prod_links: yield Request(response.urljoin(link), self.parse_item, meta={'category': response.meta['category']}) # Case in pagination next_links = response.xpath('//a[@rel="next"]/@href').extract() if next_links: yield Request(response.urljoin(next_links[0]), self.parse_category, meta={'category': response.meta['category']})
def parse(self, response): if self.i == 0: citys = response.xpath( '//div[@class="geo_name"]/a/@href').extract() else: citys = response.xpath( '//ul[@class="geoList"]/li/a/@href').extract() for city in citys: url = urllib.parse.urljoin(self.base, city) yield Request(url, callback=self.parse_city) self.i += 20 if self.i <= 720: next_url = self.base + ("/Restaurants-g294211-oa%s-China.html" % self.i) yield Request(next_url, callback=self.parse)
def parse(self, response): ress = response.xpath('//a[@class="property_title"]/@href').extract() for res in ress: url = self.base + res # print(url) yield Request(url, callback=self.parse_rev) # 分页找出所有餐厅列表 hrefs = response.xpath( '//a[@class="nav next rndBtn ui_button primary taLnk"]/@href' ).extract() if len(hrefs) > 0: href = hrefs[0] next_url = self.base + href next_url.replace(' ', '') yield Request(next_url, callback=self.parse)
def parse(self, response): ip_list = response.xpath('//*[@id="ip_list"]/tr') for ip in ip_list[1:]: item = CollectipsItem() try: item['IP'] = ip.xpath('td[2]/text()')[0].extract() item['PORT'] = ip.xpath('td[3]/text()')[0].extract() item['POSITION'] = ip.xpath('td[4]/a/text()')[0].extract() item['TYPE'] = ip.xpath('td[6]/text()')[0].extract() item['SPEED'] = ip.xpath('td[7]/div/@title').re( '\d{0,}\.\d{0,}')[0] item['LAST_CHECK_TIME'] = ip.xpath( 'td[10]/text()')[0].extract() yield item except: pass #获取下一页链接 next_page_nums = response.xpath('//*[@class="next_page"]/@href') if (next_page_nums): next_page = self.link_url + next_page_nums[0].extract() print(next_page) yield Request(url=next_page, callback=self.parse) else: print("爬取完成")
def start_requests(self): self.cp = MyConfigParser() self.cp.read("conf/conf.ini") self.usr = self.cp['user_info']['user'] self.pwd = self.cp['user_info']['pwd'] self.usr_name = self.cp['user_info']['usrname'] self.usr_IDcard = self.cp['user_info']['usrIDcard'] self.usr_phnum = self.cp['user_info']['usrphnum'] self.seat_type = self.cp['user_info']['seat_type'] self.usr_type = self.cp['user_info']['usr_type'] self.usr_type_code = self.cp['user_type_code'][self.usr_type] self.from_station = self.cp['station_info']['from_station'] self.from_station_code = stations.get(self.from_station) self.to_station = self.cp['station_info']['to_station'] self.to_station_code = stations.get(self.to_station) self.earliest = self.cp['station_info']['earliest'] self.lastest = self.cp['station_info']['lastest'] self.date = self.cp['station_info']['date'] print('start_requests') yield Request("http://www.12306.cn/mormhweb/", meta={'cookiejar': self.cookiejar}, headers=self.header)
def start_requests(self): # Parse weibo homepage home_url = "http://weibo.cn/u/%s" % self.uid yield Request(url=home_url, cookies=self.cookie, callback=self._parse_homepage, errback=self.parse_error)
def parse_item(self, response): mzitu = response.meta['mzitu'] mzitu["url"] = response.url mzitu["tags"] = response.xpath( '//div[@class="main-tags"]/a/text()').extract() mzitu["title"] = response.xpath( '//h2[@class="main-title"]/text()').extract_first() mzitu["classify"] = response.xpath( '//div[@class="main-meta"]/span[1]/a/text()').extract_first() mzitu["publish_date"] = response.xpath( '//div[@class="main-meta"]/span[2]/text()').re(u'发布于 (.*)')[0] mzitu["visitors"] = int( response.xpath('//div[@class="main-meta"]/span[3]/text()').re( u'(\\d+[,\\d+]*)')[0].replace(",", "")) mzitu["pics"] = [] img_urls = response.xpath( '//div[@class="main-image"]/descendant::img/@src').extract() for img_url in img_urls: mzitu["pics"].append( re.match("http://[^/]+/(.*)", img_url).group(1)) # yield mzitu next_page = response.xpath( '//div[@class="pagenavi"]/a[last()]/@href').extract_first() non_re_next_page = re.match(r"http://www.mzitu.com/\\d+$", next_page) if next_page == response.url or non_re_next_page: yield mzitu else: yield Request(url=urljoin(response.url, next_page), meta={'mzitu': mzitu}, dont_filter=True, callback=self.parse_next_page)
def start_requests(self): logging.debug("###### 妹子图Spider开始启动.....%s" % self) return [ Request(url="http://www.meizitu.com/tag/nvshen_460_1.html", callback=self.parse, headers=self.user_header) ]
def parse(self, response): # print(response.body) for picdiv in response.css('div[class="pic"]'): image_urls = picdiv.css( 'a[target="_blank"] img::attr(src)').extract_first() image_split = image_urls.split("/") image_name = image_split[-3] + image_split[-2] + image_split[-1] yield SaveGirlImageItem({ 'name': MeiziTuSpider.__remove_html_tags( picdiv.css('a[target="_blank"] img::attr(alt)').extract() [0]), #获取这组相片的名称 'url': picdiv.css('a[target="_blank"] img::attr(src)').extract_first( ), #获取这组照片的链接 'image_urls': [ picdiv.css( 'a[target="_blank"] img::attr(src)').extract_first() ], 'images': image_name }) next_page = response.xpath( u'//div[@class="navigation"]//li/a[contains(.,"下一页")]/@href' ).extract_first() if next_page is not None: requesturl = "http://www.meizitu.com" + next_page yield Request(requesturl, callback=self.parse, headers=self.user_header)
def start_requests(self): reported_date = self.reported_date _, month, year = self.reported_date.split('.') for start_url in self.start_urls: yield Request(url=start_url.format(year, month, reported_date), callback=self.parse)
def make_request_from_data_str(self, data_str): try: return Request(url=self.datastr_to_url(data_str), meta={'id': int(data_str)}, dont_filter=False) except Exception as e: print(e)
def parse_listings(self, response, **kwargs): if not isinstance(response, HtmlResponse): response = HtmlResponse(response.url, body=response.body, request=response.request) raw_movies = json.loads(response.text)['results'][0] if not raw_movies: return for page in range(1, raw_movies['nbPages']): listings_formdata = self.listings_formdata.copy() listings_formdata["page"] = str(page) form_data = { "requests": [{ "indexName": "films", "params": urllib.parse.urlencode(listings_formdata) }] } yield Request( self.listings_api_url, method='POST', callback=self.parse_listings, body=json.dumps(form_data), ) yield from self._requests_to_follow(response)
def parse(self, response): """ 解析京东商品的分类页面 """ selectors = response.css('.category-items .col .category-item') logging.info(u'------------从主页上获取的一级类别数目为:{0}------------'.format( len(selectors))) url_count = 0 for main_cat_sel in selectors: # 第一级类别名称 first_cat = main_cat_sel.css( '.mt .item-title span::text').extract_first() if first_cat not in self.included_cat_list: continue logging.info(first_cat) # 找到二级类别名称,以及其下面的三级类别名称列表和对应的页面 for items_sel in main_cat_sel.css('.mc div.items dl.clearfix'): # 二级类别名称 second_cat = items_sel.css('dt a::text').extract_first() # 三级类别名称,技改类别下面商品列表的链接 for item_sel in items_sel.css('dd a'): url_count += 1 third_cat = item_sel.xpath('./text()').extract_first() url = item_sel.xpath('./@href').extract_first() req = Request(url=urlparse.urljoin(response.url, url), callback=self.parse_good_list, errback=self.errback_http, dont_filter=True) req.meta[GIF.CATEGORY] = [first_cat, second_cat, third_cat] yield req logging.info( u'------------从主页上获取的三级类别数目为:{0}------------'.format(url_count))
def parse(self, response): for country, currency in zip(self.country.split(','), self.currency.split(',')): yield Request(url="{}country={}¤cy={}".format( self.country_code_api, country, currency), dont_filter=True, callback=self.parse_start_page)
def parse_info(self, response): src = response.url title = response.css(".photoDetails h1").xpath(".//text()").extract()[0].strip() link = response.css("#video-container iframe").xpath("./attribute::src").extract()[0] cover = response.css("meta[property=og\\3a image]").xpath("./attribute::content").extract()[0].strip() pdate = response.css(".post-info .post-date").extract()[0].strip() # print(response.css("meta[property=og\\3a description]").xpath("./attribute::content").extract()[0].strip()) item = FuviItem() item["title"] = title item["sapo"] = "" item["cover"] = cover item["link"] = link item["src"] = src item["site"] = self.site item["catId"] = 1 request = Request(link, callback=self.parse_item) request.meta["item"] = item yield(request)