def parse_artwork(self, response): """Extracts information from an artwork detail page """ # create a url version free of search query noise url_bits = urlparse.urlparse(response.url) url_bits = url_bits._replace(query='') clean_url = urlparse.urlunparse(url_bits) loader = ItemLoader(item=ArtworkItem(), response=response) loader.add_value('museum_code', self.name) loader.add_value('url', clean_url) loader.add_xpath('artist_name', '//div[@id="tombstone"]/p[1]/a/text()[1]') artist_url = response.xpath('//div[@id="tombstone"]/p[1]/a/@href') artist_url = urlparse.urljoin(response.url, artist_url.extract()[0]) loader.add_value('artist_url', artist_url) loader.add_css('title', '#tombstone span:nth-of-type(1)::text') loader.add_xpath('thumbnail', '//div[@id="artwork-image"]/a/img/@src') loader.add_xpath('on_display', ON_DISPLAY_SELECTOR) item = loader.load_item() self.logger.info('Scraped ' + item['title'][0]) yield item
def parse_item(self,response): sel = Selector(response) il = ItemLoader(item=Product(), response=response) cat = il.get_xpath('//div[contains(@id, "ctl00_pnlBreadCrumbs")]/a[last()]/text()') availability = il.get_xpath('//a[contains(@id,"hplddToCart") or contains(@class,"addToCart")]/text()') price = il.get_css('span#ctl00_ContentPlaceHolder1_ctrlProdDetailUC_lblRegprice > font::text') sale = il.get_css('span#ctl00_ContentPlaceHolder1_ctrlProdDetailUC_lblSaleprice > font::text') """If the xpath doesn't retunr a category, the product belongs to the Bundle category""" if not cat: il.add_value("category", "Bundle") else: il.add_value("category", cat) il.add_css("title", "span#ctl00_ContentPlaceHolder1_ctrlProdDetailUC_lblProdTitle::text") il.add_value("url",response.url) """If a product can be added to the cart, the product is available online, if not, the product is not available online""" if "ADD TO CART" in availability: il.add_value("availability", "Product is available online") else: il.add_value("availability", "Product is not available online") """If there's a sale price present but not a regular price present, it switches the sale price for the regular price as shown in the website""" if not price: il.add_value("regPrice",sale) il.add_value("salePrice", None) else: il.add_value("regPrice", price) il.add_value("salePrice",sale) return il.load_item()
def parse_titles(self, response): # for post_title in response.css('div.entries > ul > li a::text').extract(): # yield {'title': post_title} l = ItemLoader(item=Product(), response=response) l.add_css('name', '#content > h1::text') l.add_css('consist', 'div.entries > ul > li a::text') return l.load_item()
def parse_item(self, response): l = ItemLoader(item=ZtArticleItem(), response=response) l.add_value('classId', '10'); l.add_value('cataName', u'私募资讯') l.add_value('url', response.urljoin(response.url)) l.add_css('title', '#artibodyTitle::text') l.add_css('seo_keywords', 'meta[name*=keywords]::attr(content)') l.add_css('seo_description', 'meta[name*=description]::attr(content)') # 新浪发布时间和来源在一个dom节点内,而且来源会变化简单处理 source = response.css('.time-source span a::text').extract() if len(source) == 0: tmp = response.css('.time-source::text').extract()[0] tmp = tmp.replace(' ', '').replace('\n', '').replace('\t', '') l.add_value('author', tmp[16:]) l.add_value('source', tmp[16:]) l.add_value('publishTime', tmp[:16]) else: l.add_value('author', source[0]) l.add_value('source', source[0]) l.add_value('publishTime', response.css('.time-source::text').extract()[0].replace(' ', '').replace('\n', '').replace('\t', '')) l.add_css('keywords', '.article-keywords a::text') # 新浪无浏览数随机一个三位数 l.add_value('views', randint(100, 999)) l.add_css('image_urls', '#artibody img::attr(src)') # content = ''.join(response.xpath('//div[@id="artibody"]/*').extract()) content = response.css('#artibody').extract()[0] l.add_value('content', content) yield l.load_item()
def parse_item(self, response): loader = ItemLoader(GaokaopaiZhiyeItem(), response) loader.add_value('url', response.url) loader.add_value('code', response.url, re=ur'-([^-]+)\.html') loader.add_css('name', u'.modTitle>h1::text') def parse_category(): for e in response.css(u'.catType>a'): yield { 'url': e.css('::attr(href)').extract_first(), 'code': e.css('::attr(href)').re_first(ur'-([^-]+)\.html'), 'name': e.css('::text').extract_first(), } loader.add_value('category', list(parse_category())) loader.add_css('detail', u'.zhiyeShow') item = loader.load_item() return FormRequest( url='http://www.gaokaopai.com/ajax-career-getRelateMajor.html', formdata={'code': item['code'][0]}, meta={'item': item}, dont_filter=True, callback=self.parse_majors )
def parse(self, response): l = ItemLoader(item=Product(), response=response) l.add_xpath('name', '//div[@class="product_name"]') l.add_xpath('name', '//div[@class="product_title"]') l.add_xpath('price', '//p[@id="price"]') l.add_css('stock', 'p#stock]') l.add_value('last_updated', 'today') return l.load_item()
def parse_item(self, response): loader = ItemLoader(EolZhiyeItem(), response) loader.add_value('url', response.url) loader.add_value('code', response.url, re=r'/(\w+)\.shtml') loader.add_css('name', 'h1#pagetitle::text') loader.add_xpath('category', u'//div[@id="precontent"]/p[contains(., "行业")]/a/text()') loader.add_xpath('category2', u'//div[@id="precontent"]/p[contains(., "职业")]/a/text()') loader.add_xpath('detail', u'//div[@id="precontent"]/following-sibling::node()[not(self::table)]', Join('\n')) yield loader.load_item()
def parse(self, response): wines = response.css('table.table > tr') for wine in wines: loader = ItemLoader(item=WineDeciderItem(), selector=wine) loader.add_css('maturity', css='td.cbResultSetTableCell > img::attr(src)') loader.add_css('winery_name', css='td.cbResultSetTableCell > h3::text') loader.add_css('wine_name', css='td.cbResultSetTableCell::text') loader.add_css('vintage', css='td:nth-child(3) h3::text') loader.add_css('mark', css='td:nth-child(5) ::text') yield loader.load_item()
def parse(self, response): loader = ItemLoader(item=RecipeItem(), selector=response) loader.add_css('recipe_name', css='div.container h1.main-title span ::text') loader.add_css( 'recipe_image', css='div.container div.product-area div.image ::attr(src)') ingredient_loader = loader.nested_css('div.ingredient li') ingredient_loader.add_css('recipe_ingredients', css='::text') prepa_loader = loader.nested_css('div.preparation li') prepa_loader.add_css('recipe_prepa', css='div > p ::text') yield loader.load_item()
def startProductLoader(self, response): productLoader = ItemLoader(item = Product(), response = response) productLoader.add_css('name', 'h3.product_name a') productLoader.add_css('category', 'div.product_category a') productLoader.add_css('image_urls', 'img.image_main::attr(src)') productLoader.add_css('price', 'span.price') productLoader.add_css('slug', 'h3.product_name a::attr(href)') productLoader.add_value('label', 'primary') productLoader.add_value('description', '') return productLoader.load_item()
def parse(self, response): for result in response.css("div.search-results-listing"): l = ItemLoader(DownloaddmppdfsItem(), result) l.add_css('name', 'a.result-title > strong::text') l.add_xpath('file_urls', 'a.result-title::attr(href)') l.add_xpath('summary', '//span[@class="result-text"][0]') l.add_xpath('date_delivered', '//span[@class="result-text"][1]') l.add_xpath('parties', '//span[@class="result-text"][2]' ) # you can also use literal values l.add_xpath('tenement', '//span[@class="result-text"][3]' ) # you can also use literal values yield l.load_item()
def parse_item(self, response): i = ItemLoader(item=Annonce(), response=response) i.add_value('url', response.url) i.add_css('titre', 'header h1::text') i.add_css('prix', '.item_price .value::text') i.add_css('date', 'section.properties p.line::text') i.add_css('description', '.properties_description p.value::text') i.add_css( 'tag', '.line h2:not(.item_price) span::text, .line h2:not(.item_price) span a::text' ) return i.load_item()
def parse_item(self, response): loader = ItemLoader(item=Chapter(), response=response) loader.default_output_processor = TakeFirst() loader.add_css('content', '#content') loader.add_css('title', '.bookname h1::text') loader.add_xpath('book_name', '//div[@class="con_top"]/a[3]/text()') loader.add_value('url', response.url) loader.add_value('chapter_id', response.url) loader.add_value('crawl_time', datetime.datetime.now()) loader.add_value('number', response.meta['number']) yield loader.load_item()
def parse(self, response): l = ItemLoader(item=NewsParagraph(), response=response) l.add_xpath( 'title', '/html[1]/body[1]/div[2]/div[1]/div[3]/div[1]/div[2]/h3[1]/text()') l.add_xpath( 'date', '/html[1]/body[1]/div[2]/div[1]/div[3]/div[1]/div[2]/h3[1]/div[1]/text()' ) l.add_css('paragraph_text', 'div.clear > p::text') l.add_value('last_updated', datetime.today()) rv = l.load_item() return rv
def parse(self, response): l = ItemLoader(item=AmazonItem(), response=response) # extracting info l.add_css('product_name', '.a-color-base.a-text-normal::text') l.add_css('author', '.a-size-base.a-link-normal:nth-child(2)::text') l.add_css('price', '.a-offscreen') l.add_css("image_link", '.s-image::attr(src)') l.add_css('stars', ".a-icon-alt::text") yield l.load_item() next_page = response.css('li.a-last ::attr(href)').extract_first() next_page = response.urljoin(next_page) yield scrapy.Request(url=next_page, callback=self.parse)
def parseEpisode(self, response): print("parsing " + self.nprPodcastName + " episode") subsections = response.css('h3.rundown-segment__title a::attr(href)') for url in subsections.extract(): l = ItemLoader(PodcastTranscriptsItem(), response) l.add_css('episode_title', 'div.title-description h1') request = scrapy.Request(url, callback=self.parseSubsection, meta={'itemLoader': l}, dont_filter=True, priority=75) yield request
def parse_ads(self, response: HtmlResponse): loader = ItemLoader(item=AvitoparserItem(), response=response) loader.add_css('title','div.AdvertCard_advertTitle__1S1Ak::text') loader.add_xpath('photos','//figure[@class=\'PhotoGallery_photo__36e_r\']//source/@srcset') loader.add_xpath('price', '//div[@class=\'AdvertCard_price__3dDCr AdvertCard_topAdvertHeaderCommon__2zUjb rouble\']/text()') yield loader.load_item() # title = response.css('h1.title-info-title span.title-info-title-text::text').extract_first() # photos = response.xpath('//div[contains(@class, "gallery-img-wrapper")]//div[contains(@class, "gallery-img-frame")]/@data-url').extract() # # print(title, photos) # yield AvitoparserItem(title=title, photos=photos)
def parse_item(self, response): houses = response.xpath( '//div[@class="f-main-list"]//div[@class="f-list-item ershoufang-list"]' ) for house in houses: l = ItemLoader(item=GanjiItem(), selector=house) l.add_css('title', 'dd.dd-item.title a::text') l.add_css('size', 'dd.dd-item.size span::text') l.add_css('address', 'dd.dd-item.address span ::text') l.add_css('feature', 'dd.dd-item.feature span::text') l.add_css('info', 'dd.dd-item.info div ::text') yield l.load_item()
def parse_ads(self, response: HtmlResponse): loader = ItemLoader(item=AvitoparserItem(), response=response) loader.add_xpath( 'photos', '//div[contains(@class, "gallery-img-wrapper")]//div[contains(@class, "gallery-img-frame")]/@data-url' ) loader.add_css('name', 'h1.title-info-title span.title-info-title-text::text') loader.add_xpath('price', '//span[@class="js-item-price"]/@content') loader.add_xpath( 'currency', '//span[@class="price-value-prices-list-item-currency_sign"]/@content' ) loader.add_xpath( 'car_brand', '//li[contains(@class, "item-params-list-item")][1]/text()') loader.add_xpath( 'car_model', '//li[contains(@class, "item-params-list-item")][2]/text()') loader.add_xpath( 'modification', '//li[contains(@class, "item-params-list-item")][4]/text()') loader.add_xpath( 'year', '//li[contains(@class, "item-params-list-item")][5]/text()') loader.add_xpath( 'mileage', '//li[contains(@class, "item-params-list-item")][6]/text()') loader.add_xpath( 'num_doors', '//li[contains(@class, "item-params-list-item")][11]/text()') loader.add_xpath( 'engine_type', '//li[contains(@class, "item-params-list-item")][12]/text()') loader.add_xpath( 'transmission', '//li[contains(@class, "item-params-list-item")][13]/text()') loader.add_xpath( 'drive', '//li[contains(@class, "item-params-list-item")][14]/text()') loader.add_xpath( 'rudder', '//li[contains(@class, "item-params-list-item")][15]/text()') loader.add_xpath( 'color', '//li[contains(@class, "item-params-list-item")][16]/text()') loader.add_xpath( 'place_inspection', '//li[contains(@class, "item-params-list-item")][18]/text()') loader.add_value('link', response.url) yield loader.load_item()
def parse_detail(self, response): article_item = JobBoleArticleItem() #提取文章的具体字段 front_imgae_url = response.meta.get('front_image_url', '') title = response.css('.entry-header h1::text').extract()[0] create_date = response.css( '.entry-meta-hide-on-mobile::text').extract()[0].strip()[0:10] praise_nums = int(response.css('.vote-post-up h10::text').extract()[0]) fav_num = response.css( '.btn-bluet-bigger.href-style.bookmark-btn.register-user-only::text' ).extract()[0] match_re = re.match('.*?(\d+).*?', fav_num) if match_re: fav_num = int(match_re.group(1)) else: fav_num = 0 com_num = response.css( '.btn-bluet-bigger.href-style.hide-on-480 ::text').extract()[0] match_re = re.match('.*?(\d+).*?', com_num) if match_re: com_num = int(match_re.group(1)) else: com_num = 0 content = response.css('.entry').extract()[0] tag_list = response.css('.entry-meta-hide-on-mobile a::text').extract() tags = ','.join(tag_list) article_item['url_object_id'] = get_md5(response.url) article_item['title'] = title article_item['url'] = response.url try: create_date = datetime.datetime.strptime(create_date, '%Y/%m/%d').date() except Exception as e: create_date = datetime.datetime.now().date() article_item['create_date'] = create_date article_item['front_image_url'] = {front_imgae_url} article_item['praise_nums'] = praise_nums article_item['comment_nums'] = com_num article_item['fav_nums'] = fav_num article_item['tags'] = tags article_item['content'] = content #通过Item Loader加载Item item_loader = ItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css('title', '.entry-header h1::text') item_loader.add_value('url', response.url) yield article_item pass
def parsepage(self, res): """This function parses a movie page. @url https://yts.mx/browse-movies @returns items 1 @scrapes movie_title release_year genre imdb_rating @scrapes rating_count criticts audience magnet_link """ try: cr = res.css("div.rating-row > span::text").re(r"\d+%")[0] au = res.css("div.rating-row > span::text").re(r"\d+%")[1] except IndexError: cr = None au = None # Create the loader using the response movie_loader = ItemLoader(item=YtsItem(), response=res) movie_loader.default_output_processor = TakeFirst() # Load fields using XPath expressions movie_loader.add_css( "movie_title", "div.row > div#movie-info.col-xs-10.col-sm-14.col-md-7.col-lg-8.col-lg-offset-1 > div.hidden-xs > h1::text", MapCompose(str.strip, str.title), ) movie_loader.add_css( "release_year", "div.row > div#movie-info.col-xs-10.col-sm-14.col-md-7.col-lg-8.col-lg-offset-1 > div.hidden-xs > h2::text", ) movie_loader.add_xpath( "genre", "/html/body/div[4]/div[3]/div[1]/div[4]/div[1]/h2[2]/text()", ) movie_loader.add_css( "imdb_rating", 'div.rating-row > span[itemprop="ratingValue"]::text', ) movie_loader.add_css( "rating_count", 'div.rating-row > span[itemprop="ratingCount"]::text', ) movie_loader.add_value("criticts", cr) movie_loader.add_value("audience", au) movie_loader.add_css( "magnet_link", "div.modal-torrent a.magnet-download.download-torrent.magnet::attr(href)", ) return movie_loader.load_item()
def parse(self, response): self.log(f"I just visited {response.url}") for article in response.css("article.product_pod"): # item loader initialization item_loader = ItemLoader(item=BookDataItemLoaderItem(), selector=article) BookDataItemLoaderSpider.count += 1 item_loader.add_value('item_number', BookDataItemLoaderSpider.count) item_loader.add_css('title', "h3 > a::attr(title)") item_loader.add_css('price', "p.price_color::text") item_loader.add_css('stars', "article > p::attr(class)") item_loader.add_css('thumbnail_path', "div > a > img::attr(src)") item_loader.add_css('detailed_book_url', "div > a::attr(href)") # crawl detailed book page detailed_book_url = article.css("div > a::attr(href)").get() if detailed_book_url: # parse the detailed book page with an approprate parse method yield response.follow( url=detailed_book_url, callback=self.parse_detailed_book_url, # send some meta data to the parse_detailed_book_url method meta={'item': item_loader.load_item()}, dont_filter=True) else: yield item_loader.load_item() # move to following pages next_page_url = response.css("li.next > a::attr(href)").get() if next_page_url: yield response.follow(url=next_page_url, callback=self.parse)
def parse(self, response): # self.logger.info('hello this is my first spider') self.logger.info('Parse function called on {}'.format(response.url)) quotes = response.css('div.quote') for quote in quotes: loader = ItemLoader(item=QuoteItem(),selector=quote) loader.add_css('quote_content', '.text::text') loader.add_css('tags', '.tag::text') quote_item = loader.load_item() author_url = quote.css('.author + a::attr(href)').get() yield response.follow(author_url, self.parse_author, meta = {'quote_item': quote_item}) for a in response.css('li.next a'): yield response.follow(a, self.parse)
def parse_item(self, response): sel = response.css("div.path") loader = ItemLoader(item=SeriesItem(), selector=sel) loader.add_css("series_id", "a:last-child::attr(href)") loader.add_css("series_name", "a:last-child::text") series = loader.load_item() print(series) # 即将销售 & 在售 for sel in response.css("div.interval01-list-cars-infor"): loader = ItemLoader(item=ModelItem(), selector=sel) loader.add_css("model_id", "a::attr(href)") loader.add_css("model_name", "a::text") loader.add_value("series_id", series['series_id']) loader.add_value("series_name", series['series_name']) yield loader.load_item() # 停售 url = "http://www.autohome.com.cn/ashx/series_allspec.ashx" years = response.css(".dropdown-content a::attr(data)") for year in years.extract(): qs = { "y": year, "s": series["series_id"] } yield Request(url + "?" + urlencode(qs), self.stop_sale)
def parse(self, response, **kwargs): loader = ItemLoader(item=ReservaItem(), selector=response) loader.add_value('url_item', response.url) loader.add_css( 'name', '.item-page > table:nth-child(3) > tbody:nth-child(1) > tr:nth-child(1) > td:nth-child(2) > p:nth-child(2)' ) loader.add_css( 'biome', '.item-page > table:nth-child(3) > tbody:nth-child(1) > tr:nth-child(1) > td:nth-child(2) > p:nth-child(3)::text' ) loader.add_css( 'size_area', '.item-page > table:nth-child(3) > tbody:nth-child(1) > tr:nth-child(1) > td:nth-child(2) > p:nth-child(4)::text' ) loader.add_css( 'unity_created_at', '.item-page > table:nth-child(3) > tbody:nth-child(1) > tr:nth-child(1) > td:nth-child(2) > p:nth-child(5)::text' ) detail = response.xpath( '/html/body/div[2]/main/div/div/div/section/div/div[1]/table/tbody/tr[1]/td[2]/p[5]/text()' ).extract() loader.add_value('regional_administration', detail[0]) loader.add_value('address', detail[1]) loader.add_value('phones', detail[2]) yield loader.load_item()
def load_lesson(self, selector): lesson_loader = ItemLoader(items.Lesson(), selector) name = selector.xpath( './/span[@itemprop="name"]/text()').extract_first() url = selector.xpath( './/link[@itemprop="contentUrl"]/@href').extract_first() extention = url.split('.')[-1] filename = f'{name}.{extention}' lesson_loader.add_value('name', name) lesson_loader.add_value('file_urls', url) lesson_loader.add_value('filename', filename) lesson_loader.add_css('duration', 'em.lessons-list__duration::text') return lesson_loader.load_item()
def parse_question(self, response): # 处理question页面 match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) if match_obj: question_id = int(match_obj.group(2)) item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) # 标题 item_loader.add_css("title", "h1.QuestionHeader-title::text") # 问题内容 item_loader.add_css("content", ".QuestionHeader-detail") # 问题url item_loader.add_value("url", response.url) # 问题id item_loader.add_value("zhihu_id", question_id) # 回答数量 item_loader.add_xpath("answer_num", "//h4[@class='List-headerText']/span//text()") # 评论数量 item_loader.add_css("comment_num", ".QuestionHeader-Comment button::text") # 关注者和浏览数 item_loader.add_xpath( "watch_user_num", "//strong[@class='NumberBoard-itemValue']/text()") # 所属话题 item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text") question_item = item_loader.load_item() yield scrapy.Request(self.start_answer_url.format( str(question_id), 20, 0), callback=self.parse_answer) yield question_item
def parse_product(self, response): meta = response.meta product_loader = ItemLoader(ProductItem(), selector=response) url = response.url article_id = self.extract_id_from_url(url) if not article_id: return product_loader.add_value('article_id', article_id) meta.update({'product_id': article_id}) meta.update({'product_url': url}) product_loader.add_value('url', url) product_loader.add_value('list_page', meta.get('list_page', '')) product_loader.add_css('images', '.lh li img::attr(src)') product_loader.add_css('title', '.sku-name::text') product_loader.add_css('currency', '.p-price span::text') product_loader.add_css('description', '.parameter2.p-parameter-list *::text') vendor_id = self.get_vendor_id(response) product_loader.add_value('vendor_id', vendor_id) product = product_loader.load_item() yield product yield self.get_stock_and_price(product) yield response.follow(self.review_url.format( article_id, '0', self.comments_request_length), callback=self.parse_review, meta=meta)
def parse(self, response): # page = response.url.split("/")[-2] #filename = 'quotes-%s.html' % page #with open(filename, 'wb') as f: # f.write(response.body) #self.log('Saved file %s' % filename) for producteRAW in response.css( 'div.products article.product-miniature'): loader = ItemLoader(item=Producte(), selector=producteRAW) loader.add_css('nom', 'h2.product-title > a::text') loader.add_css('url', 'h2.product-title > a::attr(href)') loader.add_css('preu', 'div.product-price-and-shipping span.price::text') loader.add_css( 'preu_original', 'div.product-price-and-shipping span.regular-price::text') agotado = producteRAW.css('ul.product-flags li.agotado').get() if agotado is not None: loader.add_value('stock', 'Agotado') else: loader.add_value('stock', 'Disponible') producte = loader.load_item() producte['botiga'] = 'Jugamosotra' yield producte #PAGINES SEGÜENTS for next_page in response.css('ul.page-list a[rel=next]::attr(href)'): #print("next!") yield response.follow(next_page, self.parse)
def parse_questions(self, response): q_item = ItemLoader(item=ZhihuQuestionItem(), response=response) q_id = response.meta['question_id'] q_item.add_value('q_id', q_id) q_item.add_value('q_url', response.url) q_item.add_css('q_title', '.QuestionHeader-tags+h1::text') q_item.add_css( 'q_content', '.QuestionRichText.QuestionRichText--collapsed span::text') q_item.add_css('q_topic', '.Tag.QuestionTopic .Popover div::text') q_item.add_css('q_answers_num', '.List-headerText span::text') q_item.add_xpath( 'q_follower', '//div[@class="NumberBoard QuestionFollowStatus-counts NumberBoard--divider"]/button//strong/text()' ) q_item.add_xpath( 'q_watcher', '//div[@class="NumberBoard QuestionFollowStatus-counts NumberBoard--divider"]/div//strong/text()' ) q_item.add_value('crawl_time', datetime.datetime.now()) question_item = q_item.load_item() answers_url = self.temp_answers_url[0].format(q_id, 15, 0) yield Request(url=answers_url, callback=self.parse_answers, headers=self.headers, meta={'q_id': q_id}) yield question_item
def parse(self, response): productos = response.css('div.item__info') for producto in productos: # titulo = producto.css('a.name::text') # url = producto.xpath('//div[contains(@class,"detail")]/a[contains(@class,"image")]/img[contains(@id,"gImg")]/@src') producto_loader = ItemLoader(item=ProductoMercadoLibre(), selector=producto) producto_loader.default_output_processor = TakeFirst() producto_loader.add_css( 'titulo', 'div>h2.item__title>a.item__info-title>span.main-title::text') producto_loader.add_css('precio', 'div.price__container>div.item__price') producto_loader.add_css( 'vendidos', 'div.item__stack_column>div.item__stack_column__info>div.stack_column_item>div.item__status>div.item__condition::text' ) producto_loader.add_css( 'lugar', 'div.item__stack_column>div.item__stack_column__info>div.stack_column_item>div.item__status>div.item__condition::text' ) yield producto_loader.load_item()
def parse(self, response): for news in response.css('.cg-research-article-link'): loader = ItemLoader(item=TestprojItem(), selector=news, response=news) loader.add_css('_title', '.cg-research-article-title::text') loader.add_css('_abstract', '.cg-research-article-excerpt::text') # No views tracked # loader.add_css('_views', '.body li:nth-child(1)::text') loader.add_css('_author', '.name::text') loader.add_css('_image', '.cg-research-article-image::attr(style)') loader.add_css('_date', '.dotlist span::text') loader.add_value('_source', response.url) yield loader.load_item()
def parse_detail(self, response): # TODO: Test with interactvate # from scrapy.shell import inspect_response # inspect_response(response, self) loader = ItemLoader(item=FreeproxyItem(), response=response, url=self.start_urls[0]) # for element in response.css("table tr")[1:]: loader.add_xpath("area", "//tr/td[1]/img/@alt", MapCompose(lambda x: x.lower())) loader.add_css("ip", "table > tr > td:nth-of-type(2)::text") loader.add_css("port", "table > tr > td:nth-of-type(3)::text") loader.add_css("ssl", "table > tr > td:nth-of-type(6)::text") loader.add_css("security", "table > tr > td:nth-of-type(5)::text", MapCompose(self.__fix_security)) items = loader.load_item() yield items # # TODO: Test with interactvate # from scrapy.shell import inspect_response # inspect_response(response, self) next_page = self.start_urls[0] + response.xpath( "//div/a[@class='next_page']/@href").extract_first() yield scrapy.Request(next_page, callback=self.parse_detail)
def parse_item(self, response): l = ItemLoader(item=response.meta['item'], response=response) l.add_value('classId', '18'); l.add_value('cataName', u'私募股权资讯') l.add_value('url', response.urljoin(response.url)) l.add_css('title', 'h1::text') keywords = response.css('meta[name*=eywords]::attr(content)').extract()[0] keywordsList = keywords.split(',') while '' in keywordsList: keywordsList.remove('') l.add_value('keywords', keywordsList) l.add_value('seo_keywords', keywords) description = response.css('.news-show .subject::text').extract() l.add_value('description', description) l.add_value('seo_description', description) l.add_value('publishTime', response.css('.date::text').extract()[0]) tmp = response.css('.news-show .box-l::text').extract()[0].split(u'\u3000') while '' in tmp: tmp.remove('') l.add_value('source', tmp[0].replace(' ', '')) l.add_value('author', tmp[1].replace(' ', '') if tmp[1] != u'\u3000' else '') # pedaily 阅读数为ajax加载, 没有爬取的必要 # views = response.css('#HitsText::text').extract()[0].replace(u'阅读:', '') # l.add_value('views', views) image_urls = response.css('#news-content img::attr(src)').extract().append(response.meta['item']['image_urls']) l.add_value('image_urls', image_urls) content = response.css('#news-content').extract()[0] # 去广告 # content = content.replace(response.css('.visible-lg-block.visible-md-block').extract()[0], '') # 锚文本替换 atags = response.css('#news-content a').extract() atexts = response.css('#news-content a::text').extract() if (len(atags) == len(atexts)): for index, atag in enumerate(atags): content = content.replace(atag, atexts[index]) l.add_value('content', content) yield l.load_item()
def parse_user(self, response, user_avatar_url=''): item = ItemLoader(item=User(), response=response) item.add_css(FULL_NAME, '.name-bio-message h3') item.add_css(USERNAME, '.profileusername') item.add_css(FOLLOWER, '.user-followers span') item.add_css(FOLLOWING, '.user-following span') user = dict(item.load_item()) try: user[USER_ID] = int(response.url.split('/')[-1]) user[FOLLOWER] = int(user[FOLLOWER]) user[FOLLOWING] = int(user[FOLLOWING]) user[TYPE] = USER user[USER_AVATAR_URL] = user_avatar_url yield user # yield Request( # url=OWN_RECIPE_URL.format(user[USER_ID], 1), # callback=self.parse_first_own_recipe # ) # Followers if user[FOLLOWER]: for i in range(1, 1 + user[FOLLOWER]): yield Request(url=FOLLOW_URL.format( user[USER_ID], 'followers', i), callback=self.parse_follow, cb_kwargs=dict(user=user, f_type=FOLLOWER)) # Followings if user[FOLLOWING]: for i in range(1, 1 + user[FOLLOWING]): yield Request(url=FOLLOW_URL.format( user[USER_ID], 'follows', i), callback=self.parse_follow, cb_kwargs=dict(user=user, f_type=FOLLOWING)) except Exception as e: yield {TYPE: ERROR, URL: response.url, ERROR: str(e)}
def parse_item(self, response): """ This function parses a property page. :param response: :return: item """ # Create the loader using response l = ItemLoader(item=AllitebooksItem(), response=response) # Load primary fields using css expressions l.add_css('title', '.single-title::text', MapCompose(str.strip)) l.add_css('cover', '.entry-body-thumbnail>a>img::attr(src)') book_details = response.css('.book-detail>dl>dd::text').extract() author_list = response.css( '.book-detail>dl>dd:nth-child(2)>a::text').extract() category_list = response.css( '.book-detail>dl>dd:nth-child(16)>a::text').extract() author = ','.join(author_list) category = ','.join(category_list) book_details = book_details[len(author_list):(-len(category_list))] l.add_value('author', author, MapCompose(str.strip)) l.add_value('category', category, MapCompose(str.strip)) item_name = "isbn year pages language file_size file_format".split() for index, value in enumerate(item_name): l.add_value(value, book_details[index], MapCompose(str.strip)) l.add_css('description', '.entry-content') l.add_css('download', 'span.download-links>a::attr(href)', MapCompose(str.strip), TakeFirst()) # Housekeeping fields l.add_value('url', response.url) l.add_value('spider', self.name) l.add_value('date', datetime.datetime.now()) yield l.load_item()
def vacansy_parse(self, response): #name = response.css('div._3mfro CuJz5 PlM3e _2JVkc _3LJqf::text').extract_first() #url_vacancy = response.url #min_salary = response.css('span._3mfro _2Wp8I ZON4b PlM3e _2JVkc::text').extract() #sourse = 'superjob.ru' #yield JobparserItem(name=name, url_vacancy=url_vacancy, min_salary=min_salary, # max_salary=None, source=sourse) loader = ItemLoader(item=JobparserItem(), response=response) loader.add_css('name', 'div._3mfro CuJz5 PlM3e _2JVkc _3LJqf::text') loader.add_value('url_vacancy', response.url) loader.add_css('min_salary', 'span._3mfro _2Wp8I ZON4b PlM3e _2JVkc::text') loader.add_value('max_salary', None) loader.add_value('source', 'superjob.ru')
def parse_question(self, response): #处理question页面, 从页面中提取出具体的question item if "QuestionHeader-title" in response.text: #处理新版本 match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) if match_obj: question_id = int(match_obj.group(2)) item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css("title", "h1.QuestionHeader-title::text") item_loader.add_css("content", ".QuestionHeader-detail") item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) item_loader.add_css("answer_num", ".List-headerText span::text") item_loader.add_xpath( "comments_num", "//*[@class='QuestionHeader-Comment']/button/text()") item_loader.add_xpath( "watch_user_num", "//*[@class='QuestionHeader-follow-status']/div/div/button/div/strong/text()" ) item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text") question_item = item_loader.load_item() yield scrapy.Request(self.start_answer_url.format(question_id, 3, 0), callback=self.parse_answer) yield question_item
def parse_item(self, response): loader = ItemLoader(GaokaopaiZhuanyeItem(), response) loader.add_value('url', response.url) loader.add_css('name', u'.majorTitle>h1::text') loader.add_xpath('code', u'//div[@class="majorBase"]/h3[starts-with(., "专业代码:")]/text()', re=ur':(.+)') loader.add_xpath('degree', u'//div[@class="majorBase"]/h3[starts-with(., "授予学位:")]/text()', re=ur':(.+)') loader.add_xpath('period', u'//div[@class="majorBase"]/h3[starts-with(., "修学年限:")]/text()', re=ur':(.+)') loader.add_xpath('courses', u'//div[@class="course"]/h3[.="开设课程:"]/following-sibling::p/text()') def parse_related(): for e in response.xpath(u'//div[@class="course"]/h3[.="相近专业:"]/following-sibling::a'): yield { 'url': e.css('::attr(href)').extract_first(), 'code': e.css('::attr(href)').re_first(ur'-([^-]+)\.html'), 'name': e.css('::text').extract_first(), } loader.add_value('related', list(parse_related())) def parse_category(): category = [] for i in [u"学历类别", u"学科门类", u"专业类别"]: x = u'//h3[.="{}"]/following-sibling::ul[1]/li[@class="current"]/a'.format(i) e = response.xpath(x) category.append({ 'url': e.css('::attr(href)').extract_first(), 'code': e.css('::attr(href)').re_first(ur'/zhuanye([-0-9]*)\.html').strip('-'), 'name': e.css('::text').extract_first(), }) return category loader.add_value('category', parse_category()) loader.add_css('detail', u'.majorCon') item = loader.load_item() return Request( url='http://www.gaokaopai.com/zhuanye-jiuye-{}.html'.format(item['code'][0]), meta={'item': item}, callback=self.parse_jiuye )
def parse_question(self,response): question_id = response.meta.get('question_id','') item_loader = ItemLoader(item = ZhihuQuestionItem(),response=response) item_loader.add_css('title','.QuestionHeader h1.QuestionHeader-title::text') item_loader.add_css('topics','.QuestionTopic .Popover div::text') item_loader.add_css('content','.QuestionHeader-detail span::text') item_loader.add_value('url',response.url) item_loader.add_value('zhihu_id',question_id) item_loader.add_css('answer_num','.List-headerText span::text ') item_loader.add_css('comments_num','.QuestionHeader-Comment button::text') item_loader.add_css('watch_user_num','.NumberBoard-itemValue::text') question_item = item_loader.load_item() yield scrapy.Request(self.start_answer_url.format(question_id,20,0),headers=self.headers, callback=self.parse_answer) yield question_item scrapy.FormRequest pass
def parse(self, response): for outer in response.css('#comapreTable tr:not(:first-child)'): if outer.css('td[align="center"]'): ccode = outer.css('td[align="center"]>a::attr(id)').extract_first() cname = outer.css('td[align="center"]>a::text').extract_first() for inner in outer.xpath('td[div[@align="left"]/a]'): loader = ItemLoader(item=EolZhuanyeItem(), selector=inner) loader.add_value('ccode', ccode) loader.add_value('cname', cname) loader.add_css('url', 'a::attr(href)', lambda urls: urljoin(self.start_urls[0], urls[0])) loader.add_xpath('code', 'following-sibling::td[1]/text()', MapCompose(unicode.strip)) loader.add_css('name', 'a::text', MapCompose(unicode.strip)) item = loader.load_item() yield Request(url=item['url'][0], meta={'item': item}, callback=self.parse_item)
def parse_item(self, response): loader = ItemLoader(item=ShopItem(), response=response) loader.add_css('id', '#breadcrumbs .last-child::text') loader.add_value('url', response.url) loader.add_css('brand', '.productTitle>.brand_name>a::text') loader.add_css('name', '.productTitle>h1::text') loader.add_css('desc', 'p[itemprop="description"]') loader.add_css('cate', '#breadcrumbs span[itemprop="title"]::text', Join()) loader.add_value('site', 'forzieri') loader.add_value('lang', self.lang) loader.add_value('time', time.time()) url = urlparse.urljoin( self.start_urls[0], response.css('#scheda_tecnica_tab_trigger::attr(href)').extract_first(), ) return Request(url, meta={'loader': loader}, callback=self.parse_item_ajax)
def parse(self, response): l = ItemLoader(item=UrbanusItem(), response=response) l.add_xpath('title', "//section[1]/div/div[2]/div[1]/h1") l.add_xpath('address', "//section[2]/div/div/p") l.add_xpath('photos', "//div[contains(@class, 'slide_gale')]/span/img/@src") l.add_value('url', response.url) properties = extract_properties(response) l.add_value('bedrooms', properties.bedrooms) l.add_value('bathrooms', properties.bathrooms) l.add_value('area_total', properties.area_total) l.add_value('area_constructed', properties.area_constructed) l.add_value('garage', properties.garage) l.add_css('price', "span.inmueble_price") l.add_css('description', "div.show_detail") return l.load_item()
def parse_item(self, response): loader = ItemLoader(item=ShopItem(), response=response) loader.add_value('id', response.url, re=r'/([0-9]+).htm\?') loader.add_value('url', response.url) loader.add_css('brand', '#product-information h1 .row[itemprop="brand"]::text', TakeFirst(), unicode.strip) loader.add_css('name', '#product-information h1 .row[itemprop="name"]::text', TakeFirst(), unicode.strip) loader.add_css('desc', 'div[itemprop="description"]', TakeFirst(), remove_tags, unicode.strip) loader.add_css('cate', '.breadcrumb-list .breadcrumb span::text', Join()) loader.add_value('site', 'shopbop') loader.add_value('lang', self.lang) loader.add_value('time', time.time()) return loader.load_item()
def parse_titles(self, response): loader = ItemLoader(item=BlogCategory(), response=response) loader.add_value('hub', response.meta['hname']) loader.add_css('title', 'div.company_post h1 span::text') loader.add_css('date', 'div.published::text') loader.add_css('article', 'div.content::text') yield loader.load_item()
def parse_detail(self, response): il = ItemLoader(NewsItem(), response=response) il.add_css("title", "%s::text" % self.title) il.add_css("date", "%s::text" % self.date) il.add_css("auth", "%s::text" % self.auth) il.add_css("content", "%s > p::text" % self.content) il.add_value("cate", response.meta["cate"]) return il.load_item()
def parse_item(self, response): l = ItemLoader(item=ZtArticleItem(), response=response) l.add_value('classId', '51'); l.add_value('cataName', u'公募要闻') l.add_value('url', response.urljoin(response.url)) l.add_css('title', 'h1.title::text') # 中证seo的keyword就是tags, description为简介, 然而description开头有中文空格, 一看就是程序员不细心 l.add_css('keywords', 'meta[name*=keywords]::attr(content)') l.add_css('seo_keywords', 'meta[name*=keywords]::attr(content)') description = response.css('meta[name*=description]::attr(content)').extract()[0].replace(u'\u3000', '') # 去掉开头的中文空格 l.add_value('description', description) l.add_value('seo_description', description) l.add_css('publishTime', '.timer::text') source = response.css('.source a::text').extract() # 有来源还是采上来源 if len(source) == 0: tmp = response.css('.source::text').extract()[0] tmp = tmp.replace(u'来源:', '') l.add_value('source', tmp) # l.add_value('author', tmp) else: l.add_value('source', source[0]) # 无浏览数随机一个三位数 l.add_value('views', randint(100, 999)) l.add_css('image_urls', '#qmt_content_div p img::attr(src)') content = response.css('#qmt_content_div').extract()[0] # 去广告 content = content.replace(response.css('.visible-lg-block.visible-md-block').extract()[0], '') l.add_value('content', content) yield l.load_item()
def parse_item(self, response): loader = ItemLoader(ChsiDaxueItem(), response) loader.add_value('id', response.url, re=ur'schId-(\w+)\.dhtml') loader.add_value('url', response.url) loader.add_css('logo', u'.r_c_sch_logo>img::attr(src)', MapCompose(lambda url: urljoin('http://gaokao.chsi.com.cn/', url))) loader.add_css('name', u'.topImg::text') loader.add_css('badges', u'.r_c_sch_attr .r_c_sch_icon::attr(title)') data_clean = MapCompose(lambda x: re.sub(r'\s+', ' ', x), unicode.strip) loader.add_xpath('type', u'//span[@class="f_bold" and .="院校类型:"]/following-sibling::text()', data_clean) loader.add_xpath('membership', u'//span[@class="f_bold" and .="院校隶属:"]/following-sibling::text()', data_clean) loader.add_xpath('province', u'//span[@class="f_bold" and span]/following-sibling::text()', data_clean) loader.add_xpath('address', u'//span[@class="f_bold" and .="通讯地址:"]/following-sibling::text()', data_clean) loader.add_xpath('phone', u'//span[@class="f_bold" and .="联系电话:"]/following-sibling::text()', data_clean) loader.add_xpath('website', u'//span[@class="f_bold" and .="学校网址:"]/following-sibling::a/@href', data_clean) loader.add_xpath('backdoor', u'//span[@class="f_bold" and .="特殊招生:"]/following-sibling::text()', data_clean) def parse_votes(): xpath = u'//td[@class="tdMydT" and .="{}"]/following-sibling::td/div[@class="rank"]/@rank' get_vote = lambda what: float(response.xpath(xpath.format(what)).extract_first() or 0) return { 'overall': get_vote(u'综合满意度'), 'environment': get_vote(u'校园环境满意度'), 'life': get_vote(u'生活满意度'), } loader.add_value('votes', parse_votes()) def parse_trending(): css = u'{}>table tr:not(:first-child)' def get_trending(what): majors = [] for e in response.css(css.format(what)): majors.append({ 'id': e.css(u'.tdZytjTDiv>a::attr(href)').re_first(r'specId=(\w+)'), 'name': e.css(u'.tdZytjTDiv::attr(title)').extract_first(), 'vote': float(e.css(u'.avg_rank::text').extract_first()), 'count': int(e.css(u'.c_f00::text, .red::text').extract_first()), }) return majors return { 'count': get_trending(u'#topNoofPTable'), 'index': get_trending(u'#topIndexTable'), 'like': get_trending(u'.r_r_box_zymyd'), } loader.add_value('trending', parse_trending()) item = loader.load_item() for link in LinkExtractor(restrict_xpaths=u'//ul[@id="topNav"]//a[.="学校简介"]').extract_links(response): yield Request(link.url, meta={'item': item}, callback=self.parse_jianjie)
def parse_item(self, response): l = ItemLoader(item=ZtArticleItem(), response=response) l.add_value('classId', '57'); l.add_value('cataName', u'市场动态') l.add_value('url', response.urljoin(response.url)) l.add_css('title', '#artibodyTitle h1::text') keywords = response.css('meta[name*=eywords]::attr(content)').extract()[0] keywordsList = keywords.split(' ') while '' in keywordsList: keywordsList.remove('') l.add_value('keywords', keywordsList) l.add_value('seo_keywords', keywords) description = ''.join(response.css('#artibody p::text').extract()) if len(description) > 200: description = description[:200] l.add_value('description', description) l.add_value('seo_description', description) l.add_value('publishTime', response.css('#pubtime_baidu::text').extract()[0]) l.add_css('source', '#source_baidu a::text') # 无浏览数随机一个三位数 l.add_value('views', randint(100, 999)) l.add_css('image_urls', '#artibody img::attr(src)') content = response.css('#artibody').extract()[0] # 去广告 # content = content.replace(response.css('.visible-lg-block.visible-md-block').extract()[0], '') # 锚文本替换 atags = response.css('#artibody a').extract() atexts = response.css('#artibody a::text').extract() if (len(atags) == len(atexts)): for index, atag in enumerate(atags): content = content.replace(atag, atexts[index]) l.add_value('content', content) yield l.load_item()
def parse_item(self, response): l = ItemLoader(item=ZtArticleItem(), response=response) l.add_value('classId', '11'); l.add_value('cataName', u'私募要闻') l.add_value('url', response.urljoin(response.url)) l.add_css('title', '.hd h1::text') keywords = response.css('meta[name*=eywords]::attr(content)').extract()[0] keywordsList = keywords.split(',') while '' in keywordsList: keywordsList.remove('') l.add_value('keywords', keywordsList) l.add_value('seo_keywords', keywords) description = response.css('meta[name*=escription]::attr(content)').extract()[0] l.add_value('description', description) l.add_value('seo_description', description) l.add_value('publishTime', response.css('.info::text').extract()[0]) l.add_css('source', '.info .where::text') # 无浏览数随机一个三位数 l.add_value('views', randint(100, 999)) l.add_css('image_urls', '#qmt_content_div p img::attr(src)') content = response.css('#Cnt-Main').extract()[0] # 去广告 # content = content.replace(response.css('.visible-lg-block.visible-md-block').extract()[0], '') # 锚文本替换 atags = response.css('#Cnt-Main a').extract() atexts = response.css('#Cnt-Main a::text').extract() if (len(atags) == len(atexts)): for index, atag in enumerate(atags): content = content.replace(atag, atexts[index]) content = content.replace(u'(专栏)', '') l.add_value('content', content) yield l.load_item()
def parse_article(self, response): # Initialize some I/O processors join_all = Join('') take_first = TakeFirst() identity = Identity() prepend_url = PrependResponseUrl(response.url) strip_all, strip_one = StripAll(), StripOne() add_space_after_punct = AddSpaceAfterPunct() # Load PersonItem person_loader = ItemLoader(item=PersonItem(), response=response) person_loader.default_output_processor = take_first person_loader.add_css('name', 'h3.p-name::text', strip_all) person_loader.add_value('article_url', response.url) person_loader.add_css('pub_date', 'time.dt-published::attr(datetime)') person_loader.add_css('title', 'p.summary.p-summary::text', strip_all) person_loader.add_css('img_src', 'img.portrait::attr(src)', prepend_url) person_loader.add_xpath('bio', '//div[@class="e-content"]/p[count(preceding-sibling::h4)=1]/descendant-or-self::*/text()', join_all, add_space_after_punct) person_loader.add_xpath('hardware', '//div[@class="e-content"]/p[count(preceding-sibling::h4)=2]/descendant-or-self::*/text()', join_all, add_space_after_punct) person_loader.add_xpath('software', '//div[@class="e-content"]/p[count(preceding-sibling::h4)=3]/descendant-or-self::*/text()', join_all, add_space_after_punct) person_loader.add_xpath('dream', '//div[@class="e-content"]/p[count(preceding-sibling::h4)=4]/descendant-or-self::*/text()', join_all, add_space_after_punct) person_item = person_loader.load_item() # @gbrener 8/16/2015: The following line causes a NotImplementedError #object.__setattr__(person_item, 'export_empty_fields', True) person_item.fill_empty_fields() # Load a list of ToolItems tool_items = [] for tool_selector in response.css('div.e-content p a'): tool_loader = ItemLoader(item=ToolItem(), selector=tool_selector, response=response) tool_loader.default_output_processor = take_first tool_loader.add_xpath('tool_name', './descendant-or-self::*/text()', join_all, strip_one) tool_loader.add_xpath('tool_url', './@href') tool_item = tool_loader.load_item() # @gbrener 8/16/2015: The following line causes a NotImplementedError #object.__setattr__(tool_item, 'export_empty_fields', True) tool_item.fill_empty_fields() tool_items.append(tool_item) yield dict(person=person_item, tools=tool_items)
def parse_item(self, response): il = ItemLoader(item=ImageItem(), response=response) il.add_css('image_urls', 'img::attr(src)') return il.load_item()
def parse_item(self, response): l = ItemLoader(item=response.meta['item'], response=response) l.add_value('classId', '49'); l.add_value('cataName', u'国内经济') l.add_value('url', response.urljoin(response.url)) l.add_css('title', '.newsContent h1::text') l.add_css('seo_title', '.newsContent h1::text') l.add_css('seo_keywords', 'meta[name*=keywords]::attr(content)') l.add_css('seo_description', 'meta[name*=description]::attr(content)') # 东财来源是图片 l.add_value('source', u'东方财富网') # l.add_value('author', u'东方财富网') # 无浏览数随机一个三位数 l.add_value('views', randint(100, 999)) l.add_css('keywords', 'meta[name*=keywords]::attr(content)') l.add_css('description', '.c_review::text') # 2015年12月9日 11:09 -> 2015-12-09 11:09 publishTime = response.css('.Info span:first-child::text').extract()[0] publishTime = time.strftime('%Y-%m-%d %H:%M', time.strptime(re.sub('[^0-9]', '', publishTime), '%Y%m%d%H%M')) l.add_value('publishTime', publishTime) content = response.css('#ContentBody').extract()[0] # 替换摘要 substract = response.css('.c_review').extract(); if len(substract): content = content.replace(substract[0], '') # 删除广告 ad = response.css('.reading').extract() if len(ad) > 0: content = content.replace(ad[0], '') # 锚文本替换 atags = response.css('#ContentBody a') ataghtml = response.css('#ContentBody a').extract() for index, atag in enumerate(atags): atext = atag.css('::text').extract() if len(atext): content = content.replace(ataghtml[index], atext[0]) else: content = content.replace(ataghtml[index], atag.css(':first-child').extract()[0]) l.add_value('content', content) yield l.load_item()
def parse_content_page(self, response): # Detect if this is a redirection page m = redirect_re.search(response.body) if m: import requests new_url = m.group(1) new_content = requests.get(new_url).content response = scrapy.http.HtmlResponse(new_url, body=new_content) # Start scraping il = ItemLoader(item = LuliItem(), response=response) il.add_css('content', 'div#articleNew > p::text') il.add_css('content', 'div[itemprop="articleBody"] > p') il.add_css('date', 'div#articleDate::text') il.add_css('date', 'header > time[datetime]::attr(datetime)') il.add_css('title', 'div#articleNew > h1::text') il.add_css('title', 'h1[itemprop="headline"]::text') il.add_value('url', response.url) item = il.load_item() yield item
def parse_movie(self, response): self.logger.info('Parse movie\'s url %s.', response.url) if response.status == 403: raise DropItem('Function parse_movie 403 page.') l = ItemLoader(item=MovieItem(), response=response) l.add_value('id', response.url, re=r'/.*?/(\d+)/') l.add_xpath('name', '//span[@property="v:itemreviewed"]/text()') l.add_xpath('poster', u'//img[@title="点击看更多海报" and @rel="v:image"]/@src') l.add_xpath( 'alternate_name', u'//div[@id="info"]/span[@class="pl"][contains(./text(), "又名:")]/following::text()[1]', MapCompose(lambda s: s.split('/'), unicode.strip) ) l.add_css('year', '.year::text', re=r'\((\d+)\)') l.add_css('rating', '.rating_num::text') l.add_xpath('rating_per', '//span[@class="rating_per"]/text()') l.add_xpath('rating_betterthan', '//div[@class="rating_betterthan"]/a/text()') l.add_xpath('rating_betterthan_href', '//div[@class="rating_betterthan"]/a/@href') l.add_xpath('director', '//a[@rel="v:directedBy"]/text()') l.add_xpath('director_id', '//a[@rel="v:directedBy"]/@href', re=r'/.*?/(\d+)/') l.add_xpath('script_editor', '(//div[@id="info"]//span[@class="attrs"]/a)[2]/text()') l.add_xpath('script_editor_id', '(//div[@id="info"]//span[@class="attrs"]/a)[2]/@href', re=r'/.*?/(\d+)/') l.add_xpath('genre', '//span[@property="v:genre"]/text()') l.add_xpath('tags', '//div[@class="tags-body"]/a/text()') l.add_xpath( 'summary', '//span[@property="v:summary"]/text()', MapCompose(unicode.strip), Join('<br>') ) l.add_xpath('runtime', '//span[@property="v:runtime"]/text()') l.add_xpath('starring', '//a[@rel="v:starring"]/text()') l.add_xpath('starring_id', '//a[@rel="v:starring"]/@href', re=r'/.*?/(\d+)/') l.add_xpath('initialReleaseDate', '//span[@property="v:initialReleaseDate"]/text()') l.add_xpath( 'region', u'//div[@id="info"]/span[@class="pl"][contains(./text(), "制片国家/地区:")]/following::text()[1]', MapCompose(unicode.strip) ) l.add_xpath( 'language', u'//div[@id="info"]/span[@class="pl"][contains(./text(), "语言:")]/following::text()[1]', MapCompose(unicode.strip) ) l.add_xpath( 'imdb', u'//div[@id="info"]/span[@class="pl"][contains(./text(), "IMDb链接:")]/following::a[1]/text()' ) l.add_xpath( 'imdb_href', u'//div[@id="info"]/span[@class="pl"][contains(./text(), "IMDb链接:")]/following::a[1]/@href' ) l.add_xpath('recommendations_id', '//div[@class="recommendations-bd"]/dl/dd/a/@href', re=r'/.*?/(\d+)/') l.add_xpath('recommendations', '//div[@class="recommendations-bd"]/dl/dd/a/text()') l.add_value( 'collections_number', '//div[@class="subject-others-interests-ft"]/a[1]/text()', re=r'(\d+)' ) l.add_value( 'wishes_number', '//div[@class="subject-others-interests-ft"]/a[2]/text()', re=r'(\d+)' ) l.add_value('last_update_time', str(datetime.utcnow())) # download poster image file l.add_xpath('image_urls', u'//img[@title="点击看更多海报" and @rel="v:image"]/@src') yield l.load_item() comments_url = response.xpath(r'//div[@id="comments-section"]/div[@class="mod-hd"]/h2//a/@href').extract_first() yield Request( url=comments_url, callback=self.parse_comment )
def parse_question(self, response): #处理question页面, 从页面中提取出具体的question item if "QuestionHeader-title" in response.text: #处理新版本 match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) if match_obj: question_id = int(match_obj.group(2)) item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css("title", "h1.QuestionHeader-title::text") item_loader.add_css("content", ".QuestionHeader-detail") item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) item_loader.add_css("answer_num", ".List-headerText span::text") item_loader.add_css("comments_num", ".QuestionHeader-actions button::text") item_loader.add_css("watch_user_num", ".NumberBoard-value::text") item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text") question_item = item_loader.load_item() else: #处理老版本页面的item提取 match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) if match_obj: question_id = int(match_obj.group(2)) item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) # item_loader.add_css("title", ".zh-question-title h2 a::text") item_loader.add_xpath("title", "//*[@id='zh-question-title']/h2/a/text()|//*[@id='zh-question-title']/h2/span/text()") item_loader.add_css("content", "#zh-question-detail") item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) item_loader.add_css("answer_num", "#zh-question-answer-num::text") item_loader.add_css("comments_num", "#zh-question-meta-wrap a[name='addcomment']::text") # item_loader.add_css("watch_user_num", "#zh-question-side-header-wrap::text") item_loader.add_xpath("watch_user_num", "//*[@id='zh-question-side-header-wrap']/text()|//*[@class='zh-question-followers-sidebar']/div/a/strong/text()") item_loader.add_css("topics", ".zm-tag-editor-labels a::text") question_item = item_loader.load_item() yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers, callback=self.parse_answer) yield question_item
def parse_titles(self, response): l = ItemLoader(item=Posts(), response=response) l.add_css('content_title', 'h1.pagetitle::text', self.content_title_parser) l.add_css('post_title', 'div.entries > ul > li a::text') return l.load_item()
def parse_item(self, response): item_loader = ItemLoader(item=HouseRentingLianjiaItem(), response=response) item_loader.add_css(field_name='title', css='div.title *::text') item_loader.add_value(field_name='source', value=self.name) item_loader.add_css(field_name='author', css='div.brokerName > a.name::text') item_loader.add_css(field_name='image_urls', css='div.thumbnail > ul > li > img::attr(src)') item_loader.add_css(field_name='author_link', css='div.brokerName > a.name::attr(href)') item_loader.add_css(field_name='content', css='div.introduction *::text', re=r'\s*(.*)\s*') item_loader.add_value(field_name='source_url', value=response.url) item_loader.add_css(field_name='publish_time', css='div.zf-room > p::text') item_loader.add_css(field_name='price', css='div.price > span.total::text') item_loader.add_css(field_name='detail', css='div.zf-room *::text') yield item_loader.load_item()
def parse_item(self, response): selector = Selector(response=response) selector.css('div.main-wrap') item_loader = ItemLoader(item=HouseRenting58Item(), selector=selector, response=response) item_loader.add_css(field_name='title', css='div.house-title > h1::text') item_loader.add_value(field_name='source', value=self.name) item_loader.add_css(field_name='author', css='div.house-basic-info div.house-agent-info p.agent-name > a::text') item_loader.add_css(field_name='image_urls', css='div.basic-pic-list > ul > li > img::attr(data-src)', re=r'(.*)\?.*') item_loader.add_css(field_name='author_link', css='div.house-basic-info div.house-agent-info p.agent-name > a::attr(href)') item_loader.add_css(field_name='content', css='ul.introduce-item *::text') item_loader.add_value(field_name='source_url', value=response.url) item_loader.add_css(field_name='publish_time', css='p.house-update-info::text') item_loader.add_css(field_name='price', css='div.house-pay-way *::text') item_loader.add_css(field_name='detail', css='div.house-desc-item > ul > li > span::text') yield item_loader.load_item()