def parse_link_page(response): for post in response.xpath('//div[@data-type="link"]'): l = ItemLoader(RedditPostItem(), selector=post) post_root_xpath = './div[contains(@class, "entry")]' title = post_root_xpath + '/p[@class="title"]' tagline = post_root_xpath + '/p[@class="tagline"]' buttons = post_root_xpath + '/ul' l.add_xpath('title', title + '/a/text()') l.add_xpath('link', title + '/a/@href') l.add_xpath('poster', tagline + '/a[contains(@class, "author")]/text()') l.add_xpath('score', './div[contains(@class, "midcol")]/div[@class="score unvoted"]/text()') l.add_xpath('number_of_comments', buttons + '//a[contains(@class, "comments")]/text()') l.add_xpath('comments_link', buttons + '//a[contains(@class, "comments")]/@href') l.add_xpath('subreddit', './@data-subreddit') l.add_xpath('post_timestamp', tagline + '/time/@datetime') l.add_value('scrape_timestamp', datetime.datetime.now()) item = l.load_item() # if there are any comments for the post, go scrape them item["comments"] = [] if item["number_of_comments"] > 0: yield scrapy.Request(item["comments_link"]+"?limit=500", callback=parse_comments, meta={'item': item}) yield l.load_item()
def parse_item(self, response): sel = response.css("div.path") loader = ItemLoader(item=SeriesItem(), selector=sel) loader.add_css("series_id", "a:last-child::attr(href)") loader.add_css("series_name", "a:last-child::text") series = loader.load_item() print(series) # 即将销售 & 在售 for sel in response.css("div.interval01-list-cars-infor"): loader = ItemLoader(item=ModelItem(), selector=sel) loader.add_css("model_id", "a::attr(href)") loader.add_css("model_name", "a::text") loader.add_value("series_id", series['series_id']) loader.add_value("series_name", series['series_name']) yield loader.load_item() # 停售 url = "http://www.autohome.com.cn/ashx/series_allspec.ashx" years = response.css(".dropdown-content a::attr(data)") for year in years.extract(): qs = { "y": year, "s": series["series_id"] } yield Request(url + "?" + urlencode(qs), self.stop_sale)
def parse_info(self, response): loaderJob = ItemLoader(item=JobInfoItem(), response=response) loaderCom = ItemLoader(item=ComInfoItem(), response=response) loaderJob.add_value('url', value=response.url) loaderJob.add_xpath('job_name', '//div[@class="inner-left fl"][1]/h1/text()', TakeFirstL()) loaderJob.add_xpath('job_company', '//div[@class="inner-left fl"][1]/h2/a/text()', TakeFirstL()) loaderJob.add_xpath('job_benefits', '//div[@class="inner-left fl"][1]/div/span/text()', JoinL('|')) divs = '//ul[@class="terminal-ul clearfix"]/li' loaderJob.add_xpath('job_salary', divs, TakeFirstL(), re=u'(?<=职位月薪:</span><strong>).*(?=</strong></li>)') loaderJob.add_xpath('job_location', divs, RemoveTagsL(), TakeFirstL(), re=u'(?<=工作地点:</span><strong>).*(?=</strong></li>)') loaderJob.add_xpath('job_update', divs, RemoveTagsL(), TakeFirstL(), re=u'(?<=发布日期:</span><strong>).*(?=</strong></li>)') loaderJob.add_xpath('job_nature', divs, TakeFirstL(), re=u'(?<=工作性质:</span><strong>).*(?=</strong></li>)') loaderJob.add_xpath('job_experience', divs, TakeFirstL(), re=u'(?<=工作经验:</span><strong>).*(?=</strong></li>)') loaderJob.add_xpath('job_miniEdu', divs, TakeFirstL(), re=u'(?<=最低学历:</span><strong>).*(?=</strong></li>)') loaderJob.add_xpath('job_recruNums', divs, TakeFirstL(), re=u'(?<=招聘人数:</span><strong>).*(?=</strong></li>)') loaderJob.add_xpath('job_category', divs, RemoveTagsL(), TakeFirstL(), re=u'(?<=职位类别:</span><strong>).*(?=</strong></li>)') loaderJob.add_xpath('job_desc', '//div[@class="tab-inner-cont"][1]', ExtractTextL(), StripBlankL(), JoinL('|')) loaderJob.add_xpath('job_desc_resp', '//div[@class="tab-inner-cont"][1]', ExtractTextL(), TakeFirstL(), re=u'(?<=岗位职责|工作职责).*?(?=任职资格|岗位要求)') loaderJob.add_xpath('job_desc_req', '//div[@class="tab-inner-cont"][1]', ExtractTextL(), TakeFirstL(), re=u'(?<=任职资格|岗位要求).*?(?=。)') loaderJob.add_xpath('job_desc_loc', '//div[@class="tab-inner-cont"][1]/h2/text()', TakeFirstL()) loaderCom.add_xpath('url', '//div[@class="company-box"]/p[@class="company-name-t"]/a/@href', TakeFirstL()) loaderCom.add_xpath('com_name', '//div[@class="company-box"]/p[@class="company-name-t"]/a/text()', TakeFirstL()) divs = '//div[@class="company-box"]/ul/li' loaderCom.add_xpath('com_size', divs, ExtractTextL(), TakeFirstL(), re=u'(?<=公司规模[:,:]).*') loaderCom.add_xpath('com_nature', divs, ExtractTextL(), TakeFirstL(), re=u'(?<=公司性质[:,:]).*') loaderCom.add_xpath('com_industry', divs, ExtractTextL(), TakeFirstL(), re=u'(?<=公司行业[:,:]).*') loaderCom.add_xpath('com_intro', '//div[@class="tab-inner-cont"][2]', ExtractTextL(), StripBlankL(), JoinL('|')) loaderCom.add_xpath('com_link', divs, ExtractTextL(), TakeFirstL(), re=u'(?<=公司主页[:,:]).*') loaderCom.add_xpath('com_address', divs, RemoveTagsL(), TakeFirstL(), re=u'(?<=公司地址[:,:])[\s\S]*(?=</strong>)') return loaderJob.load_item(), loaderCom.load_item()
def parse(self, response): sites = response.xpath('//table/tbody/tr') for site in sites: url = urljoin(response.url, site.xpath("td[2]/a/@href").extract_first()) urlLast = urljoin(response.url, site.xpath("td[3]/a/@href").extract_first()) item = DeathItem() loader = ItemLoader(item,selector=site) loader.add_xpath('Mid','td[1]/text()') loader.add_xpath('firstName','td[5]/text()') loader.add_xpath('lastName','td[4]/text()') loader.add_xpath('Date','td[8]/text()') loader.add_xpath('Race','td[9]/text()') loader.add_xpath('County','td[10]/text()') loader.add_xpath('Age','td[7]/text()') loader.add_value('OILink',url) loader.add_value('OLastStatement',urlLast) if url.endswith(("jpg","no_info_available.html")): loader.add_value('Description',u'') loader.add_value('Education',u'') if urlLast.endswith("no_last_statement.html"): loader.add_value('Message',u'') yield loader.load_item() else: request = scrapy.Request(urlLast, meta={"item" : loader.load_item()}, callback =self.parse_details2) yield request else: request = scrapy.Request(url, meta={"item": loader.load_item(),"urlLast" : urlLast}, callback=self.parse_details) yield request
def parse_colleagues(self, response, author_id): self.logger.info('Parsing colleagues for author %s.' % author_id) # get all authors listed num_authors = 0 for div in response.xpath('//*[@class="gsc_1usr gs_scl"]'): num_authors += 1 name_xp = './*[@class="gsc_1usr_name"]/text()' id_val = urlparse.parse_qs(urlparse.urlparse(div.xpath('//*[@id="gsc_ccl"]/div[1]/div[2]/h3/a/@href').extract_first()).query)['user'] cited_by_xp = './*[@class="gsc_1_usr_cby"]/text()' fos_xp = './/a[@class="gsc_co_int"]/@href' # --> ["foo", "bar",...] # load general author item for colleague co_auth = ItemLoader(item=AuthorItem(), response=response, selector=div) co_auth.add_value('id', id_val) co_auth.add_xpath('name', name_xp) co_auth.add_xpath('cited', cited_by_xp) co_auth.add_xpath('fos', fos_xp) yield co_auth.load_item() # load co-authorship relation = [author_id, id_val] relation.sort() co_rel = ItemLoader(item=CoAuthorItem(), response=response) co_rel.add_value('author1', relation[0]) co_rel.add_value('author2', relation[1]) yield co_rel.load_item() self.logger.info('Found %d colleagues for author %s.' % (num_authors, author_id)) next_url = self.choose_next() if next_url: yield Request(url=next_url)
def parse(self,response): l = ItemLoader(item = NytimesItem(),response = response) l.add_xpath('topnews','//*[contains(@id,"topnews-100")]/h2/a/text()') l.add_xpath('sectionnews','//h3[contains(@class,"story-heading")]/text()') #print(type(l.load_item())) x = l.load_item() #print(len(x['date']),len(x['topnews']),len(x['sectionnews'])) nytdict = dict() datelist = [] datalist = datetime.date.today() topnewslist = [] sectionnewslist = [] nytdict['date'] = str(datalist) for t in x['topnews']: topnewslist.append(str(t.encode('ascii','ignore'))) nytdict['topnews']=topnewslist for t in x['sectionnews']: sectionnewslist.append(str(t.encode('ascii','ignore')).strip()) nytdict['sectionnews']=sectionnewslist filename = datetime.date.today() f=open('{}.json'.format(filename),'w') json.dump(nytdict,f) return l.load_item()
def parse_item(self, response): if "Digi-Key Part Number" not in response.body: return i = DetailsItem() if response.meta.get("callback_result_queue"): i['hitlist'] = response.meta.get('callback_result_queue') i = DetailsItem() i['site_name'] = self.site_name i['site_url'] = self.base_url loader = ItemLoader(i, response=response) loader.add_xpath("site_part_id", "//meta[@itemprop='productID']/@content", re="sku.(.*)") loader.add_xpath("manuf_part_id", "//meta[@itemprop='name']/@content", ) loader.add_xpath("manuf_name", "//span[@itemprop='name']/text()") loader.add_xpath("description", "//td[@itemprop='description']/text()") loader.add_xpath("datasheet_link", "//a[@class='lnkDatasheet']/@href") loader.add_xpath("image_url", "//a[@class='lnkProductPhoto']/@href") loader.add_value("page_url", response.url) loader.add_xpath("part_detail", "//td[@class='attributes-table-main']") loader.add_xpath("packaging", "//th[contains(text(),'Packaging')]/following-sibling::td/text()") loader.add_xpath("package", "//th[contains(text(),'Standard Package')]/following-sibling::td/text()") loader.add_value("package", PACKAGE_DEFAULT) loader.add_value("packaging", PACKAGING_DEFUALT) loader.add_xpath("type", "//th[text()='Accessory Type']/following-sibling::td/text()") loader.add_value("version", VERSION_DEFAULT) loader.add_value("date_created", self.timestamp()) i = loader.load_item() prices = response.xpath("//table[@id='pricing']/tr[td and not(contains(.//text(),'Call'))]") for price in prices: td = price.xpath("td") if len(td) == 3: pi = PriceItem() pi['site_name'] = self.site_name pi['site_part_id'] = i['site_part_id'] pi['date_created'] = self.timestamp() pi['price_type'] = i['packaging'] pi['quantity'] = td[0].xpath("text()").extract()[0].replace(",", "") pi['price'] = td[1].xpath("text()").extract()[0].replace(",", "") i['price_data'].append(pi) avail = AvailabilityItem() avail['site_name'] = self.site_name avail['site_part_id'] = i['site_part_id'] avail['date_created'] = self.timestamp() loader = ItemLoader(avail, response=response) loader.add_xpath("stock", "//td[@id='quantityavailable']", re='":\s([\d|\,]*)') loader.add_value("factory_leadtime", FACTORY_LEAD_TIME_DEFAULT) loader.add_value("factory_lead_uom", FACTORY_LEAD_UOM_DEFAULT) avail = loader.load_item() i['inventory_data'].append(avail) yield i
def parse_details(self, response): item = response.meta["item"] urlLast = response.meta["urlLast"] loader = ItemLoader(item,response=response) loader.add_xpath("Description","//*[@id='body']/p[3]/text()") loader.add_xpath("Education","//td[. = 'Education Level (Highest Grade Completed)']/following-sibling::td[1]/text()") if urlLast.endswith("no_last_statement.html"): loader.add_value('Message',u'') return loader.load_item() else: request = scrapy.Request(urlLast, meta={"item": loader.load_item()}, callback=self.parse_details2) return request
def parse_item(self, response): """ This function parses a property page. @url http://web:9312/properties/property_000000.html @returns items 1 @scrapes title price description address image_urls @scrapes url project spider server date """ # Create the loader using the response l = ItemLoader(item=PropertiesItem(), response=response) # Load fields using XPath expressions l.add_xpath("title", '//*[@itemprop="name"][1]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "price", './/*[@itemprop="price"][1]/text()', MapCompose(lambda i: i.replace(",", ""), float), re="[,.0-9]+" ) l.add_xpath("description", '//*[@itemprop="description"][1]/text()', MapCompose(unicode.strip), Join()) l.add_xpath("address", '//*[@itemtype="http://schema.org/Place"][1]/text()', MapCompose(unicode.strip)) l.add_xpath( "image_urls", '//*[@itemprop="image"][1]/@src', MapCompose(lambda i: urlparse.urljoin(response.url, i)) ) # Housekeeping fields l.add_value("url", response.url) l.add_value("project", self.settings.get("BOT_NAME")) l.add_value("spider", self.name) l.add_value("server", socket.gethostname()) l.add_value("date", datetime.datetime.now()) return l.load_item()
def parse_item(self, response): logging.info(u"start crawl ---> " + response.url) item = ItemLoader(item=NewsItem(), response=response) images = [] sel = Selector(response) item.add_xpath('keywords', "//head/meta[@name='keywords']/@content") if sel.xpath('//div[@class="neirong-shouquan"]'): return item.add_xpath('title', '//div[@class="article-wrap"]/h1/text()') item.add_xpath('author', '//span[@class="author-name"]/text()') item.add_value('source', u'虎嗅网') item.add_value('original_link', response.url) item.add_value('category', CATEGORY.TECHNOLOGY) article_time = sel.xpath('//span[@class="article-time"]/text()').extract() date_time = compare_time(article_time, "%Y-%m-%d %H:%M") if not date_time: return item.add_value('date_time', article_time) image_url = sel.xpath('//div[@class="article-img-box"]/img/@src').extract()[0] images.append(image_url) elements = sel.xpath('//div[@id="article_content"]/p').extract() images, content = translate_content(elements) if images: item.add_value('image_url', hashlib.sha1(images[0]).hexdigest() + ".jpg") item.add_value('image_urls', images) item.add_value('content', content) logging.info(u"finished crawl ---> " + response.url) yield item.load_item()
def parse_item(self,response): sel = Selector(response) il = ItemLoader(item=Product(), response=response) cat = il.get_xpath('//div[contains(@id, "ctl00_pnlBreadCrumbs")]/a[last()]/text()') availability = il.get_xpath('//a[contains(@id,"hplddToCart") or contains(@class,"addToCart")]/text()') price = il.get_css('span#ctl00_ContentPlaceHolder1_ctrlProdDetailUC_lblRegprice > font::text') sale = il.get_css('span#ctl00_ContentPlaceHolder1_ctrlProdDetailUC_lblSaleprice > font::text') """If the xpath doesn't retunr a category, the product belongs to the Bundle category""" if not cat: il.add_value("category", "Bundle") else: il.add_value("category", cat) il.add_css("title", "span#ctl00_ContentPlaceHolder1_ctrlProdDetailUC_lblProdTitle::text") il.add_value("url",response.url) """If a product can be added to the cart, the product is available online, if not, the product is not available online""" if "ADD TO CART" in availability: il.add_value("availability", "Product is available online") else: il.add_value("availability", "Product is not available online") """If there's a sale price present but not a regular price present, it switches the sale price for the regular price as shown in the website""" if not price: il.add_value("regPrice",sale) il.add_value("salePrice", None) else: il.add_value("regPrice", price) il.add_value("salePrice",sale) return il.load_item()
def parse_news(self, response): if response.status == 200: to_group_id = response.meta['group_id'] url = response.url ac = response.xpath('//div[@id="article-main"]') title = ac.xpath('.//h1[@class="article-title"]/text()') title = title.extract_first(default='') info = ac.xpath('.//div[@class="articleInfo"]') source = info.xpath('.//span[@class="src"]/text()') source = source.extract_first(default='') source = source.strip('\n\t ') ctime = info.xpath('.//span[@class="time"]/text()') ctime = ctime.extract_first(default='') contents = ac.xpath('.//p/text()').extract() contents = map(lambda x: re.sub(r'<.*?>', '', x), contents) text = '\n'.join(contents) labels = ac.xpath('.//a[@class="label-link"]/text()').extract() il = ItemLoader(item=ArticleItem()) il.add_value('to_group_id', to_group_id) il.add_value('url', url) il.add_value('title', title) il.add_value('source', source) il.add_value('ctime', ctime) il.add_value('text', text) il.add_value('labels', labels) yield il.load_item() else: return
def parse_movie(self,response): loader = ItemLoader(item=DoubanItem(),response=response) for attr,xpath in self.settings.getdict('INFO_XPATH').items(): loader.add_xpath(attr,xpath) s = response.xpath('//div[@id="info"]').extract_first() for attr,regex in self.settings.getdict('RE').items(): loader.add_value(attr,re.findall(regex,s)) loader.add_value('rate',self.parse_rate(response)) loader.add_value('url',response.url) if self.settings.get('ALLOW_COVER') == True: image_urls = self._get_urls( self.image_base_url, urljoin, response.xpath('//div[@id="mainpic"]/a/img/@src').extract(), lambda s:s.split('/')[-1], ) loader.add_value('image_urls',image_urls) return loader.load_item()
def parse_item(self, response): l = ItemLoader(item=PageItem(), response=response) l.add_value('title', response.request.cookies['title']) l.add_value('name', self.name) l.add_value('url', response.url) l.add_xpath('image_urls', '//td[@valign="top"]/img/@src') return l.load_item()
def parse_item(self,response): l = ItemLoader(item =MeizituItem(),response = response) l.add_xpath('name','//h2/a/text()') l.add_xpath('tags', "//div[@id='maincontent']/div[@class='postmeta clearfix']/div[@class='metaRight']/p") l.add_xpath('image_urls', "//div[@id='picture']/p/img/@src",Identity()) l.add_value('url', response.url) return l.load_item()
def parse_item(self, response): l = ItemLoader(item=PageItem(), response=response) l.add_value('title', response.request.cookies['title']) l.add_value('url', response.url) l.add_value('name', self.name) l.add_xpath('image_urls', '//div[@class="l_effect_img_mid"]/a/img/@src') return l.load_item()
def parse_content_page(self, response): # Detect if this is a redirection page m = redirect_re.search(response.body) if m: import requests new_url = m.group(1) new_content = requests.get(new_url).content response = scrapy.http.HtmlResponse(new_url, body=new_content) # Start scraping il = ItemLoader(item = LuliItem(), response=response) il.add_css('content', 'div#articleNew > p::text') il.add_css('content', 'div[itemprop="articleBody"] > p') il.add_css('date', 'div#articleDate::text') il.add_css('date', 'header > time[datetime]::attr(datetime)') il.add_css('title', 'div#articleNew > h1::text') il.add_css('title', 'h1[itemprop="headline"]::text') il.add_value('url', response.url) item = il.load_item() yield item
def parse(self, response): # l = ItemLoader(item = ItjuziItem(),response=response) jsonresponse = json.loads(response.body_as_unicode()) for i in range(0,len(jsonresponse['data']['list'])): l = ItemLoader(item = LianjiaErshouItem(),response=response) house_code = jsonresponse['data']['list'][i]['house_code'] price_total = jsonresponse['data']['list'][i]['price_total'] ctime = jsonresponse['data']['list'][i]['ctime'] title = jsonresponse['data']['list'][i]['title'] frame_hall_num = jsonresponse['data']['list'][i]['frame_hall_num'] tags = jsonresponse['data']['list'][i]['tags'] house_area = jsonresponse['data']['list'][i]['house_area'] community_id = jsonresponse['data']['list'][i]['community_id'] community_name = jsonresponse['data']['list'][i]['community_name'] is_two_five = jsonresponse['data']['list'][i]['is_two_five'] frame_bedroom_num = jsonresponse['data']['list'][i]['frame_bedroom_num'] l.add_value('house_code',house_code) l.add_value('price_total',price_total) l.add_value('ctime',ctime) l.add_value('title',title) l.add_value('frame_hall_num',frame_hall_num) l.add_value('tags',tags) l.add_value('house_area',house_area) l.add_value('community_id',community_id) l.add_value('community_name',community_name) l.add_value('is_two_five',is_two_five) l.add_value('frame_bedroom_num',frame_bedroom_num) print l yield l.load_item()
def parse_page(self, response): #爬取图片 # print u'~~~~', unicode(response.body, "gbk").encode("utf8") # print(self.config["xpathImagesPath"]) # print(response.xpath(self.config["xpathImagesPath"])) l = ItemLoader(item=PageItem(), response=response) l.add_value('title', response.request.cookies['title']) l.add_value('name', self.config["id"]) l.add_value('url', response.url) if self.config.has_key("imageUrlReplacement"): l.add_value('replace', self.config["imageUrlReplacement"]) if self.config.has_key("xpathImagesPath"): l.add_xpath('image_urls', self.config["xpathImagesPath"]) if self.config.has_key("xpathFilesPath"): l.add_xpath('file_urls', self.config["xpathFilesPath"]) yield l.load_item() #TODO:获取下一页地址,递归调用自parse_page if self.config.has_key("xpathNextImageUrl"): nextUrls = response.xpath(self.config["xpathNextImageUrl"]) if len(nextUrls) > 0: nextPage = nextUrls.extract()[0] if not nextPage.startswith("http"): if nextPage.startswith("/"): nextPage = response.url[0:response.url.index("/",10)+1]+nextPage else: nextPage = response.url[0:response.url.rfind("/")+1]+nextPage request = scrapy.Request(nextPage, callback=self.parse_page, cookies={'title': response.request.cookies['title']}) yield request
def parse(self, response): """ This function parses the categories and its subcategories on a gscholar web page. @url https://scholar.google.com/citations?view_op=top_venues&hl=de&vq=bus @returns items 1 1 @returns requests 0 0 @scrapes name subs """ # We need the div that is 'selected' i.e. contains gs_sel as a css class title_xp = '//*[@id="gs_m_broad"]/div[contains(@class,\'gs_sel\')]/a/span/text()' item = ItemLoader(item=CategoryItem(), response=response) title = response.xpath(title_xp).extract_first() item.add_value('name', title) subs = [] for sub in response.xpath('//*[@id="gs_m_rbs"]/ul/li/a'): s = {'name' : sub.xpath('text()').extract_first()} rel_url = sub.xpath('@href').extract_first() s['vq'] = parse_qs(urlparse(rel_url).query)[u'vq'][0] subs.append(s) req = Request(urljoin(response.url,rel_url), callback=self.parse_item) req.meta['parent'] = title yield req item.add_value('subs', subs) yield item.load_item()
def parse_item(self, response): """ This function parses a property page. @url http://localhost:9312/properties/property_000000.html @returns items 1 @scrapes title price description address image_urls @scrapes url project spider server date """ l = ItemLoader(item=PropertiesItem(), response=response) l.add_xpath('title', '//*[@itemprop="name"][1]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath('price', '//*[@itemprop="price"][1]/text()', MapCompose(lambda i: i.replace(',', ''), float), re='[,.0-9]+') l.add_xpath('description', '//*[@itemprop="description"][1]/text()', MapCompose(unicode.strip), Join()) l.add_xpath('address', '//*[@itemtype="http://schema.org/Place"][1]/text()', MapCompose(unicode.strip)) l.add_xpath('image_urls', '//*[@itemprop="image"][1]/@src', MapCompose(lambda i: urlparse.urljoin(response.url, i))) # Housekeeping fields l.add_value('url', response.url) l.add_value('project', self.settings.get('BOT_NAME')) l.add_value('spider', self.name) l.add_value('server', socket.gethostname()) l.add_value('date', datetime.datetime.now()) return l.load_item()
def parse_item(self, response): # FIXME: fix array issue i = ItemLoader(item=SalefinderItem(), response=response) title = r'//div[@id="product-details-container"]//h1/text()' price = r'//div[@id="product-details-container"]//span[@class="price"]/text()' per = r'//div[@id="product-details-container"]//span[@class="price"]/text()' image_url = r'//a[@id="product-image-container"]//img/@src' i.add_xpath('title', title, MapCompose(unicode.lower)) i.add_xpath('price', price, re=r'[,.0-9]+') i.add_xpath('per', per, re=r'pk|each|kg') i.add_xpath('image_url', image_url) i.add_value('url', response.url) i.add_value('date', date.today().isoformat()) product_buy = response.xpath("//div[@class='product-container']//div[@id='product-buy']") product_buy_text = product_buy.extract_first().lower() # Detect the vendor from a product-buy div if 'coles' in product_buy_text: i.add_value('vendor', 'coles') elif 'woolworths' in product_buy_text: i.add_value('vendor', 'woolworths') else: i.add_value('vendor', 'unknown') return i.load_item()
def parse_titles(self, response): loader = ItemLoader(item=BlogCategory(), response=response) loader.add_value('hub', response.meta['hname']) loader.add_css('title', 'div.company_post h1 span::text') loader.add_css('date', 'div.published::text') loader.add_css('article', 'div.content::text') yield loader.load_item()
def parse_artwork(self, response): """Extracts information from an artwork detail page """ # create a url version free of search query noise url_bits = urlparse.urlparse(response.url) url_bits = url_bits._replace(query='') clean_url = urlparse.urlunparse(url_bits) loader = ItemLoader(item=ArtworkItem(), response=response) loader.add_value('museum_code', self.name) loader.add_value('url', clean_url) loader.add_xpath('artist_name', '//div[@id="tombstone"]/p[1]/a/text()[1]') artist_url = response.xpath('//div[@id="tombstone"]/p[1]/a/@href') artist_url = urlparse.urljoin(response.url, artist_url.extract()[0]) loader.add_value('artist_url', artist_url) loader.add_css('title', '#tombstone span:nth-of-type(1)::text') loader.add_xpath('thumbnail', '//div[@id="artwork-image"]/a/img/@src') loader.add_xpath('on_display', ON_DISPLAY_SELECTOR) item = loader.load_item() self.logger.info('Scraped ' + item['title'][0]) yield item
def parse(self, response): l=ItemLoader(item=RentalItem(),response=response) l.add_xpath('price','//*[(@id = "main-info")]//*[contains(concat( " ", @class, " " ), concat( " ", "txt-big", " " )) and contains(concat( " ", @class, " " ), concat( " ", "txt-bold", " " ))]/text()') l.add_xpath('adress','//*[(@id = "addressPromo")]//*[contains(concat( " ", @class, " " ), concat( " ", "txt-bold", " " ))]/text()') l.add_value('url', response.url) return l.load_item()
def parse_image(self, response): logger.info("正在收集页面数据: %s ..." % response.url) loader = ItemLoader(item=MeiTuItem(), response=response) loader.add_xpath('publisher', "//div[@class='width']/div[@class='c_l']/p[1]/text()") loader.add_xpath('publisher', "//div[@class='width']/div[@class='c_l']/p[1]/a[@class='tags']/text()") loader.add_xpath('model_name', "//div[@class='width']/div[@class='c_l']/p[5]/text()") loader.add_xpath('model_name', "//div[@class='width']/div[@class='c_l']/p[5]/a[@class='tags']/text()") loader.add_xpath('publishtime', "//div[@class='width']/div[@class='c_l']/p[6]/text()") loader.add_xpath('magazine_no', "//div[@class='width']/div[@class='c_l']/p[2]/text()") loader.add_xpath('pic_qty', "//div[@class='width']/div[@class='c_l']/p[3]/text()") loader.add_xpath('pixel', "//div[@class='width']/div[@class='c_l']/p[4]/text()") try: loader.add_xpath('desc', "//p[@class='buchongshuoming'/text()]") except ValueError: pass loader.add_xpath('tag', "//div[@class='fenxiang_l']/a[@class='tags']/text()") loader.add_xpath('sort', "//div[@class='weizhi']/span/a[2]/text()") loader.add_xpath('image_url', "//div[@class='content']/center/img[@class='content_img']/@src") loader.add_value("page_url", response.url) yield loader.load_item()
def parse_item(self, response): loader = ItemLoader(GaokaopaiZhiyeItem(), response) loader.add_value('url', response.url) loader.add_value('code', response.url, re=ur'-([^-]+)\.html') loader.add_css('name', u'.modTitle>h1::text') def parse_category(): for e in response.css(u'.catType>a'): yield { 'url': e.css('::attr(href)').extract_first(), 'code': e.css('::attr(href)').re_first(ur'-([^-]+)\.html'), 'name': e.css('::text').extract_first(), } loader.add_value('category', list(parse_category())) loader.add_css('detail', u'.zhiyeShow') item = loader.load_item() return FormRequest( url='http://www.gaokaopai.com/ajax-career-getRelateMajor.html', formdata={'code': item['code'][0]}, meta={'item': item}, dont_filter=True, callback=self.parse_majors )
def parse_item(self, response): logging.info(u"start crawl ---> " + response.url) item = ItemLoader(item=NewsItem(), response=response) sel = Selector(response) content = sel.xpath('//div[@id="Cnt-Main-Article-QQ"]/p') article_time = content.xpath('//span[@class="pubTime"]/text()').extract() date_time = compare_time(article_time, u"%Y年%m月%d日%H:%M") if not date_time: return item.add_xpath('keywords', "//head/meta[@name='keywords']/@content") item.add_value('date_time', date_time) item.add_xpath('title', '//div[@class="hd"]/h1/text()') item.add_xpath('reading_number', '//em[@id="top_count"]/text()') item.add_xpath('author', '//span[@class="auth"]/text()') item.add_value('original_link', response.url) elements = sel.xpath('//div[@id="Cnt-Main-Article-QQ"]/p').extract() images, content = translate_content(elements) if images: item.add_value('image_url', hashlib.sha1(images[0]).hexdigest() + ".jpg") item.add_value('content', content) item.add_value('image_urls', images) item.add_value('source', u'腾讯科技') item.add_value('category', CATEGORY.TECHNOLOGY) logging.info(u"finished crawl ---> " + response.url) yield item.load_item()
def parse_item(self,response): sel = Selector(response) #department information """ li=response.xpath("//li[@class='menulevel-0']|//li[@class='menulevel-0 menulevel-0-extra']") dName = li.xpath("a/span/text()").extract() print dName """ #category name cName = response.xpath("//div/a[2]/text()").extract()[0] print ("'Category name' "+ cName) #product information il = ItemLoader(item=Product(), response=response) il.add_xpath("title","//div[contains(@class,'productdetail-container')]//span[contains(@id,'ProdTitle')]/..//text()") #il.add_xpath("title","//div[@class='catalogueTitle']/*/text()") #@class='catalogueTitle' id='subcatemenu-container' il.add_value("url",response.url) il.add_xpath("current_price","//div[contains(@class,'pricing') or contains(@class,'price')]//span[contains(@id,'Saleprice') or contains(@class,'salePrice')]//text()") il.add_xpath("regular_price","//div[contains(@class,'pricing') or contains(@class,'price')]//span[contains(@id,'Regprice') or contains(@class,'regPrice')]//text()") limited = sel.xpath("//div[contains(@id,'FinalClearance')]").extract() if len(limited) > 0: il.add_value("availability","Limited Quantities") else: il.add_value("availability","Available") return il.load_item()
def _parse(self, response): l = ItemLoader(item=BookmarksItem(), response=response) l.add_xpath(u"name", u"/html/head/title") l.add_xpath(u"anchors", u"//a/@href'") l.add_xpath(u"description", u"/html/body/text()") l.add_value(u"last_updated", datetime.datetime) # you can also use literal values return l.load_item()
def parse_article(self, response): article = ItemLoader(item=NewsCrawlerItem(), response=response) article.add_value('country', 'uk') article.add_value('language', 'english') article.nested_css('div.content__article-body').add_xpath( 'body', './p//text()') article.nested_css('meta[property="og:title"]').add_xpath( 'headline', './@content') # Function to parse published time to iso6801 time_in = Compose( Join(), lambda v: '' if (ciso8601.parse_datetime(v) is None) else ciso8601.parse_datetime(v).isoformat(sep='T')) article.nested_css( 'meta[property="article:published_time"]').add_xpath( 'published_time', './@content', time_in, ) article.add_xpath('category', '//head/meta[@property="article:section"]/@content') article.add_xpath('keywords', '//head/meta[@name="keywords"]/@content') article.add_value('url', response.url) article.add_value('encoding', response.encoding) return article.load_item()
def parse(self, response): self.driver.get(response.url) iframe = self.driver.find_element_by_css_selector('iframe') self.driver.switch_to_frame(iframe) for i in range(len(self.data)): if self.data.iloc[i, 0] in self.alr_crawled: continue file_name = self.data.iloc[i, 0] + ".mp3" text = self.data.iloc[i, 1] # Input text element = self.driver.find_element_by_xpath( "/html/body/form/textarea") element.clear() element.send_keys(text) # Adjust speed speed_element = self.driver.find_element_by_xpath( "/html/body/form/input[4]") self.driver.execute_script("arguments[0].value = '0.85';", speed_element) # Submit self.driver.find_element_by_xpath( "/html/body/form/input[5]").click() time.sleep(2) # Get link to download file download_element = self.driver.find_element_by_xpath( "/html/body/audio/source") loader = ItemLoader(item=WavItem(), selector=download_element) relative_url = download_element.get_attribute("src") absolute_url = response.urljoin(relative_url) loader.add_value("file_urls", absolute_url) loader.add_value("file_name", file_name) yield loader.load_item()
def qidian_parse(self, response): list_selector = response.xpath("// div[@class='book-mid-info']") for one_selector in list_selector: novel = ItemLoader(item=QidianHotItem(), selector=one_selector) novel.add_xpath("name", "h4/a/text()") novel.add_xpath("author", "p[1]/a[1]/text()") novel.add_xpath("type", "p[1]/a[2]/text()") novel.add_css("form", ".author span::text") # name = one_selector.xpath("h4/a/text()").extract()[0] # author = one_selector.xpath("p[1]/a[1]/text()").extract()[0] # type = one_selector.xpath("p[1]/a[2]/text()").extract()[0] # form = one_selector.xpath('p[1]/span/text()').extract()[0] # item = QidianHotItem() # item["name"] = name # item["author"] = author # item["type"] = type # item["form"] = form yield novel.load_item() self.current_page += 1 if self.current_page <= 5: next_url = "https://www.qidian.com/rank/hotsales?style=1&page=%d" % ( self.current_page) yield Request(next_url, callback=self.qidian_parse)
def parse(self, response): ''' This function extracts all the reviews and pagination links from the reviews of above amazon link ''' all_reviews_div = response.xpath('//div[@data-hook="review"]') for review in range(0, len(all_reviews_div)): i = ItemLoader(AmazonreviewsItem(), all_reviews_div[review]) # required items i.add_xpath( 'review_txt', './/*[@data-hook="review-body"]//text()', Join()) i.add_xpath( 'rating', './/*[@data-hook="review-star-rating"]//text()', re='^[0-9]') # extra info fields i.add_value('url', response.url) i.add_value('project', self.settings.get('BOT_NAME')) i.add_value('spider', self.name) i.add_value('server', socket.gethostname()) # using MapCompose for preprocessing items: converting datatime object to string i.add_value('date', datetime.date.today(), MapCompose(lambda x: x.strftime('%Y/%m/%d'))) yield i.load_item() # identify the next button for looping review urls to extract more urls next_link = response.xpath( '//*[@data-hook="pagination-bar"]//li[@class ="a-last"]//a/@href').extract_first() if next_link: next_link = response.urljoin(next_link) # next_link = urljoin('https://www.amazon.in', next_link) yield scrapy.Request(url=next_link, callback=self.parse, dont_filter=True)
def parse_question(self, response): """ 获取 详细的 item :param response: :return: """ question_id = response.meta.get('question_id') item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css('title', '.QuestionHeader-title::text') item_loader.add_css('content', '.QuestionHeader-detail') item_loader.add_value('url', response.url) item_loader.add_value('question_id', question_id) item_loader.add_css('answer_nums', '.List-headerText span::text') item_loader.add_css('comment_nums', '.QuestionHeader-Comment button::text') item_loader.add_css('watch_user_nums', '.NumberBoard-itemValue::text') item_loader.add_css('topics', '.QuestionHeader-topics .Popover div::text') question_item = item_loader.load_item() yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers, callback=self.parse_answer) yield question_item
def parse_item(self, response): item_loader = ItemLoader(item=HouseRentingLianjiaItem(), response=response) item_loader.add_css(field_name='title', css='div.title *::text') item_loader.add_value(field_name='source', value=self.name) item_loader.add_css(field_name='author', css='div.brokerName > a.name::text') item_loader.add_css(field_name='image_urls', css='div.thumbnail > ul > li > img::attr(src)') item_loader.add_css(field_name='author_link', css='div.brokerName > a.name::attr(href)') item_loader.add_css(field_name='content', css='div.introduction *::text', re=r'\s*(.*)\s*') item_loader.add_value(field_name='source_url', value=response.url) item_loader.add_css(field_name='publish_time', css='div.zf-room > p::text') item_loader.add_css(field_name='price', css='div.price > span.total::text') item_loader.add_css(field_name='detail', css='div.zf-room *::text') yield item_loader.load_item()
def ads_parse(self, response): item = ItemLoader(AvitoRealEstateItem(), response) item.add_value('url', response.url) item.add_css('title', 'div.title-info-main h1.title-info-title span::text') item.add_xpath( 'photos', "//div[contains(@class, 'gallery-img-frame')]/@data-url") item.add_xpath( 'dpublic', "//div[@class='title-info-metadata-item-redesign']/text()") item.add_xpath('floor', "//span[text() = 'Этаж: ']/parent::li/text()[2]") item.add_xpath( 'floorcnt', "//span[text() = 'Этажей в доме: ']/parent::li/text()[2]") item.add_xpath('housetype', "//span[text() = 'Тип дома: ']/parent::li/text()[2]") item.add_xpath( 'roomcnt', "//span[text() = 'Количество комнат: ']/parent::li/text()[2]") item.add_xpath( 'square', "//span[text() = 'Общая площадь: ']/parent::li/text()[2]") item.add_xpath( 'kitchensquare', "//span[text() = 'Площадь кухни: ']/parent::li/text()[2]") item.add_xpath( 'buildyear', "//span[text() = 'Год постройки: ']/parent::li/text()[2]") item.add_xpath( 'authorname', '//div[@class="seller-info-name js-seller-info-name"]/a[1]/text()') item.add_xpath( 'authorurl', '//div[@class="seller-info-name js-seller-info-name"]/a[1]/@href') yield item.load_item()
def parse_torrent_page(self, response): torrent_item = response.meta['torrent_item'] loader = ItemLoader(item=torrent_item, response=response) loader.add_css('title', "h1 .breaker-breaker::text") loader.add_css('description', "#descript::text") loader.add_value('link', "https://archive.org{}" .format(response.css(".item-download-options div.format-group:nth-last-child(2) a::attr(href)") .get())) loader.add_value('date', str(datetime.datetime.strptime(response.css("time::text").get(), "%B %d, %Y")).split(sep=" ")[0]) title = str(response.css("h1 .breaker-breaker::text").get()) link = "https://archive.org{}".format(response.css(".item-download-options div.format-group:nth-last-child(2) " "a::attr(href)").get()) if link.endswith('.torrent'): self.torrents_list.append({ 'title': title, 'description': response.css("#descript::text").get(), 'link': link, 'date': str(datetime.datetime.strptime(response.css("time::text").get(), "%B %d, %Y")).split(sep=" ")[0] }) yield loader.load_item() else: return
def _parse_item(self, resp): il = ItemLoader(item=_ISArticleItem(), response=resp) il.add_value('url', resp.url) il.add_value('time', int(time.time())) il.add_xpath('title', '//article//h1//text()') il.add_xpath( 'ingress', '//section//article//p[contains(@class, "ingress")]//text()') pgraphs_xpath = '//article//p[contains(@class, "body")]' content = [ ''.join(Selector(text=pgraph).xpath('//text()').getall()) for pgraph in resp.xpath(pgraphs_xpath).getall() ] il.add_value('content', content) il.add_xpath('published', '//article//div[contains(@class, "timestamp")]//text()') il.add_xpath('author', '//article//div[contains(@itemprop, "author")]//text()') il.add_xpath( 'images', '//section//article//div[contains(@class, "clearing-container")]') return il.load_item()
def parse(self, response): games = response.xpath("//div[@id='search_resultsRows']/a") for game in games: loader = ItemLoader(item=SteamItem(), selector=game, response=response) loader.add_xpath('game_url', ".//@href") loader.add_xpath('img_url', ".//div[@class='col search_capsule']/img/@src") loader.add_xpath('game_name', ".//span[@class='title']/text()") loader.add_xpath( 'release_date', "//div[@class='col search_released responsive_secondrow']/text()" ) loader.add_xpath( 'platform', ".//span[contains(@class,'platform_img') or @class='vr_supported']/@class" ) loader.add_xpath( 'reviews_summary', ".//span[contains(@class, 'search_review_summary')]/@data-tooltip-html" ) loader.add_xpath( 'discount_rate', ".//div[@class='col search_discount responsive_secondrow']/span/text()" ) loader.add_xpath( 'original_price', ".//div[contains(@class, 'search_price_discount_combined')]") loader.add_xpath('discounted_price', ".//@href") yield loader.load_item() next_page = response.xpath( "//a[@class='pagebtn' and text()='>']/@href").get() if next_page: yield scrapy.Request(url=next_page, callback=self.parse)
def parse_question(self, response): item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css('title', '.QuestionHeader h1.QuestionHeader-title::text') item_loader.add_css('content', '.QuestionHeader-detail span') item_loader.add_value('url', response.url) match_obj = re.match('(.*www.zhihu.com/question/(\d+))', response.url) if match_obj: question_id = int(match_obj.group(2)) item_loader.add_value('zhihu_id', question_id) item_loader.add_css('answer_num', 'h4.List-headerText span::text') item_loader.add_css('comments_num', '.QuestionHeader-Comment button::text') item_loader.add_css('watch_user_num', '.NumberBoard-value::text') item_loader.add_css('topics', '.QuestionHeader-topics .Popover div::text') question_item = item_loader.load_item() yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers, callback=self.parse_answer, dont_filter=True) yield question_item
def second_food_parse(self, response): item = response.meta['item'] print('>>>: %s 连接成功' % response.status) print( '------------------------------------第一层数据爬取已完成-----------------------------------' ) print( '----------------------------------开始进行第二层的商品数据爬取-----------------------------------' ) print('>>> 爬取的商家名:', item['business_name']) print('>>> 爬取的商家URL: ', response.url) count = 0 json_file = json.loads(response.text.encode('utf-8')) for index in json_file: for food in index.get('foods'): business_food = ItemLoader(item=MyElmBusinessFoodItem(), response=response) business_food.add_value('business_food', item['business_name']) business_food.add_value('food_name', food.get('name')) business_food.add_value('food_rating', food.get('rating')) business_food.add_value('food_month_sale', food.get('month_sales')) business_food.add_value( 'food_recent_rating', food.get('specfoods')[0].get('recent_rating')) business_food.add_value('food_price', food.get('specfoods')[0].get('price')) business_food.add_value( 'food_original_price', food.get('specfoods')[0].get('original_price')) yield business_food.load_item() count += 1 print('----------成功抓取 %d 商品----------' % count) print( '----------------------------------#######################-----------------------------------\n' )
def parse_abs_page(self, response): """ From arXiv abstract page, fetches: - submisison date and time - all categories including cross-references """ new = ItemLoader(item=ArxivItem(), response=response, parent=response.meta['item']) # all arXiv categories other_cat_full_cont = response.css( 'td[class*=subjects]').extract()[0].split('</span>;') if len(other_cat_full_cont) > 1: other_cats = other_cat_full_cont[1] other_cats_list = [ x.strip('\(').strip('\)') for x in re.findall('\(.*?\)', other_cats) ] else: other_cats_list = [] main_cat = re.findall( '\(.*?\)', response.css('div.metatable span::text').extract()[0])[0].strip( '\(').strip('\)') all_cats = [main_cat] + other_cats_list new.add_value('all_cat', all_cats) # submission date new.add_value( 'date', response.css('div.submission-history::text').extract()[-2]) yield new.load_item()
def parse_item(self, response): l = ItemLoader(item=SpiderItem(), response=response) try: for attr in ['title', 'date', 'content']: function = getattr(self, 'get' + attr, None) if function: l.add_value(attr, function(response)) else: self.logger.error('no method for %s' % attr) except Exception as e: self.logger.error('error url: %s error msg: %s' % (response.url, e)) l = ItemLoader(item=SpiderItem(), response=response) l.add_value('title', '') l.add_value('date', '1970-01-01 00:00:00') l.add_value('source', '') l.add_value('content', '') pass finally: l.add_value('url', response.url) l.add_value('collection_name', self.name) l.add_value('website', self.website) return l.load_item()
def parse_post(self, response): new = ItemLoader(item=FbcrawlItem(), response=response, parent=response.meta['item']) new.context['lang'] = self.lang new.add_xpath( 'source', "//td/div/h3/strong/a/text() | //span/strong/a/text() | //div/div/div/a[contains(@href,'post_id')]/strong/text()" ) new.add_xpath( 'shared_from', '//div[contains(@data-ft,"top_level_post_id") and contains(@data-ft,\'"isShare":1\')]/div/div[3]//strong/a/text()' ) # new.add_xpath('date','//div/div/abbr/text()') new.add_xpath( 'text', '//div[@data-ft]//p//text() | //div[@data-ft]/div[@class]/div[@class]/text()' ) #check reactions for old posts check_reactions = response.xpath( "//a[contains(@href,'reaction/profile')]/div/div/text()").get() if not check_reactions: yield new.load_item() else: new.add_xpath( 'reactions', "//a[contains(@href,'reaction/profile')]/div/div/text()") reactions = response.xpath( "//div[contains(@id,'sentence')]/a[contains(@href,'reaction/profile')]/@href" ) reactions = response.urljoin(reactions[0].extract()) yield scrapy.Request(reactions, callback=self.parse_reactions, meta={'item': new})
def parse_item(self, response): l = ItemLoader(item=SpiderItem(), response=response) try: l.add_value('title', response.xpath('//span[contains(@id, "zt")]/text()').extract_first() or '') l.add_value('title', response.xpath('//span[@id="show_bt"]/descendant-or-self::text()').extract_first() or '') l.add_value('title', response.xpath('//span[@class="show_bt"]/descendant-or-self::text()').extract_first() or '') date = response.xpath('//span[contains(@id, "sj")]/text()').re_first(r'\d+-\d+-\d+') if date == None: date = response.xpath('//div[contains(@class, "show_date")]/text()').re_first(r'\d+-\d+-\d+') date = (date if date else '1970-01-01') + ' 00:00:00' l.add_value('date', date) l.add_value('source', self.website) l.add_value('content', ''.join(response.xpath('//div[@id="Main1_txt"]/descendant-or-self::text()').extract())) l.add_value('content', ''.join(response.xpath('//span[@id="Main1_txt"]/descendant-or-self::text()').extract())) l.add_value('content', ''.join(response.xpath('//span[@id="txt"]/descendant-or-self::text()').extract())) except Exception as e: self.logger.error('error url: %s error msg: %s' % (response.url, e)) l = ItemLoader(item=SpiderItem(), response=response) l.add_value('title', '') l.add_value('date', '1970-01-01 00:00:00') l.add_value('source', '') l.add_value('content', '') pass finally: l.add_value('url', response.url) l.add_value('collection_name', self.name) l.add_value('website', self.website) return l.load_item()
def parse_taobao_item(self, response): item_loader = ItemLoader(item=TaobaoGoodsDetailItem(), response=response) item_loader.add_value('goods_id', response.meta['goods_id']) item_loader.add_xpath( 'store_name', '//*[@id="J_ShopInfo"]/div/div[1]/div[1]/dl/dd/strong/a/text()') item_loader.add_xpath('original_price', '//*[@id="J_StrPrice"]/em[2]/text()') item_loader.add_xpath('actual_price', '//*[@id="J_PromoPriceNum"]/text()') item_loader.add_xpath('sales_volume', '//*[@id="J_Counter"]/div/div[2]/a/@title') item_loader.add_xpath('postage', '//*[@id="J_WlServiceTitle"]/text()') item_loader.add_xpath('attribute', '') item_loader.add_xpath('details', '//*[@id="attributes"]/ul/li/text()') item_loader.add_xpath( 'score', '//*[@id="J_ShopInfo"]/div/div[2]/div/dl[1]/dd/a/text()') item_loader.add_xpath( 'score', '//*[@id="J_ShopInfo"]/div/div[2]/div/dl[2]/dd/a/text()') item_loader.add_xpath( 'score', '//*[@id="J_ShopInfo"]/div/div[2]/div/dl[3]/dd/a/text()') goodsDetailItem = item_loader.load_item() yield goodsDetailItem
def parse_page(self, response): # extract the json response from the tag: <script id="__NEXT_DATA__" type="application/json”> jsonresponse = json.loads(response.css('#__NEXT_DATA__::text').extract()[0]) # access the historical data within the JSON object nestedJson = jsonresponse['props']['initialState']['cryptocurrency']['ohlcvHistorical'] # retrieve the id of the crypto (a key value) id = [str(k) for k in nestedJson.keys()][0] # get the name of the respective crypto name = nestedJson[id]['name'] # save the ticker symbol ticker = jsonresponse['props']['initialState']['cryptocurrency']['ohlcvHistorical'][id]['symbol'] # accesss the historical data: e.g. Open, Close, High, Low, etc. data = jsonresponse['props']['initialState']['cryptocurrency']['ohlcvHistorical'][id]['quotes'] for d in data: loader = ItemLoader(item=CryptoItem()) loader.default_input_processor = MapCompose(str) loader.default_ouput_processor = Join('') for (field, path) in self.jmes_paths.items(): loader.add_value(field, SelectJmes(path)(d)) loader.add_value("Name", name) loader.add_value("Ticker", ticker) yield loader.load_item()
def parse_page(self, response, title, description, pubDate, author): x = response.xpath('//meta[contains(@property, "og:type")]//@content' ).extract_first() if x in 'website': file_id = hashlib.md5(title.encode('utf-8')).hexdigest() l = ItemLoader(item=NewsItem(), response=response) l.add_xpath('headline', '//meta[contains(@property, "og:title")]//@content') l.add_value('file_id', file_id) l.add_value('title', title) l.add_value('link', response.url) l.add_value('description', description) #l.add_value('author', '') l.add_xpath('author', '//*//a[contains(@href,"author")]//text()') l.add_xpath('content', '//div[contains(@class, "article-body")]//p//text()') l.add_value('pubDate', pubDate) l.add_value('source', 'thetimes') yield l.load_item() else: next
def parse_busi_art(self, res): tag = res.meta['tag'] url = res.url main = res.css('.container.js-social-anchor-start') ci = ItemLoader(item=CNN(), selector=main) ci.add_value('tag', tag) ci.add_value('crawled_at', self.crawled_at) ci.add_value('url', url) ci.add_css('title', 'h1.article-title.speakable::text') ci.add_xpath('timestamp', './/span[@class="cnnDateStamp"]/text()') img_ = main.xpath('.//div[@id="storytext"]//img/@src').extract() ci.add_value('image_urls', img_) ci.add_css('summary', 'h2.speakable::text') ci.add_xpath('text', './/p/text()') ci.add_value('source', self.source) return ci.load_item()
def parse_product(self, response): # name = scrapy.Field(output_processor=TakeFirst()) # category = scrapy.Field() # description = scrapy.Field() # specification = scrapy.Field() # price = scrapy.Field(output_processor=TakeFirst()) # images = scrapy.Field() # image_urls = scrapy.Field() l = ItemLoader(item=ToystoresItem(), response=response) l.add_xpath( 'name', "normalize-space(//h1[@class='ui header pdp']/div[1]/text())") l.add_xpath( 'category', "normalize-space(//div[@class='ui breadcrumb mobile hidden'])") l.add_xpath('description', "normalize-space(//div[@id='productDescription'])") l.add_xpath('specification', "//div[@id='productSpecifications']//table//tr") l.add_xpath( 'price', "concat(normalize-space(//div[@class='ui price']/text()),normalize-space(//div[@class='ui price']/span/text()))" ) urls = response.xpath( "//div[contains(@class,'pdp') and contains(@class,'thumb')]//img/@src" ).extract() urls = [ 'https://www.intertoys.nl' + url.replace('thumb', 'full') for url in urls ] l.add_value('image_urls', urls) # referer = response.request.headers.get('Referer', '') # # referer = referer.split('/') # l.add_value('category', type(referer)) #[referer[-3], referer[-2]]) return l.load_item()
def parse_article(self, response): item = ItemLoader(Article()) item.default_output_processor = TakeFirst() title = "".join(response.xpath('//h1//text()').getall()) if title: title = title.strip() date = response.xpath('//span[@class="datum"]/text()').get() if date: date = date.strip() else: return content = response.xpath('//article//text()').getall() content = [text for text in content if text.strip()] content = "\n".join(content).strip() item.add_value('title', title) item.add_value('date', date) item.add_value('link', response.url) item.add_value('content', content) return item.load_item()
def parse(self, response): productos = response.css('div.product-tile-inner') for producto in productos: print(producto) detalles = producto.css('div.detail') tiene_detalles = len(detalles) > 0 and detalles if (tiene_detalles): producto_loader = ItemLoader( #Instancia para cargar las propiedad del Item item=ProductoFybeca(), #Clase item selector=producto # Selector por defecto ) producto_loader.default_output_processor = TakeFirst( ) #No guardar el arreglo producto_loader.add_css( 'titulo', #Nombre de la propiedad del item 'a.name::text' # css para obtener el dato ) producto_loader.add_xpath( 'imagen', #Nombre de la propiedad del item 'div[contains(@class,"detail")]/a[contains(@class,"image")]/img[contains(@id,"gImg")]/@src' # xpath para obtener el dato ) yield producto_loader.load_item()
def parse(self, response): json_resp = json.loads(response.body) houses = json_resp.get('cat1').get('searchResults').get('listResults') for house in houses: loader = ItemLoader(item = ZillowItem()) loader.add_value('id',house.get('id')) loader.add_value('image_urls',house.get('imgSrc')) loader.add_value('detail_url',house.get('detailUrl')) loader.add_value('status_type',house.get('statusType')) loader.add_value('status_text',house.get('statusText')) loader.add_value('price',house.get('price')) loader.add_value('address',house.get('address')) loader.add_value('beds',house.get('beds')) loader.add_value('baths',house.get('baths')) loader.add_value('area_sqft',house.get('area')) loader.add_value('latitude',house.get('latLong').get('latitude')) loader.add_value('longitude',house.get('latLong').get('longitude')) loader.add_value('broker_name',house.get('brokerName')) yield loader.load_item() current_page = response.meta['currentPage'] total_pages = json_resp.get('cat1').get('searchList').get('totalPages') if current_page <= total_pages: nxt_pg = current_page + 1 yield scrapy.Request( url= parse_new_url(URL,pg_num=nxt_pg), callback=self.parse, cookies=cookie_parser(), meta={ 'currentPage': nxt_pg } )
def parse_question(self, response): # 处理question页面,从页面中提取具体的question item zhihu_id = response.meta.get("zhihu_id", "") item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css("title", "h1.QuestionHeader-title::text") item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text") item_loader.add_css("content", ".QuestionHeader-detail") item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", zhihu_id) item_loader.add_css("answer_num", ".List-headerText span::text") item_loader.add_css("comments_num", ".QuestionHeaderActions button::text") item_loader.add_css("watch_user_num", ".NumberBoard-value::text") question_item = item_loader.load_item() yield scrapy.Request(self.start_answer_urls.format(zhihu_id, 20, 0), headers=self.header, callback=self.parse_answer) yield question_item
def parse_item(self, response): loader = ItemLoader(item=SephoraScrapyProjectItem(), response=response) loader.default_output_processor = TakeFirst() loader.add_xpath( 'brand_name', "//h1[@data-comp='DisplayName Flex Box']" + "//span[@class='css-euydo4']//text()") loader.add_xpath( 'item_name', "//h1[@data-comp='DisplayName Flex Box']" + "//span[@class='css-0']//text()") loader.add_xpath('price', "//div[@data-comp='Price Box']//text()") loader.add_xpath('category', "//a[@class='css-1ylrown ']//text()") loader.add_xpath('subcategory', "//a[@class='css-1ylrown ']//text()") loader.add_xpath('subsubcategory', "//h1[@class='css-bnsadm ']//text()") details_xpath = \ self.get_detail_and_ingredient_xpath(response, 'Details') if details_xpath: loader.add_xpath('details', details_xpath) ingredient_xpath = \ self.get_detail_and_ingredient_xpath(response, 'Ingredients') if ingredient_xpath: loader.add_xpath('ingredients', ingredient_xpath) image_url = self.get_image_url(response) loader.add_value('image_url', image_url) yield loader.load_item()
def parse_item(self, response): """Parse a page with an apartment. @url http://www.merkur-berlin.de/?page_id=39&showExpose=1&exposeID=926C081BECA043C9BE7756469D94722F @returns items 1 1 @scrapes url title address rooms size warm_rent description location """ self.shutdown_on_error() item = ItemLoader(ApartmentItem(), response=response) item.add_value('url', response.url) item.add_xpath('title', '//h4[@class="entry-title"]/text()') item.add_xpath('address', '//address/text()') for field, info in dict(rooms='Rooms', size='AreaLiving', warm_rent='PriceWarmmiete', cold_rent='Price').items(): item.add_xpath(field, '//div[@class="infotables"]//tr[@id="infotable_{info}"]/td[@class=' '"infotable_value"]/text()'.format(info=info)) for field, h2 in dict(description='Objekt', equipment='Ausstattung', location='Lage', other='Mehr Angebote').items(): item.add_xpath(field, '//div[@class="infoblock"]/h2[starts-with(normalize-space(.),' ' "{h2}")]/following-sibling::p/text()'.format(h2=h2)) return item.load_item()
def parse(self, response): sel = Selector(response) noticias = sel.xpath( '//div[@class="view-content"]/div[@class="posts"]') for i, elem in enumerate(noticias): item = ItemLoader(Noticia(), elem) # Cargo mi item # Llenando mi item a traves de expresiones XPATH item.add_xpath('titular', './/h2/a/text()') item.add_xpath('descripcion', './/p/text()') item.add_value('id', i) yield item.load_item() # Retorno mi item lleno # METODO #2: UTILIZANDO BEAUTIFUL SOUP # soup = BeautifulSoup(response.body) # contenedor_noticias = soup.find_all(class_='view-content') # id = 0 # for contenedor in contenedor_noticias: # noticias = contenedor.find_all(class_='posts', recursive = False) # for noticia in noticias: # item = ItemLoader(Noticia(), response.body) # titular = noticia.find('h2').text.replace('\n', '').replace('\r', '') # descripcion = noticia.find('p') # if (descripcion): # item.add_value('descripcion', descripcion.text.replace('\n', '').replace('\r', '')) # else: # item.add_value('descripcion', 'N/A') # item.add_value('titular', titular) # item.add_value('id', id) # id += 1 # yield item.load_item() # EJECUCION # scrapy runspider 4_eluniverso.py -o resultados.csv -t csv
def parse_article(self, response): item = ItemLoader(Article()) item.default_output_processor = TakeFirst() title = response.xpath('//h1[@class="page-title"]/text()').get() if title: title = title.strip() date = response.xpath('//span[@class="date"]/text()').get() if date: date = datetime.strptime(date.strip(), '%d/%m/%Y') date = date.strftime('%Y/%m/%d') content = response.xpath( '//span[@class="description"]//text()').getall() content = [text for text in content if text.strip()] content = "\n".join(content).strip() item.add_value('title', title) item.add_value('date', date) item.add_value('link', response.url) item.add_value('content', content) return item.load_item()
def collectDetailInfo(self, response: scrapy.http.Response): loader = ItemLoader(item=BuddhacrawlerItem(), response=response) loader.add_value('url', response.url) loader.add_value('hostUrl', self.host) loader.add_value('city', self.city) loader.add_xpath( 'articleTitle', "//div[@class='newsBox']/div[@class='bt']/h2[@class='bt1']/text()") loader.add_xpath('articleTag', "//div[@class='newslink']/a/text()") loader.add_xpath('articleText', "//div[@class='newsBox']/div[@class='newsCon']") loader.add_xpath( 'publishTime', "//div[@class='newsBox']/div[@class='bt']/div[@class='bt2']/text()" ) loader.add_value('coverPictureUrl', '') loader.add_xpath( 'articlePictureUrls', "//div[@class='newsBox']/div[@class='newsCon']/div[@class='pgc-img']/img/@src" ) loader.add_value('articleVideoUrls', []) loader.add_value('createTime', datetime.now().strftime('%Y-%m-%d %H:%M:%S')) return loader.load_item()