def parse_item(self, response): request_again = self.error_handler(response) if request_again: return request_again il = ItemLoader(item=UKBusinessItem(), response=response) # From the OG section at the top il.add_xpath('name', '//meta[@property="og:title"]/@content') il.add_xpath('url', '//meta[@property="og:url"]/@content') il.add_xpath('latitude', '//meta[@property="og:latitude"]/@content') il.add_xpath('longitude', '//meta[@property="og:longitude"]/@content') il.add_xpath('address', '//meta[@property="og:street-address"]/@content') il.add_xpath('locality', '//meta[@property="og:locality"]/@content') il.add_xpath('postal_code', '//meta[@property="og:postal-code"]/@content') il.add_xpath('country', '//meta[@property="og:country-name"]/@content') # XPaths below are from the display il.add_xpath('name', '//span[@class="busname"]/text()') # No OG for this il.add_xpath('phone_number', '//span[@class="bustel"]/text()') il.add_xpath('website', '//a[@id="linkWebsite"]/@href') il.add_xpath('address', '//span[@data-yext="address.address"]/text()') il.add_xpath('locality', '//span[@itemprop="addressLocality"]/text()') il.add_xpath('postal_code', '//span[@itemprop="postalCode"]/text()') # Unicoded so it can share an input processor with the rest il.add_value('url', unicode(response.url)) return il.load_item()
def parse_item(self,response): l = ItemLoader(item=MeizituItem(), response=response) l.add_xpath('name', '//h2/a/text()') l.add_xpath('tags', "//div[@id='maincontent']/div[@class='postmeta clearfix']/div[@class='metaRight']/p") l.add_xpath('image_urls', "//div[@id='picture']/p/img/@src", Identity()) l.add_value('url', response.url) return l.load_item()
def parse(self, response): l = ItemLoader(item=JianshuArticleItem(), response=response) l.add_xpath( 'content', '//div[@class="article"]/div[@class="show-content"]/p/text()') l.add_value('url', response.url) return l.load_item()
def parse_item(self, response): sel = Selector(response) # collect xpaths of each player (row in table) rows = sel.xpath('/html//div[@class="table-container"]/table/tbody/tr') # loop through players for row in rows: loader = ItemLoader(SkatRTSItem(), selector=row) loader.default_input_processor = MapCompose() loader.default_output_processor = Join() # get unique NHL ID number from player's page URL num = row.xpath("td[2]/a/@href").extract() sNum = num[0][-7:] loader.add_value("nhl_num", sNum) # add season data loader.add_value("season", str(self.year)) # collect stats loader.add_xpath("hits", ".//td[6]/text()") loader.add_xpath("blocked_shots", ".//td[7]/text()") loader.add_xpath("missed_shots", ".//td[8]/text()") loader.add_xpath("giveaways", ".//td[9]/text()") loader.add_xpath("takeaways", ".//td[10]/text()") loader.add_xpath("faceoff_wins", ".//td[11]/text()") loader.add_xpath("faceoff_losses", ".//td[12]/text()") # feed item to pipeline yield loader.load_item()
def parse_item(self, response): loader = ItemLoader(ChinazItem(), response) loader.add_value("url", response.url) loader.add_xpath("name", u'//span[@id="spanwillchuanwebName"]/following-sibling::text()') loader.add_xpath("domain", u'//a[@id="linkUrl"]/text()') loader.add_xpath("homepage", u'//a[@id="linkUrl"]/@href') loader.add_xpath("founded", u'//span[.="建站时间:"]/following-sibling::text()') loader.add_xpath("company", u'//span[.="网站所属:"]/following-sibling::text()') loader.add_xpath("location", u'//span[.="所属地区:"]/following-sibling::a//text()') loader.add_xpath("founder", u'//span[.="创始人/团队:"]/following-sibling::text()') loader.add_xpath("categories", u'//span[.="网站类型:"]/following-sibling::a//text()') loader.add_xpath("rating", u'//td[b="用户评分:"]/following-sibling::td/img/@src', re=r"star_(\d)") loader.add_xpath("keywords", u'//td[starts-with(b, "关 键 词")]/following-sibling::td/a/text()') loader.add_xpath("brief", u'//td[b="网站简介:"]/following-sibling::td/text()') loader.add_xpath("alexa_rank", u'//span[.="Alexa排名:"]/following-sibling::text()') loader.add_xpath("baidu_weight", u'//td[.="百度权重:"]/following-sibling::td/img/@alt') loader.add_xpath("google_pagerank", u'//td[.="PR值:"]/following-sibling::td/img/@alt') loader.add_xpath("chinaz_rank", u'//td[@class="scored"]/span/text()') loader.add_xpath("backlink_num", u'//span[.="网站反链数: "]/following-sibling::text()') loader.add_xpath("keyword_num", u'//a[@id="tdgjcs"]/text()') loader.add_xpath("domain_birth", u'//span[.="域名年限:"]/following-sibling::text()', re=r":([0-9-]+)\)") loader.add_xpath("baidu_idx_num", u'//span[.="百度收录:"]/following-sibling::text()') loader.add_xpath("google_idx_num", u'//span[.="谷歌收录:"]/following-sibling::text()') loader.add_xpath("sogou_idx_num", u'//span[.="搜狗收录:"]/following-sibling::text()') loader.add_xpath( "introduction", u'//div[h3="公司简介"]/following-sibling::div[1]', MapCompose(Compose(remove_tags, unicode.strip)), ) loader.add_xpath("snapshot", u"//figure/img/@src", MapCompose(partial(urljoin, "http://top.chinaz.com/"))) return loader.load_item()
def parse_item(self, response): sel = Selector(response) # collect xpaths of each player (row in table) rows = sel.xpath('/html//div[@class="table-container"]/table/tbody/tr') for row in rows: loader = ItemLoader(GoalSTItem(), selector=row) loader.default_input_processor = MapCompose() loader.default_output_processor = Join() # get unique NHL ID number from player's page URL num = row.xpath('td[2]/a/@href').extract() sNum = num[0][-7:] loader.add_value('nhl_num', sNum) # add season data loader.add_value('season', str(self.year)) # collect additional stats loader.add_xpath('es_shots_against', './/td[6]/text()') loader.add_xpath('es_goals_against', './/td[7]/text()') loader.add_xpath('es_saves', './/td[8]/text()') loader.add_xpath('es_save_pct', './/td[9]/text()') loader.add_xpath('pp_shots_against', './/td[10]/text()') loader.add_xpath('pp_goals_against', './/td[11]/text()') loader.add_xpath('pp_saves', './/td[12]/text()') loader.add_xpath('pp_save_pct', './/td[13]/text()') loader.add_xpath('sh_shots_against', './/td[14]/text()') loader.add_xpath('sh_goals_against', './/td[15]/text()') loader.add_xpath('sh_saves', './/td[16]/text()') loader.add_xpath('sh_save_pct', './/td[17]/text()') # feed item to pipeline yield loader.load_item()
def parse_by_product(self, response): """ For the 'Bundles' category, grab the product details for the first product listed. """ self.selector = Selector(response) self.results = self.selector.xpath('//*[@id="ctl00_tdMainPanel"]') loader = ItemLoader(item = VisionsProduct(), selector = self.results[0]) self.field_xpaths = { 'product': ('div[contains(@class, "catalogueTitle")]' '/h3/text()'), 'price': ('div[@id="ctl00_ContentPlaceHolder1_pnl' 'Bundle"]/div[@id="divProductDetails"]/div' '[contains(@class, "priceAddToCart")]/div[1]/span' '[contains(@id, "SalePrice")]/text()') } # Extract and load product details loader.add_xpath('product', self.field_xpaths['product']) loader.add_xpath('price', self.field_xpaths['price'], re = '\$[\d]*[,]*[\d]*\.[\d]*') loader.add_value('availability', 'Not Limited/Clearance Item') # Because it's an individual product page, manually set the category self.category = '/'.join(['Home', response.url.split('/')[4]]) loader.add_value('category', self.category) yield loader.load_item()
def parse_item(self, response): sel = Selector(response) # collect xpaths of each player (row in table) rows = sel.xpath('/html//div[@class="table-container"]/table/tbody/tr') # loop through players for row in rows: loader = ItemLoader(SkatSOItem(), selector=row) loader.default_input_processor = MapCompose() loader.default_output_processor = Join() # get unique NHL ID number from player's page URL num = row.xpath("td[2]/a/@href").extract() sNum = num[0][-7:] loader.add_value("nhl_num", sNum) # add season data loader.add_value("season", str(self.year)) # collect stats loader.add_xpath("so_shots", ".//td[13]/text()") loader.add_xpath("so_goals", ".//td[14]/text()") loader.add_xpath("so_pct", ".//td[15]/text()") loader.add_xpath("game_deciding_goals", ".//td[16]/text()") # feed item to pipeline yield loader.load_item()
def get_new(self, response): sel = Selector(response) il = ItemLoader(item=New()) il.add_value('tema', ['Marketing y Publicidad']) il.add_value('titulo', sel.xpath('//h1[@class="glr-post-title glr-mb-10"]/text()').extract()) il.add_value('texto', sel.xpath('//div[@class="glr-post-entry"]').extract()) il.add_value('fecha', sel.xpath('//span[@class="glr-left glr-post-date"]/text()').extract()) il.add_value('keywords', sel.xpath('//div[@class="post-tags"]//a/text()').extract()) item = il.load_item() if 'keywords' in item: pass else: item['keywords'] = ['Marketing y Publicidad'] if 'fecha' in item: item['fecha'] = self.parse_date(item['fecha']) else: item['fecha'] = '10/05/2015' if 'titulo' in item: if 'texto' in item: yield item ''' item = New() item['tema'] = 'Marketing y Publicidad' item['titulo'] = self.parse_html(sel.xpath('//h1[@class="glr-post-title glr-mb-10"]/text()').extract()[0].strip()) item['texto'] = self.parse_html(sel.xpath('//div[@class="glr-post-entry"]').extract()[0].strip()) item['fecha'] = self.parse_date(sel.xpath('//span[@class="glr-left glr-post-date"]/text()').extract()[0].strip()) ''' #yield item '''res = []
def get_new(self, response): sel = Selector(response) il = ItemLoader(item=New()) il.add_value('tema', ['Marketing y Publicidad']) il.add_value('titulo', sel.xpath('//h1/text()').extract()) il.add_value('texto', sel.xpath('//div[contains(@class,"post-detalle")]').extract()) il.add_value('fecha', sel.xpath('//p[@itemprop="datePublished"]/text()').extract()) il.add_value('keywords', sel.xpath('//div[contains(@class,"tags")]/a/text()').extract()) item = il.load_item() if 'titulo' in item: pass else: print item['titulo'] print item['texto'] if 'keywords' in item: pass else: item['keywords'] = ['Marketing y Publicidad'] if 'fecha' in item: item['fecha'] = self.parse_date(item['fecha']) else: item['fecha'] = '10/05/2015' if 'titulo' in item: if 'texto' in item: yield item
def parse_item(self, response): # 解析http://www.meizitu.com/a/5336.html获取图片URL l = ItemLoader(item=MeizituItem(), response=response) l.add_xpath('image_urls', "//div[@id='picture']/p/img/@src", Identity()) l.add_value('url', response.url) return l.load_item()
def process_row(self, row, task): stats = self.crawler.stats l = ItemLoader(WV_DrillingPermit()) l.add_value(None, row) item = l.load_item() if item['API'] and item['permit_activity_type'] and item[ 'permit_activity_date']: existing_item = self.db.loadItem( item, { 'API': item['API'], 'permit_activity_type': item['permit_activity_type'], 'permit_activity_date': item['permit_activity_date'] }) if existing_item: stats.inc_value('_existing_count', spider=self) else: stats.inc_value('_new_count', spider=self) yield item dt = datetime.strptime(item['permit_activity_date'], '%Y-%m-%d %H:%M:%S') # if item['permit_activity_type'] in ('Permit Issued', 'Permit Commenced', 'Permit Completed'): if item['permit_activity_type'] in ( 'Permit Issued', 'Permits Issued' ) and datetime.now() - dt < timedelta(days=365): for item in self.create_feed_entry(item, task): yield item
def parsePage(self, response): rentHouse = ItemLoader(item = RentItem(), response = response) rentHouse.add_value('id', self.name + '-' + response.url.split('/')[-1].split('.')[0]) rentHouse.add_value('link', response.url) rentHouse.add_xpath('title', "//dl[@class = 'title']/dt/p/text()") return rentHouse.load_item()
def parse_review(self, response): sel = Selector(response) if not self._is_right_category(sel): self.log('Skip URL: %s' % response.url, level=log.INFO) return self.log('Parse URL: %s' % response.url, level=log.INFO) loader = ItemLoader(item=YelpReview(), selector=sel) loader.add_value('crawl_date', '%s' % datetime.utcnow()) loader.add_value('page_url', response.url) # Loop over all the fields we need to extract. for field, selector in self._item_selectors.iteritems(): loader.add_xpath(field, selector) master_review = loader.load_item() review_selectors = sel.xpath('//div[contains(@class, "review")][@itemprop="review"]') for rev_sel in review_selectors: review_loader = ItemLoader(item=master_review.copy(), selector=rev_sel) for field, selector in self._review_selectors.iteritems(): review_loader.add_xpath(field, selector) yield review_loader.load_item() return
def parse_item(self, response): sel = Selector(response) # collect xpaths of each player (row in table) rows = sel.xpath('/html//div[@class="table-container"]/table/tbody/tr') # prepare to adjust for shootout stats if necessary shootout = 0 if self.year > 2005: shootout = 1 # loop through players for row in rows: loader = ItemLoader(SkatEngItem(), selector=row) loader.default_input_processor = MapCompose() loader.default_output_processor = Join() # get unique NHL ID number from player's page URL num = row.xpath("td[2]/a/@href").extract() sNum = num[0][-7:] loader.add_value("nhl_num", sNum) # add season data loader.add_value("season", str(self.year)) # collect stats if shootout: loader.add_xpath("en_goals", ".//td[20]/text()") loader.add_xpath("ps_goals", ".//td[21]/text()") else: loader.add_xpath("en_goals", ".//td[21]/text()") loader.add_xpath("ps_goals", ".//td[22]/text()") # feed item to pipeline yield loader.load_item()
def parse(self, response): sel = Selector(response) # collect xpaths of each team (row in table) rows = sel.xpath('/html//div[@class="contentBlock"]/table/tbody/tr') # loop through teams for row in rows: loader = ItemLoader(StandingsItem(), selector=row) loader.default_input_processor = MapCompose() loader.default_output_processor = Join() # get team identifier team = row.xpath('td[2]/a[1]/@rel').extract() loader.add_value('team', team) # collect several other data points loader.add_xpath('division', './/td[3]/text()') loader.add_xpath('games_played', './/td[4]/text()') loader.add_xpath('wins', './/td[5]/text()') loader.add_xpath('losses', './/td[6]/text()') loader.add_xpath('ot_losses', './/td[7]/text()') loader.add_xpath('points', './/td[8]/text()') loader.add_xpath('row', './/td[9]/text()') # feed item to pipeline yield loader.load_item()
def parse_product(self, response): p = ItemLoader(item=Product(), response=response) p.add_css('nome', 'h1.livedata::text') p.add_value('url', response.url) p.add_css('descricaoLonga', '.desc-info') p.add_css('image', 'div.container-product-image a.image-link > img', re='src=[\"|\'](?P<src>[^\"\']+)[\"|\']') p.add_css('categorias', 'span[itemprop=title]::text') yield p.load_item() #executar no mongo #db.produto.remove({'categorias.0': {$exists: false}}) #db.produto.remove({'categorias.0': {$nin: [' Games', ' Livros', ' DVDs e Blu-ray']}}) #deleta produtos duplicados #var duplicates = []; #db.produto_novo.aggregate([ #{"$group" : { "_id": "$nome", "count": { "$sum": 1 }, "dups": { "$addToSet": "$_id" }, }}, #{"$match": {"_id" :{ "$ne" : null } , "count" : {"$gt": 1} } }] #,{allowDiskUse: true},{cursor:{}} #).result.forEach(function(doc) { #doc.dups.shift(); #doc.dups.forEach( function(dupId){ #duplicates.push(dupId); #} #) #}) #printjson(duplicates); #db.produto_novo.remove({_id:{$in:duplicates}})
def parse_detail(self, response): url = response.url item = ItemLoader(item=MeizituItem(), response=response) item.add_xpath("title", "//h2/a/text()") item.add_xpath("image_urls", '//div[@id="picture"]//img/@src') item.add_value('url', url) return item.load_item()
def get_product_details(self, response): crumbs = self.get_breadcrumbs(response) loader = ItemLoader(item=VisionsProduct()) loader.add_value('breadcrumbs', crumbs) loader.add_value('url', response.url) if isinstance(crumbs, basestring): loader.add_value('category', crumbs) # Ensure we aren't wasting time extracting from an empty page if extract_helper(response, self.EMPTY_PAGE_CHECK): for d in self.PRODUCT_DETAILS: if '_' not in d.name: # Don't load price loader.add_value(d.name, 'N/A') else: productDetails = detailsRunner(self.PRODUCT_DETAILS, response=response) if not productDetails['price']: productDetails['price'] = productDetails['price_gif'] productDetails.pop('price_gif') # Fix truncated image urls if productDetails['image']: productDetails['image'] = add_schema(response.url, productDetails['image']) for d in productDetails: loader.add_value(d, productDetails[d]) yield loader.load_item()
def parse_course_item(self, response): url_obj = urlparse(response.url) l = ItemLoader(item=CourseItem(), response=response) l.default_input_processor = MapCompose(unicode.strip) l.default_output_processor = TakeFirst() l.add_xpath('code', "/html/head/meta[@name='DC.Subject.ProgramCode']/@content") l.add_xpath('name', "/html/head/meta[@name='DC.Subject.Description.Short']/@content") l.add_xpath('career', "/html/head/meta[@name='DC.Subject.Level']/@content") l.year_in = Identity() l.add_value('year', ppath.basename(ppath.dirname(url_obj.path))) l.add_value('src_url', unicode(response.url)) l.add_xpath('uoc', "/html/head/meta[@name='DC.Subject.UOC']/@content") l.gened_in = MapCompose(unicode.strip, lambda s: s == 'Y') l.add_xpath('gened', "/html/head/meta[@name='DC.Subject.GenED']/@content") l.add_xpath('faculty', "/html/head/meta[@name='DC.Subject.Faculty']/@content") l.add_xpath('school', ( "//div[@class='column content-col']/div[@class='internalContentWrapper']" "/div[@class='summary']/p[strong[text()[contains(.,'School')]]]/a/text()")) l.add_xpath('campus', ( "//div[@class='column content-col']/div[@class='internalContentWrapper']" "/div[@class='summary']/p[strong[text()[contains(.,'Campus')]]]/text()")) l.add_xpath('prereqs_str', ( "//div[@class='column content-col']/div[@class='internalContentWrapper']" "/div[@class='summary']/p[text()[contains(.,'Prerequisite:')]]/text()"), re=r'Prerequisite:\s(.+)') l.add_xpath('eftsl', ( "//div[@class='column content-col']/div[@class='internalContentWrapper']" "/div[@class='summary']/p[strong[text()[contains(.,'EFTSL')]]]/text()")) l.add_xpath('description_markup', ( "//div[@class='column content-col']/div[@class='internalContentWrapper']" "/h2[text()='Description']/following-sibling::div")) course_item = l.load_item() yield course_item yield Request(url=response.xpath(("//div[@class='column content-col']/div[@class='internalContentWrapper']" "/div[@class='summary']//a[text()[contains(.,'Timetable')]]/@href")).extract()[0], callback=self.parse_class_item, meta=dict(course_identifier={k: course_item.get(k, None) for k in ('code', 'career', 'year', )}))
def parse_content(self, response): bbs_item_loader = ItemLoader(item=TutorialItem(), response=response) url = response.url html = response.body bbs_item_loader.add_value('url', url) bbs_item_loader.add_value('html', html) return bbs_item_loader.load_item()
def parse(self, response): item_list = [] for a in response.css(".menu_box .menu_main h2"): l = ItemLoader(item=CategoryItem(), response=response) # l.add_css('category', ".menu_box .menu_main h2") l.add_value("category", a.extract(), self.get_text) item_list.append(l.load_item()) return item_list
def parse_item(self, response): l = ItemLoader(item=MeizituItem(), response=response) # l.add_xpath('name', '//div[@class="postContent"]/div[@id="picture"]/p/a/text()') # l.add_xpath('tags', '//div[@class="postContent"]') l.add_xpath('img_url', '//div[@class="text"]/p/br/img/@src', Identity()) l.add_value('url', response.url) return l.load_item()
def parse_item(self, response): l = ItemLoader(item=MeizituItem(), response=response) l.add_xpath('name', '//h2/a/text()') l.add_xpath('tags', "//div[@id='maincontent']/div[@class='postmeta clearfix']/div[@class='metaRight']/p") l.add_xpath('image_urls', "//div[@id='picture']/p/img/@src", Identity()) l.add_value('url', response.url) return l.load_item()
def parse_item(self, response): l = ItemLoader(item=CrawlpictureItem(), response=response) l.add_xpath('name', '//h2/a/text()') l.add_css('tags', 'div.metaRight p::text') #l.add_xpath('image_urls','//div[@id="picture"]/p/img/@src' or '//img[@class="scrollLoading"]/@src',Identity()) l.add_css('image_urls', 'div.postContent img::attr(src)', Identity()) l.add_value('url', response.url) return l.load_item()
def parse_content(self, response): bbsItem_loader = ItemLoader(item=BbsItem(), response=response) url = str(response.url) bbsItem_loader.add_value("url", url) bbsItem_loader.add_xpath("forum", self._x_query["forum"]) bbsItem_loader.add_xpath("poster", self._x_query["poster"]) bbsItem_loader.add_xpath("content", self._x_query["page_content"]) return bbsItem_loader.load_item()
def parse(self, response): item_list = [] for a in response.css(".menu_box .menu_main h2"): l = ItemLoader(item=CategoryItem(), response=response) # l.add_css('category', ".menu_box .menu_main h2") l.add_value("category", a.extract(), self.get_text) item_list.append(l.load_item()) return item_list
def parse_item(self, response): l = ItemLoader(item=CoserItem(), response=response) l.add_xpath('name', "//h1[@class='js-post-title']/text()") l.add_xpath('info', "//div[@class='post__info']/div[@class='post__type post__info-group']/span/text()") urls = l.get_xpath('//img[@class="detail_std detail_clickable"]/@src') urls = [url.replace('/w650', '') for url in urls] l.add_value('image_urls', urls) l.add_value('url', response.url) return l.load_item()
def parse(self, response): sel = Selector(response) articulos = sel.xpath('/html/body/div[2]/div/div/div/div[1]/div[3]/div') for i, elem in enumerate(articulos): item = ItemLoader(Articulos(), elem) item.add_xpath('title', './/h3/text()') item.add_value('id', i) yield item.load_item()
def parse_content(self, response): bbsItem_loader = ItemLoader(item = ScrapyspiderItem(), response = response) url = str(response.url) bbsItem_loader.add_value('url', url) bbsItem_loader.add_xpath('forum', self._x_query['forum']) bbsItem_loader.add_xpath('poster', self._x_query['poster']) bbsItem_loader.add_xpath('content', self._x_query['page_content']) return bbsItem_loader.load_item()
def test_load_item_using_default_loader(self): i = TestItem() i['summary'] = u'lala' il = ItemLoader(item=i) il.add_value('name', u'marta') item = il.load_item() assert item is i self.assertEqual(item['summary'], u'lala') self.assertEqual(item['name'], [u'marta'])
def parse_item(self, response): self.logger.info("parse_item url %s" % response.url) l = ItemLoader(item=ImgDownloadItem(), response=response) l.add_xpath('name', '//h1[@class="article-title"]/a/text()') # l.add_xpath('tags', "//div[@id='maincontent']/div[@class='postmeta clearfix']/div[@class='metaRight']/p") l.add_xpath('image_urls', "//article[@class='article-content']/p/img/@src", Identity()) l.add_value('url', response.url) return l.load_item()
def parse(self, response): l = ItemLoader(item=CoserItem(), response=response) l.add_xpath('name', "//h1[@class='js-post-title']/text()") l.add_xpath('info', "//div[@class='post__info']/div[@class='post__type post__info-group']/span/text()") urls = l.get_xpath('//img[@class="detail_std detail_clickable"]/@src') urls = [url.replace('/w650', '') for url in urls] l.add_value('image_urls', urls) l.add_value('url', response.url) return l.load_item()
def test_load_item_using_default_loader(self): i = TestItem() i["summary"] = u"lala" il = ItemLoader(item=i) il.add_value("name", u"marta") item = il.load_item() assert item is i self.assertEqual(item["summary"], u"lala") self.assertEqual(item["name"], [u"marta"])
def parse_content(self, response): bbsItem_loader = ItemLoader(item=BbsItem(), response=response) url = str(response.url) bbsItem_loader.add_value('url', url) bbsItem_loader.add_xpath('forum', self._x_query['forum']) bbsItem_loader.add_xpath('poster', self._x_query['poster']) bbsItem_loader.add_xpath('content', self._x_query['page_content']) return bbsItem_loader.load_item()
def parse2(self, response): item = json.loads(response.body_as_unicode()) for i in range(len(item['list'])): data_tmp = item['list'][i] loader = ItemLoader(item=XqtestItem()) loader.add_value('title', data_tmp['data']) org = loader.load_item() yield org
def test_load_item_using_default_loader(self): i = TestItem() i['summary'] = u'lala' il = ItemLoader(item=i) il.add_value('name', u'marta') item = il.load_item() assert item is i self.assertEqual(item['summary'], u'lala') self.assertEqual(item['name'], [u'marta'])
def parse_product(self, response): p = ItemLoader(item=Product(), response=response) p.add_css('nome', 'h1 > span[itemprop=name]::text') p.add_value('url', response.url) p.add_css('descricaoLongaHtml','.infoProdBox') p.add_css('descricaoLonga','.infoProdBox') #p.add_css('detalhes','.ficha-tecnica table tr th::text, .ficha-tecnica table tr td::text') p.add_css('image','ul.a-carousel-list > li > img', re='src=[\"|\'](?P<src>[^\"\']+)[\"|\']') p.add_css('categorias','div[class=breadcrumb-box] span[itemprop=name]::text') yield p.load_item()
def parse(self, response): for e in response.xpath( '//table[@id="tbl_proxy_list"]//tr[count(td)=6]'): l = ItemLoader(ProxyHunterItem(), selector=e) l.add_value('prot', 'http') l.add_xpath('ip', 'td[1]', TakeFirst(), remove_tags, unicode.strip) l.add_xpath('port', 'td[2]', TakeFirst(), remove_tags, unicode.strip) yield l.load_item()
def parse_item(self, response): self.logger.info("parse_item url %s" % response.url) l = ItemLoader(item=ImgDownloadItem(), response=response) l.add_xpath('name', '//h1[@class="c333 subTitle"]/text()') l.add_xpath('desc', '//div[@class="txtmod"]/p/text()') l.add_value('url', response.url) l.add_xpath('image_urls', "//p[@class='tc mb10']/img/@src", Identity()) return l.load_item()
def parse_item2(self, response): l = ItemLoader(item=DmozItem(), response=response) l.add_xpath( 'type', '//div[@class="location ask_main_location"]/span[@class="fl"]/a[last()]/text()' ) l.add_xpath('type', '//div[@class="question"]/h2/text()') l.add_xpath('answer', '//div[@class="anwser"]/h2/text()') l.add_value('answer', '牛逼') yield l.load_item()
def parse(self, response): sel = Selector(response) preguntas = sel.xpath( '//div[@id="question-mini-list"]/div') #es una lista for i, elem in enumerate(preguntas): item = ItemLoader(Pregunta(), elem) #elem tiene el xpath item.add_xpath('pregunta', './/h3/a/text()') item.add_value('id', i) yield item.load_item()
def parsePost(self, response): l = ItemLoader(item=Post(), response=response) d = pyq(response.body) l.add_value('url', response.url) l.add_css('title', 'h1.entry-title::text') l.add_css('date', 'span.entry-date::text') l.add_css('author', 'span.author.vcard > a::text') l.add_value('content', d('div.entry-content').text()) return l.load_item()
def parse_item(self, response): parser = ItemLoader(item=HistDataItem()) parser.add_value('url', response.url) fields = ['tk', 'date', 'datemonth', 'platform', 'timeframe', 'fxpair'] for field in fields: parser.add_value(field, getValue('#'+field, response)) item = parser.load_item() formdata = dict(zip(fields, [item['tk'], item['date'], item['datemonth'], item['platform'], item['timeframe'], item['fxpair']])) request = scrapy.FormRequest.from_response(response, formnumber=0, formdata=formdata, callback=getData) return request
def _parse_item(self, response): print('--------------------start item : %s' % response.url) item = ItemLoader(item=MeizituItem(), response=response) item.add_xpath('title', ITEM_TITLE) item.add_xpath('tags', ITEM_TAGS) item.add_value('url', response.url) item.add_xpath('day', ITEM_DAY) item.add_xpath('month_year', ITEM_MONTH_YEAR) item.add_xpath('image_urls', ITEM_IMAGE_URLS) return item.load_item()
def Loader_index(self, item_selector): l = ItemLoader(item={}, selector=item_selector) conver_img = l.get_xpath('.//*[@class="lz_img"]/img/@src') l.add_xpath('title', './/*[@class="k_list-lb-2"]/div[1]/a[1]/text()') l.add_xpath('url', './/*[@class="k_list-lb-2"]/div[1]/a/@href') l.add_value('preview', conver_img) l.add_css('date', '#k_list-lb-2-f::text', re=r'(\d{4}-\d{2}-\d{2})') l.add_value('image_urls', conver_img) return l.load_item()
def parse(self, response): sel = Selector(response) lugares = sel.xpath('//div[@id=hotellist_inner]/div') #vamos a iterar sobre todas las preguntas for i, elem in enumerate(lugares): item = ItemLoader(Lugar(), elem) item.add_xpath('lugar', './/h3/a/span/text()') item.add_value('id', i) yield item.load_item()
def parse_product(self, response): p = ItemLoader(item=Product(), response=response) p.add_css('nome', 'h1 > span[itemprop=name]::text') p.add_value('url', response.url) p.add_css('descricaoLongaHtml', '.infoProdBox') p.add_css('descricaoLonga', '.infoProdBox') p.add_css('image', 'ul.a-carousel-list > li > img', re='src=[\"|\'](?P<src>[^\"\']+)[\"|\']') p.add_css('categorias', 'div[class=breadcrumb-box] span[itemprop=name]::text') yield p.load_item()
def parse_item(self, response): loader = ItemLoader(ChinazItem(), response) loader.add_value('url', response.url) loader.add_xpath( 'name', u'//span[@id="spanwillchuanwebName"]/following-sibling::text()') loader.add_xpath('domain', u'//a[@id="linkUrl"]/text()') loader.add_xpath('homepage', u'//a[@id="linkUrl"]/@href') loader.add_xpath('founded', u'//span[.="建站时间:"]/following-sibling::text()') loader.add_xpath('company', u'//span[.="网站所属:"]/following-sibling::text()') loader.add_xpath('location', u'//span[.="所属地区:"]/following-sibling::a//text()') loader.add_xpath('founder', u'//span[.="创始人/团队:"]/following-sibling::text()') loader.add_xpath('categories', u'//span[.="网站类型:"]/following-sibling::a//text()') loader.add_xpath('rating', u'//td[b="用户评分:"]/following-sibling::td/img/@src', re=r'star_(\d)') loader.add_xpath( 'keywords', u'//td[starts-with(b, "关 键 词")]/following-sibling::td/a/text()') loader.add_xpath('brief', u'//td[b="网站简介:"]/following-sibling::td/text()') loader.add_xpath('alexa_rank', u'//span[.="Alexa排名:"]/following-sibling::text()') loader.add_xpath('baidu_weight', u'//td[.="百度权重:"]/following-sibling::td/img/@alt') loader.add_xpath('google_pagerank', u'//td[.="PR值:"]/following-sibling::td/img/@alt') loader.add_xpath('chinaz_rank', u'//td[@class="scored"]/span/text()') loader.add_xpath('backlink_num', u'//span[.="网站反链数: "]/following-sibling::text()') loader.add_xpath('keyword_num', u'//a[@id="tdgjcs"]/text()') loader.add_xpath('domain_birth', u'//span[.="域名年限:"]/following-sibling::text()', re=r':([0-9-]+)\)') loader.add_xpath('baidu_idx_num', u'//span[.="百度收录:"]/following-sibling::text()') loader.add_xpath('google_idx_num', u'//span[.="谷歌收录:"]/following-sibling::text()') loader.add_xpath('sogou_idx_num', u'//span[.="搜狗收录:"]/following-sibling::text()') loader.add_xpath('introduction', u'//div[h3="公司简介"]/following-sibling::div[1]', MapCompose(Compose(remove_tags, unicode.strip))) loader.add_xpath( 'snapshot', u'//figure/img/@src', MapCompose(partial(urljoin, 'http://top.chinaz.com/'))) return loader.load_item()
def parse_page(self, response): el = ItemLoader(item=AcademicNewsItem(), response=response) el.add_xpath('title', "//div[@class='title']/h1/text()") el.add_xpath('time_pub', "//span[@class='datetime']/text()") el.add_value('time_get', datetime.datetime.today().__format__("%Y-%m-%d %H:%M:%S")) el.add_xpath('author', "//div[@class='clear author']/text()") el.add_xpath('publisher', "//div[@class='clear author']/a[@target='_blank']/text()") el.add_xpath('source', "//div[@class='clear author']/a[@target='_blank']/text()") el.add_xpath('classf', "//div[@id='location']/a/text()") # soup = BeautifulSoup(response.body) el.add_xpath('body', "//div[@id='zoom']") el.add_value('url', response.url) return el.load_item()
def parse_page(self, response): el = ItemLoader(item=NewsItem(), response=response) el.add_xpath("title", "//div[@class='title']/h1/text()") el.add_xpath("time_pub", "//span[@class='datetime']/text()") el.add_value("time_get", datetime.datetime.today().__format__("%Y-%m-%d %H:%M:%S")) el.add_xpath("author", "//div[@class='clear author']/text()") el.add_xpath("publisher", "//div[@class='clear author']/a[@target='_blank']/text()") el.add_xpath("source", "//div[@class='clear author']/a[@target='_blank']/text()") el.add_xpath("classf", "//div[@id='location']/a/text()") # soup = BeautifulSoup(response.body) el.add_xpath("body", "//div[@id='zoom']") el.add_value("url", response.url) return el.load_item()
def parse_detail(self, response): print("response.url===", response.url) #具体值 url = response.url #使用ItemLoader类 item = ItemLoader(item=Meizitu2Item(), response=response) item.add_xpath("tilte", "//h2/a/text()") item.add_xpath("image_urls", '//div[@id="picture"]//img/@src') #添加值的方式 item.add_value("url", url) return item.load_item()
def parse(self, response): new_albums = response.css(self.filter_css) count = 0 for new_album in new_albums: count += 1 if count > self.count_limit: raise CloseSpider('done') loader = ItemLoader(KuwoScrapyItem(), new_album) loader.add_value('basic_source_info', '{}') loader.add_css('basic_source_name', self.name_css, TakeFirst()) loader.add_css('basic_source_artist', self.artist_css, Join('&')) yield loader.load_item()
def _record_parse(self, response): fval = '//div[@class="innertube"]/table//tr[{0}]/td[2]{1}/text()' l = ItemLoader(item=AviationItem(), response=response) l.add_xpath('date', fval.format(2, "")) l.add_xpath('time', fval.format(3, "")) l.add_xpath('operator', fval.format(5, "/a")) l.add_xpath('flight_number', fval.format(20, "")) l.add_value('fatalities', response.meta['fatalities']) l.add_xpath('departure', fval.format(18, "/")) l.add_xpath('destination', fval.format(19, "/")) l.add_xpath('crash', fval.format(15, "/")) yield l.load_item()
def parse_item(self, response): l = ItemLoader(item=CoserItem(), response=response) l.add_xpath('name', "//div[@class='mb10 dib']/a/text()") l.add_xpath('info', "//div/p[@class='mb20']/text()") #l.add_xpath('image_urls',"//div[@class='content-img-wrap-inner']/img[@src]") l.add_value('url', response.url) # //div[@class='content-img-wrap']//img/@src # 抓不到,正则还是牛逼 urls = l.selector.re(r'src="(.+?.jpg)/w650') # urls = l.get_xpath("//div[@class='content-img-wrap']//img/@src") # urls = [url.replace('/w650', '') for url in urls] l.add_value('image_urls', urls) # l.add_xpath('image_urls',"//div/p[@class='mb20']/text()") yield l.load_item()
def parse(self, response): # crawl all display page for link in self.link_extractor['page_down'].extract_links(response): yield Request(url=link.url, callback=self.parse) print response.url self.browser.get(response.url) time.sleep(5) url = str(response.url) etaoItem_loader = ItemLoader(item=EtaoItem(), response=response) etaoItem_loader.add_value('url', url) etaoItem_loader.add_xpath('title', self._x_query['title']) etaoItem_loader.add_xpath('name', self._x_query['name']) etaoItem_loader.add_xpath('price', self._x_query['price']) yield etaoItem_loader.load_item()
def dishparse(self,response): #item = DinnerItem() #item['dish_id'] = response.meta['id'] #large_info = response.xpath("//div[@class='large_info']") #item['name'] = large_info.xpath("./div[@class='box']/h1/text()").extract() #item['tags'] = large_info.xpath("./div[contains(@class,'mgt20')]//a/text()").extract() #item['other'] = large_info.xpath("./ul//li/text()").extract() l = ItemLoader(item=DinnerItem(),response=response) l.add_value('dish_id',response.meta['id']) l.add_xpath('name',"//div[@class='large_info']/div[@class='box']/h1/text()") l.add_xpath('tags',"//div[@class='large_info']/div[contains(@class,'mgt20')]//a/text()") l.add_xpath('other',"//div[@class='large_info']/ul//li/text()") return l.load_item()
def parse_detail(self, response): l = ItemLoader(response.meta['item'], response) # l.add_xpath('fanhao','//span[@class="list_text"]/em/b/a/text()') l.add_xpath('image_name', '//span[@class="list_text"]/em/b/a/text()') photo = response.xpath( '//span[@class="list_img"]/a/img/@data-original').extract() # item = response.meta['item'] # item['fanhao'] = selector.xpath('//span[@class="list_text"]/em/b/a/text()').extract() # photo = selector.xpath('//span[@class="list_img"]/a/img/@data-original').extract() img = [] for p in photo: img.append('http://www.nh87.cn' + p) l.add_value('image_urls', img) # 返回item return l.load_item()