def parse_list(self,response): #Get seller attributes sel = Selector(response) self.page += 1 for s in sel.xpath(Seller.base_xpath): seller_loader = ItemLoader(Seller(),selector=s) # iterate over fields and add xpaths to the seller_loader seller_loader.add_value('page',self.page) seller_loader.add_value('flag','Seller') for key,value in Seller.item_fields.iteritems(): seller_loader.add_xpath(key,value) yield seller_loader.load_item() #Get commodity attributes for s in sel.xpath(Commodity.base_xpath): comm_loader = ItemLoader(Commodity(),selector=s) comm_loader.add_value('page',self.page) comm_loader.add_value('flag','Commodity') for key,value in Commodity.item_fields.iteritems(): comm_loader.add_xpath(key,value) yield comm_loader.load_item() #Next page if(sel.xpath(self.next_page_xpath)): yield Request("http://spu.taobao.com/spu/3c/detail.htm" + sel.xpath(self.next_page_xpath).extract()[0], callback=self.parse_list)
def parse(self,response): l = ItemLoader(item = timeItem(),response = response) #l.add_xpath('topnews','//*[@id="article-container"]/div/div[1]/section/div/article[*]/div/p/text()') l.add_xpath('topnews','//*[@id="article-container"]/div/div[1]/section/div/article[*]/div/h2/a/text()') l.add_xpath('topnews','//*[@id="article-container"]/div/div[1]/section/div/article[1]/div/div/div[2]/div[*]/h3/a/text()') l.add_xpath('sectionnews','//a[contains(@class,"home-columnists-title")]/text()') l.add_xpath('sectionnews','//a[contains(@data-event,"hp-news")]/text()') x = l.load_item() nytdict = dict() datelist = [] datalist = datetime.date.today() topnewslist = [] sectionnewslist = [] nytdict['date'] = str(datalist) for t in x['topnews']: topnewslist.append(str(t.encode('ascii','ignore')).strip()) nytdict['topnews']=topnewslist for t in x['sectionnews']: sectionnewslist.append(str(t.encode('ascii','ignore')).strip()) nytdict['sectionnews']=sectionnewslist filename = datetime.date.today() f=open('{}.json'.format(filename),'w') json.dump(nytdict,f) return l.load_item()
def parse_review(self, response): sel = Selector(response) if not self._is_right_category(sel): self.log('Skip URL: %s' % response.url, level=log.INFO) return self.log('Parse URL: %s' % response.url, level=log.INFO) loader = ItemLoader(item=YelpReview(), selector=sel) loader.add_value('crawl_date', '%s' % datetime.utcnow()) loader.add_value('page_url', response.url) # Loop over all the fields we need to extract. for field, selector in self._item_selectors.iteritems(): loader.add_xpath(field, selector) master_review = loader.load_item() review_selectors = sel.xpath('//div[contains(@class, "review")][@itemprop="review"]') for rev_sel in review_selectors: review_loader = ItemLoader(item=master_review.copy(), selector=rev_sel) for field, selector in self._review_selectors.iteritems(): review_loader.add_xpath(field, selector) yield review_loader.load_item() return
def parse_book_url(self, response): book_item = BookDetails(book_id="", book_type="pdf") bil = ItemLoader(item=book_item, response=response) bil.add_xpath("book_id", "/*//script/text()", re=r'bookId\s*:\s*(.*),.*') bil.add_xpath("book_path", "/*//script/text()", re=r'getDownloadUrl\s*:\s*\"(.*)\".*') #bil.get_xpath() bil.load_item() download_url = self.base_url + book_item['book_path'][0] post_data = "book_id=" + book_item['book_id'][ 0] + "&" + "type=" + book_item['book_type'] #post_data = "book_id=" + "2759" + "&" + "type=" + book_item['book_type'] #set header post_header = {} post_header[ "Content-Type"] = "application/x-www-form-urlencoded; charset=UTF-8" post_header["User-Agent"] = "Mozilla/5.0" #print post_header #print curl_cmd yield Request(download_url, self.get_book_link, headers=post_header, method='POST', body=post_data)
def get_new(self, response): sel = Selector(response) il = ItemLoader(item=New()) il.add_value('tema', ['Marketing y Publicidad']) il.add_value('titulo', sel.xpath('//h1[@itemprop="headline"]/a/text()').extract()) il.add_value('texto', sel.xpath('//div[@itemprop="articleBody"]').extract()) il.add_value('fecha', sel.xpath('//div[@itemprop="datePublished"]/text()').extract()) il.add_value('keywords', sel.xpath('//div[contains(@class,"nota-tags")]//h3/a/text()').extract()) item = il.load_item() if 'titulo' in item: pass else: iln = ItemLoader(item=New()) iln.add_value('tema', ['Marketing y Publicidad']) iln.add_value('titulo', sel.xpath('//h1/text()').extract()) iln.add_value('texto', sel.xpath('//div[@id="principal"]/div[@class="nota"]/div[3]').extract()) iln.add_value('fecha', sel.xpath('//div[@class="fecha-nota"]/text()').extract()) iln.add_value('keywords', sel.xpath('//div[contains(@class,"nota-tags")]//h3/a/text()').extract()) item = iln.load_item() if 'keywords' in item: pass else: item['keywords'] = ['Marketing y Publicidad'] if 'fecha' in item: item['fecha'] = self.parse_date(item['fecha']) else: item['fecha'] = '10/05/2015' if 'titulo' in item: if 'texto' in item: yield item
def parse_content(self, response): goods_loader = ItemLoader(item=AlibbItem(), response = response) url = str(response.url) goods_loader.add_value('url', url) goods_loader.add_value('url_hash',hashlib.sha1(url).hexdigest()) goods_loader.add_xpath('name', self._x_query['title'].encode('utf-8')) # detail data iDetailDataPattern=re.compile("iDetailData.*};",re.DOTALL) detail_data_list=response.xpath('//script').re(iDetailDataPattern) detail_data=detail_data_list[0].replace("iDetailData = {","{") detail_data=detail_data.replace("};","}") detail_data=detail_data.replace("\t|\n|\\","") detail_data_json=json.loads(detail_data) if len(detail_data_json)!=0: properties=detail_data_json['sku']['skuMap'].keys() goods_loader.add_value('properties',[property.replace(">",",") for property in properties]) for attribute in detail_data_json['sku']['skuProps']: attributes={} options=[value['name'] for value in attribute['value']] attributes['name']=attribute['prop'] attributes['options']=options goods_loader.add_value('attributes',attributes) else: goods_loader.add_value('attributes',"") price=response.xpath('//span[re:test(@class,"value price-length-\d$")]/text()').extract() goods_loader.add_value('price',price[0] if len(price)>0 else detail_data_json['sku']['price']) # detail information detail_info_list=response.xpath(self._x_query['detail_info']).extract() goods_loader.add_value('parameters', [list(info_list) for info_list in zip(detail_info_list[::2],detail_info_list[1::2])]) print goods_loader.load_item()['url'] # profile img profile_img_urls=response.xpath('//li/@data-imgs').re("original.*jpg") for urls in profile_img_urls: profile_img_url=urls.replace("original\":\"http","http") goods_loader.add_value("boothes",profile_img_url) # big img for link in response.xpath('//*[@id="desc-lazyload-container"]/@data-tfs-url').extract(): yield Request(url = link, meta={'item': goods_loader},callback=self.parse_content_down)
def parse_product(self, response): product_url = response.url # sel = self.selenium #sel.open(response.url) #time.sleep(2.5) selector = Selector(response) # //*[@id="product_detail_view_1"]/div/div[6]/div[2]/span[2] price = selector.xpath('//*[@id="product_detail_view_1"]/div/div[7]/div[2]/span[2]/text()').extract() if not price: price = selector.xpath('//*[@id="product_detail_view_1"]/div/div[6]/div[2]/span[2]/text()').extract() if not price: price = selector.xpath( '//*[@id="product_detail_view_1"]/div/div[5]/div[2]/span[2]/text()').extract() if not price: price = selector.xpath( '//*[@id="product_detail_view_1"]/div/div[4]/div[2]/span[2]/text()').extract() l = ItemLoader(item=BillionPricesIndiaItem(), response=response) l.add_xpath('product_name', '//*[@id="inner"]/div[1]/div[1]/div/div/text()') l.add_xpath('quantity', '//*[@id="product_detail_view_1"]/div/div[1]/div/text()') l.add_xpath('category', '//*[@id="inner"]/div[1]/div[1]/div/a[1]/text()') l.add_xpath('product', '//*[@id="inner"]/div[1]/div[1]/div/a[2]/text()') item = l.load_item() item['product_url'] = product_url item['price'] = price item['vendor'] ='Local Banya' item['city'] = 'Mumbai' item['state'] = 'Maharashtra' item['country'] = 'India' item['date']=str(time.strftime("%d/%m/%Y")) return item
def parse_item(self, response): sel = Selector(response) # collect xpaths of each player (row in table) rows = sel.xpath('/html//div[@class="table-container"]/table/tbody/tr') # loop through players for row in rows: loader = ItemLoader(SkatTOIItem(), selector=row) loader.default_input_processor = MapCompose() loader.default_output_processor = Join() # get unique NHL ID number from player's page URL num = row.xpath("td[2]/a/@href").extract() sNum = num[0][-7:] loader.add_value("nhl_num", sNum) # add season data loader.add_value("season", str(self.year)) # collect TOI stats after converting from m,mmm:ss to seconds i = 5 CATEG = ["es_toi", "sh_toi", "pp_toi", "toi"] while i < 12: i += 1 if i % 2 == 0: temp = row.xpath("td[" + str(i) + "]/text()").extract()[0] sTemp = temp.split(":") sTemp[0] = sTemp[0].replace(",", "") loader.add_value(CATEG[(i - 6) / 2], str(60 * int(sTemp[0]) + int(sTemp[1]))) else: pass # feed item to pipeline yield loader.load_item()
def parse_item(self, response): sel = Selector(response) # collect xpaths of each player (row in table) rows = sel.xpath('/html//div[@class="table-container"]/table/tbody/tr') # loop through players for row in rows: loader = ItemLoader(SkatSOItem(), selector=row) loader.default_input_processor = MapCompose() loader.default_output_processor = Join() # get unique NHL ID number from player's page URL num = row.xpath("td[2]/a/@href").extract() sNum = num[0][-7:] loader.add_value("nhl_num", sNum) # add season data loader.add_value("season", str(self.year)) # collect stats loader.add_xpath("so_shots", ".//td[13]/text()") loader.add_xpath("so_goals", ".//td[14]/text()") loader.add_xpath("so_pct", ".//td[15]/text()") loader.add_xpath("game_deciding_goals", ".//td[16]/text()") # feed item to pipeline yield loader.load_item()
def process_row(self, row, task): stats = self.crawler.stats l = ItemLoader(WV_DrillingPermit()) l.add_value(None, row) item = l.load_item() if item['API'] and item['permit_activity_type'] and item[ 'permit_activity_date']: existing_item = self.db.loadItem( item, { 'API': item['API'], 'permit_activity_type': item['permit_activity_type'], 'permit_activity_date': item['permit_activity_date'] }) if existing_item: stats.inc_value('_existing_count', spider=self) else: stats.inc_value('_new_count', spider=self) yield item dt = datetime.strptime(item['permit_activity_date'], '%Y-%m-%d %H:%M:%S') # if item['permit_activity_type'] in ('Permit Issued', 'Permit Commenced', 'Permit Completed'): if item['permit_activity_type'] in ( 'Permit Issued', 'Permits Issued' ) and datetime.now() - dt < timedelta(days=365): for item in self.create_feed_entry(item, task): yield item
def parse(self, response): l = ItemLoader(item=Problem(), response=response) d = pyq(response.body) l.add_value('id', response.url[-4:]) l.add_value('title', d('#content_body > center:nth-child(1) > span').text()) l.add_value('body', d('#content_body').text()) return l.load_item()
def parse(self, response): contents = response.xpath( '//ul[@class="note-list"]/li/div[@class="content"]') for content in contents: l = ItemLoader(item=JianshuSummaryItem(), selector=content, response=response) l.add_xpath( 'title', 'a[@class="title"]/text()', MapCompose(lambda i: i.replace('|', '.').replace('丨', '.'))) l.add_xpath('link', 'a[@class="title"]/@href', MapCompose(lambda i: urljoin(response.url, i))) l.add_xpath( 'author', 'div[@class="author"]/div[@class="info"]/a[@class="nickname"]/text()' ) l.add_xpath( 'author_url', 'div[@class="author"]/div[@class="info"]/a[@class="nickname"]/@href', MapCompose(lambda i: urljoin(response.url, i))) l.add_xpath( 'timestamp', 'div[@class="author"]/div[@class="info"]/span[@class="time"]/@data-shared-at' ) l.add_xpath('read', 'div[@class="meta"]/a[1]/text()[2]', MapCompose(str.strip, int)) l.add_xpath('reply', 'div[@class="meta"]/a[2]/text()[2]', MapCompose(str.strip, int)) l.add_xpath('like', 'div[@class="meta"]/span[1]/text()', MapCompose(str.strip, int)) l.add_xpath('money', 'div[@class="meta"]/span[2]/text()', MapCompose(str.strip, int)) yield l.load_item() pass
def parse_content(self, response): '''Parse content pages.''' loader = ItemLoader(item=Rede(), response=response) # Usually, we are only interested in the first item, e.g. for title, place, etc. loader.default_output_processor = TakeFirst() # Add fields loader.add_value('link', response.url) loader.add_css('title', '.text h1', extract_text) # Test if text has an abstract abstract = response.css('.abstract') if abstract: loader.add_css('abstract', '.abstract', extract_text) loader.add_css('text', '.abstract ~ p:not(.picture)', extract_text, Join('\n')) else: loader.add_css('text', '.text p:not(.picture)', extract_text, Join('\n')) # Metadata are in dt/dd pairs. keys = response.css('dl dt::text').extract() values = response.css('dl dd::text').extract() for key, value in zip(keys, values): if key == 'Datum:': match = re.search(r'(\d{1,2}\.\d{1,2}\.\d{2,4})', value) if match: # '22.03.2011' format value = match.group(1) dt = datetime.strptime(value.encode(ENC), '%d.%m.%Y') else: # '22. März 2011' format dt = datetime.strptime(value.encode(ENC), '%d. %B %Y') loader.add_value('date', dt.date()) elif key == 'Ort:': loader.add_value('place', value) return loader.load_item()
def parse_items(self, response): item = ItemLoader(Articulos(), response) item.add_xpath('title', '//*[@id="MainContainer"]/article/section[1]/div[1]/div/h2/text()') item.add_xpath('description', '//*[@id="MainContainer"]/article/section[1]/div[2]/ul/li[3]/text()') yield item.load_item() # scrapy runspider multiplepages.py -o ../../resources/computrabajo.csv -t csv
def create_tag(self, feed_entry_id, tag, comment=''): # TODO: create tags l = ItemLoader(FeedEntryTag()) l.add_value('feed_entry_id', feed_entry_id) l.add_value('tag', tag) l.add_value('comment', comment) return l.load_item()
def parse_item(self, response): # 解析http://www.meizitu.com/a/5336.html获取图片URL l = ItemLoader(item=MeizituItem(), response=response) l.add_xpath('image_urls', "//div[@id='picture']/p/img/@src", Identity()) l.add_value('url', response.url) return l.load_item()
def parse_detail(self, response): url = response.url item = ItemLoader(item=MeizituItem(), response=response) item.add_xpath("title", "//h2/a/text()") item.add_xpath("image_urls", '//div[@id="picture"]//img/@src') item.add_value('url', url) return item.load_item()
def parse(self, response): data = json.loads(response.text) for key in data.keys(): dataLenth = len(data[key]) logging.info("total size: " + str(dataLenth)) for i in range(1, dataLenth): logging.info("curIndex: " + str(i)) content = {} logging.info("title: " + data[key][i]["title"]) loader = ItemLoader(item=NetNews(), response=response) content["title"] = data[key][i]["title"] loader.add_value("news_title", content["title"]) content["digest"] = data[key][i]["digest"] loader.add_value("news_digest", content["digest"]) # print "recSource: "+data[key][i]["recSource"] content["label"] = data[key][i]["recSource"] loader.add_value("news_label", content["label"]) content["imageUrl"] = data[key][i]["imgsrc"] loader.add_value("news_img_url", content["imageUrl"]) content["id"] = data[key][i]["id"] loader.add_value("news_id", content["id"]) detail = self.getNewsDetail(content["id"]) # print "body: "+detail if (len(detail) != 0): loader.add_value("news_detail", detail) # 添加时间戳 loader.add_value("timestamp", self.get_timestamp()) yield loader.load_item()
def get_new(self, response): sel = Selector(response) il = ItemLoader(item=New()) il.add_value('tema', ['Marketing y Publicidad']) il.add_value('titulo', sel.xpath('//h1[@class="glr-post-title glr-mb-10"]/text()').extract()) il.add_value('texto', sel.xpath('//div[@class="glr-post-entry"]').extract()) il.add_value('fecha', sel.xpath('//span[@class="glr-left glr-post-date"]/text()').extract()) il.add_value('keywords', sel.xpath('//div[@class="post-tags"]//a/text()').extract()) item = il.load_item() if 'keywords' in item: pass else: item['keywords'] = ['Marketing y Publicidad'] if 'fecha' in item: item['fecha'] = self.parse_date(item['fecha']) else: item['fecha'] = '10/05/2015' if 'titulo' in item: if 'texto' in item: yield item ''' item = New() item['tema'] = 'Marketing y Publicidad' item['titulo'] = self.parse_html(sel.xpath('//h1[@class="glr-post-title glr-mb-10"]/text()').extract()[0].strip()) item['texto'] = self.parse_html(sel.xpath('//div[@class="glr-post-entry"]').extract()[0].strip()) item['fecha'] = self.parse_date(sel.xpath('//span[@class="glr-left glr-post-date"]/text()').extract()[0].strip()) ''' #yield item '''res = []
def parse(self, response): for sel in response.css("ul#channels-browse-content-grid > li"): loader = ItemLoader(YoutubeVideo(), selector=sel) loader.add_xpath('link', './/h3/a/@href') yield loader.load_item()
def parse(self, response): content = response.body page = response.url.split("/")[-1] """ content = Selector(response=response).xpath("//div[@class='body textStyle']").extract() if (len(content)): content = content[0] #踢除标签 strip = StripTags() content = strip.filterTags(content) #写文件 filename = 'quotes-%s' % page with open(filename, 'w') as f: f.write(str(content)) self.log('Saved file %s' % filename) """ loader = ItemLoader(item=TutorialItem(), response=response) loader.add_xpath('title', "//title/text()") loader.add_xpath('content', "//div[@class='body textStyle']") data = loader.load_item() downFile = DownFile(data['content'][0], 'http://www.admin10000.com') downFile.downImgFile() mongo = Mongo("articles") mongo.setTable("admin10000") content = data['content'][0] # 踢除标签 strip = StripTags() content = strip.filterTags(content) article = {'title': data['title'][0], 'content': content} mongo.add(article)
def parse_by_product(self, response): """ For the 'Bundles' category, grab the product details for the first product listed. """ self.selector = Selector(response) self.results = self.selector.xpath('//*[@id="ctl00_tdMainPanel"]') loader = ItemLoader(item = VisionsProduct(), selector = self.results[0]) self.field_xpaths = { 'product': ('div[contains(@class, "catalogueTitle")]' '/h3/text()'), 'price': ('div[@id="ctl00_ContentPlaceHolder1_pnl' 'Bundle"]/div[@id="divProductDetails"]/div' '[contains(@class, "priceAddToCart")]/div[1]/span' '[contains(@id, "SalePrice")]/text()') } # Extract and load product details loader.add_xpath('product', self.field_xpaths['product']) loader.add_xpath('price', self.field_xpaths['price'], re = '\$[\d]*[,]*[\d]*\.[\d]*') loader.add_value('availability', 'Not Limited/Clearance Item') # Because it's an individual product page, manually set the category self.category = '/'.join(['Home', response.url.split('/')[4]]) loader.add_value('category', self.category) yield loader.load_item()
def parse(self, response): sel = response.xpath('.//*[@class="post_info"]') if not sel: self.log('posts are not find') return self.group_id = response.xpath('.//div[@id="group_followers"]/a/@href').re('group.=(\d+?)$')[0] for s in sel: wall_text = s.xpath('div[@class="wall_text"]') text = wall_text.xpath('div/div[@class="wall_post_text"]').extract() spam_words = get_spam_words_from_msg(text, self.spam_words_from_file) if spam_words: l = ItemLoader(item=VkItem(), selector=s, response=response) date = s.xpath('div[@class="replies"]/div/small/a[1]/span/text()').extract() date = l.get_value(date, MapCompose(normalize_date), TakeFirst()) if is_date_less_last_date(date, self.days_count_to_parse): return l.add_value('id', wall_text.xpath('div/a/@data-from-id').extract()) l.add_value('name', wall_text.xpath('div/a/text()').extract()) l.add_value('text', text) l.add_value('date', date) l.add_value('words', spam_words) yield l.load_item() #ban => Request() replies_hidden = s.xpath('.//a[@class="wr_header"]/@onclick') if replies_hidden: url = get_url_hided_replies(replies_hidden[0].extract(), self.main_page) yield Request(url=url, callback=self.get_hided_items) else: replies = s.xpath('.//div[@class="reply_table"]').extract() for reply in replies: raw_html = ''.join(reply.splitlines()).encode('utf-8') html_response = HtmlResponse(url=response.url, body=raw_html) for i in self.get_replies_items(html_response): yield i.load_item() yield Request(url=self.get_next_msgs_url(), method='POST', callback=self.parse, body=self.get_post_body_for_next_msgs())
def get_new(self, response): sel = Selector(response) il = ItemLoader(item=New()) il.add_value('tema', ['Marketing y Publicidad']) il.add_value('titulo', sel.xpath('//h1/text()').extract()) il.add_value('texto', sel.xpath('//div[contains(@class,"post-detalle")]').extract()) il.add_value('fecha', sel.xpath('//p[@itemprop="datePublished"]/text()').extract()) il.add_value('keywords', sel.xpath('//div[contains(@class,"tags")]/a/text()').extract()) item = il.load_item() if 'titulo' in item: pass else: print item['titulo'] print item['texto'] if 'keywords' in item: pass else: item['keywords'] = ['Marketing y Publicidad'] if 'fecha' in item: item['fecha'] = self.parse_date(item['fecha']) else: item['fecha'] = '10/05/2015' if 'titulo' in item: if 'texto' in item: yield item
def create_tag_items (self, task_id, item_id): tag_items = [] l = ItemLoader (FeedEntryTag()) l.add_value ('feed_entry_id', item_id) l.add_value ('tag', 'NRC') tag_items.append(l.load_item()) nrc_tags = self.db.loadNrcTags(task_id) for t in nrc_tags: l = ItemLoader (FeedEntryTag()) l.add_value ('feed_entry_id', item_id) l.add_value ('tag', t['tag']) l.add_value ('comment', t['comment']) tag_items.append(l.load_item()) return tag_items
def parse_movie(self, response): loader = ItemLoader(item=MovieItem(), response=response) loader.add_xpath( 'name', '//div[@id="title-overview-widget"]/div[2]/div[2]/div/div[2]/div[2]/h1/text()' ) loader.add_xpath('year', "//h1/span[@id='titleYear']/a/text()") loader.add_xpath( 'rate', "//div[@id='title-overview-widget']/div[2]/div[2]/div/div[1]/div[1]/div[1]/strong/span/text()" ) loader.add_xpath('director', "//div[2]/div[1]/div[2]/span/a/span/text()") loader.add_xpath('director', "//div[3]/div[1]/div[2]/span/a/span/text()") loader.add_xpath('storyline', "//div[@id='titleStoryLine']/div[1]/p/text()") user_review_url = response.xpath( "//div[@id='titleUserReviewsTeaser']/div/div[3]/a[2]/@href" ).extract() item = loader.load_item() user_review_another_url = response.xpath( "//div[@id='titleUserReviewsTeaser']/div/div[2]/a[2]/@href" ).extract() if user_review_url or user_review_another_url: full_url = 0 if not user_review_another_url: full_url = urljoin(response.url, user_review_url.pop()) elif not user_review_url: full_url = urljoin(response.url, user_review_another_url.pop()) request = Request(urljoin(response.url, full_url), callback=self.parse_audience_review) request.meta['item'] = item return request return item
def parse(self, response): items = ItemLoader(item=XsContentItem(), response=response) #章节标题 items.add_xpath('title', '//*[@class="bookname"]/h1/text()') #正文 items.add_xpath('text', '//*[@id="content"]/text()') yield items.load_item()
def parse_item(self, response): sel = Selector(response) # collect xpaths of each player (row in table) rows = sel.xpath('/html//div[@class="table-container"]/table/tbody/tr') # prepare to adjust for shootout stats if necessary shootout = 0 if self.year > 2005: shootout = 1 # loop through players for row in rows: loader = ItemLoader(SkatEngItem(), selector=row) loader.default_input_processor = MapCompose() loader.default_output_processor = Join() # get unique NHL ID number from player's page URL num = row.xpath("td[2]/a/@href").extract() sNum = num[0][-7:] loader.add_value("nhl_num", sNum) # add season data loader.add_value("season", str(self.year)) # collect stats if shootout: loader.add_xpath("en_goals", ".//td[20]/text()") loader.add_xpath("ps_goals", ".//td[21]/text()") else: loader.add_xpath("en_goals", ".//td[21]/text()") loader.add_xpath("ps_goals", ".//td[22]/text()") # feed item to pipeline yield loader.load_item()
def parse_item(self, response): sel = Selector(response) # collect xpaths of each player (row in table) rows = sel.xpath('/html//div[@class="table-container"]/table/tbody/tr') # loop through players for row in rows: loader = ItemLoader(SkatRTSItem(), selector=row) loader.default_input_processor = MapCompose() loader.default_output_processor = Join() # get unique NHL ID number from player's page URL num = row.xpath("td[2]/a/@href").extract() sNum = num[0][-7:] loader.add_value("nhl_num", sNum) # add season data loader.add_value("season", str(self.year)) # collect stats loader.add_xpath("hits", ".//td[6]/text()") loader.add_xpath("blocked_shots", ".//td[7]/text()") loader.add_xpath("missed_shots", ".//td[8]/text()") loader.add_xpath("giveaways", ".//td[9]/text()") loader.add_xpath("takeaways", ".//td[10]/text()") loader.add_xpath("faceoff_wins", ".//td[11]/text()") loader.add_xpath("faceoff_losses", ".//td[12]/text()") # feed item to pipeline yield loader.load_item()
def parse_item(self, response): """This method will not populate such fields: locality, mobile_number, country, email """ il = ItemLoader(item=UKBusinessItem(), response=response) il.add_value('url', unicode(response.url)) il.add_xpath('name', '//h3[@class="biz"]/text()') il.add_xpath('category', '//div[@id="breadcrumbs"]/a[2]/text()') bcon_list = response.xpath('//ul[@class="bcon"]/li') for li in bcon_list: li_text = cond_set_value(li.xpath('.//b/text()').extract()) if li_text == 'Tel:': phone_number = cond_set_value(li.xpath('text()').extract()) il.add_value('phone_number', phone_number) if li_text == 'Web:': website = cond_set_value(li.xpath('.//a/text()').extract()) il.add_value('website', website) if li_text == 'Fax:': fax_number = cond_set_value(li.xpath('text()').extract()) il.add_value('fax_number', fax_number) address_list = response.xpath('//ul[@class="bad"]/li/text()').extract() if address_list: address_without_postal_code = u', '.join(address_list[:-1]) postal_code = address_list[-1] il.add_value('address', address_without_postal_code) il.add_value('postal_code', postal_code) il.add_xpath('latitude', '//div[@id="lat"]/text()') il.add_xpath('longitude', '//div[@id="lng"]/text()') return il.load_item()
def detail(self, response): log.msg(response.url) hxs = HtmlXPathSelector(response) product_name = hxs.xpath('//*[@id="vip_content_section"]/div[2]/h1/text()').extract() # //*[@id="vip_content_section"]/div[2]/h1 if (len(product_name) != 0): product_name = hxs.xpath('//*[@id="vip_content_section"]/div[2]/h1/text()').extract()[0] product_price = hxs.xpath('//*[@id="price-val"]/text()').extract() if (len(product_price) != 0): product_price = hxs.xpath('//*[@id="price-val"]/text()').extract()[0] if (len(product_price) != 0 or product_price != None) and (len(product_name) or product_name != None): l = ItemLoader(item=BillionPricesIndiaItem(), response=response) l.add_xpath('product_name', '//*[@id="vip_content_section"]/div[2]/h1/text()') # l.add_xpath('quantity', '//*[@id="product_detail_view_1"]/div/div[1]/div/text()') l.add_xpath('category', '//*[@id="cat_crum"]/@value') l.add_xpath('product', '//*[@id="overview_tab"]/div/div/p/text()') item = l.load_item() item['product_url'] = response.url item['price'] = product_price item['vendor'] = 'PepperFry' item['city'] = 'Mumbai' item['state'] = 'Maharashtra' item['country'] = 'India' item['date'] = str(time.strftime("%d/%m/%Y")) return item
def parse_item(self, response): xpath = './/div[@class="content_left"]' sel = response.xpath(xpath) if not sel: return l = ItemLoader(item=HabrahabrItem(), selector=sel, response=response) l.add_xpath('title', '//h1/span/text()') l.add_xpath('image_urls', '//div[@class="content html_format"]/img/@src') comments_items = [] comments = sel.xpath('//div[starts-with(@class, "message html_format")]').extract() for comment in comments: comment_item = ItemLoader(item=HabrahabrComment(), selector=sel, response=response) comment_item.add_value('comment', comment) comments_items.append(comment_item.load_item()) l.add_value('comments', comments_items) yield l.load_item()
def detail(self, response): log.msg(response.url) hxs = HtmlXPathSelector(response) product_name = hxs.xpath( '//*[@id="vip_content_section"]/div[2]/h1/text()').extract() # //*[@id="vip_content_section"]/div[2]/h1 if (len(product_name) != 0): product_name = hxs.xpath( '//*[@id="vip_content_section"]/div[2]/h1/text()').extract()[0] product_price = hxs.xpath('//*[@id="price-val"]/text()').extract() if (len(product_price) != 0): product_price = hxs.xpath( '//*[@id="price-val"]/text()').extract()[0] if (len(product_price) != 0 or product_price != None) and (len(product_name) or product_name != None): l = ItemLoader(item=BillionPricesIndiaItem(), response=response) l.add_xpath('product_name', '//*[@id="vip_content_section"]/div[2]/h1/text()') # l.add_xpath('quantity', '//*[@id="product_detail_view_1"]/div/div[1]/div/text()') l.add_xpath('category', '//*[@id="cat_crum"]/@value') l.add_xpath('product', '//*[@id="overview_tab"]/div/div/p/text()') item = l.load_item() item['product_url'] = response.url item['price'] = product_price item['vendor'] = 'PepperFry' item['city'] = 'Mumbai' item['state'] = 'Maharashtra' item['country'] = 'India' item['date'] = str(time.strftime("%d/%m/%Y")) return item
def get_product_details(self, response): crumbs = self.get_breadcrumbs(response) loader = ItemLoader(item=VisionsProduct()) loader.add_value('breadcrumbs', crumbs) loader.add_value('url', response.url) if isinstance(crumbs, basestring): loader.add_value('category', crumbs) # Ensure we aren't wasting time extracting from an empty page if extract_helper(response, self.EMPTY_PAGE_CHECK): for d in self.PRODUCT_DETAILS: if '_' not in d.name: # Don't load price loader.add_value(d.name, 'N/A') else: productDetails = detailsRunner(self.PRODUCT_DETAILS, response=response) if not productDetails['price']: productDetails['price'] = productDetails['price_gif'] productDetails.pop('price_gif') # Fix truncated image urls if productDetails['image']: productDetails['image'] = add_schema(response.url, productDetails['image']) for d in productDetails: loader.add_value(d, productDetails[d]) yield loader.load_item()
def get_app(self, response): il = ItemLoader(item=PlayStoreItems(), response=response) il.add_css('app_id', '.details-wrapper::attr(data-docid)') il.add_css('name', '.document-title div::text') il.add_css('category', '.category span::text') il.add_css( 'category_url', '.category::attr(href)', Compose(lambda urls: [urljoin(response.url, url) for url in urls])) il.add_css('price', '.details-actions .price span::text') il.add_css('offers_in_app_purchases', '.inapp-msg::text') il.add_css('stars_count', '.stars-count::text') il.add_css('video', '.details-trailer > span::attr(data-video-url)') il.add_css('screenshots', '.screenshot::attr(src)') il.add_xpath( 'description', '//div[contains(@class, "show-more-content")]/div//text()') il.add_css('update_date', '[itemprop="datePublished"]::text') il.add_css('file_size', '[itemprop="fileSize"]::text') il.add_css('installs', '[itemprop="numDownloads"]::text') il.add_css('current_version', '[itemprop="softwareVersion"]::text') il.add_css('requires_android', '[itemprop="operatingSystems"]::text') il.add_css('offered_by', '[itemprop="author"] > a span::text') il.add_css( 'offered_by_url', '[itemprop="author"] > a::attr(href)', Compose(lambda urls: [urljoin(response.url, url) for url in urls])) yield il.load_item()
def parse(self, response): l = ItemLoader(item=JianshuArticleItem(), response=response) l.add_xpath( 'content', '//div[@class="article"]/div[@class="show-content"]/p/text()') l.add_value('url', response.url) return l.load_item()
def parse_item(self, response): #l=用ItemLoader载入MeizituItem() re = [] l = ItemLoader(item=MeizituItem(), response=response) #名字 l.add_xpath('name', '//h2/a/text()') #标签 l.add_xpath('tags', "//div[@id='maincontent']/div[@class='postmeta clearfix']/div[@class='metaRight']/p") #图片连接 l.add_xpath('image_urls', "//div[@id='picture']/p/img/@src", Identity()) #url l.add_value('url', response.url) re.append(l.load_item()) print re #return re return l.load_item()
def create_tag (self, feed_entry_id, tag, comment = ''): # TODO: create tags l = ItemLoader (FeedEntryTag()) l.add_value ('feed_entry_id', feed_entry_id) l.add_value ('tag', tag) l.add_value ('comment', comment) return l.load_item()
def parse_item(self, response): sel = Selector(response) # collect xpaths of each player (row in table) rows = sel.xpath('/html//div[@class="table-container"]/table/tbody/tr') for row in rows: loader = ItemLoader(GoalSTItem(), selector=row) loader.default_input_processor = MapCompose() loader.default_output_processor = Join() # get unique NHL ID number from player's page URL num = row.xpath('td[2]/a/@href').extract() sNum = num[0][-7:] loader.add_value('nhl_num', sNum) # add season data loader.add_value('season', str(self.year)) # collect additional stats loader.add_xpath('es_shots_against', './/td[6]/text()') loader.add_xpath('es_goals_against', './/td[7]/text()') loader.add_xpath('es_saves', './/td[8]/text()') loader.add_xpath('es_save_pct', './/td[9]/text()') loader.add_xpath('pp_shots_against', './/td[10]/text()') loader.add_xpath('pp_goals_against', './/td[11]/text()') loader.add_xpath('pp_saves', './/td[12]/text()') loader.add_xpath('pp_save_pct', './/td[13]/text()') loader.add_xpath('sh_shots_against', './/td[14]/text()') loader.add_xpath('sh_goals_against', './/td[15]/text()') loader.add_xpath('sh_saves', './/td[16]/text()') loader.add_xpath('sh_save_pct', './/td[17]/text()') # feed item to pipeline yield loader.load_item()
def parse_news(self, response): item = ItemLoader(item=NewsItem(), response=response) item.add_value('url', response.url) item.add_value('title', response.xpath("//h1[@class='single-post__title']/text()").extract()[0]) item.add_value('content', response.xpath("//section[@class='article']/p/text()").extract()) return item.load_item()
def parse(self, response): l = ItemLoader(item=MyItem(), response=response) l.add_xpath( "title", """//div[@class="carousel"]/div[@class="songlist-slides slide-page"]/ul[@class="list-songlist slide-item"]/li[@class="songlist-item"]/a[@class="lnk-songlist"]/@title""", ) return l.load_item()
def parse_product(self, response): p = ItemLoader(item=Product(), response=response) p.add_css('nome', 'h1.livedata::text') p.add_value('url', response.url) p.add_css('descricaoLonga', '.desc-info') p.add_css('image', 'div.container-product-image a.image-link > img', re='src=[\"|\'](?P<src>[^\"\']+)[\"|\']') p.add_css('categorias', 'span[itemprop=title]::text') yield p.load_item() #executar no mongo #db.produto.remove({'categorias.0': {$exists: false}}) #db.produto.remove({'categorias.0': {$nin: [' Games', ' Livros', ' DVDs e Blu-ray']}}) #deleta produtos duplicados #var duplicates = []; #db.produto_novo.aggregate([ #{"$group" : { "_id": "$nome", "count": { "$sum": 1 }, "dups": { "$addToSet": "$_id" }, }}, #{"$match": {"_id" :{ "$ne" : null } , "count" : {"$gt": 1} } }] #,{allowDiskUse: true},{cursor:{}} #).result.forEach(function(doc) { #doc.dups.shift(); #doc.dups.forEach( function(dupId){ #duplicates.push(dupId); #} #) #}) #printjson(duplicates); #db.produto_novo.remove({_id:{$in:duplicates}})
def parse_parts2(self, response): log.msg("\tparse_parts time: %s" % int(time.time()), level=log.DEBUG) ua = response.request.headers['User-Agent'] log.msg("\tua: %s" % ua, level=log.DEBUG) for part in response.css('table.parts > tbody > tr'): il = ItemLoader(item=CarPart(), selector=part) il.add_xpath('shop_city', "td[@class='shop']/a/text()") il.add_xpath('shop_name', "td[@class='shop']/a/strong/text()") shop_url = il.get_xpath("td[@class='shop']/a/@href", TakeFirst()) photo_url = il.get_xpath("td[@class='photo']/a/@href", TakeFirst()) il.add_value('shop_url', urljoin(self.main_url, shop_url)) il.add_value('ext_link', urljoin(self.main_url, photo_url)) il.add_xpath('info', "td[@class='info']//text()") il.add_xpath('price', "td[@class='price']//text()") il.add_value('brand', response.meta.get('brand')) il.add_value('model', response.meta.get('model')) il.add_value('car_part', response.meta.get('car_part')) il.add_value('category', response.meta.get('category')) item = il.load_item() if item.is_valid(): yield item
def parse_item(self, response): request_again = self.error_handler(response) if request_again: return request_again il = ItemLoader(item=UKBusinessItem(), response=response) # From the OG section at the top il.add_xpath('name', '//meta[@property="og:title"]/@content') il.add_xpath('url', '//meta[@property="og:url"]/@content') il.add_xpath('latitude', '//meta[@property="og:latitude"]/@content') il.add_xpath('longitude', '//meta[@property="og:longitude"]/@content') il.add_xpath('address', '//meta[@property="og:street-address"]/@content') il.add_xpath('locality', '//meta[@property="og:locality"]/@content') il.add_xpath('postal_code', '//meta[@property="og:postal-code"]/@content') il.add_xpath('country', '//meta[@property="og:country-name"]/@content') # XPaths below are from the display il.add_xpath('name', '//span[@class="busname"]/text()') # No OG for this il.add_xpath('phone_number', '//span[@class="bustel"]/text()') il.add_xpath('website', '//a[@id="linkWebsite"]/@href') il.add_xpath('address', '//span[@data-yext="address.address"]/text()') il.add_xpath('locality', '//span[@itemprop="addressLocality"]/text()') il.add_xpath('postal_code', '//span[@itemprop="postalCode"]/text()') # Unicoded so it can share an input processor with the rest il.add_value('url', unicode(response.url)) return il.load_item()
def parse_course_item(self, response): url_obj = urlparse(response.url) l = ItemLoader(item=CourseItem(), response=response) l.default_input_processor = MapCompose(unicode.strip) l.default_output_processor = TakeFirst() l.add_xpath('code', "/html/head/meta[@name='DC.Subject.ProgramCode']/@content") l.add_xpath('name', "/html/head/meta[@name='DC.Subject.Description.Short']/@content") l.add_xpath('career', "/html/head/meta[@name='DC.Subject.Level']/@content") l.year_in = Identity() l.add_value('year', ppath.basename(ppath.dirname(url_obj.path))) l.add_value('src_url', unicode(response.url)) l.add_xpath('uoc', "/html/head/meta[@name='DC.Subject.UOC']/@content") l.gened_in = MapCompose(unicode.strip, lambda s: s == 'Y') l.add_xpath('gened', "/html/head/meta[@name='DC.Subject.GenED']/@content") l.add_xpath('faculty', "/html/head/meta[@name='DC.Subject.Faculty']/@content") l.add_xpath('school', ( "//div[@class='column content-col']/div[@class='internalContentWrapper']" "/div[@class='summary']/p[strong[text()[contains(.,'School')]]]/a/text()")) l.add_xpath('campus', ( "//div[@class='column content-col']/div[@class='internalContentWrapper']" "/div[@class='summary']/p[strong[text()[contains(.,'Campus')]]]/text()")) l.add_xpath('prereqs_str', ( "//div[@class='column content-col']/div[@class='internalContentWrapper']" "/div[@class='summary']/p[text()[contains(.,'Prerequisite:')]]/text()"), re=r'Prerequisite:\s(.+)') l.add_xpath('eftsl', ( "//div[@class='column content-col']/div[@class='internalContentWrapper']" "/div[@class='summary']/p[strong[text()[contains(.,'EFTSL')]]]/text()")) l.add_xpath('description_markup', ( "//div[@class='column content-col']/div[@class='internalContentWrapper']" "/h2[text()='Description']/following-sibling::div")) course_item = l.load_item() yield course_item yield Request(url=response.xpath(("//div[@class='column content-col']/div[@class='internalContentWrapper']" "/div[@class='summary']//a[text()[contains(.,'Timetable')]]/@href")).extract()[0], callback=self.parse_class_item, meta=dict(course_identifier={k: course_item.get(k, None) for k in ('code', 'career', 'year', )}))
def parse_item(self, response): """Fields not populated by this method: email, mobile_number, latitude, longitude. """ il = ItemLoader(item=UKBusinessItem(), response=response) il.add_value("url", unicode(response.url)) il.add_xpath("name", './/h1[@class="title"]/a/text() | .//h1[@class="title"]/text()') address_text = response.xpath(".//address/text()[normalize-space()]").extract() address_text = [part.strip().rstrip(",") for part in address_text] address = ", ".join(address_text) il.add_value("address", address) il.add_xpath("postal_code", './/h3[@class="postcode"]/text()') il.add_xpath( "website", './/div[@class="contact-info"]//strong/a/@href |' './/div[@class="contact-info"]/ul/strong/span/text()', ) il.add_xpath("category", './/ul[contains(@class, "breadcrumb")]/li[last()]/a/text()') il.add_xpath("linkedin", './/ul[contains(@class, "social")]/li[@class="linkedIn"]/a/@href') il.add_xpath("description", './/div[@class="about-text"]/p/text()') phones_sp = response.xpath('.//div[@class="contact-info"]//li/span') for span in phones_sp: text = cond_set_value(span.xpath("text()[normalize-space()]").extract(), "") if "T:" in text: phone_number = cond_set_value(span.xpath(".//div/text()").extract()) il.add_value("phone_number", phone_number) if "F:" in text: fax_number = cond_set_value(span.xpath(".//div/text()").extract()) il.add_value("fax_number", fax_number) return il.load_item()
def parsePage(self, response): rentHouse = ItemLoader(item = RentItem(), response = response) rentHouse.add_value('id', self.name + '-' + response.url.split('/')[-1].split('.')[0]) rentHouse.add_value('link', response.url) rentHouse.add_xpath('title', "//dl[@class = 'title']/dt/p/text()") return rentHouse.load_item()
def make_bot_task_error(self, task_id, code, message=''): t = ItemLoader(BotTaskError()) t.message_in = lambda slist: [s[:1023] for s in slist] t.add_value('task_id', task_id) t.add_value('bot', self.name) t.add_value('code', code) t.add_value('message', message) return t.load_item()
def parse_item(self, response): l = ItemLoader(item=CrawlpictureItem(), response=response) l.add_xpath('name', '//h2/a/text()') l.add_css('tags', 'div.metaRight p::text') #l.add_xpath('image_urls','//div[@id="picture"]/p/img/@src' or '//img[@class="scrollLoading"]/@src',Identity()) l.add_css('image_urls', 'div.postContent img::attr(src)', Identity()) l.add_value('url', response.url) return l.load_item()
def parse_item(self, response): for e in response.xpath('//table[@id="basic"]/tbody/tr'): l = ItemLoader(ProxyHunterItem(), selector=e) l.add_xpath('ip', 'td[2]/a/text()') l.add_xpath('port', 'td[3]/text()') l.add_xpath('prot', 'td[4]/a/text()') yield l.load_item()
def parse_item(self, response): l = ItemLoader(item=MeizituItem(), response=response) # l.add_xpath('name', '//div[@class="postContent"]/div[@id="picture"]/p/a/text()') # l.add_xpath('tags', '//div[@class="postContent"]') l.add_xpath('img_url', '//div[@class="text"]/p/br/img/@src', Identity()) l.add_value('url', response.url) return l.load_item()
def parse_item(self, response): l = ItemLoader(item=MeizituItem(), response=response) l.add_xpath('name', '//h2/a/text()') l.add_xpath('tags', "//div[@id='maincontent']/div[@class='postmeta clearfix']/div[@class='metaRight']/p") l.add_xpath('image_urls', "//div[@id='picture']/p/img/@src", Identity()) l.add_value('url', response.url) return l.load_item()
def parse(self, response): item_list = [] for a in response.css(".menu_box .menu_main h2"): l = ItemLoader(item=CategoryItem(), response=response) # l.add_css('category', ".menu_box .menu_main h2") l.add_value("category", a.extract(), self.get_text) item_list.append(l.load_item()) return item_list
def parse2(self, response): item = json.loads(response.body_as_unicode()) for i in range(len(item['list'])): data_tmp = item['list'][i] loader = ItemLoader(item=XqtestItem()) loader.add_value('title', data_tmp['data']) org = loader.load_item() yield org
def test_load_item_using_default_loader(self): i = TestItem() i['summary'] = u'lala' il = ItemLoader(item=i) il.add_value('name', u'marta') item = il.load_item() assert item is i self.assertEqual(item['summary'], u'lala') self.assertEqual(item['name'], [u'marta'])
def parse(self, response): sel = Selector(response) articulos = sel.xpath('/html/body/div[2]/div/div/div/div[1]/div[3]/div') for i, elem in enumerate(articulos): item = ItemLoader(Articulos(), elem) item.add_xpath('title', './/h3/text()') item.add_value('id', i) yield item.load_item()