def parse_detail(self, response): url = response.url item = ItemLoader(item=MeizituItem(), response=response) item.add_xpath("title", "//h2/a/text()") item.add_xpath("image_urls", '//div[@id="picture"]//img/@src') item.add_value('url', url) return item.load_item()
def parse_items(self, response): item = ItemLoader(Articulos(), response) item.add_xpath('title', '//*[@id="MainContainer"]/article/section[1]/div[1]/div/h2/text()') item.add_xpath('description', '//*[@id="MainContainer"]/article/section[1]/div[2]/ul/li[3]/text()') yield item.load_item() # scrapy runspider multiplepages.py -o ../../resources/computrabajo.csv -t csv
def parse_item(self, response): # 解析http://www.meizitu.com/a/5336.html获取图片URL l = ItemLoader(item=MeizituItem(), response=response) l.add_xpath('image_urls', "//div[@id='picture']/p/img/@src", Identity()) l.add_value('url', response.url) return l.load_item()
def parse_by_product(self, response): """ For the 'Bundles' category, grab the product details for the first product listed. """ self.selector = Selector(response) self.results = self.selector.xpath('//*[@id="ctl00_tdMainPanel"]') loader = ItemLoader(item = VisionsProduct(), selector = self.results[0]) self.field_xpaths = { 'product': ('div[contains(@class, "catalogueTitle")]' '/h3/text()'), 'price': ('div[@id="ctl00_ContentPlaceHolder1_pnl' 'Bundle"]/div[@id="divProductDetails"]/div' '[contains(@class, "priceAddToCart")]/div[1]/span' '[contains(@id, "SalePrice")]/text()') } # Extract and load product details loader.add_xpath('product', self.field_xpaths['product']) loader.add_xpath('price', self.field_xpaths['price'], re = '\$[\d]*[,]*[\d]*\.[\d]*') loader.add_value('availability', 'Not Limited/Clearance Item') # Because it's an individual product page, manually set the category self.category = '/'.join(['Home', response.url.split('/')[4]]) loader.add_value('category', self.category) yield loader.load_item()
def get_app(self, response): il = ItemLoader(item=PlayStoreItems(), response=response) il.add_css('app_id', '.details-wrapper::attr(data-docid)') il.add_css('name', '.document-title div::text') il.add_css('category', '.category span::text') il.add_css( 'category_url', '.category::attr(href)', Compose(lambda urls: [urljoin(response.url, url) for url in urls])) il.add_css('price', '.details-actions .price span::text') il.add_css('offers_in_app_purchases', '.inapp-msg::text') il.add_css('stars_count', '.stars-count::text') il.add_css('video', '.details-trailer > span::attr(data-video-url)') il.add_css('screenshots', '.screenshot::attr(src)') il.add_xpath( 'description', '//div[contains(@class, "show-more-content")]/div//text()') il.add_css('update_date', '[itemprop="datePublished"]::text') il.add_css('file_size', '[itemprop="fileSize"]::text') il.add_css('installs', '[itemprop="numDownloads"]::text') il.add_css('current_version', '[itemprop="softwareVersion"]::text') il.add_css('requires_android', '[itemprop="operatingSystems"]::text') il.add_css('offered_by', '[itemprop="author"] > a span::text') il.add_css( 'offered_by_url', '[itemprop="author"] > a::attr(href)', Compose(lambda urls: [urljoin(response.url, url) for url in urls])) yield il.load_item()
def parse_list(self,response): #Get seller attributes sel = Selector(response) self.page += 1 for s in sel.xpath(Seller.base_xpath): seller_loader = ItemLoader(Seller(),selector=s) # iterate over fields and add xpaths to the seller_loader seller_loader.add_value('page',self.page) seller_loader.add_value('flag','Seller') for key,value in Seller.item_fields.iteritems(): seller_loader.add_xpath(key,value) yield seller_loader.load_item() #Get commodity attributes for s in sel.xpath(Commodity.base_xpath): comm_loader = ItemLoader(Commodity(),selector=s) comm_loader.add_value('page',self.page) comm_loader.add_value('flag','Commodity') for key,value in Commodity.item_fields.iteritems(): comm_loader.add_xpath(key,value) yield comm_loader.load_item() #Next page if(sel.xpath(self.next_page_xpath)): yield Request("http://spu.taobao.com/spu/3c/detail.htm" + sel.xpath(self.next_page_xpath).extract()[0], callback=self.parse_list)
def parse_book_url(self, response): book_item = BookDetails(book_id="", book_type="pdf") bil = ItemLoader(item=book_item, response=response) bil.add_xpath("book_id", "/*//script/text()", re=r'bookId\s*:\s*(.*),.*') bil.add_xpath("book_path", "/*//script/text()", re=r'getDownloadUrl\s*:\s*\"(.*)\".*') #bil.get_xpath() bil.load_item() download_url = self.base_url + book_item['book_path'][0] post_data = "book_id=" + book_item['book_id'][ 0] + "&" + "type=" + book_item['book_type'] #post_data = "book_id=" + "2759" + "&" + "type=" + book_item['book_type'] #set header post_header = {} post_header[ "Content-Type"] = "application/x-www-form-urlencoded; charset=UTF-8" post_header["User-Agent"] = "Mozilla/5.0" #print post_header #print curl_cmd yield Request(download_url, self.get_book_link, headers=post_header, method='POST', body=post_data)
def parse(self, response): items = ItemLoader(item=XsContentItem(), response=response) #章节标题 items.add_xpath('title', '//*[@class="bookname"]/h1/text()') #正文 items.add_xpath('text', '//*[@id="content"]/text()') yield items.load_item()
def parse(self, response): for sel in response.css("ul#channels-browse-content-grid > li"): loader = ItemLoader(YoutubeVideo(), selector=sel) loader.add_xpath('link', './/h3/a/@href') yield loader.load_item()
def parse(self, response): l = ItemLoader(item=JianshuArticleItem(), response=response) l.add_xpath( 'content', '//div[@class="article"]/div[@class="show-content"]/p/text()') l.add_value('url', response.url) return l.load_item()
def parsePage(self, response): rentHouse = ItemLoader(item = RentItem(), response = response) rentHouse.add_value('id', self.name + '-' + response.url.split('/')[-1].split('.')[0]) rentHouse.add_value('link', response.url) rentHouse.add_xpath('title', "//dl[@class = 'title']/dt/p/text()") return rentHouse.load_item()
def parse(self, response): content = response.body page = response.url.split("/")[-1] """ content = Selector(response=response).xpath("//div[@class='body textStyle']").extract() if (len(content)): content = content[0] #踢除标签 strip = StripTags() content = strip.filterTags(content) #写文件 filename = 'quotes-%s' % page with open(filename, 'w') as f: f.write(str(content)) self.log('Saved file %s' % filename) """ loader = ItemLoader(item=TutorialItem(), response=response) loader.add_xpath('title', "//title/text()") loader.add_xpath('content', "//div[@class='body textStyle']") data = loader.load_item() downFile = DownFile(data['content'][0], 'http://www.admin10000.com') downFile.downImgFile() mongo = Mongo("articles") mongo.setTable("admin10000") content = data['content'][0] # 踢除标签 strip = StripTags() content = strip.filterTags(content) article = {'title': data['title'][0], 'content': content} mongo.add(article)
def parse(self, response): l = ItemLoader(item=MyItem(), response=response) l.add_xpath( "title", """//div[@class="carousel"]/div[@class="songlist-slides slide-page"]/ul[@class="list-songlist slide-item"]/li[@class="songlist-item"]/a[@class="lnk-songlist"]/@title""", ) return l.load_item()
def parse_item(self, response): sel = Selector(response) # collect xpaths of each player (row in table) rows = sel.xpath('/html//div[@class="table-container"]/table/tbody/tr') # prepare to adjust for shootout stats if necessary shootout = 0 if self.year > 2005: shootout = 1 # loop through players for row in rows: loader = ItemLoader(SkatEngItem(), selector=row) loader.default_input_processor = MapCompose() loader.default_output_processor = Join() # get unique NHL ID number from player's page URL num = row.xpath("td[2]/a/@href").extract() sNum = num[0][-7:] loader.add_value("nhl_num", sNum) # add season data loader.add_value("season", str(self.year)) # collect stats if shootout: loader.add_xpath("en_goals", ".//td[20]/text()") loader.add_xpath("ps_goals", ".//td[21]/text()") else: loader.add_xpath("en_goals", ".//td[21]/text()") loader.add_xpath("ps_goals", ".//td[22]/text()") # feed item to pipeline yield loader.load_item()
def parse_review(self, response): sel = Selector(response) if not self._is_right_category(sel): self.log('Skip URL: %s' % response.url, level=log.INFO) return self.log('Parse URL: %s' % response.url, level=log.INFO) loader = ItemLoader(item=YelpReview(), selector=sel) loader.add_value('crawl_date', '%s' % datetime.utcnow()) loader.add_value('page_url', response.url) # Loop over all the fields we need to extract. for field, selector in self._item_selectors.iteritems(): loader.add_xpath(field, selector) master_review = loader.load_item() review_selectors = sel.xpath('//div[contains(@class, "review")][@itemprop="review"]') for rev_sel in review_selectors: review_loader = ItemLoader(item=master_review.copy(), selector=rev_sel) for field, selector in self._review_selectors.iteritems(): review_loader.add_xpath(field, selector) yield review_loader.load_item() return
def parse_item(self, response): for e in response.xpath('//table[@id="basic"]/tbody/tr'): l = ItemLoader(ProxyHunterItem(), selector=e) l.add_xpath('ip', 'td[2]/a/text()') l.add_xpath('port', 'td[3]/text()') l.add_xpath('prot', 'td[4]/a/text()') yield l.load_item()
def parse_item(self, response): l = ItemLoader(item=MeizituItem(), response=response) # l.add_xpath('name', '//div[@class="postContent"]/div[@id="picture"]/p/a/text()') # l.add_xpath('tags', '//div[@class="postContent"]') l.add_xpath('img_url', '//div[@class="text"]/p/br/img/@src', Identity()) l.add_value('url', response.url) return l.load_item()
def parse_item(self, response): l = ItemLoader(item=CrawlpictureItem(), response=response) l.add_xpath('name', '//h2/a/text()') l.add_css('tags', 'div.metaRight p::text') #l.add_xpath('image_urls','//div[@id="picture"]/p/img/@src' or '//img[@class="scrollLoading"]/@src',Identity()) l.add_css('image_urls', 'div.postContent img::attr(src)', Identity()) l.add_value('url', response.url) return l.load_item()
def parse_item(self, response): self.logger.info("parse_item url %s" % response.url) l = ItemLoader(item=ImgDownloadItem(), response=response) l.add_xpath('name', '//h1[@class="article-title"]/a/text()') # l.add_xpath('tags', "//div[@id='maincontent']/div[@class='postmeta clearfix']/div[@class='metaRight']/p") l.add_xpath('image_urls', "//article[@class='article-content']/p/img/@src", Identity()) l.add_value('url', response.url) return l.load_item()
def parse_item(self, response): l = ItemLoader(item=CoserItem(), response=response) l.add_xpath('name', "//h1[@class='js-post-title']/text()") l.add_xpath('info', "//div[@class='post__info']/div[@class='post__type post__info-group']/span/text()") urls = l.get_xpath('//img[@class="detail_std detail_clickable"]/@src') urls = [url.replace('/w650', '') for url in urls] l.add_value('image_urls', urls) l.add_value('url', response.url) return l.load_item()
def parse_final(self, response): # sel = Selector(response) # item = MeiziScrapyItem() # item["image_name"] = sel.xpath('/html/body/div[2]/div[1]/h2/text()').extract()[0] # return item l = ItemLoader(item=MeiziScrapyItem(), response=response) l.add_xpath('image_name', '/html/body/div[2]/div[1]/h2/text()') l.add_xpath('image_url', '//*[@id="content"]/a/img/@src') return l.load_item()
def parse_template(self, response): """ Callback used by Scrapy to process downloaded responses //*[@id="page_theme"]/div[2]/div/div/div/div[2]/div[4]/table/tbody/tr[10]/td[2] """ response_body = response.body_as_unicode() # Checking if coffee beans are present in the source, since it shifts down the divs coffee = True if 'cups of coffee' in response_body else False prop_xpath = '//div[@class="info_wrapper"]//tr[td[@class="key"]/strong/text() = "{}:"]/td[@class="value"]/text()' substr_xpath = 'substring-after(normalize-space({}), "{}")' item_fields = { 'item_hash': '//*[@id="offer_sku"]/text()', 'title': '//*[@id="thing_name"]/text()', 'thumbnail': '//*[@id="thing_image"]/@src', 'description': '//*[@id="description"]', 'creator': '//*[@id="product_manufacturer"]/text()', 'when': prop_xpath.format('Released'), 'bootstrap_version': substr_xpath.format(prop_xpath.format('Bootstrap'), 'Compatible with '), 'cost_single': substr_xpath.format( '//*[@id="page_theme"]/div[2]/div/div/div/div[2]/div[{}]/div[1]//span/text()' .format(3 if coffee else 2), '$'), 'cost_multiple': substr_xpath.format( '//*[@id="page_theme"]/div[2]/div/div/div/div[2]/div[{}]/div[2]//span/text()' .format(3 if coffee else 2), '$'), 'cost_extended': substr_xpath.format( '//*[@id="page_theme"]/div[2]/div/div/div/div[2]/div[{}]/div[3]//span/text()' .format(3 if coffee else 2), '$'), 'purchases': '//div[@class="purchases"]/span[@class="count"]/text()', } selector = Selector(response) loader = ItemLoader(WrapBootstrapTemplate(), selector=selector) # define processors loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # iterate over fields and add xpaths to the loader for field, xpath in item_fields.iteritems(): loader.add_xpath(field, xpath) yield loader.load_item()
def parse(self, response): l = ItemLoader(item=CoserItem(), response=response) l.add_xpath('name', "//h1[@class='js-post-title']/text()") l.add_xpath('info', "//div[@class='post__info']/div[@class='post__type post__info-group']/span/text()") urls = l.get_xpath('//img[@class="detail_std detail_clickable"]/@src') urls = [url.replace('/w650', '') for url in urls] l.add_value('image_urls', urls) l.add_value('url', response.url) return l.load_item()
def parse(self, response): sel = Selector(response) articulos = sel.xpath('/html/body/div[2]/div/div/div/div[1]/div[3]/div') for i, elem in enumerate(articulos): item = ItemLoader(Articulos(), elem) item.add_xpath('title', './/h3/text()') item.add_value('id', i) yield item.load_item()
def parse(self, response): for e in response.xpath( '//table[@id="tbl_proxy_list"]//tr[count(td)=6]'): l = ItemLoader(ProxyHunterItem(), selector=e) l.add_value('prot', 'http') l.add_xpath('ip', 'td[1]', TakeFirst(), remove_tags, unicode.strip) l.add_xpath('port', 'td[2]', TakeFirst(), remove_tags, unicode.strip) yield l.load_item()
def parse_item2(self, response): l = ItemLoader(item=DmozItem(), response=response) l.add_xpath( 'type', '//div[@class="location ask_main_location"]/span[@class="fl"]/a[last()]/text()' ) l.add_xpath('type', '//div[@class="question"]/h2/text()') l.add_xpath('answer', '//div[@class="anwser"]/h2/text()') l.add_value('answer', '牛逼') yield l.load_item()
def parse(self, response): sel = Selector(response) preguntas = sel.xpath( '//div[@id="question-mini-list"]/div') #es una lista for i, elem in enumerate(preguntas): item = ItemLoader(Pregunta(), elem) #elem tiene el xpath item.add_xpath('pregunta', './/h3/a/text()') item.add_value('id', i) yield item.load_item()
def Loader_index(self, item_selector): l = ItemLoader(item={}, selector=item_selector) conver_img = l.get_xpath('.//*[@class="lz_img"]/img/@src') l.add_xpath('title', './/*[@class="k_list-lb-2"]/div[1]/a[1]/text()') l.add_xpath('url', './/*[@class="k_list-lb-2"]/div[1]/a/@href') l.add_value('preview', conver_img) l.add_css('date', '#k_list-lb-2-f::text', re=r'(\d{4}-\d{2}-\d{2})') l.add_value('image_urls', conver_img) return l.load_item()
def parse_item(self, response): t_cl_item=response.xpath('//table[@class="item"]') for t in t_cl_item: l = ItemLoader(item=SSDItem(), selector=t) l.add_xpath('name', './/td[@class="l"]/a/text()') l.add_xpath('usual_price', './/div[@class="price"]/text()') l.add_xpath('club_price', './/div[@class="price club_price"]/text()') l.add_xpath('img_url', './/div[@class="photo"]/span/img/@src') l.add_xpath('url', './/td[@class="l"]/a/@href') yield l.load_item()
def parse(self, response): sel = Selector(response) lugares = sel.xpath('//div[@id=hotellist_inner]/div') #vamos a iterar sobre todas las preguntas for i, elem in enumerate(lugares): item = ItemLoader(Lugar(), elem) item.add_xpath('lugar', './/h3/a/span/text()') item.add_value('id', i) yield item.load_item()
def parse(self, response): direction = response.xpath('//li[@class="btn-schedules-active"][1]/text()').extract() day = response.xpath('//li[@class="btn-schedules-active"][2]/text()').extract() for sel in response.xpath('//tr'): loader = ItemLoader(item = RtdItem(), selector = sel) loader.default_output_processor = TakeFirst() loader.add_value('day', day) loader.add_value('direction', direction) loader.add_xpath('route', 'th/a/text()') loader.add_xpath('depart_time', 'td[1]/text()') loader.add_xpath('arrive_time', 'td[2]/text()') yield loader.load_item()
def parse_detail(self, response): print("response.url===", response.url) #具体值 url = response.url #使用ItemLoader类 item = ItemLoader(item=Meizitu2Item(), response=response) item.add_xpath("tilte", "//h2/a/text()") item.add_xpath("image_urls", '//div[@id="picture"]//img/@src') #添加值的方式 item.add_value("url", url) return item.load_item()
def parse_content(self, response): goods_loader = ItemLoader(item=AlibbItem(), response = response) url = str(response.url) goods_loader.add_value('url', url) goods_loader.add_value('url_hash',hashlib.sha1(url).hexdigest()) goods_loader.add_xpath('name', self._x_query['title'].encode('utf-8')) # detail data iDetailDataPattern=re.compile("iDetailData.*};",re.DOTALL) detail_data_list=response.xpath('//script').re(iDetailDataPattern) detail_data=detail_data_list[0].replace("iDetailData = {","{") detail_data=detail_data.replace("};","}") detail_data=detail_data.replace("\t|\n|\\","") detail_data_json=json.loads(detail_data) if len(detail_data_json)!=0: properties=detail_data_json['sku']['skuMap'].keys() goods_loader.add_value('properties',[property.replace(">",",") for property in properties]) for attribute in detail_data_json['sku']['skuProps']: attributes={} options=[value['name'] for value in attribute['value']] attributes['name']=attribute['prop'] attributes['options']=options goods_loader.add_value('attributes',attributes) else: goods_loader.add_value('attributes',"") price=response.xpath('//span[re:test(@class,"value price-length-\d$")]/text()').extract() goods_loader.add_value('price',price[0] if len(price)>0 else detail_data_json['sku']['price']) # detail information detail_info_list=response.xpath(self._x_query['detail_info']).extract() goods_loader.add_value('parameters', [list(info_list) for info_list in zip(detail_info_list[::2],detail_info_list[1::2])]) print goods_loader.load_item()['url'] # profile img profile_img_urls=response.xpath('//li/@data-imgs').re("original.*jpg") for urls in profile_img_urls: profile_img_url=urls.replace("original\":\"http","http") goods_loader.add_value("boothes",profile_img_url) # big img for link in response.xpath('//*[@id="desc-lazyload-container"]/@data-tfs-url').extract(): yield Request(url = link, meta={'item': goods_loader},callback=self.parse_content_down)
def parse_item(self, response): sel = Selector(response) # collect xpaths of each player (row in table) rows = sel.xpath('/html//div[@class="table-container"]/table/tbody/tr') # instantiate parsing variables MONTHS = {'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04', 'May': '05', 'Jun': '06', 'Jul': '07', 'Aug': '08', 'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12'} # loop through players for row in rows: loader = ItemLoader(GoalBioItem(), selector=row) loader.default_input_processor = MapCompose() loader.default_output_processor = Join() # get unique NHL ID number from player's page URL num = row.xpath('td[2]/a/@href').extract() sNum = num[0][-7:] loader.add_value('nhl_num', sNum) # parse the name name = row.xpath('td[2]/a/text()').extract() sName = name[0].split(' ', 1) loader.add_value('first_name', sName[0]) loader.add_value('last_name', sName[1]) # collect birth year bDate = row.xpath('td[4]/text()').extract()[0] bYear = "19" + bDate[-2:] bMonth = MONTHS[bDate[:3]] bDay = bDate[4:6] loader.add_value('birthday', "%s-%s-%s" % (bYear, bMonth, bDay)) # add other data points loader.add_value('position', 'G') loader.add_xpath('draft_year', './/td[12]/text()') loader.add_xpath('draft_position', './/td[14]/text()') # feed item to pipeline yield loader.load_item()
def parse_item(self, response): sel = Selector(response) # collect xpaths of each player (row in table) rows = sel.xpath('/html//div[@class="table-container"]/table/tbody/tr') # loop through players for row in rows: loader = ItemLoader(SkatPMItem(), selector=row) loader.default_input_processor = MapCompose() loader.default_output_processor = Join() # get unique NHL ID number from player's page URL num = row.xpath("td[2]/a/@href").extract() sNum = num[0][-7:] loader.add_value("nhl_num", sNum) # add season data loader.add_value("season", str(self.year)) # collect stats loader.add_xpath("team_goals_for", ".//td[14]/text()") loader.add_xpath("team_pp_goals_for", ".//td[15]/text()") loader.add_xpath("team_goals_against", ".//td[16]/text()") loader.add_xpath("team_pp_goals_against", ".//td[17]/text()") # feed item to pipeline yield loader.load_item()
def parse_item(self, response): # l=用ItemLoader载入MeizituItem() l = ItemLoader(item=MeizituItem(), response=response) # 名字 # l.add_xpath('name', '//h2/a/text()') # 标签 # l.add_xpath('tags', "//div[@id='maincontent']/div[@class='postmeta clearfix']/div[@class='metaRight']/p") # 图片连接 l.add_xpath('image_urls', "//div[@id='picture']/p/img/@src", Identity()) # url l.add_value('url', response.url) return l.load_item()
def parse_parts2(self, response): log.msg("\tparse_parts time: %s" % int(time.time()), level=log.DEBUG) ua = response.request.headers['User-Agent'] log.msg("\tua: %s" % ua, level=log.DEBUG) for part in response.css('table.parts > tbody > tr'): il = ItemLoader(item=CarPart(), selector=part) il.add_xpath('shop_city', "td[@class='shop']/a/text()") il.add_xpath('shop_name', "td[@class='shop']/a/strong/text()") shop_url = il.get_xpath("td[@class='shop']/a/@href", TakeFirst()) photo_url = il.get_xpath("td[@class='photo']/a/@href", TakeFirst()) il.add_value('shop_url', urljoin(self.main_url, shop_url)) il.add_value('ext_link', urljoin(self.main_url, photo_url)) il.add_xpath('info', "td[@class='info']//text()") il.add_xpath('price', "td[@class='price']//text()") il.add_value('brand', response.meta.get('brand')) il.add_value('model', response.meta.get('model')) il.add_value('car_part', response.meta.get('car_part')) il.add_value('category', response.meta.get('category')) item = il.load_item() if item.is_valid(): yield item
def parse_item(self, response): l = ItemLoader(item=CoserItem(), response=response) l.add_xpath('name', "//div[@class='mb10 dib']/a/text()") l.add_xpath('info', "//div/p[@class='mb20']/text()") #l.add_xpath('image_urls',"//div[@class='content-img-wrap-inner']/img[@src]") l.add_value('url', response.url) # //div[@class='content-img-wrap']//img/@src # 抓不到,正则还是牛逼 urls = l.selector.re(r'src="(.+?.jpg)/w650') # urls = l.get_xpath("//div[@class='content-img-wrap']//img/@src") # urls = [url.replace('/w650', '') for url in urls] l.add_value('image_urls', urls) # l.add_xpath('image_urls',"//div/p[@class='mb20']/text()") yield l.load_item()
def parse(self,response): l = ItemLoader(item = timeItem(),response = response) #l.add_xpath('topnews','//*[@id="article-container"]/div/div[1]/section/div/article[*]/div/p/text()') l.add_xpath('topnews','//*[@id="article-container"]/div/div[1]/section/div/article[*]/div/h2/a/text()') l.add_xpath('topnews','//*[@id="article-container"]/div/div[1]/section/div/article[1]/div/div/div[2]/div[*]/h3/a/text()') l.add_xpath('sectionnews','//a[contains(@class,"home-columnists-title")]/text()') l.add_xpath('sectionnews','//a[contains(@data-event,"hp-news")]/text()') x = l.load_item() nytdict = dict() datelist = [] datalist = datetime.date.today() topnewslist = [] sectionnewslist = [] nytdict['date'] = str(datalist) for t in x['topnews']: topnewslist.append(str(t.encode('ascii','ignore')).strip()) nytdict['topnews']=topnewslist for t in x['sectionnews']: sectionnewslist.append(str(t.encode('ascii','ignore')).strip()) nytdict['sectionnews']=sectionnewslist filename = datetime.date.today() f=open('{}.json'.format(filename),'w') json.dump(nytdict,f) return l.load_item()
def parse(self, response): items = [] for everyday in response.xpath('//ul/li/strong/a'): loader = ItemLoader(ProductItem(), everyday) loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() loader.add_xpath('name', 'text()') loader.add_xpath('price', '@href') loader.add_xpath('stock', '@mon') loader.add_value('last_updated', 'today') # you can also use literal values item = self.to_utf8(loader.load_item(), *['name', 'price', 'stock', 'last_updated']) self.log(item['name'], log.INFO) items.append(item) return items
def parse_item(self, response): """This method will not populate such fields: locality, mobile_number, country, email """ il = ItemLoader(item=UKBusinessItem(), response=response) il.add_value('url', unicode(response.url)) il.add_xpath('name', '//h3[@class="biz"]/text()') il.add_xpath('category', '//div[@id="breadcrumbs"]/a[2]/text()') bcon_list = response.xpath('//ul[@class="bcon"]/li') for li in bcon_list: li_text = cond_set_value(li.xpath('.//b/text()').extract()) if li_text == 'Tel:': phone_number = cond_set_value(li.xpath('text()').extract()) il.add_value('phone_number', phone_number) if li_text == 'Web:': website = cond_set_value(li.xpath('.//a/text()').extract()) il.add_value('website', website) if li_text == 'Fax:': fax_number = cond_set_value(li.xpath('text()').extract()) il.add_value('fax_number', fax_number) address_list = response.xpath('//ul[@class="bad"]/li/text()').extract() if address_list: address_without_postal_code = u', '.join(address_list[:-1]) postal_code = address_list[-1] il.add_value('address', address_without_postal_code) il.add_value('postal_code', postal_code) il.add_xpath('latitude', '//div[@id="lat"]/text()') il.add_xpath('longitude', '//div[@id="lng"]/text()') return il.load_item()
def parse_product(self, response): product_url = response.url # sel = self.selenium #sel.open(response.url) #time.sleep(2.5) selector = Selector(response) # //*[@id="product_detail_view_1"]/div/div[6]/div[2]/span[2] price = selector.xpath('//*[@id="product_detail_view_1"]/div/div[7]/div[2]/span[2]/text()').extract() if not price: price = selector.xpath('//*[@id="product_detail_view_1"]/div/div[6]/div[2]/span[2]/text()').extract() if not price: price = selector.xpath( '//*[@id="product_detail_view_1"]/div/div[5]/div[2]/span[2]/text()').extract() if not price: price = selector.xpath( '//*[@id="product_detail_view_1"]/div/div[4]/div[2]/span[2]/text()').extract() l = ItemLoader(item=BillionPricesIndiaItem(), response=response) l.add_xpath('product_name', '//*[@id="inner"]/div[1]/div[1]/div/div/text()') l.add_xpath('quantity', '//*[@id="product_detail_view_1"]/div/div[1]/div/text()') l.add_xpath('category', '//*[@id="inner"]/div[1]/div[1]/div/a[1]/text()') l.add_xpath('product', '//*[@id="inner"]/div[1]/div[1]/div/a[2]/text()') item = l.load_item() item['product_url'] = product_url item['price'] = price item['vendor'] ='Local Banya' item['city'] = 'Mumbai' item['state'] = 'Maharashtra' item['country'] = 'India' item['date']=str(time.strftime("%d/%m/%Y")) return item
def parse(self, response): selector = response.selector.xpath(view) #iterate over titles for page in selector.select(self.view): loader = ItemLoader(AmazonItem(), page) #define processors loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() #iterate over fields and add xpaths to the loader for field, xpath in self.iem_fields.iteritems(): loader.add_xpath(field, xpath) return loader.load_item()
def parse_item(self, response): # sel = Selector(response) # # name = sel.xpath("//div[@class='main-image']/p/a/img/@alt").extract()[0] # print(name) l = ItemLoader(item=MzituItem(), response=response) l.add_xpath('image_urls', "//div[@class='main-image']/p/a/img/@src", Identity()) l.add_xpath('name', "//div[@class='main-image']/p/a/img/@alt", Identity()) # l.add_value('name', name) return l.load_item()
def parse_detail(self, response): l = ItemLoader(response.meta['item'], response) # l.add_xpath('fanhao','//span[@class="list_text"]/em/b/a/text()') l.add_xpath('image_name', '//span[@class="list_text"]/em/b/a/text()') photo = response.xpath( '//span[@class="list_img"]/a/img/@data-original').extract() # item = response.meta['item'] # item['fanhao'] = selector.xpath('//span[@class="list_text"]/em/b/a/text()').extract() # photo = selector.xpath('//span[@class="list_img"]/a/img/@data-original').extract() img = [] for p in photo: img.append('http://www.nh87.cn' + p) l.add_value('image_urls', img) # 返回item return l.load_item()
def parse_item(self, response): xpath = './/div[@class="content_left"]' sel = response.xpath(xpath) if not sel: return l = ItemLoader(item=HabrahabrItem(), selector=sel, response=response) l.add_xpath('title', '//h1/span/text()') l.add_xpath('image_urls', '//div[@class="content html_format"]/img/@src') comments_items = [] comments = sel.xpath('//div[starts-with(@class, "message html_format")]').extract() for comment in comments: comment_item = ItemLoader(item=HabrahabrComment(), selector=sel, response=response) comment_item.add_value('comment', comment) comments_items.append(comment_item.load_item()) l.add_value('comments', comments_items) yield l.load_item()
def parse(self, response): items = ItemLoader(item=LiveItem(), response=response) for content in response.xpath('//*[@id="sortdetail-container"]/li/a'): i = ItemLoader(item=LiveItem(), selector=content) #标题 i.add_xpath('title', 'div[2]/span[1]/text()') #用户名 i.add_xpath('username', 'div[2]/span[2]/@title') #热度 i.add_xpath('num', 'div[2]/span[4]/i/text()') #图片的地址 i.add_xpath('pic_addr', 'div[1]/img/@data-original') #直播间的相对地址 i.add_xpath('addr', '@href') #直播平台 i.add_value('platform', 'panda') yield i.load_item()
def parse_item(self, response): """Fields not populated by this method: locality, country, mobile_number, category. """ il = ItemLoader(item=UKBusinessItem(), response=response) il.add_value('url', unicode(response.url)) il.add_xpath('name', '//div[@class="company_details"]/h3/text()') address_p =response.xpath( '//div[@class="fl company_details_inset1"]/p') address_first_part = address_p.xpath( 'text()[normalize-space()]').extract() address_first_part = [ad.strip().replace('\\n\\r', '') for ad in address_first_part] address_second_part = cond_set_value( address_p.xpath( './/a[contains(@href, "town")]/text()').extract(), '') address = ', '.join(address_first_part) + ', '+ address_second_part if address == ', ': return il.add_value('address', address) postal_code = ''.join( address_p.xpath( './/a[not(contains(@href, "town"))]/text()').extract()) il.add_value('postal_code', postal_code) il.add_xpath('website', './/div[@class="company-website "]/a/text()') il.add_xpath('fax_number', './/span[@class="company-fax"]/text()') il.add_xpath('phone_number', './/span[@class="company-phno"]/@phone') mail_a = response.xpath('.//a[@value="Contact us"]/text()').extract() try: mail_a.remove('Contact Us') except: pass mail_a = [a.strip() for a in mail_a] mail = '@'.join(mail_a) il.add_value('email', mail) latitude = '' longitude = '' map_data = cond_set_value(response.xpath( './/p[@class="direction_map"]/iframe/@src').extract()) if map_data: try: qs = parse_qs(map_data) coordinates = qs['ll'][0].split(',') latitude = coordinates[0] longitude = coordinates[1] except: pass il.add_value('latitude', latitude) il.add_value('longitude', longitude) return il.load_item()
def parse(self, response): """ Default callback used by Scrapy to process downloaded responses Testing contracts: @url http://odds.500.com/index_jczq_2014-08-29.shtml """ selector = Selector(response) # iterate over matchs for match in selector.select(self.match_list_xpath): loader = ItemLoader(Match(), selector=match) # define processors loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # iterate over fields and add xpaths to the loader for field, xpath in self.match_fields.iteritems(): loader.add_xpath(field, xpath) match_item = loader.load_item() match_item["game_date"] = self.game_date match_item["season_id"] = match_item["season_id"].split('-')[-1] match_item["teama_id"] = match_item["teama_id"].split('-')[-1] match_item["teamb_id"] = match_item["teamb_id"].split('-')[-1] if "score" in match_item: sa, sb = match_item["score"].split(':') match_item["score_a"] = sa match_item["score_b"] = sb match_item["result"] = "win" if sa > sb else "draw" if sa == sb else "lost" else: match_item["score_a"] = match_item["score_b"] = -1 match_item["result"] = "none" yield match_item #scrap asia odds #id=454359&ctype=1&start=60&r=1&style=0&guojia=0 for i in xrange(3): url = self.asia_odds_url % (match_item["match_id"], i * 30) request = scrapy.Request(url, callback=self.parse_asia_odds) request.meta['match_item'] = match_item yield request
def parse_book_url(self, response): book_item = BookDetails(book_id = "", book_type = "pdf") bil = ItemLoader(item=book_item, response=response) bil.add_xpath("book_id", "/*//script/text()", re = r'bookId\s*:\s*(.*),.*') bil.add_xpath("book_path", "/*//script/text()", re = r'getDownloadUrl\s*:\s*\"(.*)\".*') #bil.get_xpath() bil.load_item() download_url = self.base_url + book_item['book_path'][0] post_data = "book_id=" + book_item['book_id'][0] + "&" + "type=" + book_item['book_type'] #post_data = "book_id=" + "2759" + "&" + "type=" + book_item['book_type'] #set header post_header = {} post_header["Content-Type"] = "application/x-www-form-urlencoded; charset=UTF-8" post_header["User-Agent"] = "Mozilla/5.0" #print post_header #print curl_cmd yield Request(download_url, self.get_book_link, headers = post_header, method='POST', body=post_data)
def parse(self,response): l = ItemLoader(item = googleItem(),response = response) l.add_xpath('news','//span[contains(@class,"titletext")]/text()') x = l.load_item() #print(len(x['date']),len(x['topnews']),len(x['sectionnews'])) nytdict = dict() datelist = [] datalist = datetime.date.today() newslist = [] nytdict['date'] = str(datalist) for t in x['news']: newslist.append(str(t.encode('ascii','ignore'))) nytdict['news']=newslist filename = datetime.date.today() f=open('{}.json'.format(filename),'w') json.dump(nytdict,f) return l.load_item()
def detail(self, response): log.msg(response.url) hxs = HtmlXPathSelector(response) product_name = hxs.xpath('//*[@id="vip_content_section"]/div[2]/h1/text()').extract() # //*[@id="vip_content_section"]/div[2]/h1 if (len(product_name) != 0): product_name = hxs.xpath('//*[@id="vip_content_section"]/div[2]/h1/text()').extract()[0] product_price = hxs.xpath('//*[@id="price-val"]/text()').extract() if (len(product_price) != 0): product_price = hxs.xpath('//*[@id="price-val"]/text()').extract()[0] if (len(product_price) != 0 or product_price != None) and (len(product_name) or product_name != None): l = ItemLoader(item=BillionPricesIndiaItem(), response=response) l.add_xpath('product_name', '//*[@id="vip_content_section"]/div[2]/h1/text()') # l.add_xpath('quantity', '//*[@id="product_detail_view_1"]/div/div[1]/div/text()') l.add_xpath('category', '//*[@id="cat_crum"]/@value') l.add_xpath('product', '//*[@id="overview_tab"]/div/div/p/text()') item = l.load_item() item['product_url'] = response.url item['price'] = product_price item['vendor'] = 'PepperFry' item['city'] = 'Mumbai' item['state'] = 'Maharashtra' item['country'] = 'India' item['date'] = str(time.strftime("%d/%m/%Y")) return item
def parse_item(self,response): l = ItemLoader(item=MeizituItem(), response=response) l.add_xpath('name', '//h2/a/text()') l.add_xpath('tags', "//div[@id='maincontent']/div[@class='postmeta clearfix']/div[@class='metaRight']/p") l.add_xpath('image_urls', "//div[@id='picture']/p/img/@src", Identity()) l.add_value('url', response.url) return l.load_item()
def parse_item(self, response): sel = Selector(response) # collect xpaths of each player (row in table) rows = sel.xpath('/html//div[@class="table-container"]/table/tbody/tr') for row in rows: loader = ItemLoader(GoalSOItem(), selector=row) loader.default_input_processor = MapCompose() loader.default_output_processor = Join() # get unique NHL ID number from player's page URL num = row.xpath('td[2]/a/@href').extract() sNum = num[0][-7:] loader.add_value('nhl_num', sNum) # add season data loader.add_value('season', str(self.year)) # collect additional stats loader.add_xpath('so_wins', './/td[14]/text()') loader.add_xpath('so_losses', './/td[15]/text()') loader.add_xpath('so_shots_against', './/td[16]/text()') loader.add_xpath('so_goals_against', './/td[17]/text()') # feed item to pipeline yield loader.load_item()
def parse_template(self, response): """ Callback used by Scrapy to process downloaded responses //*[@id="page_theme"]/div[2]/div/div/div/div[2]/div[4]/table/tbody/tr[10]/td[2] """ response_body = response.body_as_unicode() # Checking if coffee beans are present in the source, since it shifts down the divs coffee = True if 'cups of coffee' in response_body else False prop_xpath = '//div[@class="info_wrapper"]//tr[td[@class="key"]/strong/text() = "{}:"]/td[@class="value"]/text()' substr_xpath = 'substring-after(normalize-space({}), "{}")' item_fields = { 'item_hash': '//*[@id="offer_sku"]/text()', 'title': '//*[@id="thing_name"]/text()', 'thumbnail': '//*[@id="thing_image"]/@src', 'description': '//*[@id="description"]', 'creator': '//*[@id="product_manufacturer"]/text()', 'when': prop_xpath.format('Released'), 'bootstrap_version': substr_xpath.format(prop_xpath.format('Bootstrap'), 'Compatible with '), 'cost_single': substr_xpath.format('//*[@id="page_theme"]/div[2]/div/div/div/div[2]/div[{}]/div[1]//span/text()'.format(3 if coffee else 2), '$'), 'cost_multiple': substr_xpath.format('//*[@id="page_theme"]/div[2]/div/div/div/div[2]/div[{}]/div[2]//span/text()'.format(3 if coffee else 2), '$'), 'cost_extended': substr_xpath.format('//*[@id="page_theme"]/div[2]/div/div/div/div[2]/div[{}]/div[3]//span/text()'.format(3 if coffee else 2), '$'), 'purchases': '//div[@class="purchases"]/span[@class="count"]/text()', } selector = Selector(response) loader = ItemLoader(WrapBootstrapTemplate(), selector=selector) # define processors loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # iterate over fields and add xpaths to the loader for field, xpath in item_fields.iteritems(): loader.add_xpath(field, xpath) yield loader.load_item()
def get_app(self, response): il = ItemLoader(item=PlayStoreItems(), response=response) il.add_css('app_id', '.details-wrapper::attr(data-docid)') il.add_css('name', '.document-title div::text') il.add_css('category', '.category span::text') il.add_css('category_url', '.category::attr(href)', Compose(lambda urls: [urljoin(response.url, url) for url in urls])) il.add_css('price', '.details-actions .price span::text') il.add_css('offers_in_app_purchases', '.inapp-msg::text') il.add_css('stars_count', '.stars-count::text') il.add_css('video', '.details-trailer > span::attr(data-video-url)') il.add_css('screenshots', '.screenshot::attr(src)') il.add_xpath('description', '//div[contains(@class, "show-more-content")]/div//text()') il.add_css('update_date', '[itemprop="datePublished"]::text') il.add_css('file_size', '[itemprop="fileSize"]::text') il.add_css('installs', '[itemprop="numDownloads"]::text') il.add_css('current_version', '[itemprop="softwareVersion"]::text') il.add_css('requires_android', '[itemprop="operatingSystems"]::text') il.add_css('offered_by', '[itemprop="author"] > a span::text') il.add_css('offered_by_url', '[itemprop="author"] > a::attr(href)', Compose(lambda urls: [urljoin(response.url, url) for url in urls])) yield il.load_item()
def parse_content(self, response): bbsItem_loader = ItemLoader(item=BbsItem(), response=response) url = str(response.url) bbsItem_loader.add_value("url", url) bbsItem_loader.add_xpath("forum", self._x_query["forum"]) bbsItem_loader.add_xpath("poster", self._x_query["poster"]) bbsItem_loader.add_xpath("content", self._x_query["page_content"]) return bbsItem_loader.load_item()