def parse(self, response): hxs = HtmlXPathSelector(response) if "openmarket" in response.url: rate = XPathItemLoader(item=FinanceIndex(), response=response) rate.add_value("name", "Tasa Objetivo FED") rate.add_value("unit", "%") rate.add_value( "value", hxs.select("//td[@class='data'][3]/text()").re("\d+\.\d+")) #rate.update_only_if_change = True return [rate.load_item()] else: for line in response.body_as_unicode().splitlines(): if "Federal funds (effective)" in line: rate = XPathItemLoader(item=FinanceIndex(), response=response) rate.add_value("name", "FED effective fund rate") rate.add_value("unit", "%") rate.add_value( "value", hxs.select( "//th[contains(text(), 'Federal funds')]/following-sibling::td/text()" ).re("\xa0(.*?)\xa0")) return [rate.load_item()]
def parse(self, response): """ This function parses a sample response. Some contracts are mingled with this docstring. @url http://www.haodf.com/yiyuan/shanghai/list.htm @returns items 21 21 @returns requests 3 3 @scrapes _hospitalName grade area city """ hxs = HtmlXPathSelector(response) city_tree = hxs.select("//div[@id='el_tree_1000000']") # Used for hospital _cityName = city_tree.select( "div[@class='kstl2']/a/text()").extract()[0] l = XPathItemLoader(CityItem(), city_tree) l.add_xpath('cityAreas', "div[@class='ksbd']/ul/li/a/text()") l.add_xpath('_cityName', "div[@class='kstl2']/a/text()") yield l.load_item() for url in city_tree.select("div[@class='kstl']/a/@href").extract(): yield Request(url, callback=self.parse) area_list = hxs.select( "//div[@id='el_result_content']/div/div[@class='bxmd']/div") hospital_list = area_list.select("div[@class='m_ctt_green']/ul/li/a") for hospital in hospital_list: l = XPathItemLoader(HospitalItem(), hospital) l.add_xpath('_hospitalName', "text()") featureList = hospital.select( "following-sibling::span/text()").extract() if len(featureList) == 1: featureStr = featureList[0].strip() m = re.match(u"\((?P<grade>\S+)(|, 特色:(?P<feature>\S+))\)", featureStr) if m is not None: if m.groupdict()["grade"] is not None: l.add_value('grade', m.groupdict()["grade"]) if m.groupdict()["feature"] is not None: l.add_value('feature', m.groupdict()["feature"]) #l.add_xpath('feature', "following-sibling::span/text()") l.add_xpath( 'area', "parent::*/parent::*/parent::*/preceding-sibling::*[1]/attribute::id" ) l.add_value('city', _cityName) yield l.load_item()
def get_question(self, selector, response): # both select function and selector's join function need to add dot to search from relative based directory question_loader = XPathItemLoader(item = LazyTweetQuestion(), \ selector = selector) question_loader.add_xpath('question_content', ''.join([ './/span[@class="post-body"]', '//span[@class="post-status"]/descendant-or-self::text()' ])) # not useful question_loader.add_xpath('question_tags', ''.join([ '//*[@id="post-tags"]/ul/li/a/text()' ])) question_loader.add_xpath('asking_date', ''.join([ './/span[@class="post-meta"]//span[@class="timestamp"]/text()' ])) question_loader.add_value('asker', self.get_user(selector.select(''.join([ './/span[@class="post-meta"]' ])))) question_loader.add_xpath('number_of_answers', ''.join([ './/span[@class="post-meta"]', '//a[last()]/text()' ])) question_loader.add_value('question_id', response.url.split('/')[-1]) print question_loader.get_output_value('question_tags') return question_loader.load_item()
def parse(self, response): # hxs = HtmlXPathSelector(response) # ads = hxs.select('//div[@class="list-ads"]/a') # items = [] # for ad in ads: # item = LeboncoinItem() # item['name'] = ad.select('div[@class="ad-lbc"]/div[@class="detail"]/div[@class="title"]/text()').re('^\s*([\w\s]+\w)\s*') # item['photo'] = ad.select('div[@class="ad-lbc"]/div[@class="image"]/div[@class="image-and-nb"]/img/@src').extract() # item['url'] = ad.select('@href').extract() # self.log(item['name']) #print item['name'],':' ,item['photo'],'--->', item['url'] #html = '<div><div style="width:150px;height:250px;float:left;text-align:center">\ #<img src="%s" alt="" /><br />\ #<p><a href="%s">%s</a></p>\ #</div></div>' % (''.join(item['photo']), ''.join(item['url']), ''.join(item['name']) ) ##print photo #items.append(item) ## put in filename #filename = response.url.split("/")[-4] #open('/tmp/lbc/'+filename+'.html', 'a').write(html) #return items #yield items hxs = HtmlXPathSelector(response) for qxs in hxs.select('//div[@class="list-ads"]/a'): loader = XPathItemLoader(LeboncoinItem(), selector=qxs) loader.add_xpath('name' , 'div[@class="ad-lbc"]/div[@class="detail"]/div[@class="title"]/text()', re='^\s*([\w\s]+\w)\s*' ) loader.add_xpath('photo' , 'div[@class="ad-lbc"]/div[@class="image"]/div[@class="image-and-nb"]/img/@src' ) loader.add_xpath('url' , '@href' ) loader.add_value('category' , response.url.split("/")[-4] ) yield loader.load_item()
def get_answer(self, selector, question_loader): answer_loader = XPathItemLoader(item = YahooAnswer(), selector = selector) answer_loader.add_xpath('answer_id', './@id') answer_loader.add_xpath('answer_content','.//div[@class="qa-container"]//div[@class="content"]//text()') answer_loader.add_value('answerer',self.get_user(selector)) answer_loader.add_value('question_id',question_loader.get_output_value('question_id')) answer_loader.add_xpath('answering_date',''.join([ './/div[@class="qa-container"]//ul[@class="meta"]', '/li[1]/abbr/@title' ])) answer_loader.add_xpath('marks',''.join([ './/div[@class="utils-container"]', '//li[@class="rate-up"]', '//span[@class="seo-rated"]/text()' ])) answer_loader.add_xpath('marks',''.join([ './/div[@class="utils-container"]', '//li[@class="rate-up"]', '//span[@class="seo-rated"]//strong/text()' ])) # get the good number ot bad number marks = answer_loader.get_output_value('marks') # print marks if marks.find('good'): answer_loader.add_value('number_of_good_marks', marks.split(' ')[0]) #bad numbers # is best answer answer_class = selector.select('./@class').extract()[0] if answer_class.find('best') != -1: answer_loader.add_value('is_best_answer', 1) else: answer_loader.add_value('is_best_answer', 0) return answer_loader.load_item()
def parse_series_item(self, response): hxs = HtmlXPathSelector(response) videos = hxs.select('//div[@class="vo1"]') for v in videos: l = XPathItemLoader(FySeriesItem(), v) series_id = self._get_series_id(response.url) text = v.select('dl[@class="vd1"]/dt[5]/text()').extract() episode_all = self._get_episode_all(text[0]) l.add_xpath('title', 'div[@class="vd"]/text()[2]', MapCompose(unicode.strip), re='](.+)') l.add_xpath('image_url', 'dl[@class="vd1"]/dd/img/@src') l.add_xpath('director', 'dl[@class="vd1"]/dt[1]/text()', self._get_default, re='...(.+)') l.add_xpath('actor', 'dl[@class="vd1"]/dt[2]/text()', self._get_default, re='...(.+)') l.add_xpath('origin', 'dl[@class="vd1"]/dt[4]/text()', self._get_default, re='...(.+)') l.add_xpath('episode_count', 'dl[@class="vd1"]/dt[5]/text()', self._get_default, re='\d+') l.add_xpath('release_date', 'dl[@class="vd1"]/dt[6]/text()', self._get_default, re='...(.+)') l.add_xpath('description', 'dl[@class="vd4"][2]/dd/text()', MapCompose(unicode.strip, self._get_default)) l.add_value('source_id', self.name+'_'+series_id) l.add_value('episode_all', episode_all) l.add_value('channel', 2) series = l.load_item() request = Request(self.episode_list_url + series_id, callback=self.parse_episode_list) request.meta['series'] = series yield request
def parse(self, response): """ Default callback used by Scrapy to process downloaded responses """ selector = HtmlXPathSelector(response) details=urlparse(response.request.url) queryStr={x.split('=')[0]:(x.split('=')[1]) for x in details.query.split("&")} print "\n",(urllib.unquote(queryStr['p%5B%5D']).split("=")[1]),queryStr['start'] for deal in selector.select(self.deals_list_xpath): loader = XPathItemLoader(flipkartData(), selector=deal) # define processors loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # iterate over fields and add xpaths to the loader for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath) # adding the request URL to the loader loader.add_value("requestURL",unicode(response.request.url, "utf-8")) # adding the category for the request loader.add_value("category",unicode(self.category)) yield loader.load_item()
def parse(self, response): x = XmlXPathSelector(response) #x.register_namespace("xsi", "http://www.w3.org/2001/XMLSchema-instance") #programs = x.select('./body/outline[position()=4]/outline[position()<4]') programs = x.select('//body/outline/outline') podcastCount = str(len(programs)) i=0 allitems=[] for program in programs: i=i+1 l = XPathItemLoader(PodcastItem(), selector=program) l.add_xpath('id', 'concat("dpc_", ./@xmlUrl)') l.add_value('audioType', 'disco') l.add_xpath('brandId', './@xmlUrl') l.add_xpath('brandFeed', './@xmlUrl') l.add_xpath('brandName', './@title') l.add_xpath('brandDescription', './@description') l.add_xpath('brandHomepage', './@htmlUrl') self.log('Discovering dpc [%s of %s] feeds' % (i, podcastCount), level=log.INFO) item = l.load_item() yield item
def parse(self, response): x = XmlXPathSelector(response) x.register_namespace("im", "http://itunes.apple.com/rss") x.register_namespace('atom','http://www.w3.org/2005/Atom') feedCount = str(len(self.start_urls)) self.i=self.i+1 self.log('Reading rss url [%s of %s]' % (self.i, feedCount), level=log.INFO) entries = x.select('//atom:entry') if entries: # a itunes rss feed for entry in entries: id = entry.select('./atom:id/@im:id').extract() self.log('Entry %s' % (str(id)), level=log.INFO) yield Request('http://itunes.apple.com/lookup?id='+ id[0], callback=self.getItunesTrackJson) else: # a single feed l = XPathItemLoader(PodcastItem(), x) l.add_value('id', 'rssdisco_'+response.url) l.add_value('audioType', 'disco') l.add_value('brandFeed', response.url) l.add_xpath('brandName', '//./channel/title/text()') self.log('Feed from rss %s' % (response.url), level=log.INFO) item = l.load_item() yield item
def parse_page(self, response, chart, next_pages): hxs = HtmlXPathSelector(response) # parse every chart entry list = [] for item in hxs.select('//*[@class="printable-row"]'): loader = XPathItemLoader(SingleItem(), selector=item) loader.add_xpath('rank', 'div/div[@class="prank"]/text()') loader.add_xpath('track', 'div/div[@class="ptitle"]/text()') loader.add_xpath('artist', 'div/div[@class="partist"]/text()') loader.add_xpath('album', 'div/div[@class="palbum"]/text()') single = loader.load_item() list.append(dict(single)) chart['list'] += list if len(next_pages) == 0: log.msg("Done with %s" %(chart['name'])) yield chart else: next_page = next_pages.popleft() log.msg("Starting nextpage (%s) of %s - %s left" % (next_page, chart['name'], len(next_pages))) request = Request('http://www.billboard.com'+next_page, callback = lambda r: self.parse_page(r, chart, next_pages)) yield request
def parse(self, response): url = response.url group_name = url[url.find("group") :].split("/")[1] hxs = HtmlXPathSelector(response) dls = hxs.select('//dl[@class="obu"]') items = [] for dl in dls: item = GroupUserItem() l = XPathItemLoader(item=item, selector=dl) l.add_xpath("homepage", "dt/a/@href") l.add_xpath("image", "dt/a/img/@src") l.add_xpath("name", "dd/a/text()") l.add_value("group", group_name) yield l.load_item() links = hxs.select('//span[@class="next"]/a/@href').extract() for url in links: yield Request(url, callback=self.parse) if len(links) < 1: p = re.compile('<span class="next">.*?<a href="(.+?)">', re.S) m = p.search(response.body_as_unicode()) if m: url = m.group(1) yield Request(url, callback=self.parse)
def getRssFeedFromItunes(self, response): itunes1 = json.loads(response.body) metaData = response.meta["metaData"] podcastId = metaData["podcastId"] podcastName = metaData["podcastName"] genreName = metaData["genreName"] itunesPopularInGenre = metaData["itunesPopularInGenre"] self.log("%s %s %s" % (response.url, podcastId, podcastName), level=log.INFO) if itunes1["resultCount"] == 1: # should only ever be one as looking up by Id l = XPathItemLoader(PodcastItem(), response=response) l.add_value("id", "itunesglobal_" + itunes1["results"][0]["feedUrl"]) l.add_value("audioType", "disco") l.add_value("brandName", podcastName) l.add_value("brandCategory", genreName) l.add_value("brandFeed", itunes1["results"][0]["feedUrl"]) l.add_value("itunesPopularInGenre", str(itunesPopularInGenre)) l.add_value("itunesTrackId", str(itunes1["results"][0]["trackId"])) l.add_value("itunesCollectionId", str(itunes1["results"][0]["collectionId"])) if "artistId" in itunes1["results"][0]: l.add_value("itunesArtistId", str(itunes1["results"][0]["artistId"])) item = l.load_item() yield item else: self.log( "--FAILED itunes Json Discovering genre %s %s %s" % (response.url, podcastId, podcastName), level=log.WARNING, ) return
def parse_materials(self, response): reportnum = response.request.meta['reportnum'] text = unicode (response.body, response.encoding) hxs = HtmlXPathSelector(text=text) materials = hxs.select ('//table[@class="t16Standard"]/tr') if (len(materials) == 0): self.log('Materials data not present in response from {0}'.format(response.url), log.INFO) else: # Skip the first report record because this is the header row materials.pop (0) if (len(materials) == 0): self.log('No materials reports found in response {0}' .format(reportnum), log.INFO) else: self.log('Retrieved {0} materials records in report {1}' .format(len(materials),reportnum), log.INFO) for material in materials: l = XPathItemLoader(NrcScrapedMaterial(), material) l.name_in = lambda slist: [s[:32] for s in slist] l.add_value('reportnum', reportnum) for name, params in NrcScrapedMaterial.fields.items(): if 'xpath' in params: l.add_xpath(name, params['xpath']) item = l.load_item() yield item self.db.setBotTaskStatus(reportnum, self.name, 'DONE')
def parse_rc(self,response): loader = XPathItemLoader(item=ParseRcItem(), response=response) id = self.parse_id_from_url(response.url) loader.add_value('questionId', id) loader.add_xpath('text', '//div[@class="text"]/text()') loader.add_xpath('text', '//div[@class="text"]/span/text()') loader.add_xpath('answerList','//div[@class="item clearfix"]/span/text()') loader.add_xpath('choiceList','//div[@class="item clearfix"]/b/text()') loader.add_xpath('answer','//div[@class="answer clearfix hidden QuesHidden"]/b/text()') # loader.add_xpath('explanation','//div[@id="DivExplain"]') item = loader.load_item() if len(item['text']) ==3: test = item['text'][0] + '<span style="text-decoration:underline;">' + item['text'][2] + '</span>'+ item['text'][1] else: test = item['text'][0] for filename in self.fileList: index = filename.find(id) if index != -1: f = open('/home/huwei/origin/rcarticle/' + filename) artile = f.read() f.close content = self.rc_content.format(artile[24:len(artile) - 4],item['questionId'][0], item['questionId'][0],test, item['questionId'][0],item['choiceList'][0],item['choiceList'][0],item['answerList'][0], item['questionId'][0],item['choiceList'][1],item['choiceList'][1],item['answerList'][1], item['questionId'][0],item['choiceList'][2],item['choiceList'][2],item['answerList'][2], item['questionId'][0],item['choiceList'][3],item['choiceList'][3],item['answerList'][3], item['questionId'][0],item['choiceList'][4],item['choiceList'][4],item['answerList'][4], item['questionId'][0],item['answer'][0]) wf = open('/home/huwei/gmatclub/rc/' + id + '.html','w') wf.write(content) wf.close() return item
def get_question(self, selector, response): # both select function and selector's join function need to add dot to search from relative based directory question_loader = XPathItemLoader(item = LazyTweetQuestion(), \ selector = selector) question_loader.add_xpath( 'question_content', ''.join([ './/span[@class="post-body"]', '//span[@class="post-status"]/descendant-or-self::text()' ])) # not useful question_loader.add_xpath( 'question_tags', ''.join(['//*[@id="post-tags"]/ul/li/a/text()'])) question_loader.add_xpath( 'asking_date', ''.join([ './/span[@class="post-meta"]//span[@class="timestamp"]/text()' ])) question_loader.add_value( 'asker', self.get_user( selector.select(''.join(['.//span[@class="post-meta"]'])))) question_loader.add_xpath( 'number_of_answers', ''.join(['.//span[@class="post-meta"]', '//a[last()]/text()'])) question_loader.add_value('question_id', response.url.split('/')[-1]) print question_loader.get_output_value('question_tags') return question_loader.load_item()
def parse(self, response): # actually a method """ Default callback used by Scrapy to process downloaded response """ selector = HtmlXPathSelector( response) # instantiate HtmlXPathSelector() w/ response parameter # iterate over deals for content in selector.xpath( self.content_list_xpath): #multiple deals per page loader = XPathItemLoader(RedditLearnPython(), selector=content) #iterate over each deal # define processors loader.default_input_processor = MapCompose( unicode.strip) #strip out white-space of unicode strings loader.default_output_processor = Join() #join data by a space # iterate over fields and add xpaths to the loader for field, xpath in self.item_fields.iteritems( ): #itemitems() method allows you to iterate (k, v) of items in a dict loader.add_xpath(field, xpath) #add specific field xpath to loader yield loader.load_item( ) # load_item: grabs each item field (link, title, etc), gets xpath, process data # w/ input output processor. Yield each item, then move onto next deal
def parse_sale(self, response): l = XPathItemLoader(item=SaleItem(), response=response) l.add_value('url', response.url) l.add_xpath('address', '//h1[@class="address"]/text()') l.add_xpath('price', '//div[@class="price"]/text()') l.add_xpath('sale_date', '//th[text()="Last sale:"]/../td/div[last()]/text()', re=r'on (\w+)') l.add_xpath('bedrooms', '//th[text()="Bedrooms:"]/../td/text()') l.add_xpath('bathrooms', '//th[text()="Bathrooms:"]/../td/text()', re=r'(\d+)') l.add_xpath('powder_rooms', '//th[text()="Bathrooms:"]/../td/text()', re=r', (\d+)') l.add_xpath('property_type', '//th[text()="Property type:"]/../td/text()') l.add_xpath('size', '//th[text()="Size:"]/../td/text()', re=r'([\d|,]+) sqft') l.add_xpath('lot', '//th[text()="Lot:"]/../td/text()') l.add_xpath('price_per_sf', '//th[text()="Price/sqft:"]/../td/text()') l.add_xpath('year_built', '//th[text()="Year built:"]/../td/text()') l.add_xpath('public_records', 'id("property_public_info_module")/ul/li/span/text()') return l.load_item()
def process_item(self, task_id): report = self.db.loadScrapedFullReport(task_id) if report is None: return text = report['full_report_body'] text = "".join(chr(min(ord(c),127)) for c in text) t = TextResponse (url=report['full_report_url'], body=text.encode('utf-8')) #must have utf-8 here l = XPathItemLoader(NrcParsedReport(), response=t) l.add_value('reportnum', task_id) patterns = self.compile_patterns () for p in patterns: l.add_value(p[0], text, TakeFirst(), unicode.strip, re=p[1]) county = l.get_output_value('county') pattern = self.get_area_code_pattern(county) if pattern: l.add_value ('areaid', county) l.add_value('blockid', text, TakeFirst(), unicode.strip, re="%s[\s]+(?:BLOCK[\s]+)?([\d]+)" % pattern) l.add_value('blockid', text, TakeFirst(), unicode.strip, re="BLOCK[\s]+([\d]+)") item = l.load_item() yield item self.item_completed(task_id)
def parse_page(self, response): x = HtmlXPathSelector(response) #x.register_namespace("xsi", "http://www.w3.org/2001/XMLSchema-instance") l = XPathItemLoader(PodcastItem(), x) l.add_xpath('id', 'concat("frc_", //head/link[@rel="alternate"][@type="application/rss+xml"])') l.add_value('audioType', 'disco') l.add_xpath('brandId', '//head/link[@rel="alternate"][@type="application/rss+xml"]') l.add_xpath('brandFeed', '//head/link[@rel="alternate"][@type="application/rss+xml"]') l.add_xpath('brandName', './/div[contains(@class, "article-full")]/h2/text()') l.add_xpath('brandImage', './/div[contains(@class, "article-full")]/div[@class="illustration"]/img/@src') l.add_xpath('brandDescription', './/div[contains(@class, "article-full")]/span[position()=1]/text()') l.add_value('brandHomepage', response.url) l.add_value('channelName', 'France Culture') l.add_value('channelHomepage', 'http://www.franceculture.fr/') l.add_value('channelImage', 'http://www.franceculture.fr/sites/all/themes/franceculture/images/logo.png') l.add_value('ownerId', 'FRR') l.add_value('ownerName', 'Radio France') l.add_value('ownerHomepage', 'http://www.radiofrance.fr/') l.add_value('ownerKey', 'frr') l.add_value('ownerImage', 'http://www.radiofrance.fr/fileadmin/templates/images/bloc_tete/logo.png') item = l.load_item() self.log('Discovering frr %s' % (item['brandName']), level=log.INFO) yield item
def parse(self, response): """ Default callback used by Scrapy to process downloaded responses """ selector = HtmlXPathSelector(response) details=urlparse(response.request.url) queryStr={x.split('=')[0]:(x.split('=')[1]) for x in details.query.split("&")} print "\n",queryStr['page'] # iterate over deals for deal in selector.select(self.products_list_xpath): loader = XPathItemLoader(JabongData(), selector=deal) # define processors loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # iterate over fields and add xpaths to the loader for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath) # adding the request URL to the loader loader.add_value("requestURL",unicode(response.request.url, "utf-8")) # adding the category for the request loader.add_value("category",unicode(self.category)) yield loader.load_item()
def parse(self, response): # actually a method """ Default callback used by Scrapy to process downloaded responses Testing contracts: @url http://www.livingsocial.com/cities/15-san-francisco @returns items 1 @scrapes title link """ selector = HtmlXPathSelector(response) # instantiate HtmlXPathSelector() w/ response parameter # iterate over deals for deal in selector.xpath(self.deals_list_xpath): #multiple deals per page loader = XPathItemLoader(LivingSocialDeal(), selector=deal) #iterate over each deal # define processors # An Item Loader contains one input processor and one output processor for each (item) field. loader.default_input_processor = MapCompose(unicode.strip) #strip out white-space of unicode strings loader.default_output_processor = Join() #join data by a space # iterate over fields and add xpaths to the loader for field, xpath in self.item_fields.iteritems(): #itemitems() method allows you to iterate (k, v) of items in a dict loader.add_xpath(field, xpath) #add specific field xpath to loader yield loader.load_item() # load_item: grabs each item field (link, title, etc), gets xpath, process data # w/ input output processor. Yield each item, then move onto next deal
def parse_item(self,response): l = XPathItemLoader(item = LocalItem(),response = response) l.add_xpath('company','//*[@id="biz-vcard"]/div[2]/h1/span/text()') l.add_xpath('phone','//*[@id="biz-vcard"]/div[5]/div[2]/address/p/strong/text()') l.add_xpath('locality','//*[@id="biz-vcard"]/div[5]/div[2]/address/p/span[2]/text()') l.add_xpath('region','//*[@id="biz-vcard"]/div[5]/div[2]/address/p/span[3]/text()') l.add_xpath('postalcode','//*[@id="biz-vcard"]/div[5]/div[2]/address/p/span[4]/text()') res = l.load_item() results = {'name':'','address':'','phone':''} if 'company' in res: results['name'] = res['company'] if 'locality' in res: results['address'] = res['locality'] if 'region' in res: results['address'] = results['address'] + res['region'] if 'postalcode' in res: results['address'] = results['address'] + res['postalcode'] if 'phone' in res: results['phone'] = results['phone'] return res
def parse(self, response): """ Default callback used by Scrapy to process downloaded responses # Testing contracts: # @url http://www.livingsocial.com/cities/15-san-francisco # @returns items 1 # @scrapes title link """ selector = HtmlXPathSelector(response) # iterate over deals for entry in selector.xpath(self.entries_list_xpath): loader = XPathItemLoader(WGGesuchtEntry(), selector=entry) # define processors loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # iterate over fields and add xpaths to the loader for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath) yield loader.load_item() cur_index = response.meta.get("cur_index", 1) new_url = re.sub("\d+.html", str(cur_index) + ".html", response.url) print("\n" + str(response.url) + "\n" + new_url + "\n") if cur_index < 59: yield Request(new_url, callback=self.parse, meta={"cur_index": cur_index + 1})
def parse_item(self, response, loop, fields): hxs = HtmlXPathSelector(response) self.macro.update({'URL': response.url}) for e in hxs.select(loop or '(//*)[1]'): loader = XPathItemLoader(item=Item(), selector=e) for k, v in fields.iteritems(): if 'value' in v: get_v_x = loader.get_value v_x = v.get('value') elif 'xpath' in v: get_v_x = loader.get_xpath v_x = v.get('xpath') else: log.msg(u'field [{}] should contains "value" or "xpath"'. format(k), level=log.WARNING) continue val = get_v_x(self.macro.expand(v_x), utils.convert_type(v.get('parse', {})), re=v.get('regex')) if not val and 'default' in v: val = self.macro.expand(v.get('default')) qry = v.get('filter', {}) if utils.filter_data(qry, val): loader.add_value(k, val) else: break else: yield loader.load_item()
def parse_items(self, hxs, chart, typeItem): # parse every chart entry chart_list = [] for item in hxs.select( '//div[contains(@class,"chart_listing")]/article'): loader = XPathItemLoader(typeItem, selector=item) loader.add_xpath( 'rank', 'header/span[contains(@class, "chart_position")]/text()') # ptitle yields the title for the type, so just set the title to whatever the chartype is. if 'artist' in chart['type'].lower(): loader.add_xpath('artist', 'header/p[@class="chart_info"]/a/text()') else: loader.add_xpath(chart['type'].lower(), 'header/h1/text()') loader.add_xpath('artist', 'header/p[@class="chart_info"]/a/text()') loader.add_xpath('album', 'header/p[@class="chart_info"]/text()') single = loader.load_item() chart_list.append(dict(single)) chart['list'] += chart_list return chart
def parse(self, response): x = HtmlXPathSelector(response) # x.register_namespace("xsi", "http://www.w3.org/2001/XMLSchema-instance") # programs = x.select('./body/outline[position()=4]/outline[position()<4]') programs = x.select('//div[@class="itemContainer"]') podcastCount = str(len(programs)) i = 0 allitems = [] for program in programs: i = i + 1 l = XPathItemLoader(PodcastItem(), selector=program) l.add_xpath("id", 'concat("svr_", .//span/h3/a[@class="programName"]/@href)') l.add_value("audioType", "disco") l.add_xpath("brandId", './/span/h3/a[@class="programName"]/@href') l.add_xpath("brandFeed", 'concat("http://sverigesradio.se/sida/poddradio.aspx", ./a/@href)') l.add_xpath("brandName", './/span/h3/a[@class="programName"]/text()') l.add_xpath("brandDescription", './/div[@class="views-field-field-emission-desc-courte-value"]/p/text()') l.add_xpath("brandHomepage", 'concat("http://sverigesradio.se/sida/poddradio.aspx", ./a/@href)') l.add_value("ownerId", "SR") l.add_value("ownerName", "Sveriges Radio") l.add_value("ownerHomepage", "http://sverigesradio.se/") l.add_value("ownerKey", "sr") l.add_value( "ownerImage", "http://sverigesradio.se/diverse/appdata/isidor/images/news_images/3297/459929_87_56.jpg" ) self.log("Discovering svr [%s of %s] feeds" % (i, podcastCount), level=log.INFO) item = l.load_item() yield item
def parse_item(self, response): url_obj = urlparse(response.url) path = url_obj.path if path.endswith("/"): path = path[:-1] page = path.split("/")[-1] fullDomain = getDomainName(response.url) # with HTTP or HTTPS domain = fullDomain.split("/")[-2] newpath = r'C:\\Users\\****\\scrapy_projects\\tutorial\\' + domain if not os.path.exists(newpath): os.makedirs(newpath) os.chdir(newpath) filename = '%s.html' % (domain + " " + page) with open(filename, 'wb') as f: f.write(response.body) links = 'links-%s.txt' % (domain + " " + page) content = 'contents-%s.txt' % (domain + " " + page) f1.write("\n") f1.write(domain + sep) f1.write(page + sep) # 16 whois attributes f1.write(str(whois.whois(response.url).whois_server) + sep) f1.write(str(whois.whois(response.url).referral_url) + sep) f1.write(str(whois.whois(response.url).updated_date) + sep) f1.write(str(whois.whois(response.url).creation_date) + sep) f1.write(str(whois.whois(response.url).expiration_date) + sep) f1.write(str(whois.whois(response.url).name_servers) + sep) f1.write(str(whois.whois(response.url).status) + sep) f1.write(str(whois.whois(response.url).emails) + sep) f1.write(str(whois.whois(response.url).dnssec) + sep) f1.write(str(whois.whois(response.url).name) + sep) f1.write(str(whois.whois(response.url).org) + sep) f1.write(str(whois.whois(response.url).address) + sep) f1.write(str(whois.whois(response.url).city) + sep) f1.write(str(whois.whois(response.url).state) + sep) f1.write(str(whois.whois(response.url).zipcode) + sep) f1.write(str(whois.whois(response.url).country) + sep) extractLinks(links, response) countRelAbsHttpsLinks(links) countInOutLinks(links) countSlashes(links) imagePreloading(links) extractText(content, response) countSentences(content) checkGrammar(content) # Average word length is: ??? global_wc can be zero f1.write(str("%.2f" % (global_wordLen / global_wc)) + sep) # Number of words in the page: f1.write(str(global_wc) + sep) # Downloads images loader = XPathItemLoader(item=ImageItem(), response=response) loader.add_xpath('image_urls', '//img/@src') hashImages() # Calculates hashes of images downloaded by scrapy # Write label into the data file f1.write(my_dict.get(fullDomain, "redirect")) return loader.load_item()
def parse_item(self,response): l = XPathItemLoader(item = YellowPagesItem(),response = response) l.add_xpath('company','//*[@id="main-content"]/div[1]/div[1]/h1/text()') l.add_xpath('st_add','//*[@id="main-content"]/div[1]/div[1]/div/section[2]/div[1]/p[1]/text()') l.add_xpath('city','//*[@id="main-content"]/div[1]/div[1]/div/section[2]/div[1]/p[2]/text()') l.add_xpath('phone','//*[@id="main-content"]/div[1]/div[1]/div/section[2]/div[1]/p[3]/text()') #reviews left res = l.load_item() print("") print("") results = {'name':'','address':'','phone':''} if 'company' in res: results['name'] = res['company'] if 'st_add' in res: results['address'] = res['st_add'] if 'city' in res: results['address'] = results['address'] + res['city'] if 'phone' in res: results['phone'] = res['phone'] print("") return res
def parse(self, response): """ Default callback used by Scrapy to process downloaded responses Testing contracts: @url http://www.livingsocial.com/cities/15-san-francisco @returns items 1 @scrapes title link """ selector = HtmlXPathSelector(response) for deal in selector.xpath(self.deals_list_xpath): loader = XPathItemLoader(LivingSocialDeal(),selector=deal) # define processors loader.default_input_processor = MapCompose(unicode.strip) # stripe out white-space of unicode strings loader.default_output_processor = Join() # join the data together by a space # iterate over fields and add xpaths to the loader for field, xpath in self.item_fields.iteritems(): # iteritems() iterate the (key,value) of items in a dictionary. There are also iterkeys() and itervalues() functions. loader.add_xpath(field, xpath) yield loader.load_item() # yield each other and move on to the next # output as json file: scrapy crawl livingsocial -o items.json
def parse_article(self, response): """ The lines below is a spider contract. For more info see: http://doc.scrapy.org/en/latest/topics/contracts.html @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/ @scrapes name """ selector = Selector(response) loader = XPathItemLoader(LeMondeArt(), selector=selector) self.log('\n\nA response from %s just arrived!' % response.url) # define processors text_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # Populate the LeMonde Item with the item loader for field, xpath in self.article_item_fields.iteritems(): try: loader.add_xpath(field, xpath, text_input_processor) except ValueError: self.log("XPath %s not found at url %s" % (xpath, response.url)) #loader.add_value("Url",response.url) yield loader.load_item()
def parse(self, response): x = HtmlXPathSelector(response) #x.register_namespace("xsi", "http://www.w3.org/2001/XMLSchema-instance") #programs = x.select('./body/outline[position()=4]/outline[position()<4]') programs = x.select('//div[@class="item-list"]/ul/li[contains(@class,"views-row")]/div/div/div') podcastCount = str(len(programs)) i=0 allitems=[] for program in programs: i=i+1 l = XPathItemLoader(PodcastItem(), selector=program) l.add_xpath('id', 'concat("fri_", .//li/a[@class="rss"]/@href)') l.add_value('type', 'disco') l.add_xpath('brandId', './/li/a[@class="rss"]/@href') l.add_xpath('brandFeed', 'concat("http://www.franceinfo.fr", .//li[contains(@class,"link_rss")]/a[@class="rss"]/@href)') l.add_xpath('brandName', './/h3/a/text()') l.add_xpath('brandTimes', './/div[@class="views-field-field-emission-texte-diffusion-value"]/text()') l.add_xpath('brandDescription', './/div[@class="views-field-field-emission-desc-courte-value"]/p/text()') l.add_xpath('brandHomepage', './/h3/a/@href') l.add_value('channelId', 'franceinfo') l.add_xpath('channelName', '//head/meta[@property="og:site_name"]/@content') l.add_xpath('channelDescription', '//head/meta[@property="og:description"]/@content') l.add_xpath('channelImage', '//div[@id="header"]/div/span/a/img/@src') l.add_xpath('brandHomepage', './/h3/a/@href') l.add_xpath('brandHomepage', './/h3/a/@href') l.add_xpath('brandHomepage', './/h3/a/@href') l.add_xpath('brandHomepage', './/h3/a/@href') self.log('Discovering fri [%s of %s] feeds' % (i, podcastCount), level=log.INFO) item = l.load_item() yield item
def parse(self, response): """ Default callback used by Scrapy to process downloaded responses Testing contracts: @url http://www.livingsocial.com/cities/15-san-francisco @returns items 1 @scrapes title link """ # Gives ability to select parts of response defined in deals_list_xpath selector = HtmlXPathSelector(response) # Iterate through found deals for deal in selector.xpath(self.deals_list_xpath): # Loads data into item fields defined in items.py loader = XPathItemLoader(LivingSocialDeal(), selector=deal) # Define processors for clean up and joining elements loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # Iterate over item_fields dict and add xpaths to loader for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath) yield loader.load_item()
def parse_article(self, response): """ The lines below is a spider contract. For more info see: http://doc.scrapy.org/en/latest/topics/contracts.html @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/ @scrapes name """ selector = Selector(response) loader = XPathItemLoader(LeMondeArt(), selector=selector) self.log('\n\nA response from %s just arrived!' % response.url) # define processors text_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # Populate the LeMonde Item with the item loader for field, xpath in self.article_item_fields.iteritems(): try: loader.add_xpath(field, xpath, text_input_processor) except ValueError: self.log("XPath %s not found at url %s" % (xpath, response.url)) #loader.add_value("Url",response.url) yield loader.load_item()
def parse_item(self,response): l = XPathItemLoader(item=TwitterBotItem(),response=response) print "###################" l.add_xpath('company','//*[@class="trends-inner"]/div/div[2]/ul/li[1]/a/text()') l.add_xpath('street_address','//*[@id="info-container"]/div[1]/dl/dd[1]/span[1]/text()') l.add_xpath('locality','//*[@id="info-container"]/div[1]/dl/dd[1]/span[2]/text()') l.add_xpath('region','//*[@id="info-container"]/div[1]/dl/dd[1]/span[3]/text()') l.add_xpath('postalcode','//*[@id="info-container"]/div[1]/dl/dd[1]/span[4]/text()') res = l.load_item() results = {'name':'','address':''} if 'company' in res: results['name'] = res['company'] if 'street_address' in res: results['address'] = res['street_address'] if 'locality' in res: results['address'] = results['address'] + res['locality'] if 'region' in res: results['address'] = results['address'] + res['region'] if 'postalcode' in res: results['address'] = results['address'] + res['postalcode'] return res
def scrape_content_items (self, response): hxs = HtmlXPathSelector(response) stats = self.crawler.stats page_num = hxs.select ('//*[@id="MainContent_DocumentList1_GridView1_PageCurrent"]/@value').extract() if page_num: page_num = page_num[0] self.log('%s Scraping page %s' % (response.meta['cookiejar'], page_num), log.INFO) else: self.log('%s No page number found' % (response.meta['cookiejar']), log.WARNING) stats.inc_value ('_pages', spider=self) reports = hxs.select ('//table[@id="MainContent_DocumentList1_GridView1"]//tr') for report in reports: l = XPathItemLoader(FracFocusScrape(), report) l.state_in = lambda slist: [s[:20] for s in slist] l.county_in = lambda slist: [s[:20] for s in slist] for name, params in FracFocusScrape.fields.items(): l.add_xpath(name, params['xpath']) item = l.load_item() if item.get('api'): if self.db.itemExists(item): stats.inc_value ('_existing_count', spider=self) else: stats.inc_value ('_new_count', spider=self) # print item['operator'] yield item if not stats.get_value('_existing_count') and not stats.get_value('_new_count'): self.log('%s No records found' % (response.meta['cookiejar']), log.WARNING)
def parse_item(self,response): l = XPathItemLoader(item = BurrpItem(),response = response) l.add_xpath('company','//*[@id="listings-details"]/section[2]/div/div[1]/div[1]/span/p/text()') l.add_xpath('phone','//*[@id="listings-details"]/section[2]/div/div[1]/div[1]/div/ul/li[1]/strong/text()') l.add_xpath('address','//*[@id="listings-details"]/section[2]/div/div[1]/div[1]/div/ul/li[2]/text()') l.add_xpath('region','//*[@id="listings-details"]/section[2]/div/div[1]/div[1]/p/a/text()') l.add_xpath('cuisine1','//*[@id="listings-details"]/section[2]/div/div[1]/div[1]/div/ul/li[3]/a[1]/text()') l.add_xpath('cuisine2','//*[@id="listings-details"]/section[2]/div/div[1]/div[1]/div/ul/li[3]/a[2]/text()') res = l.load_item() results = {'name':'','address':'','phone':''} if 'company' in res: results['name'] = res['company'] if 'address' in res: results['address'] = res['address'] if 'locality' in res: results['address'] = results['address'] + res['locality'] if 'region' in res: results['address'] = results['address'] + res['region'] if 'postalcode' in res: results['address'] = results['address'] + res['postalcode'] return res
def parse(self, response): hxs = HtmlXPathSelector(response) parse_prices = lambda l: filter(bool, [item.strip() for item in l]) item_name = hxs.select( "//input[contains(concat(' ', @class, ' '), ' search-within ')]/@value" ).extract() item_hash = hashlib.md5( '%s::%s::%s' % (self.auction_id, item_name, self.name)).hexdigest() item_price = parse_prices( hxs.select("//div[2]//div[2]/text()").extract()) loader = XPathItemLoader(item=SearchResultItem(), response=response) loader.add_value("id", item_hash) loader.add_value("auction_id", self.auction_id) loader.add_value("site", self.name) loader.add_xpath( "name", "//input[contains(concat(' ', @class, ' '), ' search-within ')]/@value" ) loader.add_value("link", response.url) loader.add_value("price", item_price) return loader.load_item()
def parse_item(self,response): l = XPathItemLoader(item = FoursquareItem(),response = response) l.add_xpath('phone','//*[@id="container"]/div/div[2]/div[1]/div[2]/div[3]/div[1]/div[2]/div[2]/span/text()') l.add_xpath('st_add','//*[@id="container"]/div/div[2]/div[1]/div[2]/div[2]/div[2]/div[2]/div/span[1]/text()') l.add_xpath('locality','//*[@id="container"]/div/div[2]/div[1]/div[2]/div[2]/div[2]/div[2]/div/span[2]/text()') l.add_xpath('state','//*[@id="container"]/div/div[2]/div[1]/div[2]/div[2]/div[2]/div[2]/div/span[3]/text()') l.add_xpath('postalcode','//*[@id="container"]/div/div[2]/div[1]/div[2]/div[2]/div[2]/div[2]/div/span[4]/text()') l.add_xpath('country','//*[@id="container"]/div/div[2]/div[1]/div[2]/div[2]/div[2]/div[2]/div/text()[3]/text()') l.add_xpath('company','//*[@id="container"]/div/div[2]/div[1]/div[2]/div[2]/div[2]/h1/text()') res = l.load_item() results = {'name':'','address':'','phone':'','timings':''} if 'company' in res: results['name'] = res['company'] if 'st_add' in res: results['address'] = res['st_add'] if 'locality' in res: results['address'] = results['address'] + res['locality'] if 'state' in res: results['address'] = results['address'] + res['state'] if 'postalcode' in res: results['address'] = results['address'] + res['postalcode'] if 'country' in res: results['address'] = results['address'] + res['country'] if 'phone' in res: results['phone'] = res['phone'] return results
def parse_listing(self, response): l = XPathItemLoader(item=ListingItem(), response=response) l.add_value("url", response.url) l.add_xpath("address", '//h1[@class="address"]/text()') l.add_xpath("price", '//div[@class="price"]/text()') l.add_xpath("bedrooms", '//th[text()="Bedrooms:"]/../td/text()') l.add_xpath("bathrooms", '//th[text()="Bathrooms:"]/../td/text()', re=r"(\d+)") l.add_xpath("powder_rooms", '//th[text()="Bathrooms:"]/../td/text()', re=r", (\d+)") l.add_xpath("property_type", '//th[text()="Property type:"]/../td/text()') l.add_xpath("size", '//th[text()="Size:"]/../td/text()', re=r"([\d|,]+) sqft") l.add_xpath("lot", '//th[text()="Lot:"]/../td/text()') l.add_xpath("price_per_sf", '//th[text()="Price/sqft:"]/../td/text()') l.add_xpath("year_built", '//th[text()="Year built:"]/../td/text()') l.add_xpath("date_listed", '//th[text()="Added on Trulia:"]/../td/text()') l.add_xpath("mls_id", '//th[text()="MLS/ID:"]/../td/text()') l.add_xpath("descriptive_title", '//h2[@class="descriptive_title"]/text()') l.add_xpath("description", '//div[@class="listing_description_module"]/text()') l.add_xpath("additional_fields", 'id("property_listing_details_module")/ul/li/span/text()') l.add_xpath("public_records", 'id("property_public_info_module")/ul/li/span/text()') return l.load_item()
def parse_item(self,response): l = XPathItemLoader(item = ZomatoItem(),response = response) l.add_xpath('phone1','//*[@id="phoneNoString"]/div/span/span[1]/text()') l.add_xpath('company','//html/body/div[3]/section/div/div[2]/div[2]/div[1]/h1/a/span/text()') l.add_xpath('phone2','//*[@id="phoneNoString"]/div/span/span[2]/text()') l.add_xpath('address','/html/body/div[3]/section/div/div[3]/div[3]/div[1]/div[2]/h4/text()[1]') l.add_xpath('review1','//*[@id="my-reviews-container"]/div[1]/div[3]/div[1]/div[1]/div[3]/div/div[1]/div/text()') l.add_xpath('review2','//*[@id="my-reviews-container"]/div[1]/div[3]/div[1]/div[2]/div[3]/div/div[1]/div/text()') l.add_xpath('timings','//*[@id="mainframe"]/section/div[1]/div/div[2]/div[1]/div[2]/div/div[4]/div[2]/div[1]/span/text()') res = l.load_item() results = {'name':'','address':'','phone':'','review1':'','review2':'','timings':''} if 'company' in res: results['name'] = res['company'] if 'address' in res: results['address'] = res['address'] if 'phone' in res: results['phone'] = res['phone'] if 'review1' in res: results['review1'] = res['review1'] if 'review2' in res: results['review2'] = res['review2'] if 'timings' in res: results['timings'] = res['timings'] return res
def parse(self, response): """ Default callback used by Scrapy to process downloaded responses Testing contracts: @url http://www.livingsocial.com/cities/15-san-francisco @returns items 1 @scrapes title link """ selector = HtmlXPathSelector(response) # iterate over deals for deal in selector.select(self.deals_list_xpath): loader = XPathItemLoader(LivingSocialDeal(), selector=deal) # define processors loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # iterate over fields and add xpaths to the loader for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath) yield loader.load_item()
def process_item(self, task_id): report = self.db.loadScrapedFullReport(task_id) if report is None: return text = report["full_report_body"] text = "".join(chr(min(ord(c), 127)) for c in text) t = TextResponse(url=report["full_report_url"], body=text.encode("utf-8")) # must have utf-8 here l = XPathItemLoader(NrcParsedReport(), response=t) l.add_value("reportnum", task_id) patterns = self.compile_patterns() for p in patterns: l.add_value(p[0], text, TakeFirst(), unicode.strip, re=p[1]) county = l.get_output_value("county") pattern = self.get_area_code_pattern(county) if pattern: l.add_value("areaid", county) l.add_value("blockid", text, TakeFirst(), unicode.strip, re="%s[\s]+(?:BLOCK[\s]+)?([\d]+)" % pattern) l.add_value("blockid", text, TakeFirst(), unicode.strip, re="BLOCK[\s]+([\d]+)") item = l.load_item() yield item self.item_completed(task_id)
def list_item(self, response): # log.msg("the reponse_URL:%s" % response.url,level=log.DEBUG) sel=Selector(text=response.body) result_list=sel.xpath("//div[@class='result-list-item__inner']").extract() # logging.info("the reponse URL:%s" % response.url) # with open("temp.txt",'wb') as f: # f.write(response.selector.xpath("//div[@class='result-list-item__inner']").extract()) # result_list=response.xpath("//div[@class='result-list-item__inner']").extract() num=0 for result_item in result_list: num=num+1 log.msg("this is the %d item" % num,level=log.DEBUG) loader=XPathItemLoader(item=Person(),selector=Selector(text=result_item)) loader.add_xpath('name',".//h4[@class='member-title result-name']/text()") loader.add_xpath('age',".//div[@class='primary-description truncated-line']/text()[1]") loader.add_xpath('bullet',".//div[@class='primary-description truncated-line']/text()[2]") loader.add_xpath('fit',".//ul[@class='unstyled-list'][1]/li[1]/text()") loader.add_xpath('nationnality',".//ul[@class='unstyled-list'][1]/li[2]/text()") loader.add_xpath('price',".//ul[@class='unstyled-list'][2]/li/text()") # items.append(item) yield loader.load_item() url=sel.xpath("//li[@class='pagination__next ']/a/@href").extract() # print url yield Request(url[0],callback=self.list_item)
def parse(self, response): """ Default callback used by Scrapy to process downloaded responses Testing contracts: @url http://www.livingsocial.com/cities/15-san-francisco @returns items 1 @scrapes title link """ selector = HtmlXPathSelector(response) # iterate over deals for deal in selector.xpath(self.deals_list_xpath): loader = XPathItemLoader(LivingSocialDeal(), selector=deal) # define processors loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # iterate over fields and add xpaths to the loader for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath) yield loader.load_item()
def parse_item(self,response): l = XPathItemLoader(item = AsklailaItem(),response = response) l.add_xpath('company','//*[@id="all-content"]/div[4]/div[1]/div[1]/div[1]/div[1]/h1/span/text()') l.add_xpath('st_add','//*[@id="ldpAdrsDetails"]/p[2]/span/span[1]/text()') l.add_xpath('locality','//*[@id="ldpAdrsDetails"]/p[2]/span/span[2]/a/title/text()') l.add_xpath('region','//*[@id="ldpAdrsDetails"]/p[2]/span/span[3]/text()') l.add_xpath('postalcode','//*[@id="ldpAdrsDetails"]/p[2]/span/span[4]/text()') l.add_xpath('phone','//*[@id="ldpAdrsDetails"]/p[1]/span/span[1]/text()') res = l.load_item() results = {'name':'','address':'','phone':''} if 'company' in res: results['name'] = res['company'] if 'st_add' in res: results['address'] = res['st_add'] if 'locality' in res: results['address'] = results['address'] + res['locality'] if 'region' in res: results['address'] = results['address'] + res['region'] if 'postalcode' in res: results['address'] = results['address'] + res['postalcode'] if 'phone' in res: results['phone'] = res['phone'] return results
def parse_item(self, response): #hxs = HtmlXPathSelector(response) l = XPathItemLoader(item=PytexasItem(), response=response) l.add_xpath('title', '//*/div[@class="span6"]/h2/text()') l.add_xpath('speaker', '//*/div[@class="span6"]/h3/text()') l.add_xpath('description', '//*/div[@class="span6"]/p[2]/text()') #l.add_value('last_updated', 'today') # you can also use literal values return l.load_item()
def parse(self, response): gold = XPathItemLoader(item=FinanceIndex(), response=response) gold.add_value("name", "Oro Spot Cierre Londres") gold.add_value("unit", "USD") gold.add_xpath("value", "//td[@bgcolor='#cccc99'][1]//text()") return [gold.load_item()]
def parse(self, response): ubi = XPathItemLoader(item=FinanceIndex(), response=response) ubi.add_value("name", "Uruguay Bond Index") ubi.add_value("unit", "bps") ubi.add_xpath("value", "//span/text()") return [ubi.load_item()]
def parse(self, response): """ Default callback used by Scrapy to process downloaded responses """ # with open('polydata/'+response.url.split('=')[1], 'wb') as f: # f.write(response.body) # scraped_url_list = list() selector = HtmlXPathSelector(response) # iterate over deals for deal in selector.select(self.products_list_xpath): loader = XPathItemLoader(PolyvoreData(), selector=deal) # define processors loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # iterate over fields and add xpaths to the loader for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath) # adding the request URL to the loader loader.add_value("requestURL", unicode(response.request.url, "utf-8")) # scraped_url_list.append(loader.load_item()['requestURL']) for item in deal.xpath('//*[@id="content"]/ul[1]/li'): ll = XPathItemLoader(PolyvoreData(), selector=item) # define processors ll.default_input_processor = MapCompose(unicode.strip) ll.default_output_processor = Join() for field, xpath in self.item_items.iteritems(): ll.add_xpath(field, xpath) ll.add_value("requestURL", loader.load_item()['requestURL']) ll.add_value("name", loader.load_item()['name']) ll.add_value("numlikes", loader.load_item()['numlikes']) yield ll.load_item() for item in deal.xpath('//*[@id="content"]/ul[2]/li'): ll = XPathItemLoader(PolyvoreData(), selector=item) # define processors ll.default_input_processor = MapCompose(unicode.strip) ll.default_output_processor = Join() for field, xpath in self.item_items.iteritems(): ll.add_xpath(field, xpath) ll.add_value("requestURL", loader.load_item()['requestURL']) ll.add_value("name", loader.load_item()['name']) ll.add_value("numlikes", loader.load_item()['numlikes']) yield ll.load_item()
def get_user(self, selector): user_loader = XPathItemLoader(item=LazyTweetUser(), selector=selector) user_loader.add_xpath('twitter_username', ''.join(['./a[1]/text()'])) user_loader.add_value( 'twitter_url', ''.join([ r'http://twitter.com/', user_loader.get_output_value('twitter_username') ])) return user_loader.load_item()
def parse_doctor_detail(self, response): """ This function parses a sample response. Some contracts are mingled with this docstring. @url http://www.chunyuyisheng.com/doctor/clinic_web_31f4d70d2867b969 @returns items 1 1 @returns requests 0 0 """ hxs = HtmlXPathSelector(response) l = XPathItemLoader(CYDoctorItem(), hxs) l.add_xpath('_name', ("//div[@class='bdHd']/h1/text()")) shortdesc = hxs.select( "//div[@id='mainColumn']//p[@class='bdFt']/text()").extract() if len(shortdesc) == 1: shortdescStr = shortdesc[0].strip() words = shortdescStr.split() if len(words) == 3: l.add_value('title', words[0]) l.add_value('hospital', words[1]) l.add_value('specialty', words[2]) else: print("title/hostpital/special error.") l.add_xpath( 'specialtyDesc', "//div[@id='docOtherInfo']/div[@class='infoCell'][1]//p[2]/text()") l.add_xpath( 'personalInfo', "//div[@id='docOtherInfo']/div[@class='infoCell'][2]//p[2]/text()") l.add_xpath('stars', "//p[@class='right starTxt']/text()") answer = hxs.select( "//div[@id='resolvedData']/p[1]/a/text()").extract() if len(answer) == 1: answerStr = answer[0].strip().replace(u"\xa0", "") m = re.match(u"解答:(?P<answer_cnt>\d+)", answerStr) if m.groupdict()["answer_cnt"] is not None: l.add_value('answers', m.groupdict()["answer_cnt"]) review = hxs.select("//div[@id='resolvedData']/p[2]/text()").extract() if len(review) == 1: reviewStr = review[0].strip().replace(u"\xa0", "") m = re.match(u"评价:(?P<review_cnt>\d+)", reviewStr) if m.groupdict()["review_cnt"] is not None: l.add_value('reviews', m.groupdict()["review_cnt"]) # l.add_xpath('answers', "//div[@id='resolvedData']/p[1]/a/text()") # l.add_xpath('reviews', "//div[@id='resolvedData']/p[2]/text()") ret = l.load_item() print ret yield ret
def history(text, url): response = http.TextResponse(url=url, body=str(text.replace(u'\xa0', ''))) h = XPathItemLoader(item=TCADValueHistoryItem(), response=response) h.add_xpath('year', '//td[1]/text()') h.add_xpath('value', '//td[4]/text()') return h.load_item()
def improvement(text, url): response = http.TextResponse(url=url, body=str(text)) i = XPathItemLoader(item=TCADImprovementItem(), response=response) i.add_xpath('id', '//td[1]/text()') i.add_xpath('state_category', '//td[2]/text()') i.add_xpath('description', '//td[3]/text()') return i.load_item()
def parse(self, response): rate = XPathItemLoader(item=FinanceIndex(), response=response) rate.add_value("name", "Tasa Objetivo BCU") rate.add_value("unit", "%") rate.add_xpath("value", "8.75") #rate.update_only_if_change = True return [rate.load_item()]
def parse(self, response): hxs = HtmlXPathSelector(response) entries = hxs.select( '//tr[contains(@class,"trusted tlistrow")]/td[contains(@class, "tlistname")]' ) for entry in entries: l = XPathItemLoader(item=TorrentItem(), selector=entry) l.add_xpath('torrent', 'a/@href') l.add_xpath('title', 'a[contains(@href, "nyaa")]/text()') yield l.load_item()
def parse_talk(self, response): loader = XPathItemLoader(item=Pybr8TalksItem(), response=response) loader.add_xpath('title', '//div[@id="proposal"]/h1/text()') loader.add_xpath('description', '//div[@class="twocolumn"]/div[2]/text()[2]') loader.add_xpath('author_name', '//div[@class="twocolumn"]/div/div[2]/h3/text()') loader.add_xpath('author_profile', '//div[@class="twocolumn"]/div/div[2]/text()[3]') return loader.load_item()
def parse(self, response): rate = XPathItemLoader(item=FinanceIndex(), response=response) rate.add_value("name", "Merval") rate.add_value("unit", "") hxs = HtmlXPathSelector(response) rate.add_value("value", hxs.select("//span[contains(@id,'UltimoMerval')]/text()")[0].extract()) return [rate.load_item()]