def parse(self, response): """ Default callback used by Scrapy to process downloaded responses Testing contracts: @url http://www.livingsocial.com/cities/15-san-francisco @returns items 1 @scrapes title link """ selector = HtmlXPathSelector(response) for deal in selector.xpath(self.deals_list_xpath): loader = XPathItemLoader(LivingSocialDeal(),selector=deal) # define processors loader.default_input_processor = MapCompose(unicode.strip) # stripe out white-space of unicode strings loader.default_output_processor = Join() # join the data together by a space # iterate over fields and add xpaths to the loader for field, xpath in self.item_fields.iteritems(): # iteritems() iterate the (key,value) of items in a dictionary. There are also iterkeys() and itervalues() functions. loader.add_xpath(field, xpath) yield loader.load_item() # yield each other and move on to the next # output as json file: scrapy crawl livingsocial -o items.json
def parse(self, response): """ Default callback used by Scrapy to process downloaded responses # Testing contracts: # @url http://www.livingsocial.com/cities/15-san-francisco # @returns items 1 # @scrapes title link """ selector = HtmlXPathSelector(response) # iterate over deals for entry in selector.xpath(self.entries_list_xpath): loader = XPathItemLoader(WGGesuchtEntry(), selector=entry) # define processors loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # iterate over fields and add xpaths to the loader for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath) yield loader.load_item() cur_index = response.meta.get("cur_index", 1) new_url = re.sub("\d+.html", str(cur_index) + ".html", response.url) print("\n" + str(response.url) + "\n" + new_url + "\n") if cur_index < 59: yield Request(new_url, callback=self.parse, meta={"cur_index": cur_index + 1})
def parse_item(self,response): l = XPathItemLoader(item=TwitterBotItem(),response=response) print "###################" l.add_xpath('company','//*[@class="trends-inner"]/div/div[2]/ul/li[1]/a/text()') l.add_xpath('street_address','//*[@id="info-container"]/div[1]/dl/dd[1]/span[1]/text()') l.add_xpath('locality','//*[@id="info-container"]/div[1]/dl/dd[1]/span[2]/text()') l.add_xpath('region','//*[@id="info-container"]/div[1]/dl/dd[1]/span[3]/text()') l.add_xpath('postalcode','//*[@id="info-container"]/div[1]/dl/dd[1]/span[4]/text()') res = l.load_item() results = {'name':'','address':''} if 'company' in res: results['name'] = res['company'] if 'street_address' in res: results['address'] = res['street_address'] if 'locality' in res: results['address'] = results['address'] + res['locality'] if 'region' in res: results['address'] = results['address'] + res['region'] if 'postalcode' in res: results['address'] = results['address'] + res['postalcode'] return res
def parse(self, response): url = response.url group_name = url[url.find("group") :].split("/")[1] hxs = HtmlXPathSelector(response) dls = hxs.select('//dl[@class="obu"]') items = [] for dl in dls: item = GroupUserItem() l = XPathItemLoader(item=item, selector=dl) l.add_xpath("homepage", "dt/a/@href") l.add_xpath("image", "dt/a/img/@src") l.add_xpath("name", "dd/a/text()") l.add_value("group", group_name) yield l.load_item() links = hxs.select('//span[@class="next"]/a/@href').extract() for url in links: yield Request(url, callback=self.parse) if len(links) < 1: p = re.compile('<span class="next">.*?<a href="(.+?)">', re.S) m = p.search(response.body_as_unicode()) if m: url = m.group(1) yield Request(url, callback=self.parse)
def parse(self, response): """ Default callback used by Scrapy to process downloaded responses Testing contracts: @url http://www.livingsocial.com/cities/15-san-francisco @returns items 1 @scrapes title link """ selector = HtmlXPathSelector(response) # iterate over deals for deal in selector.xpath(self.deals_list_xpath): loader = XPathItemLoader(LivingSocialDeal(), selector=deal) # define processors loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # iterate over fields and add xpaths to the loader for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath) yield loader.load_item()
def parse_materials(self, response): reportnum = response.request.meta['reportnum'] text = unicode (response.body, response.encoding) hxs = HtmlXPathSelector(text=text) materials = hxs.select ('//table[@class="t16Standard"]/tr') if (len(materials) == 0): self.log('Materials data not present in response from {0}'.format(response.url), log.INFO) else: # Skip the first report record because this is the header row materials.pop (0) if (len(materials) == 0): self.log('No materials reports found in response {0}' .format(reportnum), log.INFO) else: self.log('Retrieved {0} materials records in report {1}' .format(len(materials),reportnum), log.INFO) for material in materials: l = XPathItemLoader(NrcScrapedMaterial(), material) l.name_in = lambda slist: [s[:32] for s in slist] l.add_value('reportnum', reportnum) for name, params in NrcScrapedMaterial.fields.items(): if 'xpath' in params: l.add_xpath(name, params['xpath']) item = l.load_item() yield item self.db.setBotTaskStatus(reportnum, self.name, 'DONE')
def parse_item(self,response): l = XPathItemLoader(item = BurrpItem(),response = response) l.add_xpath('company','//*[@id="listings-details"]/section[2]/div/div[1]/div[1]/span/p/text()') l.add_xpath('phone','//*[@id="listings-details"]/section[2]/div/div[1]/div[1]/div/ul/li[1]/strong/text()') l.add_xpath('address','//*[@id="listings-details"]/section[2]/div/div[1]/div[1]/div/ul/li[2]/text()') l.add_xpath('region','//*[@id="listings-details"]/section[2]/div/div[1]/div[1]/p/a/text()') l.add_xpath('cuisine1','//*[@id="listings-details"]/section[2]/div/div[1]/div[1]/div/ul/li[3]/a[1]/text()') l.add_xpath('cuisine2','//*[@id="listings-details"]/section[2]/div/div[1]/div[1]/div/ul/li[3]/a[2]/text()') res = l.load_item() results = {'name':'','address':'','phone':''} if 'company' in res: results['name'] = res['company'] if 'address' in res: results['address'] = res['address'] if 'locality' in res: results['address'] = results['address'] + res['locality'] if 'region' in res: results['address'] = results['address'] + res['region'] if 'postalcode' in res: results['address'] = results['address'] + res['postalcode'] return res
def parse_item(self,response): l = XPathItemLoader(item = LocalItem(),response = response) l.add_xpath('company','//*[@id="biz-vcard"]/div[2]/h1/span/text()') l.add_xpath('phone','//*[@id="biz-vcard"]/div[5]/div[2]/address/p/strong/text()') l.add_xpath('locality','//*[@id="biz-vcard"]/div[5]/div[2]/address/p/span[2]/text()') l.add_xpath('region','//*[@id="biz-vcard"]/div[5]/div[2]/address/p/span[3]/text()') l.add_xpath('postalcode','//*[@id="biz-vcard"]/div[5]/div[2]/address/p/span[4]/text()') res = l.load_item() results = {'name':'','address':'','phone':''} if 'company' in res: results['name'] = res['company'] if 'locality' in res: results['address'] = res['locality'] if 'region' in res: results['address'] = results['address'] + res['region'] if 'postalcode' in res: results['address'] = results['address'] + res['postalcode'] if 'phone' in res: results['phone'] = results['phone'] return res
def parse_rc(self,response): loader = XPathItemLoader(item=ParseRcItem(), response=response) id = self.parse_id_from_url(response.url) loader.add_value('questionId', id) loader.add_xpath('text', '//div[@class="text"]/text()') loader.add_xpath('text', '//div[@class="text"]/span/text()') loader.add_xpath('answerList','//div[@class="item clearfix"]/span/text()') loader.add_xpath('choiceList','//div[@class="item clearfix"]/b/text()') loader.add_xpath('answer','//div[@class="answer clearfix hidden QuesHidden"]/b/text()') # loader.add_xpath('explanation','//div[@id="DivExplain"]') item = loader.load_item() if len(item['text']) ==3: test = item['text'][0] + '<span style="text-decoration:underline;">' + item['text'][2] + '</span>'+ item['text'][1] else: test = item['text'][0] for filename in self.fileList: index = filename.find(id) if index != -1: f = open('/home/huwei/origin/rcarticle/' + filename) artile = f.read() f.close content = self.rc_content.format(artile[24:len(artile) - 4],item['questionId'][0], item['questionId'][0],test, item['questionId'][0],item['choiceList'][0],item['choiceList'][0],item['answerList'][0], item['questionId'][0],item['choiceList'][1],item['choiceList'][1],item['answerList'][1], item['questionId'][0],item['choiceList'][2],item['choiceList'][2],item['answerList'][2], item['questionId'][0],item['choiceList'][3],item['choiceList'][3],item['answerList'][3], item['questionId'][0],item['choiceList'][4],item['choiceList'][4],item['answerList'][4], item['questionId'][0],item['answer'][0]) wf = open('/home/huwei/gmatclub/rc/' + id + '.html','w') wf.write(content) wf.close() return item
def parse(self, response): """ Default callback used by Scrapy to process downloaded responses """ selector = HtmlXPathSelector(response) details=urlparse(response.request.url) queryStr={x.split('=')[0]:(x.split('=')[1]) for x in details.query.split("&")} print "\n",queryStr['page'] # iterate over deals for deal in selector.select(self.products_list_xpath): loader = XPathItemLoader(JabongData(), selector=deal) # define processors loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # iterate over fields and add xpaths to the loader for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath) # adding the request URL to the loader loader.add_value("requestURL",unicode(response.request.url, "utf-8")) # adding the category for the request loader.add_value("category",unicode(self.category)) yield loader.load_item()
def parse_article(self, response): """ The lines below is a spider contract. For more info see: http://doc.scrapy.org/en/latest/topics/contracts.html @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/ @scrapes name """ selector = Selector(response) loader = XPathItemLoader(LeMondeArt(), selector=selector) self.log('\n\nA response from %s just arrived!' % response.url) # define processors text_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # Populate the LeMonde Item with the item loader for field, xpath in self.article_item_fields.iteritems(): try: loader.add_xpath(field, xpath, text_input_processor) except ValueError: self.log("XPath %s not found at url %s" % (xpath, response.url)) #loader.add_value("Url",response.url) yield loader.load_item()
def scrape_content_items (self, response): hxs = HtmlXPathSelector(response) stats = self.crawler.stats page_num = hxs.select ('//*[@id="MainContent_DocumentList1_GridView1_PageCurrent"]/@value').extract() if page_num: page_num = page_num[0] self.log('%s Scraping page %s' % (response.meta['cookiejar'], page_num), log.INFO) else: self.log('%s No page number found' % (response.meta['cookiejar']), log.WARNING) stats.inc_value ('_pages', spider=self) reports = hxs.select ('//table[@id="MainContent_DocumentList1_GridView1"]//tr') for report in reports: l = XPathItemLoader(FracFocusScrape(), report) l.state_in = lambda slist: [s[:20] for s in slist] l.county_in = lambda slist: [s[:20] for s in slist] for name, params in FracFocusScrape.fields.items(): l.add_xpath(name, params['xpath']) item = l.load_item() if item.get('api'): if self.db.itemExists(item): stats.inc_value ('_existing_count', spider=self) else: stats.inc_value ('_new_count', spider=self) # print item['operator'] yield item if not stats.get_value('_existing_count') and not stats.get_value('_new_count'): self.log('%s No records found' % (response.meta['cookiejar']), log.WARNING)
def parse(self, response): # actually a method """ Default callback used by Scrapy to process downloaded responses Testing contracts: @url http://www.livingsocial.com/cities/15-san-francisco @returns items 1 @scrapes title link """ selector = HtmlXPathSelector(response) # instantiate HtmlXPathSelector() w/ response parameter # iterate over deals for deal in selector.xpath(self.deals_list_xpath): #multiple deals per page loader = XPathItemLoader(LivingSocialDeal(), selector=deal) #iterate over each deal # define processors # An Item Loader contains one input processor and one output processor for each (item) field. loader.default_input_processor = MapCompose(unicode.strip) #strip out white-space of unicode strings loader.default_output_processor = Join() #join data by a space # iterate over fields and add xpaths to the loader for field, xpath in self.item_fields.iteritems(): #itemitems() method allows you to iterate (k, v) of items in a dict loader.add_xpath(field, xpath) #add specific field xpath to loader yield loader.load_item() # load_item: grabs each item field (link, title, etc), gets xpath, process data # w/ input output processor. Yield each item, then move onto next deal
def parse(self, response): x = XmlXPathSelector(response) x.register_namespace("im", "http://itunes.apple.com/rss") x.register_namespace('atom','http://www.w3.org/2005/Atom') feedCount = str(len(self.start_urls)) self.i=self.i+1 self.log('Reading rss url [%s of %s]' % (self.i, feedCount), level=log.INFO) entries = x.select('//atom:entry') if entries: # a itunes rss feed for entry in entries: id = entry.select('./atom:id/@im:id').extract() self.log('Entry %s' % (str(id)), level=log.INFO) yield Request('http://itunes.apple.com/lookup?id='+ id[0], callback=self.getItunesTrackJson) else: # a single feed l = XPathItemLoader(PodcastItem(), x) l.add_value('id', 'rssdisco_'+response.url) l.add_value('audioType', 'disco') l.add_value('brandFeed', response.url) l.add_xpath('brandName', '//./channel/title/text()') self.log('Feed from rss %s' % (response.url), level=log.INFO) item = l.load_item() yield item
def get_answer(self, selector, question_loader): answer_loader = XPathItemLoader(item = YahooAnswer(), selector = selector) answer_loader.add_xpath('answer_id', './@id') answer_loader.add_xpath('answer_content','.//div[@class="qa-container"]//div[@class="content"]//text()') answer_loader.add_value('answerer',self.get_user(selector)) answer_loader.add_value('question_id',question_loader.get_output_value('question_id')) answer_loader.add_xpath('answering_date',''.join([ './/div[@class="qa-container"]//ul[@class="meta"]', '/li[1]/abbr/@title' ])) answer_loader.add_xpath('marks',''.join([ './/div[@class="utils-container"]', '//li[@class="rate-up"]', '//span[@class="seo-rated"]/text()' ])) answer_loader.add_xpath('marks',''.join([ './/div[@class="utils-container"]', '//li[@class="rate-up"]', '//span[@class="seo-rated"]//strong/text()' ])) # get the good number ot bad number marks = answer_loader.get_output_value('marks') # print marks if marks.find('good'): answer_loader.add_value('number_of_good_marks', marks.split(' ')[0]) #bad numbers # is best answer answer_class = selector.select('./@class').extract()[0] if answer_class.find('best') != -1: answer_loader.add_value('is_best_answer', 1) else: answer_loader.add_value('is_best_answer', 0) return answer_loader.load_item()
def parse(self, response): ubi = XPathItemLoader(item=FinanceIndex(), response=response) ubi.add_value("name", "Uruguay Bond Index") ubi.add_value("unit", "bps") ubi.add_xpath("value", "//span/text()") return [ubi.load_item()]
def parse(self, response): gold = XPathItemLoader(item=FinanceIndex(), response=response) gold.add_value("name", "Oro Spot Cierre Londres") gold.add_value("unit", "USD") gold.add_xpath("value", "//td[@bgcolor='#cccc99'][1]//text()") return [gold.load_item()]
def history(text, url): response = http.TextResponse(url=url, body=str(text.replace(u'\xa0',''))) h = XPathItemLoader(item=TCADValueHistoryItem(), response=response) h.add_xpath('year', '//td[1]/text()') h.add_xpath('value', '//td[4]/text()') return h.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) entries = hxs.select('//tr[contains(@class,"trusted tlistrow")]/td[contains(@class, "tlistname")]') for entry in entries: l = XPathItemLoader(item=TorrentItem(), selector=entry ) l.add_xpath('torrent', 'a/@href') l.add_xpath('title', 'a[contains(@href, "nyaa")]/text()') yield l.load_item()
def parse(self, response): selector = HtmlXPathSelector(response) for link in selector.select(self.links_list_xpath): loader = XPathItemLoader(iWatchOnline(), selector=link) loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() for field, xpath in self.episodes_field.iteritems(): loader.add_xpath(field,xpath) yield loader.load_item()
def parse_item(self, response): l = XPathItemLoader(item=BookItem(), response=response) l.add_xpath('name', '//span[@id=\'ctl00_ContentPlaceHolderMainOrta_LabelAdi\']/text()') l.add_xpath('isbn', '//span[@id=\'ctl00_ContentPlaceHolderMainOrta_LabelIsbn\']/text()') l.add_xpath('author', '//span[@id=\'ctl00_ContentPlaceHolderMainOrta_LabelYazar\']/a/text()') l.add_xpath('publisher','//a[@id=\'ctl00_ContentPlaceHolderMainOrta_HyperLinkYayinci\']/text()') l.add_xpath('price', '//span[@class=\'fiyat\']/text()', u'(.*) TL') l.add_value('link', response.url) l.add_value('store', 4) return l.load_item()
def parse_item(self,response): l = XPathItemLoader(item = HotfrogItem(),response = response) l.add_xpath('company','/html/body/center/table[2]/text()') res = l.load_item() print("") print("") return res print("") print("")
def parse(self, response): items = [] for name, pattern, pos in rates: rate = XPathItemLoader(item=FinanceIndex(), response=response) rate.add_value("name", name) rate.add_value("unit", "%") rate.add_xpath("value", "//a[contains(text(), '%s')]/parent::td/following-sibling::td[%d]/text()" % (pattern, pos)) items.append(rate.load_item()) return items
def get_UT_item(self, sel, user_url): ''' given the selector of topic and user url, generate the u_t relationship ''' ut_loader = XPathItemLoader(item=ZhiHuU_T(), selector = sel) ut_loader.add_value('crawled_from', user_url) ut_loader.add_value('user_url', '/'+'/'.join(user_url.split('/')[-3:-1])) ut_loader.add_xpath('topic_url', './/a[contains(@class, "zm-list-avatar-link")]/@href') return ut_loader.load_item()
def parse_item(self, response): l = XPathItemLoader(item=BookItem(), response=response) l.add_xpath('name', '//h1[@class=\'kitapad14pnt\']/b/text()') l.add_xpath('isbn', '//span[@class=\'kunye\']/text()', u'ISBN: ([0-9\-X]+)') l.add_xpath('author', '//span[@class=\'yazarad12pnt\']/a/span[@class=\'yazarad12pnt\']/text()') l.add_xpath('publisher','//h3[@class=\'kapakyazisi\']/b/font/a/text()') l.add_xpath('price', '//span[@class="kapakyazisi"]/font/b/text()', u'(.*) TL') l.add_value('link', response.url) l.add_value('store', 5) return l.load_item()
def parse_item(self, response): l = XPathItemLoader(item=BookItem(), response=response) l.add_xpath('name', '//div[@class=\'boxTanimisim\']/div/text()') l.add_xpath('isbn', '//div[@id=\'tanitimbox\']/text()', u'.*ISBN : ([0-9]+)') l.add_xpath('author', '//div[@class=\'boxTanimVideo\']/a/text()') l.add_xpath('publisher','//h3[@class=\'boxTanimyayinevi\']/a/b/text()') l.add_xpath('price', '//b[@class=\'pricerange\']/text()', u'\s*([0-9,]*) TL \(KDV Dahil\)') l.add_value('link', response.url) l.add_value('store', 2) return l.load_item()
def parse_item(self, response): l = XPathItemLoader(item=BookItem(), response=response) l.add_xpath('name', '//span[@class=\'kitapismi\']/text()') l.add_xpath('isbn', '//span[@class=\'normalkucuk\']/text()', u'ISBN:([0-9]+)') l.add_xpath('author', '//span/a[contains(@href, "/yazar/")]/text()') l.add_xpath('publisher','//span/a[contains(@href, "/yayinevi/")]/text()') l.add_xpath('price', '//td/text()', u'Kitapyurdu Fiyatı:(.*) TL\.') l.add_value('link', response.url) l.add_value('store', 3) return l.load_item()
def get_user(self, selector): user_loader = XPathItemLoader(item = LazyTweetUser(), selector = selector) user_loader.add_xpath('twitter_username', ''.join([ './a[1]/text()' ])) user_loader.add_value('twitter_url', ''.join([ r'http://twitter.com/', user_loader.get_output_value('twitter_username') ])) return user_loader.load_item()
def parse_argument(self, response): loader = XPathItemLoader(item=Argument(), response=response) id = self.parse_id_from_url(response.url) if id: loader.add_value('id', id) else: loader.add_value('id', -1) loader.add_xpath('rating', '//b[@id="QuestionRateValue"]/text()') loader.add_xpath('essay', '//div[@class="essay"]') return loader.load_item()
def parse(self, response): rate = XPathItemLoader(item=FinanceIndex(), response=response) rate.add_value("name", "Tasa Objetivo BCU") rate.add_value("unit", "%") rate.add_xpath("value", "8.75") #rate.update_only_if_change = True return [rate.load_item()]
def get_UT_item(self, sel, user_url): ''' given the selector of topic and user url, generate the u_t relationship ''' ut_loader = XPathItemLoader(item=ZhiHuU_T(), selector=sel) ut_loader.add_value('crawled_from', user_url) ut_loader.add_value('user_url', '/' + '/'.join(user_url.split('/')[-3:-1])) ut_loader.add_xpath( 'topic_url', './/a[contains(@class, "zm-list-avatar-link")]/@href') return ut_loader.load_item()
def get_user(self, selector): user_loader = XPathItemLoader(item = YahooUser(), selector = selector) user_loader.add_xpath('user_name', './/span[contains(@class, "user")]//span[contains(@class, "fn")]/text()') user_loader.add_xpath('user_url', './/span[@class="user"]//a[@class="url"]/@href') user_loader.add_value('user_id', re.match(r'http://answers\.yahoo\.com/my/profile\?show=(.*)', user_loader.get_output_value('user_url') ).group(1)) if user_loader.get_collected_values('user_name'): return user_loader.load_item() else: return None
def parse(self, response): items = [] for name, pattern, pos in rates: rate = XPathItemLoader(item=FinanceIndex(), response=response) rate.add_value("name", name) rate.add_value("unit", "%") rate.add_xpath( "value", "//a[contains(text(), '%s')]/parent::td/following-sibling::td[%d]/text()" % (pattern, pos)) items.append(rate.load_item()) return items
def parse(self, response): selector = HtmlXPathSelector(response) for startup in selector.select(self.startup_results_xpath): loader = XPathItemLoader(SearchResults(), selector=startup) loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath) yield loader.load_item()
def parse(self, response): selector = HtmlXPathSelector(response) # looking for a deals for deal in selector.select(self.deals_list_xpath): loader = XPathItemLoader(LivingSocialDeal(), selector=deal) loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath) yield loader.load_item()
def parse_links(self, response): listing = re.findall(r"lid=(\d+)",response.url) loader = XPathItemLoader(item=AuctionsItem(), response=response) loader.add_value("id",listing[0]) loader.add_xpath("auctioneer",settings['AUCTION_AUCTIONEER']) loader.add_xpath("contact_number",settings['AUCTION_CONTACT_NUMBER']) loader.add_xpath("date",settings['AUCTION_DATE']) loader.add_xpath("time",settings['AUCTION_TIME']) loader.add_xpath("location",settings['AUCTION_LOCATION']) loader.add_value("link",response.url) loader.add_xpath("listing",settings['AUCTION_LISTING']) return loader.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) orden_compra, anio = re.search(r'wOCabc=(\d+)&wEjercicio=(\d+)', urlparse(response.url).query).groups() for tr in hxs.select('//table[contains(@width, "760")][2]/tr'): i = CompraLineaItem() l = XPathItemLoader(item=i, selector=tr) l.add_xpath('cantidad', 'td[1]/text()') l.add_xpath('importe', 'td[2]/text()') l.add_xpath('detalle', 'td[3]/text()') l.add_value('orden_compra', int(orden_compra)) l.add_value('anio', int(anio)) x = l.load_item() yield x
def get_question(self, selector, response): # both select function and selector's join function need to add dot to search from relative based directory question_loader = XPathItemLoader(item = LazyTweetQuestion(), \ selector = selector) question_loader.add_xpath( 'question_content', ''.join([ './/span[@class="post-body"]', '//span[@class="post-status"]/descendant-or-self::text()' ])) # not useful question_loader.add_xpath( 'question_tags', ''.join(['//*[@id="post-tags"]/ul/li/a/text()'])) question_loader.add_xpath( 'asking_date', ''.join([ './/span[@class="post-meta"]//span[@class="timestamp"]/text()' ])) question_loader.add_value( 'asker', self.get_user( selector.select(''.join(['.//span[@class="post-meta"]'])))) question_loader.add_xpath( 'number_of_answers', ''.join(['.//span[@class="post-meta"]', '//a[last()]/text()'])) question_loader.add_value('question_id', response.url.split('/')[-1]) print question_loader.get_output_value('question_tags') return question_loader.load_item()
def parse_articles(self, response): hxs = HtmlXPathSelector(response) l = XPathItemLoader(item=Article(), response=response) l.add_xpath("title", "//h1[contains(@class,'detail-title')]/text()") l.add_xpath( "content", "//div[contains(@class,'article-text')]//p[contains(@class,'body')]" ) l.add_xpath("date", "//span[contains(@class,'dateline')]/text()") l.add_xpath("location", " ") l.add_xpath("keywords", "//div[@id='articleKeywords']/p/a/text()") l.add_value("link", response.url) l.add_value("author", 'Sainath') return l.load_item()
def parse(self, response): """Get response from start_urls""" selector = HtmlXPathSelector(response) for deal in selector.xpath(self.xpath_for_deals): loader = XPathItemLoader(LivingSocial(), selector=deal) # define processors loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # iterate over fields and add xpaths to the loader for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath.strip()) yield loader.load_item()
def parse_full_report(self, response): # need to work around weird bug where lxml can't handle encode=WINDOWS-1252 # so pull out the body, convert to utf-8 and create a new TextResponse object to contain it # since XPathItemLoader requires a Response object text = unicode (response.body, response.encoding) t = TextResponse (url=response.url, body=text.encode('utf-8'), encoding='utf-8') l= XPathItemLoader(NrcScrapedFullReport(), response=t) url_parts = urlsplit(response.url) l.add_value('reportnum', parse_qs(url_parts.query)['standard_web inc_seq']) l.add_xpath('full_report_body', '//body') l.add_value('full_report_url', response.url) item = l.load_item() reportnum = item['reportnum'] yield item self.db.setBotTaskStatus(reportnum, self.name, 'DONE')
def get_answer(self, selector, response): answer_loader = XPathItemLoader(item = LazyTweetAnswer(), \ selector = selector) answer_loader.add_value('question_id', response.url.split('/')[-1]) answer_loader.add_value( 'answerer', self.get_user( selector.select(''.join(['.//span[@class="answer-meta"]'])))) answer_loader.add_xpath( 'answer_content', ''.join([ './/span[@class="answer-body"]', '//span[@class="answer-status"]//descendant-or-self::text()' ])) print answer_loader.get_output_value('answer_content') a = input() return answer_loader.load_item()
def parse(self, response): selector = HtmlXPathSelector(response) #iterate over deals for deal in selector.select(self.deals_list_xpath): loader = XPathItemLoader(LivingSocialDeal(), selector=deal) #define processor # renove whitespace loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() #iterate over fields and add xpaths to the loader for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath) yield loader.load_item()
def parse_doctor_detail(self, response): """ This function parses a sample response. Some contracts are mingled with this docstring. @url http://www.chunyuyisheng.com/doctor/clinic_web_31f4d70d2867b969 @returns items 1 1 @returns requests 0 0 """ hxs = HtmlXPathSelector(response) l = XPathItemLoader(CYDoctorItem(), hxs) l.add_xpath('_name', ("//div[@class='bdHd']/h1/text()")) shortdesc = hxs.select( "//div[@id='mainColumn']//p[@class='bdFt']/text()").extract() if len(shortdesc) == 1: shortdescStr = shortdesc[0].strip() words = shortdescStr.split() if len(words) == 3: l.add_value('title', words[0]) l.add_value('hospital', words[1]) l.add_value('specialty', words[2]) else: print("title/hostpital/special error.") l.add_xpath( 'specialtyDesc', "//div[@id='docOtherInfo']/div[@class='infoCell'][1]//p[2]/text()") l.add_xpath( 'personalInfo', "//div[@id='docOtherInfo']/div[@class='infoCell'][2]//p[2]/text()") l.add_xpath('stars', "//p[@class='right starTxt']/text()") answer = hxs.select( "//div[@id='resolvedData']/p[1]/a/text()").extract() if len(answer) == 1: answerStr = answer[0].strip().replace(u"\xa0", "") m = re.match(u"解答:(?P<answer_cnt>\d+)", answerStr) if m.groupdict()["answer_cnt"] is not None: l.add_value('answers', m.groupdict()["answer_cnt"]) review = hxs.select("//div[@id='resolvedData']/p[2]/text()").extract() if len(review) == 1: reviewStr = review[0].strip().replace(u"\xa0", "") m = re.match(u"评价:(?P<review_cnt>\d+)", reviewStr) if m.groupdict()["review_cnt"] is not None: l.add_value('reviews', m.groupdict()["review_cnt"]) # l.add_xpath('answers', "//div[@id='resolvedData']/p[1]/a/text()") # l.add_xpath('reviews', "//div[@id='resolvedData']/p[2]/text()") ret = l.load_item() print ret yield ret
def search_results(self, response): text = unicode (response.body, response.encoding) hxs = HtmlXPathSelector(text=text) reports = hxs.select ('//table[@class="t16Standard"]/tr') if (len(reports) == 0): self.log('Incident report data not present in response', log.ERROR) else: # Skip the first report record because this is the header row reports.pop (0) if (len(reports) == 0): self.log('No incident reports found in response', log.WARNING) else: self.log('Retrieved {0} incident reports'.format(len(reports)), log.INFO) for report in reports: l = XPathItemLoader(NrcScrapedReport(), report) l.context['base_url'] = response.url for name, params in NrcScrapedReport.fields.items(): l.add_xpath(name, params['xpath']) item = l.load_item() if self.db.reportExists(item['reportnum']): self.log('Report {0} already exists. Skipping to next report.'.format(item['reportnum']), log.INFO) else: f_request = Request( item['full_report_url'], callback=self.parse_full_report) m_request = Request( item['materials_url'], callback=self.parse_materials) yield item self.db.setBotTaskStatus(item['reportnum'], self.name, 'DONE') # if self.db.fullReportExists (item['reportnum']): # self.log('Full report Report {0} already exists. Skipping download.'.format(item['reportnum']), log.INFO) # else: # yield f_request # # if self.db.materialExists (item['reportnum']): # self.log('Materials record(s) already exist for report {0}. Skipping download.'.format(item['reportnum']), log.INFO) # else: # yield m_request # get next page of results next = hxs.select('//td[@class="pagination"][4]/a/@href') if len(next) > 0: yield Request (urljoin(response.url, next[0].extract()), callback=self.search_results)
def parse(self, response): selector = HtmlXPathSelector(response) # iterate over data_list for data in selector.select(self.data_list): loader = XPathItemLoader(TeoniteItem(), selector=data) loader.default_input_processor = MapCompose(str.strip) loader.default_output_processor = Join() # add xpath to loader for field, xpath in self.item_fields.items(): loader.add_xpath(field, xpath) yield loader.load_item() for nextp in selector.select(self.next_page): yield response.follow(nextp, callback=self.parse)
def parse(self, response): hxs = HtmlXPathSelector(response) for qxs in hxs.select(self.lista_linhas_xpath): loader = XPathItemLoader(LinhaItem(), selector=qxs) loader.add_xpath('linha', './td[1]/p//text()') loader.add_xpath('nome', './td[3]/p//text()') link = self.base_url + qxs.select('./td[3]//a/@href').extract()[0] #TODO: Deveria manter o contexto e retornar os dados da proxima pagina # mas o que parece eh que nao esta retornando request = Request(link, callback=self.parse_item) #pdb.set_trace() loader.add_value('ida', request.meta['ida']) loader.add_value('volta', request.meta['volta']) yield loader.load_item()
def get_user(self, selector, response, label): user_loader = XPathItemLoader(item = StackOverflowUser(), selector = selector) user_loader.add_xpath('user_name', ''.join([ './/div[contains(@class, "user-details")]', '/a/text()' ])) user_loader.add_xpath('user_link', ''.join([ './/div[contains(@class, "user-details")]', '/a/@href' ])) if user_loader.get_output_value('user_link'): user_id = user_loader.get_output_value('user_link') user_loader.add_value('user_id', user_loader.get_output_value('user_link')) return user_loader.load_item()
def parse_talk(self, response): loader = XPathItemLoader(item=Pybr8TalksItem(), response=response) loader.add_xpath('title', '//div[@id="proposal"]/h1/text()') loader.add_xpath('description', '//div[@class="twocolumn"]/div[2]/text()[2]') loader.add_xpath('author_name', '//div[@class="twocolumn"]/div/div[2]/h3/text()') loader.add_xpath('author_profile', '//div[@class="twocolumn"]/div/div[2]/text()[3]') return loader.load_item()
def parse(self, response): """ Default callback used by Scrapy to process downloaded responses """ selector = HtmlXPathSelector(response) #iterate over events for event in selector.select(self.events_list_xpath): loader = XPathItemLoader(CrunchBaseEvent(), selector=event) #define processors loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() #iterate over fields and add xpaths to the loader. for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath) yield loader.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) item_name = hxs.select( "//input[@id='ctl00_ctlPagePlaceHolder_Keywords']/@value").extract( ) item_hash = hashlib.md5( '%s::%s::%s' % (self.auction_id, item_name, self.name)).hexdigest() loader = XPathItemLoader(item=SearchResultItem(), response=response) loader.add_value("id", item_hash) loader.add_value("auction_id", self.auction_id) loader.add_value("site", self.name) loader.add_xpath( "name", "//input[@id='ctl00_ctlPagePlaceHolder_Keywords']/@value") loader.add_value("link", response.url) loader.add_xpath("price", "//td[7]/text()") return loader.load_item()
def parse(self, response): response.body = response.body.replace('\\','').replace('\xa0','') p = XPathItemLoader(item=PersonItem(), response=response) try: p.add_value('first_name', re.findall( '&qf=(\w+)&', response.url )[0] ) p.add_value('middle_name', re.findall( '&qmi=(\w+)&', response.url )[0] ) p.add_value('last_name', re.findall( '&qn=(\w+)&', response.url )[0] ) p.add_value('city', re.findall( '&qc=(\w+)&', response.url )[0] ) p.add_value('state', re.findall( '&qs=(\w+)&', response.url )[0] ) p.add_value('zipcode', re.findall( '&qz=(\d+)&', response.url )[0] ) p.add_value('prop_ref', re.findall( '&prop_ref=(\d+)', response.url )[0] ) p.add_xpath('cities', '//div[@class="addresses"]/p/b/text()[1]', re="([^\(]+)") p.add_xpath('age','//div[@class="greenTopBoxLeft round12_12_0_0"]/p[@class="nameAge"]/text()[2]', re=", Age (\d+)") except IndexError: pass else: return p.load_item()
def parse_faculty_detail(self, response): """ This function parses a sample response. Some contracts are mingled with this docstring. @url http://www.haodf.com/faculty/DE4rO-XCoLU0Jq1rbc1P6dS2aO.htm @returns items 21 21 @returns requests 3 3 @scrapes _name specialty title shortDesc """ hxs = HtmlXPathSelector(response) linkExtractor = SgmlLinkExtractor( allow=(r"/faculty/\S+/menzhen.htm\?orderby", ), unique=True) links = linkExtractor.extract_links(response) for link in links: yield Request(link.url, callback=self.parse_faculty_detail) specialty = hxs.select( "/html/body/div[3]/div/div[2]/div/a[3]/text()").extract() hospital = hxs.select( "/html/body/div[3]/div/div[2]/div/a[2]/text()").extract() docLinks = hxs.select( "//table[@id='doc_list_index']/tr[descendant::td[contains(@class, 'tda')]]" ) #docLinks = hxs.select("//table[@id='doc_list_index']/tr") for doc in docLinks: l = XPathItemLoader(DoctorItem(), doc) docNames = doc.select( "./td[@class='tda']/li/a[contains(@href, 'http://www.haodf.com/doctor/')]/text()" ).extract() if len(docNames) != 0: print docNames[0] l.add_xpath( '_name', "./td[@class='tda']/li/a[contains(@href, 'http://www.haodf.com/doctor/')]/text()" ) l.add_value('specialty', specialty) l.add_value('hospital', hospital) l.add_xpath('title', "./td[@class='tda']/li/p[1]/text()") l.add_xpath('acadamicDegree', "./td[@class='tda']/li/p[2]/text()") l.add_xpath('shortDesc', "./td[@class='tdb']/text()") #clinic time todo ret = l.load_item() #print ret yield ret
def myparse(self, response): print "myParse" selector = HtmlXPathSelector(response) # l = selector.select(self.deals_list_xpath) l = selector.select('//div[@id="detailed"]') ll = l.select('.//div[@class="title4"]/a/text()').extract() open(ll[0].strip() + '.html', 'wb').write(response.body) print ll[0].strip() for deal in l: #loader = XPathItemLoader(LivingSocialDeal(),selector=deal) loader = XPathItemLoader(MoviesClass(), selector=deal) loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() loader.default_output_processor = TakeFirst() for field, xpath in self.mov_fields.iteritems(): loader.add_xpath(field, xpath) x = deal.select(field).extract() yield loader.load_item()
def parse(self, response): # hxs = HtmlXPathSelector(response) # ads = hxs.select('//div[@class="list-ads"]/a') # items = [] # for ad in ads: # item = LeboncoinItem() # item['name'] = ad.select('div[@class="ad-lbc"]/div[@class="detail"]/div[@class="title"]/text()').re('^\s*([\w\s]+\w)\s*') # item['photo'] = ad.select('div[@class="ad-lbc"]/div[@class="image"]/div[@class="image-and-nb"]/img/@src').extract() # item['url'] = ad.select('@href').extract() # self.log(item['name']) #print item['name'],':' ,item['photo'],'--->', item['url'] #html = '<div><div style="width:150px;height:250px;float:left;text-align:center">\ #<img src="%s" alt="" /><br />\ #<p><a href="%s">%s</a></p>\ #</div></div>' % (''.join(item['photo']), ''.join(item['url']), ''.join(item['name']) ) ##print photo #items.append(item) ## put in filename #filename = response.url.split("/")[-4] #open('/tmp/lbc/'+filename+'.html', 'a').write(html) #return items #yield items hxs = HtmlXPathSelector(response) for qxs in hxs.select('//div[@class="list-ads"]/a'): loader = XPathItemLoader(LeboncoinItem(), selector=qxs) loader.add_xpath('name' , 'div[@class="ad-lbc"]/div[@class="detail"]/div[@class="title"]/text()', re='^\s*([\w\s]+\w)\s*' ) loader.add_xpath('photo' , 'div[@class="ad-lbc"]/div[@class="image"]/div[@class="image-and-nb"]/img/@src' ) loader.add_xpath('url' , '@href' ) loader.add_value('category' , response.url.split("/")[-4] ) yield loader.load_item()
def parse_materials(self, response): text = unicode (response.body, response.encoding) hxs = HtmlXPathSelector(text=text) materials = hxs.select ('//table[@class="t16Standard"]/tr') if (len(materials) == 0): self.log('Materials data not present in response from {0}'.format(response.url), log.INFO) else: # Skip the first report record because this is the header row materials.pop (0) if (len(materials) == 0): self.log('No incident reports found in response', log.INFO) else: self.log('Retrieved {0} materials records'.format(len(materials)), log.INFO) for material in materials: l = XPathItemLoader(NrcScrapedMaterial(), material) l.add_value('reportnum', response.url, TakeFirst(), re='P3_SEQNOS:(\d+)') for name, params in NrcScrapedMaterial.fields.items(): if 'xpath' in params: l.add_xpath(name, params['xpath']) item = l.load_item() yield item
def parse(self, response): """ Default callback used by Scrapy to process downloaded responses Testing contracts: @url http://www.livingsocial.com/cities/15-san-francisco @returns items 1 @scrapes title link """ selector = HtmlXPathSelector(response) # iterate over deals for deal in selector.xpath(self.deals_list_xpath): loader = XPathItemLoader(LivingSocialDeal(), selector=deal) # define processors loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # iterate over fields and add xpaths to the loader for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath) yield loader.load_item()
def parse(self, response): """ This function parses a sample response. Some contracts are mingled with this docstring. @url http://www.haodf.com/yiyuan/shanghai/list.htm @returns items 21 21 @returns requests 3 3 @scrapes _hospitalName grade area city """ hxs = HtmlXPathSelector(response) city_tree = hxs.select("//div[@id='el_tree_1000000']") # Used for hospital _cityName = city_tree.select( "div[@class='kstl2']/a/text()").extract()[0] l = XPathItemLoader(CityItem(), city_tree) l.add_xpath('cityAreas', "div[@class='ksbd']/ul/li/a/text()") l.add_xpath('_cityName', "div[@class='kstl2']/a/text()") yield l.load_item() for url in city_tree.select("div[@class='kstl']/a/@href").extract(): yield Request(url, callback=self.parse) area_list = hxs.select( "//div[@id='el_result_content']/div/div[@class='bxmd']/div") hospital_list = area_list.select("div[@class='m_ctt_green']/ul/li/a") for hospital in hospital_list: l = XPathItemLoader(HospitalItem(), hospital) l.add_xpath('_hospitalName', "text()") featureList = hospital.select( "following-sibling::span/text()").extract() if len(featureList) == 1: featureStr = featureList[0].strip() m = re.match(u"\((?P<grade>\S+)(|, 特色:(?P<feature>\S+))\)", featureStr) if m is not None: if m.groupdict()["grade"] is not None: l.add_value('grade', m.groupdict()["grade"]) if m.groupdict()["feature"] is not None: l.add_value('feature', m.groupdict()["feature"]) #l.add_xpath('feature', "following-sibling::span/text()") l.add_xpath( 'area', "parent::*/parent::*/parent::*/preceding-sibling::*[1]/attribute::id" ) l.add_value('city', _cityName) yield l.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) for tr in hxs.select('//div[@id="miListView"]/table/tr'): i = ProveedorItem() l = XPathItemLoader(item=i, selector=tr) l.add_xpath('nombre', 'td[1]/text()') l.add_xpath('domicilio', 'td[2]/text()') l.add_xpath('cuit', 'td[3]/text()') l.add_xpath('localidad', 'td[4]/text()') yield l.load_item() for l in self.extractor.extract_links(response): yield Request(l.url, callback=self.parse)
def parse_item(self, response): #hxs = HtmlXPathSelector(response) l = XPathItemLoader(item=PytexasItem(), response=response) l.add_xpath('title', '//*/div[@class="span6"]/h2/text()') l.add_xpath('speaker', '//*/div[@class="span6"]/h3/text()') l.add_xpath('description', '//*/div[@class="span6"]/p[2]/text()') #l.add_value('last_updated', 'today') # you can also use literal values return l.load_item()