def l2_parse_poets_list(self, response): ''' Level 2: parse poets list and crawl to next level i.e. individual poet's page and its subsections ''' i = response.url.find('&startsWith=') self.logger.debug('l2_parse_poets_list: %s', response.url[i+12]) data = json.loads(response.body) poets = data['Data'] total = data['Total'] errors = data['Errors'] count = len(poets) self.logger.info('l2_parse_poets_list: result has %d of %d poets', count, total) for poet in poets: # extract info of a poet and save #print poet poet_info = {} poet_info['name'] = poet['Name'] poet_info['birth'] = poet['FromDate'] poet_info['death'] = poet['ToDate'] poet_info['url'] = self.BASE_URL_POETS + poet['SEOSlug'] + "/" # save poet info to db save_to_db_author(poet_info) # Crawl poet's page url_base = self.domain_name + self.API_POET_READ + '?' + '&sort=SortTitle-asc' + '&page=1' + '&lang=1' + '&pageSize=10000' + '&id=' + poet['SEOSlug'] for info in ['ghazals', 'couplets', 'nazms',]: url = url_base + '&info=' + info yield scrapy.Request(url, callback=self.l3_parse_poetry_list)
def parse_poet_page(self, response): ''' Parse the poet page 1. Extract Date of birth and date of death 2. Extract year of birth and death 3. Save Poet details 4. Crawl further to scrap his/her articles ''' self.crawler.stats.inc_value('kangaroo/poet_page_visit') try: name = response.xpath( "//h1[@id='firstHeading']//text()").extract()[0] try: birth = response.xpath( "//div[@id='mw-content-text']/table[@id='kkparichay-box']//div[@id='kkparichay-dob']/text()" ).extract()[0] except: birth = None try: death = response.xpath( "//div[@id='mw-content-text']/table[@id='kkparichay-box']//div[@id='kkparichay-dod']/text()" ).extract()[0] except: death = None data = {} data['name'] = name data['birth'] = birth data['death'] = death data['url'] = response.url.encode('utf-8') # Store these information in DB save_to_db_author(data) self.crawler.stats.inc_value('kangaroo/poet_found') except: self.logger.error("parse_poet_page: %s", sys.exc_info()[0]) _trace = '' for frame in traceback.extract_tb(sys.exc_info()[2]): fname, lineno, fn, text = frame self.logger.error("error in %s on line %d" % (fname, lineno)) _trace = _trace + "error in %s on line %d" % (fname, lineno) with open(self.LOGFILE, "a") as outfile: t = time.asctime(time.localtime(time.time())) json.dump( { 'link': response.url, 'error': 'parsing poet failed', 'trace': _trace, 'time': t }, outfile, indent=4) # Process the page for poems/collections list return self.parse_url_list(response)
def l2_parse_author_page(self, response): """ Parse the author page 1. Extract Date of birth and date of death 2. Extract year of birth and death 3. Save Author details 4. Crawl further to scrap his/her articles """ name = None # date of birth birth = None # date of death death = None try: name = response.xpath("//h1[@id='firstHeading']//text()").extract()[0] #print name except: print "################################## name error #####################" try: birth = response.xpath("//div[@id='mw-content-text']/table[@id='kkparichay-box']//div[@id='kkparichay-dob']/text()").extract()[0] #print dob except: pass try: death = response.xpath("//div[@id='mw-content-text']/table[@id='kkparichay-box']//div[@id='kkparichay-dod']/text()").extract()[0] #print dod except: pass data = {} data['name'] = name data['birth'] = birth data['death'] = death data['url'] = response.url.encode('utf-8') # Store these information in DB save_to_db_author(data) ## # Parse the page, find out the articles list, generate request for each article # Extract article links from the Author page and generate request for each try: print "DBG:: l2_parse_author_page: Extracting poem links from Author page" articles = response.xpath("//div[@id='mw-content-text']/ul/li/a/@href").extract() articles_links = [self.domain_name+x for x in articles] for url in articles_links: # Check if the entry for ``url`` exist in db, # Also find out the list of languages in which the content is there. lang_list = get_language_list_for_url(url) # Now crawl poetry page only for remaining langauge for lang in (x for x in self.LANGUAGES if x not in lang_list): #print "Visiting Article: ", url yield scrapy.Request(url, callback=self.l3_parse_article_page) except: print "l2_parse_author_page: Nothing found in Author page!!!" print("ERROR: l2_parse_author_page: Unexpected error:", sys.exc_info()[0]) for frame in traceback.extract_tb(sys.exc_info()[2]): fname,lineno,fn,text = frame print ("DBG:: Error in %s on line %d" % (fname, lineno))