Example #1
0
 def l2_parse_poets_list(self, response):
     '''
     Level 2: parse poets list and crawl to next level i.e. individual poet's page and its subsections
     '''
     i = response.url.find('&startsWith=')
     self.logger.debug('l2_parse_poets_list: %s', response.url[i+12])
     
     data = json.loads(response.body)
     poets = data['Data']
     total = data['Total']
     errors = data['Errors']
     count = len(poets)
     self.logger.info('l2_parse_poets_list: result has %d of %d poets', count, total)
     
     for poet in poets:
         # extract info of a poet and save
         #print poet
         poet_info = {}
         poet_info['name'] = poet['Name']
         poet_info['birth'] = poet['FromDate']
         poet_info['death'] = poet['ToDate']
         poet_info['url'] = self.BASE_URL_POETS + poet['SEOSlug'] + "/"
         
         # save poet info to db
         save_to_db_author(poet_info)
         
         # Crawl poet's page
         url_base = self.domain_name + self.API_POET_READ + '?' + '&sort=SortTitle-asc' + '&page=1' + '&lang=1' + '&pageSize=10000' + '&id=' + poet['SEOSlug']
         for info in ['ghazals', 'couplets', 'nazms',]:
             url = url_base + '&info=' + info
             yield scrapy.Request(url, callback=self.l3_parse_poetry_list)
Example #2
0
    def parse_poet_page(self, response):
        '''
        Parse the poet page
        1. Extract Date of birth and date of death
        2. Extract year of birth and death
        3. Save Poet details
        4. Crawl further to scrap his/her articles
        '''
        self.crawler.stats.inc_value('kangaroo/poet_page_visit')

        try:
            name = response.xpath(
                "//h1[@id='firstHeading']//text()").extract()[0]
            try:
                birth = response.xpath(
                    "//div[@id='mw-content-text']/table[@id='kkparichay-box']//div[@id='kkparichay-dob']/text()"
                ).extract()[0]
            except:
                birth = None
            try:
                death = response.xpath(
                    "//div[@id='mw-content-text']/table[@id='kkparichay-box']//div[@id='kkparichay-dod']/text()"
                ).extract()[0]
            except:
                death = None

            data = {}
            data['name'] = name
            data['birth'] = birth
            data['death'] = death
            data['url'] = response.url.encode('utf-8')

            # Store these information in DB
            save_to_db_author(data)
            self.crawler.stats.inc_value('kangaroo/poet_found')

        except:
            self.logger.error("parse_poet_page: %s", sys.exc_info()[0])
            _trace = ''
            for frame in traceback.extract_tb(sys.exc_info()[2]):
                fname, lineno, fn, text = frame
                self.logger.error("error in %s on line %d" % (fname, lineno))
                _trace = _trace + "error in %s on line %d" % (fname, lineno)
            with open(self.LOGFILE, "a") as outfile:
                t = time.asctime(time.localtime(time.time()))
                json.dump(
                    {
                        'link': response.url,
                        'error': 'parsing poet failed',
                        'trace': _trace,
                        'time': t
                    },
                    outfile,
                    indent=4)

        # Process the page for poems/collections list
        return self.parse_url_list(response)
Example #3
0
    def l2_parse_author_page(self, response):
        """
        Parse the author page
        1. Extract Date of birth and date of death
        2. Extract year of birth and death
        3. Save Author details
        4. Crawl further to scrap his/her articles
        """
        name = None                
        # date of birth
        birth = None
        # date of death
        death = None        

        try:
            name = response.xpath("//h1[@id='firstHeading']//text()").extract()[0]    
            #print name            
        except:
            print "################################## name error #####################"
        	
        try:
            birth = response.xpath("//div[@id='mw-content-text']/table[@id='kkparichay-box']//div[@id='kkparichay-dob']/text()").extract()[0]	
            #print dob            
        except:
            pass
        
        try:
            death = response.xpath("//div[@id='mw-content-text']/table[@id='kkparichay-box']//div[@id='kkparichay-dod']/text()").extract()[0]	
            #print dod            
        except:
            pass
        
           
        data = {}
        data['name'] = name
        data['birth'] = birth
        data['death'] = death
        data['url'] = response.url.encode('utf-8')
        
        # Store these information in DB
        save_to_db_author(data)

        ##
        # Parse the page, find out the articles list, generate request for each article
        # Extract article links from the Author page and generate request for each        
        try:
            print "DBG:: l2_parse_author_page: Extracting poem links from Author page"
            articles = response.xpath("//div[@id='mw-content-text']/ul/li/a/@href").extract()
    
            articles_links = [self.domain_name+x for x in articles]
            for url in articles_links:
                # Check if the entry for ``url`` exist in db,
                # Also find out the list of languages in which the content is there.
                lang_list = get_language_list_for_url(url)
                # Now crawl poetry page only for remaining langauge
                for lang in (x for x in self.LANGUAGES if x not in lang_list):                                 
                    #print "Visiting Article: ", url
                    yield scrapy.Request(url, callback=self.l3_parse_article_page)
        except:
            print "l2_parse_author_page: Nothing found in Author page!!!"
            print("ERROR: l2_parse_author_page: Unexpected error:", sys.exc_info()[0])
            for frame in traceback.extract_tb(sys.exc_info()[2]):
                fname,lineno,fn,text = frame
                print ("DBG:: Error in %s on line %d" % (fname, lineno))