Beispiel #1
0
    def parse_poetry_list(self, response):
        '''
        Parse poet's poetry list
        '''
        i = response.url.find('&id=')
        self.logger.debug('parse_poetry_list: %s', response.url[i + 4:])

        data = json.loads(response.body)

        poetries = data['Data']
        total = data['Total']
        errors = data['Errors']
        count = len(poetries)
        self.logger.info('parse_poetry_list: result has %d of %d poetries',
                         count, total)

        for poetry in poetries:
            # extract info of the poetry, and crawl poetry page
            #print poetry
            content_slug = poetry['ContentSlug']
            type_slug = poetry['TypeSlug']

            # Crate url
            url = self.domain_name + type_slug + '/' + content_slug

            # Check if the entry for ``url`` exist in db,
            # Also find out the list of languages in which the content is there.
            lang_list = get_language_list_for_url(url)

            # Now crawl poetry page only for remaining langauge
            for lang in (x for x in self.LANGUAGES if x not in lang_list):
                url_t = url + '?lang=' + lang
                yield scrapy.Request(url_t, callback=self.parse_poetry)
Beispiel #2
0
 def l3_parse_poetry_list(self, response):
     '''
     Level 3: parse poet's poetry list
     '''
     i = response.url.find('&id=')
     self.logger.debug('l3_parse_poetry_list: %s', response.url[i+4:])
     
     data = json.loads(response.body)
     
     poetries = data['Data']
     total = data['Total']
     errors = data['Errors']
     count = len(poetries)
     self.logger.info('l3_parse_poetry_list: result has %d of %d poetries', count, total)
     
     for poetry in poetries:
         # extract info of the poetry, and crawl poetry page
         #print poetry
         content_slug = poetry['ContentSlug']
         type_slug = poetry['TypeSlug']
         
         # Crate url
         url = self.domain_name + type_slug + '/' + content_slug
         
         # Check if the entry for ``url`` exist in db,
         # Also find out the list of languages in which the content is there.
         lang_list = get_language_list_for_url(url)
         
         # Now crawl poetry page only for remaining langauge
         for lang in (x for x in self.LANGUAGES if x not in lang_list):
                 url_t = url + '?lang=' + lang
                 yield scrapy.Request(url_t, callback=self.l4_parse_poetry)
Beispiel #3
0
    def parse_url_list(self, response):
        '''
        Find out the poem/collections/book list, generate request for each item.
        '''
        self.logger.debug("parse_url_list: extracting poem/collections links.")
        self.crawler.stats.inc_value('kangaroo/list_page_visit')

        try:
            # All URLs in main body content div i.e. mw-content-text
            urls_all = response.xpath(
                "//div[@id='mw-content-text']//a/@href").extract()
            urls_all = set(urls_all)
            #print len(urls_all)

            # Exclude the URLs present in breadcrumbs
            urls_exclude = response.xpath(
                "//div[@id='mw-content-text']//div[@id='kkrachna' or @class='kkrachna' or @id='extrainfobox' or @class='noarticletext']//a/@href"
            ).extract()
            urls_exclude = set(urls_exclude)
            #print len(urls_exclude)

            urls = urls_all - urls_exclude
            self.logger.debug("parse_url_list: %s urls found." % len(urls))

        except:
            print("ERROR: parse_url_list: ", sys.exc_info()[0])
            _trace = ''
            for frame in traceback.extract_tb(sys.exc_info()[2]):
                fname, lineno, fn, text = frame
                print("ERROR: error in %s on line %d" % (fname, lineno))
                _trace = _trace + "error in %s on line %d" % (fname, lineno)
            with open(self.LOGFILE, "a") as outfile:
                t = time.asctime(time.localtime(time.time()))
                json.dump(
                    {
                        'link': response.url,
                        'error': 'parse_url_list failed',
                        'trace': _trace,
                        'time': t
                    },
                    outfile,
                    indent=4)

        urls = [self.domain_name + x for x in urls]

        for url in urls:
            # Check if the entry for ``url`` exist in db,
            # Also find out the list of languages in which the content is there.
            lang_list = get_language_list_for_url(url)
            # Now crawl poetry page only for remaining langauge
            for lang in (x for x in self.LANGUAGES if x not in lang_list):
                yield scrapy.Request(url, callback=self.parse_poetry_page)
Beispiel #4
0
    def l3_parse_article_page(self,response):
        """
        First check for the page containing div[@class='poem'] in the XPATH
        1. If found then extract the poem and save in the database
        2. If not found call l2_parse_author_page again because it contains list of poems in a journal 
        """
        try:
            print "DBG:: l3_parse_article_page: Extracting poem from Article page"
            p = response.xpath("//div[@id='mw-content-text']/div[@class='poem']//p").extract()
            poem = " ".join(x.encode('utf-8') for x in p)
            try:
                h1 = response.xpath("//h1[@id='firstHeading']//text()").extract()[0].encode('utf-8')
                title = h1
                author = h1.split('/')[-1]                
                
                data = {}
                data['poem'] = poem
                data['url'] = response.url.encode('utf-8')        
                data['title'] = title
                data['author'] = author.title()                
                data['language'] = 'hi'# Content of this site are in hindi
                
                # Store these information in DB
                save_to_db_poem(data)

            except:
                print "ERROR:: l3_parse_article_page: Title not found"
        except:
            # Extract article links from the Author page and generate request for each            
            try:
                print "DBG:: l3_parse_article_page: Extracting poem links from Author page"
                articles = response.xpath("//div[@id='mw-content-text']/ul/li/a/@href").extract()
        
                articles_links = [self.domain_name+x for x in articles]
                for url in articles_links:
                    # Check if the entry for ``url`` exist in db,
                    # Also find out the list of languages in which the content is there.
                    lang_list = get_language_list_for_url(url)                
                    # Now crawl poetry page only for remaining langauge
                    for lang in (x for x in self.LANGUAGES if x not in lang_list):                                  
                        #print "Visiting Article: ", url
                        yield scrapy.Request(url, callback=self.l3_parse_article_page)
            except:
                print "DBG:: Nothing found in Author page!!!" 
                print("ERROR: l3_parse_article_page: Unexpected error:", sys.exc_info()[0])
                for frame in traceback.extract_tb(sys.exc_info()[2]):
                    fname,lineno,fn,text = frame
                    print ("DBG:: Error in %s on line %d" % (fname, lineno))
Beispiel #5
0
    def l2_parse_author_page(self, response):
        """
        Parse the author page
        1. Extract Date of birth and date of death
        2. Extract year of birth and death
        3. Save Author details
        4. Crawl further to scrap his/her articles
        """
        name = None                
        # date of birth
        birth = None
        # date of death
        death = None        

        try:
            name = response.xpath("//h1[@id='firstHeading']//text()").extract()[0]    
            #print name            
        except:
            print "################################## name error #####################"
        	
        try:
            birth = response.xpath("//div[@id='mw-content-text']/table[@id='kkparichay-box']//div[@id='kkparichay-dob']/text()").extract()[0]	
            #print dob            
        except:
            pass
        
        try:
            death = response.xpath("//div[@id='mw-content-text']/table[@id='kkparichay-box']//div[@id='kkparichay-dod']/text()").extract()[0]	
            #print dod            
        except:
            pass
        
           
        data = {}
        data['name'] = name
        data['birth'] = birth
        data['death'] = death
        data['url'] = response.url.encode('utf-8')
        
        # Store these information in DB
        save_to_db_author(data)

        ##
        # Parse the page, find out the articles list, generate request for each article
        # Extract article links from the Author page and generate request for each        
        try:
            print "DBG:: l2_parse_author_page: Extracting poem links from Author page"
            articles = response.xpath("//div[@id='mw-content-text']/ul/li/a/@href").extract()
    
            articles_links = [self.domain_name+x for x in articles]
            for url in articles_links:
                # Check if the entry for ``url`` exist in db,
                # Also find out the list of languages in which the content is there.
                lang_list = get_language_list_for_url(url)
                # Now crawl poetry page only for remaining langauge
                for lang in (x for x in self.LANGUAGES if x not in lang_list):                                 
                    #print "Visiting Article: ", url
                    yield scrapy.Request(url, callback=self.l3_parse_article_page)
        except:
            print "l2_parse_author_page: Nothing found in Author page!!!"
            print("ERROR: l2_parse_author_page: Unexpected error:", sys.exc_info()[0])
            for frame in traceback.extract_tb(sys.exc_info()[2]):
                fname,lineno,fn,text = frame
                print ("DBG:: Error in %s on line %d" % (fname, lineno))