Beispiel #1
0
 def l4_parse_poetry(self, response):
     '''
     Level 4: parse poetry page, extract poetry, and save.
     '''
     #print response.url
     
     try:
         # extract poem
         #stanza_selectors = response.xpath("//div[@class='poemContainer']/div[@class='PoemTextHost  ']/div[@class='PoemDisplay OrgTextDisplay ']/div")
         stanza_selectors = response.xpath("//div[contains(@class,'poemContainer')]/div[contains(@class,'PoemTextHost')]/div[contains(@class,'PoemDisplay') and contains(@class,'OrgTextDisplay')]/div")
         poem = ''
         for s in stanza_selectors:
             line_selectors = s.xpath("./p")
             for l in line_selectors:
                 line = l.xpath(".//text()").extract()
                 line = ''.join(line)
                 line = line.strip()
                 ##print line
                 poem = poem + line + '\n'
             poem = poem + '\n'
         poem = poem[0:-1]#strip last '\n' from the poem
         #print poem
         
         # extract title of the poem
         title = response.xpath("//div[@class='shayariContainerDiv']/div[@class='left_pan_shayari']/div[@class='shayari_first']/h1/text()").extract()[0]
         
         # extract poet name
         #poet = response.xpath("//div[@class='artist_img_descrpt']/div[@class='about_artist']/h2/text()").extract()[0]
         # poet name must be in english, that is why we have to discard the previous one. 
         poet_href = response.xpath("//div[@class='artist_img']//a/@href").extract()[0]
         p = re.compile(ur'poets/(.+)/') #href="/poets/anjum-tarazi/?lang=Hi"
         poet = p.search(poet_href).group(1)
         poet = poet.replace('-', ' ')
         
         # check response.url for language information: https://.....xyz/?lang=hi
         tmp = response.url
         tmp = tmp.split('?')
         url = tmp[0]
         language = tmp[1].split('=')[1]
         
         data = {}
         data['poem'] = poem
         data['url'] = url
         data['title'] = title.strip()
         data['author'] = poet.title()
         data['language'] = language
          
         # Store these information in DB
         save_to_db_poem(data)
         
     except:
         print("ERROR: l4_parse_poetry: Unexpected error:", sys.exc_info()[0])
         _trace = ''
         for frame in traceback.extract_tb(sys.exc_info()[2]):
             fname,lineno,fn,text = frame
             print ("DBG:: Error in %s on line %d" % (fname, lineno))
             _trace = _trace + "Error in %s on line %d" % (fname, lineno)
         with open(self.LOGFILE, "a") as outfile:
             t = time.asctime( time.localtime(time.time()) )
             json.dump({'link': response.url, 'error': 'parsing failed', 'trace': _trace, 'time': t}, outfile, indent=4)
Beispiel #2
0
 def l4_parse_poetry(self, response):
     '''
     Level 4: parse poetry page, extract poetry, and save.
     '''
     #print response.url
     
     try:
         # extract data
         content = response.xpath("//div[@id='ImageHost']//div[@class='PoemDisplay']/span").extract()[0]
         title = response.xpath("//div[@class='shayariContainerDiv']/div[@class='left_pan_shayari']/div[@class='shayari_first']/h1/text()").extract()[0]
         
         #poet = response.xpath("//div[@class='artist_img_descrpt']/div[@class='about_artist']/h2/text()").extract()[0]
         # poet name must be in english, that is why we have to discard the previous one. 
         poet_href = response.xpath("//div[@class='artist_img']//a/@href").extract()[0]
         p = re.compile(ur'poets/(.+)/') #href="/poets/anjum-tarazi/?lang=Hi"
         poet = p.search(poet_href).group(1)
         poet = poet.replace('-', ' ')
         
         # remove outer span tag
         poem = remove_tags(content, 'span')
     except:
         print "ERROR: l4_parse_poetry: failed to extract data"
         with open("exception_poetry.txt", "a") as outfile:
             json.dump({'link': response.url, 'error': 'parsing failed'}, outfile, indent=4)
     
     try:
         # check response.url for language information: https://.....xyz/?lang=hi
         tmp = response.url
         tmp = tmp.split('?')
         url = tmp[0]
         language = tmp[1].split('=')[1]
         
         # Correct the line order of the poem; and save
         refined_poem = refine_poetry(poem, url, language)
         if refined_poem:
             data = {}
             data['poem'] = refined_poem
             data['url'] = url
             data['title'] = title
             data['author'] = poet.title()
             data['language'] = language
             
             # Store these information in DB
             save_to_db_poem(data)
         else:
             with open("exception_poetry.txt", "a") as outfile:
                 json.dump({'link': response.url, 'error': 'refine_poetry failed'}, outfile, indent=4)
     
     except:
         print("ERROR: l4_parse_poetry: Unexpected error:", sys.exc_info()[0])
         for frame in traceback.extract_tb(sys.exc_info()[2]):
             fname,lineno,fn,text = frame
             print ("DBG:: Error in %s on line %d" % (fname, lineno))
         with open("exception_poetry.txt", "a") as outfile:
             json.dump({'link': response.url, 'error': sys.exc_info()[0]}, outfile, indent=4)
Beispiel #3
0
    def l3_parse_article_page(self,response):
        """
        First check for the page containing div[@class='poem'] in the XPATH
        1. If found then extract the poem and save in the database
        2. If not found call l2_parse_author_page again because it contains list of poems in a journal 
        """
        try:
            print "DBG:: l3_parse_article_page: Extracting poem from Article page"
            p = response.xpath("//div[@id='mw-content-text']/div[@class='poem']//p").extract()
            poem = " ".join(x.encode('utf-8') for x in p)
            try:
                h1 = response.xpath("//h1[@id='firstHeading']//text()").extract()[0].encode('utf-8')
                title = h1
                author = h1.split('/')[-1]                
                
                data = {}
                data['poem'] = poem
                data['url'] = response.url.encode('utf-8')        
                data['title'] = title
                data['author'] = author.title()                
                data['language'] = 'hi'# Content of this site are in hindi
                
                # Store these information in DB
                save_to_db_poem(data)

            except:
                print "ERROR:: l3_parse_article_page: Title not found"
        except:
            # Extract article links from the Author page and generate request for each            
            try:
                print "DBG:: l3_parse_article_page: Extracting poem links from Author page"
                articles = response.xpath("//div[@id='mw-content-text']/ul/li/a/@href").extract()
        
                articles_links = [self.domain_name+x for x in articles]
                for url in articles_links:
                    # Check if the entry for ``url`` exist in db,
                    # Also find out the list of languages in which the content is there.
                    lang_list = get_language_list_for_url(url)                
                    # Now crawl poetry page only for remaining langauge
                    for lang in (x for x in self.LANGUAGES if x not in lang_list):                                  
                        #print "Visiting Article: ", url
                        yield scrapy.Request(url, callback=self.l3_parse_article_page)
            except:
                print "DBG:: Nothing found in Author page!!!" 
                print("ERROR: l3_parse_article_page: Unexpected error:", sys.exc_info()[0])
                for frame in traceback.extract_tb(sys.exc_info()[2]):
                    fname,lineno,fn,text = frame
                    print ("DBG:: Error in %s on line %d" % (fname, lineno))
Beispiel #4
0
 def parse_article_page(self, response):
     self.count_visit_article = self.count_visit_article + 1
     try:
         print "Extracting poem ", self.count_visit_article, " from Article page"
         p = response.xpath("/html/body/table//tr[2]/td/table//tr/td/pre").extract()
         poem = " ".join(x.encode('utf-8') for x in p)
         
         if poem:
             title = response.xpath("/html/body/table//tr[2]/td/table//tr/td/p/strong/text()").extract()[0].encode('utf-8')
             
             self.count_articles = self.count_articles + 1
             
             data = {}
             data['index'] = self.count_articles
             data['title'] = title
             data['author'] = ''
             data['poem'] = poem
             data['url'] = response.url.encode('utf-8')
             
             # Store these information in DB
             save_to_db_poem(data)
             
         else:
             print "First method failed trying another xpath"
             p = response.xpath("/html/body/center/p").extract()
             poem = " ".join(x.encode('utf-8') for x in p)
             
             if poem:
                 title = response.xpath("/html/body/center/p[1]/strong/text()").extract()[0].encode('utf-8')
                 author = response.xpath("/html/body/center/p[1]/a/em/text()").extract()[0].encode('utf-8')
                 self.count_articles = self.count_articles + 1
                 
                 data = {}
                 data['index'] = self.count_articles
                 data['title'] = "".join(x.encode('utf-8') for x in title)
                 data['author'] = "".join(x.encode('utf-8') for x in author)
                 data['poem'] = poem
                 data['url'] = response.url.encode('utf-8')
                 
                 # Store these information in DB
                 save_to_db_poem(data)
             
             else:
                 print "Both method failed write it in file for further processing"
     except:
         print "Error Article page!!!"
         with open("up_exception_article.txt", "a") as outfile:
             json.dump({'index': self.count_visit_article, 'link': response.url}, outfile, indent=4)
Beispiel #5
0
    def parse_article_page(self, response):
        self.count_visit_article = self.count_visit_article + 1
        try:
            print "Extracting poem ", self.count_visit_article, " from Article page"
            p = response.xpath(
                "/html/body/table//tr[2]/td/table//tr/td/pre").extract()
            poem = " ".join(x.encode('utf-8') for x in p)

            if poem:
                title = response.xpath(
                    "/html/body/table//tr[2]/td/table//tr/td/p/strong/text()"
                ).extract()[0].encode('utf-8')

                self.count_articles = self.count_articles + 1

                data = {}
                data['index'] = self.count_articles
                data['title'] = title
                data['author'] = ''
                data['poem'] = poem
                data['url'] = response.url.encode('utf-8')

                # Store these information in DB
                save_to_db_poem(data)

            else:
                print "First method failed trying another xpath"
                p = response.xpath("/html/body/center/p").extract()
                poem = " ".join(x.encode('utf-8') for x in p)

                if poem:
                    title = response.xpath(
                        "/html/body/center/p[1]/strong/text()").extract(
                        )[0].encode('utf-8')
                    author = response.xpath(
                        "/html/body/center/p[1]/a/em/text()").extract(
                        )[0].encode('utf-8')
                    self.count_articles = self.count_articles + 1

                    data = {}
                    data['index'] = self.count_articles
                    data['title'] = "".join(x.encode('utf-8') for x in title)
                    data['author'] = "".join(x.encode('utf-8') for x in author)
                    data['poem'] = poem
                    data['url'] = response.url.encode('utf-8')

                    # Store these information in DB
                    save_to_db_poem(data)

                else:
                    print "Both method failed write it in file for further processing"
        except:
            print "Error Article page!!!"
            with open("up_exception_article.txt", "a") as outfile:
                json.dump(
                    {
                        'index': self.count_visit_article,
                        'link': response.url
                    },
                    outfile,
                    indent=4)
Beispiel #6
0
    def parse_poetry(self, response):
        '''
        Parse poetry page, extract poetry, and save.
        '''
        self.logger.debug("parse_poetry: IN.")
        try:
            # extract poem
            stanza_selectors = response.xpath(
                "//div[contains(@class,'mainContentBody')]/div[contains(@class,'poemPageContentBody')]/div/div"
            )
            poem = ''
            for s in stanza_selectors:
                line_selectors = s.xpath(".//p")
                for l in line_selectors:
                    line = l.xpath(".//text()").extract()
                    line = ''.join(line)
                    line = line.strip()
                    ##print line
                    poem = poem + line + '\n'
                poem = poem + '\n'
            poem = poem[0:-1]  #strip last '\n' from the poem
            #print poem

            # extract title of the poem
            title = response.xpath(
                "//div[contains(@class,'mainContentBody')]/div[contains(@class,'poemPageContentHeader')]/h1/text()"
            ).extract()[0]

            # extract poet name
            # poet name must be in english.
            poet_href = response.xpath(
                "//div[contains(@class,'mainContentBody')]/div[contains(@class,'poemPageContentHeader')]//a[contains(@class,'ghazalAuthor')]/@href"
            ).extract()[0]
            p = re.compile(
                ur'poets/(.+)/')  #href="/poets/anjum-tarazi/?lang=Hi"
            poet = p.search(poet_href).group(1)
            poet = poet.replace('-', ' ')

            # check response.url for language information: https://.....xyz/?lang=hi
            tmp = response.url
            tmp = tmp.split('?')
            url = tmp[0]
            language = tmp[1].split('=')[1]

            data = {}
            data['poem'] = poem
            data['url'] = url
            data['title'] = title.strip()
            data['author'] = poet.title()
            data['language'] = language

            # Store these information in DB
            save_to_db_poem(data)

        except:
            self.logger.error("parse_poetry: %s", sys.exc_info()[0])
            _trace = ''
            for frame in traceback.extract_tb(sys.exc_info()[2]):
                fname, lineno, fn, text = frame
                self.logger.error("error in %s on line %d" % (fname, lineno))
                _trace = _trace + "error in %s on line %d" % (fname, lineno)
            with open(self.LOGFILE, "a") as outfile:
                t = time.asctime(time.localtime(time.time()))
                json.dump(
                    {
                        'link': response.url,
                        'error': 'parsing poetry failed',
                        'trace': _trace,
                        'time': t
                    },
                    outfile,
                    indent=4)
Beispiel #7
0
    def parse_poetry_page(self, response):
        """
        Parse the poetry page
        1. First check if the page contains a poetry or not.
        2. If Poetry not found, call parse_url_list because it may contain list of poems/collections/books.
        3. If Poetry found, then extract the poem and save in the database.
        """
        self.logger.debug("parse_poetry_page: IN.")
        self.crawler.stats.inc_value('kangaroo/poetry_page_visit')
        flag_poetry_found = True

        ##
        # Find out if the page contains a Poetry
        try:
            p = response.xpath(
                "//div[@id='mw-content-text']/div[@class='poem']//p").extract(
                )
            if len(p) is 0:
                # in some pages, the poetry is not in div[@class='poem']
                # e.g. http://www.kavitakosh.org/kk/%E0%A4%AE%E0%A4%A8_%E0%A4%B2%E0%A4%BE%E0%A4%97%E0%A5%8D%E0%A4%AF%E0%A5%8B_%E0%A4%AE%E0%A5%87%E0%A4%B0%E0%A5%8B_%E0%A4%AF%E0%A4%BE%E0%A4%B0_%E0%A4%AB%E0%A4%BC%E0%A4%95%E0%A5%80%E0%A4%B0%E0%A5%80_%E0%A4%AE%E0%A5%87%E0%A4%82_/_%E0%A4%95%E0%A4%AC%E0%A5%80%E0%A4%B0
                p = response.xpath("//div[@id='mw-content-text']//p").extract()

                if len(p):
                    # Now check for the length of the text under the <p>
                    # Because it may contains empty <p>, or one line text stating poetry is not available etc.
                    # Here we assume that the length of Poetry should be greater than ARTICLE_MIN_LEN.
                    p_t = response.xpath(
                        "//div[@id='mw-content-text']//p/text()").extract()
                    p_t = "".join(x.encode('utf-8') for x in p_t)
                    if len(p_t) <= ARTICLE_MIN_LEN:
                        flag_poetry_found = Flase
                else:
                    flag_poetry_found = Flase
            else:
                # Check the length of the article
                p_t = response.xpath(
                    "//div[@id='mw-content-text']/div[@class='poem']//p/text()"
                ).extract()
                p_t = "".join(x.encode('utf-8') for x in p_t)
                if len(p_t) <= ARTICLE_MIN_LEN:
                    flag_poetry_found = Flase

        except:
            self.logger.error("parse_poetry_page: xpath error.")
            flag_poetry_found = False

        ##
        # If poetry not found...
        if (flag_poetry_found is False):
            # It may contains list of poems/collections/books
            self.logger.info(
                'parse_poetry_page: no poetry found on this page.')
            return self.parse_url_list(response)

        ##
        # If Poetry found...
        # This is poetry page, extract the poetry
        try:
            h1 = response.xpath("//h1[@id='firstHeading']//text()").extract(
            )[0].encode('utf-8')
            h1_list = h1.split('/')
            title = '/'.join(h1_list[:-1])
            poet = h1_list[-1]
            # Process and create Poetry
            poem = " ".join(x.encode('utf-8') for x in p)

            data = {}
            data['poem'] = poem
            data['url'] = response.url.encode('utf-8')
            data['title'] = title
            data['author'] = poet.strip()
            data['language'] = 'hi'  # Content of this site are in hindi

            # Store these information in DB
            save_to_db_poem(data)
            self.crawler.stats.inc_value('kangaroo/poetry_found')

        except:
            self.logger.error("parse_poetry_page: %s", sys.exc_info()[0])
            _trace = ''
            for frame in traceback.extract_tb(sys.exc_info()[2]):
                fname, lineno, fn, text = frame
                self.logger.error("error in %s on line %d" % (fname, lineno))
                _trace = _trace + "error in %s on line %d" % (fname, lineno)
            with open(self.LOGFILE, "a") as outfile:
                t = time.asctime(time.localtime(time.time()))
                json.dump(
                    {
                        'link': response.url,
                        'error': 'parsing poetry failed',
                        'trace': _trace,
                        'time': t
                    },
                    outfile,
                    indent=4)