Python retrieveSource Examples, seed.buildSeed.retrieveSource Python Examples

Example #1

0

Show file

File: tsdDaily.py Project: clapmyhands/ntuwebcrawler

    def crawlFrontierL2(self, currentQuery):
        baseURL = "http://www.thesundaily.my"
        seedURL = "http://www.thesundaily.my"  # Where the seed links are located
        xpath = "//div[@id='primary']//ul//li//a"

        date = currentQuery.split('-')  # date
        pageExists = 1
        page = 0

        allReturnedURL = list()
        allCatURL = list()
        catCount = 0

        date = currentQuery.split('-')  # date

        allCatURL = buildSeed.crawlFrontierL1(self, currentQuery, baseURL,
                                              xpath, seedURL)

        print("Found %d categories" % len(allCatURL))

        for catURL in allCatURL:

            articleURL = catURL[0]
            articleCount = 0
            category = catURL[2]

            while page < 5:
                doc = buildSeed.retrieveSource(
                    self, articleURL + "?page=" + str(page))

                for returnedArticleUrl in doc.xpath(
                        '//div[@class="view-content"]//h2[@class="node-title"]//a'
                ):

                    url = baseURL + str(returnedArticleUrl.attrib.get(
                        'href'))  # Getting attribute value

                    allReturnedURL.append([
                        url,
                        str(returnedArticleUrl.text_content()), category,
                        str(date[0]) + str(date[1]) + str(date[2])
                    ])

                    articleCount += 1

                catCount += 1

                #if len(allReturnedURL) == 100:
                #    break
                page += 1
                print(
                    "Collecting articles for category %s, page %d, total %d." %
                    (category, page, len(allReturnedURL)))

        return allReturnedURL

Example #2

0

Show file

    def crawlFrontierL2(self, currentQuery):
        baseURL = "http://www.utusan.com.my"
        seedURL = "http://www.utusan.com.my/special/arkib"  # Where the seed links are located
        xpath = "//div[@class='menu menuTop menuTwo']//ul//a"

        allReturnedURL = list()
        allCatURL = list()
        catCount = 0

        date = currentQuery.split('-')  # date

        allCatURL = buildSeed.crawlFrontierL1(self, currentQuery, baseURL,
                                              xpath, seedURL)

        print("Found %d categories" % len(allCatURL))

        for catURL in allCatURL:

            articleURL = catURL[0]
            articleCount = 0
            category = catURL[2]

            if category == "terkini" or category == "video":
                continue

            doc = buildSeed.retrieveSource(self, articleURL)

            for returnedArticleUrl in doc.xpath(
                    '//div[@class="element teaser"]//h2//a'):

                url = baseURL + str(returnedArticleUrl.attrib.get(
                    'href'))  # Getting attribute value

                allReturnedURL.append([
                    url,
                    str(returnedArticleUrl.text_content()), category,
                    str(date[0]) + str(date[1]) + str(date[2])
                ])

                articleCount += 1

            catCount += 1

            #if len(allReturnedURL) == 100:
            #    break

            print("%d Articles for category %d. %s" %
                  (articleCount, catCount, category))

        return allReturnedURL

Example #3

0

Show file

File: tsdDaily.py Project: clapmyhands/ntuwebcrawler

 def crawlFrontierL2(self, currentQuery):
     baseURL = "http://www.thesundaily.my"
     seedURL = "http://www.thesundaily.my" # Where the seed links are located
     xpath = "//div[@id='primary']//ul//li//a"
     
     date      = currentQuery.split('-') # date
     pageExists = 1
     page = 0
     
     allReturnedURL = list()
     allCatURL = list()
     catCount = 0
     
     date = currentQuery.split('-') # date
     
     allCatURL = buildSeed.crawlFrontierL1(self, currentQuery, baseURL, xpath, seedURL)
     
     print("Found %d categories" % len(allCatURL))     
     
     for catURL in allCatURL:
             
             articleURL = catURL[0]
             articleCount = 0
             category = catURL[2]
             
             while page < 5:
                 doc = buildSeed.retrieveSource(self, articleURL + "?page=" + str(page))                
                             
                 for returnedArticleUrl in doc.xpath('//div[@class="view-content"]//h2[@class="node-title"]//a') :
                     
                     url = baseURL + str(returnedArticleUrl.attrib.get('href')) # Getting attribute value
 
                     allReturnedURL.append([url, str(returnedArticleUrl.text_content()), category, str(date[0]) + str(date[1]) + str(date[2]) ] )
                     
                     articleCount += 1
                     
                 catCount += 1
                 
                 #if len(allReturnedURL) == 100:
                 #    break
                 page += 1
                 print("Collecting articles for category %s, page %d, total %d." % (category, page, len(allReturnedURL)))
             
     return allReturnedURL

Example #4

0

Show file

File: utusanDaily.py Project: clapmyhands/ntuwebcrawler

    def crawlFrontierL2(self, currentQuery):
        baseURL = "http://www.utusan.com.my"
        seedURL = "http://www.utusan.com.my/special/arkib" # Where the seed links are located
        xpath = "//div[@class='menu menuTop menuTwo']//ul//a"
        
        allReturnedURL = list()
        allCatURL = list()
        catCount = 0
        
        date = currentQuery.split('-') # date
        
        allCatURL = buildSeed.crawlFrontierL1(self, currentQuery, baseURL, xpath, seedURL)
        
        print("Found %d categories" % len(allCatURL))     
        
        for catURL in allCatURL:
                
                articleURL = catURL[0]
                articleCount = 0
                category = catURL[2]
                
                if category == "terkini" or category == "video":
                    continue
                
                doc = buildSeed.retrieveSource(self, articleURL)                
                
                for returnedArticleUrl in doc.xpath('//div[@class="element teaser"]//h2//a') :
                    
                    url = baseURL + str(returnedArticleUrl.attrib.get('href')) # Getting attribute value

                    allReturnedURL.append([url, str(returnedArticleUrl.text_content()), category, str(date[0]) + str(date[1]) + str(date[2]) ] )
                    
                    articleCount += 1
                    
                catCount += 1
                
                #if len(allReturnedURL) == 100:
                #    break
                
                print("%d Articles for category %d. %s" % (articleCount, catCount, category))
                
        return allReturnedURL