def crawlFrontierL2(self, currentQuery): baseURL = "http://www.thesundaily.my" seedURL = "http://www.thesundaily.my" # Where the seed links are located xpath = "//div[@id='primary']//ul//li//a" date = currentQuery.split('-') # date pageExists = 1 page = 0 allReturnedURL = list() allCatURL = list() catCount = 0 date = currentQuery.split('-') # date allCatURL = buildSeed.crawlFrontierL1(self, currentQuery, baseURL, xpath, seedURL) print("Found %d categories" % len(allCatURL)) for catURL in allCatURL: articleURL = catURL[0] articleCount = 0 category = catURL[2] while page < 5: doc = buildSeed.retrieveSource( self, articleURL + "?page=" + str(page)) for returnedArticleUrl in doc.xpath( '//div[@class="view-content"]//h2[@class="node-title"]//a' ): url = baseURL + str(returnedArticleUrl.attrib.get( 'href')) # Getting attribute value allReturnedURL.append([ url, str(returnedArticleUrl.text_content()), category, str(date[0]) + str(date[1]) + str(date[2]) ]) articleCount += 1 catCount += 1 #if len(allReturnedURL) == 100: # break page += 1 print( "Collecting articles for category %s, page %d, total %d." % (category, page, len(allReturnedURL))) return allReturnedURL
def crawlFrontierL2(self, currentQuery): baseURL = "http://www.utusan.com.my" seedURL = "http://www.utusan.com.my/special/arkib" # Where the seed links are located xpath = "//div[@class='menu menuTop menuTwo']//ul//a" allReturnedURL = list() allCatURL = list() catCount = 0 date = currentQuery.split('-') # date allCatURL = buildSeed.crawlFrontierL1(self, currentQuery, baseURL, xpath, seedURL) print("Found %d categories" % len(allCatURL)) for catURL in allCatURL: articleURL = catURL[0] articleCount = 0 category = catURL[2] if category == "terkini" or category == "video": continue doc = buildSeed.retrieveSource(self, articleURL) for returnedArticleUrl in doc.xpath( '//div[@class="element teaser"]//h2//a'): url = baseURL + str(returnedArticleUrl.attrib.get( 'href')) # Getting attribute value allReturnedURL.append([ url, str(returnedArticleUrl.text_content()), category, str(date[0]) + str(date[1]) + str(date[2]) ]) articleCount += 1 catCount += 1 #if len(allReturnedURL) == 100: # break print("%d Articles for category %d. %s" % (articleCount, catCount, category)) return allReturnedURL
def crawlFrontierL2(self, currentQuery): baseURL = "http://www.thesundaily.my" seedURL = "http://www.thesundaily.my" # Where the seed links are located xpath = "//div[@id='primary']//ul//li//a" date = currentQuery.split('-') # date pageExists = 1 page = 0 allReturnedURL = list() allCatURL = list() catCount = 0 date = currentQuery.split('-') # date allCatURL = buildSeed.crawlFrontierL1(self, currentQuery, baseURL, xpath, seedURL) print("Found %d categories" % len(allCatURL)) for catURL in allCatURL: articleURL = catURL[0] articleCount = 0 category = catURL[2] while page < 5: doc = buildSeed.retrieveSource(self, articleURL + "?page=" + str(page)) for returnedArticleUrl in doc.xpath('//div[@class="view-content"]//h2[@class="node-title"]//a') : url = baseURL + str(returnedArticleUrl.attrib.get('href')) # Getting attribute value allReturnedURL.append([url, str(returnedArticleUrl.text_content()), category, str(date[0]) + str(date[1]) + str(date[2]) ] ) articleCount += 1 catCount += 1 #if len(allReturnedURL) == 100: # break page += 1 print("Collecting articles for category %s, page %d, total %d." % (category, page, len(allReturnedURL))) return allReturnedURL
def crawlFrontierL2(self, currentQuery): baseURL = "http://www.utusan.com.my" seedURL = "http://www.utusan.com.my/special/arkib" # Where the seed links are located xpath = "//div[@class='menu menuTop menuTwo']//ul//a" allReturnedURL = list() allCatURL = list() catCount = 0 date = currentQuery.split('-') # date allCatURL = buildSeed.crawlFrontierL1(self, currentQuery, baseURL, xpath, seedURL) print("Found %d categories" % len(allCatURL)) for catURL in allCatURL: articleURL = catURL[0] articleCount = 0 category = catURL[2] if category == "terkini" or category == "video": continue doc = buildSeed.retrieveSource(self, articleURL) for returnedArticleUrl in doc.xpath('//div[@class="element teaser"]//h2//a') : url = baseURL + str(returnedArticleUrl.attrib.get('href')) # Getting attribute value allReturnedURL.append([url, str(returnedArticleUrl.text_content()), category, str(date[0]) + str(date[1]) + str(date[2]) ] ) articleCount += 1 catCount += 1 #if len(allReturnedURL) == 100: # break print("%d Articles for category %d. %s" % (articleCount, catCount, category)) return allReturnedURL