def test_categorize_relative_urls_provided(self): p = LinkParser() p.feed(base_url='http://feeds.huffingtonpost.com', html=''' <link rel="alternate" type="application/rss+xml" title="The Full Feed" href="huffingtonpost/raw_feed" /> ''') self.assertEqual(p.find_base_url(), 'http://feeds.huffingtonpost.com') self.assertListEqual([SAMPLE_RSS], p.data[RSS_KEY])
def xtest_500plus_links(self): p = LinkParser() input_html = read_file('01_input.html') p.feed(input_html, timeout=60) output_json = read_file('01_output.json') data = json.loads(output_json) self.assertSetEqual(set(data[RSS_KEY]), set(p.data[RSS_KEY])) self.assertSetEqual(set(data[ATOM_KEY]), set(p.data[ATOM_KEY]))
def test_multiple_runs(self): p = LinkParser() p.feed(''' <link rel="canonical" href="http://feeds.huffingtonpost.com" /> <link rel="alternate" type="application/rss+xml" title="The Full Feed" href="huffingtonpost/raw_feed" /> ''') p.feed( '<link rel="alternate" type="application/atom+xml" href="http://feeds.feedburner.com/PTCC" />' ) self.assertListEqual([SAMPLE_RSS], p.data[RSS_KEY]) self.assertListEqual([SAMPLE_ATOM], p.data[ATOM_KEY])
def crawlAllUrl(self,outputFlag = False,crawlAmountLimit = CRAWL_AMOUNT_LIMIT): while len(Crawler.urlList)>0: Crawler.urlRecordLock.acquire()#lock the queue when loading the first element url = Crawler.urlList.pop() pathname = self.url2Pathname(url) Crawler.urlNotDone.pop(pathname) if Crawler.crawledAmount >= crawlAmountLimit: Crawler.urlRecordLock.release() break Crawler.urlRecordLock.release() result = self.crawlUrl(NORMAL_SITE,url,outputFlag) try: urlArr = urlparse.urlparse(url) #if can not crawl the url, accumulate to the errorCounter if result == False: Crawler.urlRecordLock.acquire() if Crawler.errorCounter.has_key(urlArr.netloc): Crawler.errorCounter[urlArr.netloc]+=1 else: Crawler.errorCounter[urlArr.netloc] = 1 Crawler.urlRecordLock.release() continue if Crawler.errorCounter[urlArr.netloc]> MIN_ERRORS_ALLOWED_FOR_A_SITE: continue _path = urlArr.path rightMostSlashIndex = _path.rfind('/') replaced = _path[rightMostSlashIndex : len(_path)] #try to parse relative address if replaced.find('.') != -1: _path = _path.replace(replaced,'') hostPath = urlArr.scheme + '://' + urlArr.netloc + _path parser = LinkParser() parser.setFlag(NORMAL_SITE) parser.setHostPath(hostPath) parser.feed(result) urlList = parser.hrefsList Crawler.urlRecordLock.acquire() self.addUrlList(urlList) Crawler.crawledAmount += 1 Crawler.urlRecordLock.release() parser.close() except Exception, e: #print(e) self.reportError(url, msg[ERROR_HTML_PARSE])
def __crawlPage(self, pageName): fullPageName = pageName # If the page is a url go directly there otherwise prepend the domain if(pageName.find('://')!=-1): page = self.__getPage(pageName) else: page = self.__getPage(self.domain+pageName) # Some link urls will be in the format /page1/page2 # we remove this to avoid http://site//page1/page2 if pageName.startswith('/'): pageName = pageName[1:] parser = LinkParser(self.domain) parser.feed(page) pageLinks = parser.getLinks() self.discovered = self.discovered.union(pageLinks) # Convert links to list for later json serialisation self.map.append({'page': fullPageName, 'links': list(pageLinks)})
def main(): initResult = init.initGlobal() crawler = Crawler() if(initResult != False): #input print("Please enter your keyword") keyword = raw_input() keyword = keyword.replace(' ','+') #start crawling from search engine crawler = Crawler() startTime = time.time() crawler.loadRecord(LOG_OF_CRAWLED_URL) crawler.loadRecord(LOG_OF_CRAWLED_CONTENT) crawler.addSearchEngineUrl(keyword) htmlcode = crawler.crawlUrl(GOOGLE) parser = LinkParser() parser.setFlag(GOOGLE) parser.feed(htmlcode) top10 = parser.hrefsList crawler.addUrlList(top10,GOOGLE) parser.close() threadPool = [] # run the work with THREAD_NUM threads while len(threadPool) <= THREAD_NUM: th = threading.Thread(None,crawl) threadPool.append(th) for item in threadPool: item.start() for item in threadPool: item.join() crawler.flush() endTime = time.time() print("time used:") print(endTime-startTime) keyword = raw_input()
def test_empty_href(self): p = LinkParser() p.feed('<a href>test</a><link href><a href="' + SAMPLE_RSS + '"></a>') self.assertListEqual([SAMPLE_RSS], p.data[RSS_KEY])