def test_categorize_relative_urls_provided(self):
     p = LinkParser()
     p.feed(base_url='http://feeds.huffingtonpost.com',
            html='''
         <link rel="alternate" type="application/rss+xml" title="The Full Feed" href="huffingtonpost/raw_feed" />
     ''')
     self.assertEqual(p.find_base_url(), 'http://feeds.huffingtonpost.com')
     self.assertListEqual([SAMPLE_RSS], p.data[RSS_KEY])
    def xtest_500plus_links(self):
        p = LinkParser()

        input_html = read_file('01_input.html')
        p.feed(input_html, timeout=60)
        output_json = read_file('01_output.json')
        data = json.loads(output_json)

        self.assertSetEqual(set(data[RSS_KEY]), set(p.data[RSS_KEY]))
        self.assertSetEqual(set(data[ATOM_KEY]), set(p.data[ATOM_KEY]))
 def test_multiple_runs(self):
     p = LinkParser()
     p.feed('''
         <link rel="canonical" href="http://feeds.huffingtonpost.com" />
         <link rel="alternate" type="application/rss+xml" title="The Full Feed" href="huffingtonpost/raw_feed" />
     ''')
     p.feed(
         '<link rel="alternate" type="application/atom+xml" href="http://feeds.feedburner.com/PTCC" />'
     )
     self.assertListEqual([SAMPLE_RSS], p.data[RSS_KEY])
     self.assertListEqual([SAMPLE_ATOM], p.data[ATOM_KEY])
    def crawlAllUrl(self,outputFlag = False,crawlAmountLimit = CRAWL_AMOUNT_LIMIT): 
        while len(Crawler.urlList)>0:
            Crawler.urlRecordLock.acquire()#lock the queue when loading the first element
            url = Crawler.urlList.pop()
            pathname = self.url2Pathname(url)
            Crawler.urlNotDone.pop(pathname)
            
            if Crawler.crawledAmount >= crawlAmountLimit:
                Crawler.urlRecordLock.release()
                break
            Crawler.urlRecordLock.release()
            
            result = self.crawlUrl(NORMAL_SITE,url,outputFlag)
            try:
                urlArr = urlparse.urlparse(url)
                #if can not crawl the url, accumulate to the errorCounter
                if result == False:
                    Crawler.urlRecordLock.acquire()
                    if Crawler.errorCounter.has_key(urlArr.netloc):
                        Crawler.errorCounter[urlArr.netloc]+=1
                    else:
                        Crawler.errorCounter[urlArr.netloc] = 1
                    Crawler.urlRecordLock.release()
                    continue
                    if Crawler.errorCounter[urlArr.netloc]> MIN_ERRORS_ALLOWED_FOR_A_SITE:
                        continue
                _path = urlArr.path
                rightMostSlashIndex = _path.rfind('/')
                replaced = _path[rightMostSlashIndex : len(_path)]
                #try to parse relative address
                if replaced.find('.') != -1:
                    _path = _path.replace(replaced,'')
                hostPath = urlArr.scheme + '://' + urlArr.netloc + _path 
                 
                parser = LinkParser()
                parser.setFlag(NORMAL_SITE)
                parser.setHostPath(hostPath)
                parser.feed(result)
                urlList = parser.hrefsList
                    
                Crawler.urlRecordLock.acquire()
                self.addUrlList(urlList)
                Crawler.crawledAmount += 1
                Crawler.urlRecordLock.release()

                parser.close()
                    
            except Exception, e:
                #print(e)
                self.reportError(url, msg[ERROR_HTML_PARSE])
	def __crawlPage(self, pageName):
		fullPageName = pageName
		# If the page is a url go directly there otherwise prepend the domain
		if(pageName.find('://')!=-1):
			page = self.__getPage(pageName)
		else:
			page = self.__getPage(self.domain+pageName)
		# Some link urls will be in the format /page1/page2
		# we remove this to avoid http://site//page1/page2
		if pageName.startswith('/'):
			pageName = pageName[1:]
		parser = LinkParser(self.domain)
		parser.feed(page)
		pageLinks = parser.getLinks()
		self.discovered = self.discovered.union(pageLinks)
		# Convert links to list for later json serialisation
		self.map.append({'page': fullPageName, 'links': list(pageLinks)})	
def main():
    initResult = init.initGlobal()
    crawler = Crawler()
    if(initResult != False):
        #input
        print("Please enter your keyword")
        keyword = raw_input()
        keyword = keyword.replace(' ','+')
        
        #start crawling from search engine
        crawler = Crawler()
        startTime = time.time()
        crawler.loadRecord(LOG_OF_CRAWLED_URL)
        crawler.loadRecord(LOG_OF_CRAWLED_CONTENT)
        crawler.addSearchEngineUrl(keyword)
        htmlcode = crawler.crawlUrl(GOOGLE)
        parser = LinkParser()
        parser.setFlag(GOOGLE)
        parser.feed(htmlcode)
        top10 = parser.hrefsList
        crawler.addUrlList(top10,GOOGLE)

        parser.close()
        threadPool = []
        #   run the work with THREAD_NUM threads
        while len(threadPool) <= THREAD_NUM:
            th = threading.Thread(None,crawl)
            threadPool.append(th)
            
        for item in threadPool:
            item.start()  
        for item in threadPool:
            item.join()
              
        crawler.flush()
        endTime = time.time()
        print("time used:")
        print(endTime-startTime)
        keyword = raw_input()
 def test_empty_href(self):
     p = LinkParser()
     p.feed('<a href>test</a><link href><a href="' + SAMPLE_RSS + '"></a>')
     self.assertListEqual([SAMPLE_RSS], p.data[RSS_KEY])