Python LinkParser.feed Examples

Programming Language: Python

Namespace/Package Name: linkparser

Class/Type: LinkParser

Method/Function: feed

Examples at hotexamples.com: 7

Python LinkParser.feed - 7 examples found. These are the top rated real world Python examples of linkparser.LinkParser.feed extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

LinkParser(10)

get_link(5)

feed(4)

close(2)

setFlag(2)

find_base_url(1)

getLinks(1)

has_parse(1)

setHostPath(1)

Example #1

Show file

File: linkparser_test.py Project: richteri/linkparser

 def test_categorize_relative_urls_provided(self):
     p = LinkParser()
     p.feed(base_url='http://feeds.huffingtonpost.com',
            html='''
         <link rel="alternate" type="application/rss+xml" title="The Full Feed" href="huffingtonpost/raw_feed" />
     ''')
     self.assertEqual(p.find_base_url(), 'http://feeds.huffingtonpost.com')
     self.assertListEqual([SAMPLE_RSS], p.data[RSS_KEY])

Example #2

Show file

File: linkparser_test.py Project: richteri/linkparser

    def xtest_500plus_links(self):
        p = LinkParser()

        input_html = read_file('01_input.html')
        p.feed(input_html, timeout=60)
        output_json = read_file('01_output.json')
        data = json.loads(output_json)

        self.assertSetEqual(set(data[RSS_KEY]), set(p.data[RSS_KEY]))
        self.assertSetEqual(set(data[ATOM_KEY]), set(p.data[ATOM_KEY]))

Example #3

Show file

File: linkparser_test.py Project: richteri/linkparser

 def test_multiple_runs(self):
     p = LinkParser()
     p.feed('''
         <link rel="canonical" href="http://feeds.huffingtonpost.com" />
         <link rel="alternate" type="application/rss+xml" title="The Full Feed" href="huffingtonpost/raw_feed" />
     ''')
     p.feed(
         '<link rel="alternate" type="application/atom+xml" href="http://feeds.feedburner.com/PTCC" />'
     )
     self.assertListEqual([SAMPLE_RSS], p.data[RSS_KEY])
     self.assertListEqual([SAMPLE_ATOM], p.data[ATOM_KEY])

Example #4

Show file

File: crawlerlib.py Project: Neil-Liang/Migrate_From_Google_Code

    def crawlAllUrl(self,outputFlag = False,crawlAmountLimit = CRAWL_AMOUNT_LIMIT): 
        while len(Crawler.urlList)>0:
            Crawler.urlRecordLock.acquire()#lock the queue when loading the first element
            url = Crawler.urlList.pop()
            pathname = self.url2Pathname(url)
            Crawler.urlNotDone.pop(pathname)
            
            if Crawler.crawledAmount >= crawlAmountLimit:
                Crawler.urlRecordLock.release()
                break
            Crawler.urlRecordLock.release()
            
            result = self.crawlUrl(NORMAL_SITE,url,outputFlag)
            try:
                urlArr = urlparse.urlparse(url)
                #if can not crawl the url, accumulate to the errorCounter
                if result == False:
                    Crawler.urlRecordLock.acquire()
                    if Crawler.errorCounter.has_key(urlArr.netloc):
                        Crawler.errorCounter[urlArr.netloc]+=1
                    else:
                        Crawler.errorCounter[urlArr.netloc] = 1
                    Crawler.urlRecordLock.release()
                    continue
                    if Crawler.errorCounter[urlArr.netloc]> MIN_ERRORS_ALLOWED_FOR_A_SITE:
                        continue
                _path = urlArr.path
                rightMostSlashIndex = _path.rfind('/')
                replaced = _path[rightMostSlashIndex : len(_path)]
                #try to parse relative address
                if replaced.find('.') != -1:
                    _path = _path.replace(replaced,'')
                hostPath = urlArr.scheme + '://' + urlArr.netloc + _path 
                 
                parser = LinkParser()
                parser.setFlag(NORMAL_SITE)
                parser.setHostPath(hostPath)
                parser.feed(result)
                urlList = parser.hrefsList
                    
                Crawler.urlRecordLock.acquire()
                self.addUrlList(urlList)
                Crawler.crawledAmount += 1
                Crawler.urlRecordLock.release()

                parser.close()
                    
            except Exception, e:
                #print(e)
                self.reportError(url, msg[ERROR_HTML_PARSE])

Example #5

Show file

File: crawler.py Project: andycampbell92/simple-web-crawler

	def __crawlPage(self, pageName):
		fullPageName = pageName
		# If the page is a url go directly there otherwise prepend the domain
		if(pageName.find('://')!=-1):
			page = self.__getPage(pageName)
		else:
			page = self.__getPage(self.domain+pageName)
		# Some link urls will be in the format /page1/page2
		# we remove this to avoid http://site//page1/page2
		if pageName.startswith('/'):
			pageName = pageName[1:]
		parser = LinkParser(self.domain)
		parser.feed(page)
		pageLinks = parser.getLinks()
		self.discovered = self.discovered.union(pageLinks)
		# Convert links to list for later json serialisation
		self.map.append({'page': fullPageName, 'links': list(pageLinks)})

Example #6

Show file

File: main.py Project: Neil-Liang/Migrate_From_Google_Code

def main():
    initResult = init.initGlobal()
    crawler = Crawler()
    if(initResult != False):
        #input
        print("Please enter your keyword")
        keyword = raw_input()
        keyword = keyword.replace(' ','+')
        
        #start crawling from search engine
        crawler = Crawler()
        startTime = time.time()
        crawler.loadRecord(LOG_OF_CRAWLED_URL)
        crawler.loadRecord(LOG_OF_CRAWLED_CONTENT)
        crawler.addSearchEngineUrl(keyword)
        htmlcode = crawler.crawlUrl(GOOGLE)
        parser = LinkParser()
        parser.setFlag(GOOGLE)
        parser.feed(htmlcode)
        top10 = parser.hrefsList
        crawler.addUrlList(top10,GOOGLE)

        parser.close()
        threadPool = []
        #   run the work with THREAD_NUM threads
        while len(threadPool) <= THREAD_NUM:
            th = threading.Thread(None,crawl)
            threadPool.append(th)
            
        for item in threadPool:
            item.start()  
        for item in threadPool:
            item.join()
              
        crawler.flush()
        endTime = time.time()
        print("time used:")
        print(endTime-startTime)
        keyword = raw_input()

Example #7

Show file

File: linkparser_test.py Project: richteri/linkparser

 def test_empty_href(self):
     p = LinkParser()
     p.feed('<a href>test</a><link href><a href="' + SAMPLE_RSS + '"></a>')
     self.assertListEqual([SAMPLE_RSS], p.data[RSS_KEY])