Python UrlManager.get_url Examples

Programming Language: Python

Namespace/Package Name: url_manager

Class/Type: UrlManager

Method/Function: get_url

Examples at hotexamples.com: 3

Python UrlManager.get_url - 3 examples found. These are the top rated real world Python examples of url_manager.UrlManager.get_url extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

UrlManager(27)

add_new_urls(14)

get_new_url(12)

add_new_url(11)

has_new_url(9)

old_url_size(3)

get_url(2)

add_urls(1)

has_url_node(1)

show(1)

save_progress(1)

save_process(1)

save_2_file(1)

reset(1)

put_many(1)

put(1)

old_urls_size(1)

add_new_next_urls(1)

isEmpty_new_urls(1)

has_url(1)

complete(1)

add_old_urls(1)

has_new_rul(1)

get_url_node(1)

add_new_next_url(1)

get_new_urls(1)

add_url(1)

get_new_next_url(1)

get_new_next_count(1)

get_main_seed_url(1)

get(1)

fetch(1)

update(1)

Example #1

Show file

File: wikisum.py Project: TPeterW/summariser

class Crawler(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()

    def get_urls(self, keywords):
        data = {}
        for word in keywords:
            url = self.crawl(word)
            data[word] = url;
        
        return data
    
    def crawl(self, word):
        results = {}
        url = self.manager.get_url(word);
        page = self.downloader.download(url)
        
        return self.parser.search(page)

Example #2

Show file

File: wikisum.py Project: TPeterW/summariser

class Crawler(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()

    def get_urls(self, keywords):
        data = {}
        for word in keywords:
            url = self.crawl(word)
            data[word] = url

        return data

    def crawl(self, word):
        results = {}
        url = self.manager.get_url(word)
        page = self.downloader.download(url)

        return self.parser.search(page)

Example #3

Show file

class SpiderMain():
    def __init__(self):
        self.urlDownLoader = HtmlDownloader()
        self.htmlParser = HtmlParser()
        self.urlManager = UrlManager()
        self.jsondataParser = JsonData_Parser()
        self.htmlOutPuter = HtmlOutPuter()

    def _get_from_discover_toplist(self, url):
        urls = self.htmlParser.parse(
            htmlContent=self.urlDownLoader.download(url),
            type='discover_toplist')
        self.urlManager.add_new_urls(urls)

    def _get_from_discover_artist(self, url):
        urls = self.htmlParser.parse(
            htmlContent=self.urlDownLoader.download(url),
            type='discover_artist')
        self.urlManager.add_new_urls(urls)

    def _get_from_artist(self, url):
        results = self.htmlParser.parse(
            htmlContent=self.urlDownLoader.download(url), type='artist')
        for name, urls in results.items():
            print(name)
            self.urlManager.add_new_urls(urls)

    def _get_from_song(self, url):
        tmp = {}
        name = self.htmlParser.parse(
            htmlContent=self.urlDownLoader.download(url), type='song')
        print("正收集：" + name)
        comments = self.jsondataParser.parse(
            self.urlDownLoader.downloadJsonData(url))
        tmp[name] = comments
        self.htmlOutPuter.collect_datas(tmp)

    def _parse_url(self, url):
        res = ''
        SONG = 'song'
        DISCOVER = 'discover'
        ARTIST = 'artist'
        TOPLIST = 'toplist'
        if (url.find(DISCOVER) != -1):
            res += DISCOVER
        if (url.find(ARTIST) != -1):
            if (res != ''):
                res += '_' + ARTIST
            else:
                res += ARTIST
        if (url.find(TOPLIST) != -1):
            if (res != ''):
                res += '_' + TOPLIST
            else:
                res += TOPLIST
        if (url.find(SONG) != -1):
            res += SONG
        return res

    def craw(self, rootUrl, direction=""):
        if (rootUrl.find('#') != -1):
            pos = rootUrl.find('#')
            rootUrl = rootUrl[:pos] + rootUrl[pos + 2:]
        self.urlManager.add_new_url(rootUrl)
        while self.urlManager.has_new_url():
            url = self.urlManager.get_url()
            methodName = '_get_from_' + self._parse_url(url)
            method = getattr(self, methodName)
            if (method != None):
                method(url)
        self.htmlOutPuter.output_html(direction=direction)