예제 #1
0
class SpiderMan(object):
    def __init__(self):
        self.manager = URLManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        self.manager.add_new_url(root_url)
        while (self.manager.has_new_url()
               and self.manager.old_url_size() < 100):
            try:
                new_url = self.manager.get_new_url()
                html = self.downloader.download(new_url)
                new_urls, data = self.parser(new_url)
                self.manager.add_new_url(new_urls)
                self.output.store_data(data)
                print('已经抓取了{}个连接'.format(self.manager.old_url_size()))
            except Exception:
                print('爬取失败')
        self.output.output_html()
예제 #2
0
 def __init__(self):
     self.manager = URLManager()
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
     self.output = DataOutput()
예제 #3
0
import URLManager
import fetcher
"""
Feed the fetcher with url
"""
if __name__ == '__main__':
    config = configparser.ConfigParser()
    config.read('crawler.config')

    print("Master started. Initial page {}".format(
        config["SITE"]["initial_page"]))

    # init
    start_time = datetime.datetime.now()

    url_manager = URLManager.URLManager()
    url_manager.insert_url(config["SITE"]["initial_page"], 0, 0, -1)

    fetcher = fetcher.Fetcher(url_manager)

    end_time = datetime.datetime.now()
    delta = end_time - start_time
    print("Init time", delta)

    # start crawling
    while url_manager.has_next_url():  # TODO: change if parallel
        print("queue size", url_manager.get_size())
        next_url = url_manager.get_next_url()

        print("fetching", next_url)
        fetcher.get_page(next_url)
예제 #4
0
 def standardpage(self):
     num_of_urls = len(self.standard_content_pages)
     winning_url = URLManager.getRandom(num_of_urls) - 1
     self.client.get(self.standard_content_pages[winning_url])
     print "standard content type"
예제 #5
0
 def detail(self):
     num_of_details = len(self.program_details)
     winning_detail = URLManager.getRandom(num_of_details) - 1
     response = self.client.get(self.program_details[winning_detail])
     print response.status_code
예제 #6
0
 def sort(self):
     num_of_param = len(self.sort_param)
     winning_param = URLManager.getRandom(num_of_param) - 1
     response = self.client.get(self.sort_param[winning_param])
     print response.status_code