class SpiderMan(object): def __init__(self): self.manager = URLManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): self.manager.add_new_url(root_url) while (self.manager.has_new_url() and self.manager.old_url_size() < 100): try: new_url = self.manager.get_new_url() html = self.downloader.download(new_url) new_urls, data = self.parser(new_url) self.manager.add_new_url(new_urls) self.output.store_data(data) print('已经抓取了{}个连接'.format(self.manager.old_url_size())) except Exception: print('爬取失败') self.output.output_html()
def __init__(self): self.manager = URLManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput()
import URLManager import fetcher """ Feed the fetcher with url """ if __name__ == '__main__': config = configparser.ConfigParser() config.read('crawler.config') print("Master started. Initial page {}".format( config["SITE"]["initial_page"])) # init start_time = datetime.datetime.now() url_manager = URLManager.URLManager() url_manager.insert_url(config["SITE"]["initial_page"], 0, 0, -1) fetcher = fetcher.Fetcher(url_manager) end_time = datetime.datetime.now() delta = end_time - start_time print("Init time", delta) # start crawling while url_manager.has_next_url(): # TODO: change if parallel print("queue size", url_manager.get_size()) next_url = url_manager.get_next_url() print("fetching", next_url) fetcher.get_page(next_url)
def standardpage(self): num_of_urls = len(self.standard_content_pages) winning_url = URLManager.getRandom(num_of_urls) - 1 self.client.get(self.standard_content_pages[winning_url]) print "standard content type"
def detail(self): num_of_details = len(self.program_details) winning_detail = URLManager.getRandom(num_of_details) - 1 response = self.client.get(self.program_details[winning_detail]) print response.status_code
def sort(self): num_of_param = len(self.sort_param) winning_param = URLManager.getRandom(num_of_param) - 1 response = self.client.get(self.sort_param[winning_param]) print response.status_code