class Spider: def __init__(self): self.urls = Urlmanager() self.parse = HtmlParse() self.htmldownload = HtmlDownloader() self.data_output = DataOutPut() def craw(self, root): for i in range(1, 6): url = root + '?page=' + str(i) self.urls.add_new_url(url) while self.urls.is_empty(): try: new_url = self.urls.get_url() logging.info(new_url) html_content = self.htmldownload.download_html(new_url) data = self.parse.parse(html_content) self.data_output.collect_data(data) except: pass print demjson.encode(self.data_output.datas)
def __init__(self): self.urls = Urlmanager() self.parse = HtmlParse() self.htmldownload = HtmlDownloader() self.data_output = DataOutPut()