Example #1
0
class Spider:
    def __init__(self):
        self.urls = Urlmanager()
        self.parse = HtmlParse()
        self.htmldownload = HtmlDownloader()
        self.data_output = DataOutPut()

    def craw(self, root):
        for i in range(1, 6):
            url = root + '?page=' + str(i)
            self.urls.add_new_url(url)
        while self.urls.is_empty():
            try:
                new_url = self.urls.get_url()
                logging.info(new_url)
                html_content = self.htmldownload.download_html(new_url)
                data = self.parse.parse(html_content)
                self.data_output.collect_data(data)
            except:
                pass
        print demjson.encode(self.data_output.datas)
Example #2
0
 def __init__(self):
     self.urls = Urlmanager()
     self.parse = HtmlParse()
     self.htmldownload = HtmlDownloader()
     self.data_output = DataOutPut()