コード例 #1
0
ファイル: crawler.py プロジェクト: lambokini/dp-python
class Crawler():

    def __init__(self ):
        self.downloader = DownloadManager()
        self.webpage = None
        self.init_database()
        self.rules = {}
    def uninit_database(self):
        self.queue.close_db()
        self.webpagedb.close_db()
        self.feedsdb.close_db()
        
    def init_database(self):
        self.queue = QueueDB('queue.db', 0)
        self.webpagedb = WebpageDB('webpage.db')
        self.feedsdb = FeedsDB('duplcheck.db')

    def add_seeds(self, links):
        self.feedsdb.add_urls(links)
        self.queue.push_urls(links)

    def add_rules(self, rules):
        self.rules = {}
        for url, inurls in rules.items():
            print url
            reurl = re.compile(url)
            repatn = []
            for u in inurls:
                print u
                repatn.append(re.compile(u))
            self.rules[reurl] = repatn
            print 

    def get_patterns_from_rules(self,url):
        patns = []
        for purl,ru in self.rules.items():
            if purl.match(url)!= None:
                patns.extend(ru)
        return list(set(patns))

    def start(self):
        while 1:
            url = self.queue.pop_url()
            print url
            if url == None:
                print "crawling task is done."
                break
            error_msg, url, redirected_url, html = self.downloader.download(url)
            #print error_msg, url, redirected_url, html
            if html !=None:
                self.webpage = WebPage(url,html)
                self.webpagedb.html2db(url,html)
                self.webpage.parse_links()
                ruptn = self.get_patterns_from_rules(url)
                print ruptn
                links = self.webpage.filter_links(tags = ['a'], patterns= ruptn)
                self.add_seeds(links)
                break
            self.mysleep(3)        

    def mysleep(self, n):
        for i in range(n):
            time.sleep(1)
            print "sleep",i,"of",n
コード例 #2
0
ファイル: crawler.py プロジェクト: lambokini/dp-python
 def init_database(self):
     self.queue = QueueDB('queue.db', 0)
     self.webpagedb = WebpageDB('webpage.db')
     self.feedsdb = FeedsDB('duplcheck.db')