Exemple #1
0
    def __init__(self):
        print '.. init Reptile'
        _config = Config()
        _Reptile.__init__(self, _config.getint('reptile', 'page_num'))
        self.curPageUrl = ''
        startpages =  _config.get('reptile', 'startpage').split()
        _netlocs = []
        for url in startpages:
            self._queue.put(url)
        print '.. init startpages: ', startpages

        self._sourceparser = SourceParser(startpages)
Exemple #2
0
class Reptile(_Reptile):
    '''
    main reptile
    '''
    def __init__(self):
        print '.. init Reptile'
        _config = Config()
        _Reptile.__init__(self, _config.getint('reptile', 'page_num'))
        self.curPageUrl = ''
        startpages =  _config.get('reptile', 'startpage').split()
        _netlocs = []
        for url in startpages:
            self._queue.put(url)
        print '.. init startpages: ', startpages

        self._sourceparser = SourceParser(startpages)

    def matchUrl(self, url):
        print 'match url:', url
        return self._sourceparser.matchUrl(url)

    def run(self):
        print '.. run'
        while not self._queue.empty():
            time.sleep(random.randint(5,20))
            print '.. while not run'
            url = self._queue.get()
            self._sourceparser.setCurPageUrl(url)
            #if not self.outPageRange():
            #if True:
            print '.. post: ', url
            _source = self.requestSource(url)
            if not _source:
                continue
            print '.. get: source length ', len(_source)
            self._sourceparser.setSource(_source)
            self._sourceparser.saveSource(self.downloadedPageNum)
            _absurls = self._sourceparser.getAbsUrls()
            for url in _absurls:
                self.inQueue(url)