Beispiel #1
0
class Crawler():
    '''
    classdocs
    '''

    def __init__(self, urllist, recsys):
        '''
        Constructs a new crawler using the specified url list and the specified
        recommender system to rate the relevance of the downloaded pages
        
        @param urllist: A list of urls to start with
        @param recsys: A recommender system to rate pages
        '''
        self.__urllist = urllist
        self.__queue = PriorityQueue()
        self.__recsys = recsys
        self.__threads = []
        self.__visited = []

    def crawl(self, threads=2, block=True):
        '''
        Starts crawling using the (optional) specified number of threads
        Blocks until crawling is finished if no other option is given
        
        @param threads: The number of threads used for crawling, default is 2
        
        @param block: False, if the method should be non-blocking (True by default)
        '''
        for i in range(threads):
            t = _CrawlThread(i, self.__queue, self.__recsys, self.__visited)
            self.__threads.append(t)
            t.deamon = True
            t.start()
        for url in self.__urllist:
            self.__queue.enqueue(url, 1.)
        self.__queue.join()
    
    def save(self):
        '''
        Sends a request to all running threads to safe the current state of their
        libraries
        '''
        for t in self.__threads:
            t.request_save()
    
    def abort(self):
        '''
        Safely aborts the crawling threads and saves the downloaded
        webpages
        '''
        for t in self.__threads:
            t.request_stop()
Beispiel #2
0
 def __init__(self, urllist, recsys):
     '''
     Constructs a new crawler using the specified url list and the specified
     recommender system to rate the relevance of the downloaded pages
     
     @param urllist: A list of urls to start with
     @param recsys: A recommender system to rate pages
     '''
     self.__urllist = urllist
     self.__queue = PriorityQueue()
     self.__recsys = recsys
     self.__threads = []
     self.__visited = []