Esempio n. 1
0
class Manager(threading.Thread):
    """
    Loop until user terminates program. Obtain tweet requests from the queue and
    spawn worker threads to retrieve the data.
 
    Blocks when the maximum number of simultanous requests are underway.
    Currently busy-waits when there are no requests on the queue.
    Also busy-waits when the twitter rate limit is reached.
    """
    
    def __init__(self, server_url='http://127.0.0.1:5984', db_name='hashmapd'):
        threading.Thread.__init__(self)
        self.db = couchdb.Server(server_url)[db_name]
        self.request_queue = RequestQueue(server_url=server_url, db_name=db_name)
        logger.debug('Manager: downloading tweets from %s, database %s'%(server_url, db_name))
    
    def notify_completed(self, thread):
        # report to the queue that the job finished successfully
        row = self.db[thread.request_id]
        self.request_queue.completed_request(row, thread.request_id)
        # create hash request if completed downloading user requests
        self.create_hash_request_if_finished(thread)
        # print notification of completion
        status.info('Manager: retrieved tweets', thread)
    
    def notify_failed(self, thread, err, notify_error=False):
        #backoff
        status.backoff()
        # report to the queue that the job failed
        row = self.db[thread.request_id]
        self.request_queue.failed_request(row, thread.request_id)
        # create hash request if completed downloading user requests
        self.create_hash_request_if_finished(thread)
        # print error message
        if notify_error:
            status.error('Manager: error (%s) retrieving tweets'%err, thread)
        else:
            status.info('Manager: error (%s) retrieving tweets'%err, thread)
    
    def delete_request_doc(self, thread):
        doc = self.db[thread.request_id]
        self.db.delete(doc)
        logger.debug('Deleted request for missing tweets (' + str(thread.screen_name) + ',' + str(thread.page) + ')')

    def create_hash_request_if_finished(self, thread):
        # if there are no more pending download requests for this user,
        # create a new hash request for the user
        results = self.db.view('queue/queued_user_download_requests', reduce=False)
        if len(results[thread.screen_name]) == 0:
            self.request_queue.add_hash_request(thread.screen_name)
    
    def run(self):
        # obtain a twitter screen name from db that needs data downloaded
        # spawn a thread for each page of downloads
        while status.terminate == False:
            # get the next request
            logger.debug('Manager: get the next request')
            next_request = self.request_queue.next('download')
            if next_request == None:
                logger.info('Request queue is empty - no more users to download')
                status.terminate = True
                continue
            logger.debug('Manager: Next request is for %s, page %s'%(next_request['username'], next_request['page']))
            screen_name = next_request['username']
            page = next_request['page']
            request_id = next_request.id
            
            # if there is no entry in the db for this user, create one 
            if screen_name not in self.db:
                logger.debug('Create record for user %s'%screen_name)
                hits = status.hits()
                if hits:
                    thread = StoreUser(screen_name, self.db, api)
                else:
                    with lock:
                        thread = StoreUser(screen_name, self.db, api)
                thread.start()
            
            thread = RetrieveTweets(self, screen_name, page, self.db, request_id)
            logger.debug('Start thread %s downloading tweets for %s, page %s'%(screen_name, page, thread.getName()))
            thread.setDaemon(True)
            thread.start()
        
        # wait until all threads have finished
        main_thread = threading.current_thread()
        for thread in threading.enumerate():
            if thread is main_thread:
                continue
            logging.debug('Joining thread %s', thread.getName())
            thread.join() 
        
        logger.info('Exited download_tweets.py')
    
    def exit(self):
        status.terminate = True
        logger.info('Terminating threads')