Beispiel #1
0
def test_queue_adds_requests_for_username():
    # -- ARRANGE --
    # variables used to test
    no_to_add = 5

    # set up result
    result = []

    def add_result(key, value):
        result.append(value)

    # create a mock server
    couchdb.client.Database.__setitem__ = Mock()
    couchdb.client.Database.__setitem__.side_effect = add_result

    # set up target
    request_queue = RequestQueue("http://127.0.0.1:5984", "hashmapd")

    # -- ACT --
    for i in xrange(no_to_add):
        request_queue.add_download_requests_for_username("utunga")

    # -- ASSERT --
    # ensure that the front of the queue remains unchanged, the back has been incremented, and the new values have been added
    for i in xrange(no_to_add):
        for j in xrange(request_queue.n_pages):
            assert result[request_queue.n_pages * i + j]["request_time"] != None
            assert result[request_queue.n_pages * i + j]["username"] == "utunga"
            assert result[request_queue.n_pages * i + j]["page"] == (j + 1)
Beispiel #2
0
 def __init__(self, server_url='http://127.0.0.1:5984', db_name='hashmapd'):
     threading.Thread.__init__(self)
     self.db = couchdb.Server(server_url)[db_name]
     self.request_queue = RequestQueue(server_url=server_url, db_name=db_name)
     logger.debug('Manager: downloading tweets from %s, database %s'%(server_url, db_name))
Beispiel #3
0
class Manager(threading.Thread):
    """
    Loop until user terminates program. Obtain tweet requests from the queue and
    spawn worker threads to retrieve the data.
 
    Blocks when the maximum number of simultanous requests are underway.
    Currently busy-waits when there are no requests on the queue.
    Also busy-waits when the twitter rate limit is reached.
    """
    
    def __init__(self, server_url='http://127.0.0.1:5984', db_name='hashmapd'):
        threading.Thread.__init__(self)
        self.db = couchdb.Server(server_url)[db_name]
        self.request_queue = RequestQueue(server_url=server_url, db_name=db_name)
        logger.debug('Manager: downloading tweets from %s, database %s'%(server_url, db_name))
    
    def notify_completed(self, thread):
        # report to the queue that the job finished successfully
        row = self.db[thread.request_id]
        self.request_queue.completed_request(row, thread.request_id)
        # create hash request if completed downloading user requests
        self.create_hash_request_if_finished(thread)
        # print notification of completion
        status.info('Manager: retrieved tweets', thread)
    
    def notify_failed(self, thread, err, notify_error=False):
        #backoff
        status.backoff()
        # report to the queue that the job failed
        row = self.db[thread.request_id]
        self.request_queue.failed_request(row, thread.request_id)
        # create hash request if completed downloading user requests
        self.create_hash_request_if_finished(thread)
        # print error message
        if notify_error:
            status.error('Manager: error (%s) retrieving tweets'%err, thread)
        else:
            status.info('Manager: error (%s) retrieving tweets'%err, thread)
    
    def delete_request_doc(self, thread):
        doc = self.db[thread.request_id]
        self.db.delete(doc)
        logger.debug('Deleted request for missing tweets (' + str(thread.screen_name) + ',' + str(thread.page) + ')')

    def create_hash_request_if_finished(self, thread):
        # if there are no more pending download requests for this user,
        # create a new hash request for the user
        results = self.db.view('queue/queued_user_download_requests', reduce=False)
        if len(results[thread.screen_name]) == 0:
            self.request_queue.add_hash_request(thread.screen_name)
    
    def run(self):
        # obtain a twitter screen name from db that needs data downloaded
        # spawn a thread for each page of downloads
        while status.terminate == False:
            # get the next request
            logger.debug('Manager: get the next request')
            next_request = self.request_queue.next('download')
            if next_request == None:
                logger.info('Request queue is empty - no more users to download')
                status.terminate = True
                continue
            logger.debug('Manager: Next request is for %s, page %s'%(next_request['username'], next_request['page']))
            screen_name = next_request['username']
            page = next_request['page']
            request_id = next_request.id
            
            # if there is no entry in the db for this user, create one 
            if screen_name not in self.db:
                logger.debug('Create record for user %s'%screen_name)
                hits = status.hits()
                if hits:
                    thread = StoreUser(screen_name, self.db, api)
                else:
                    with lock:
                        thread = StoreUser(screen_name, self.db, api)
                thread.start()
            
            thread = RetrieveTweets(self, screen_name, page, self.db, request_id)
            logger.debug('Start thread %s downloading tweets for %s, page %s'%(screen_name, page, thread.getName()))
            thread.setDaemon(True)
            thread.start()
        
        # wait until all threads have finished
        main_thread = threading.current_thread()
        for thread in threading.enumerate():
            if thread is main_thread:
                continue
            logging.debug('Joining thread %s', thread.getName())
            thread.join() 
        
        logger.info('Exited download_tweets.py')
    
    def exit(self):
        status.terminate = True
        logger.info('Terminating threads')
Beispiel #4
0
def test_download_requests_dequeued_in_order():
    # -- ARRANGE --
    # variables used to test
    no_to_remove = 5

    # set up result
    dict = {}
    result = []

    def get_from_db(item):
        return dict[item]

    def store_in_db(key, value):
        dict[key] = value

    # returns a list of values taken from the dumnmy values dictionary, that are
    # formatted and ordered in the same way as the relevant view would be
    def get_view_results(view, reduce, descending):
        if view == "queue/queued_download_requests":
            return generate_view("queued", "download")
        elif view == "queue/underway_download_requests":
            return generate_view("underway", "download")
        elif view == "queue/queued_hash_requests":
            return generate_view("queued", "hash")
        elif view == "queue/underway_hash_requests":
            return generate_view("underway", "hash")

    def generate_view(doc_status, queue_name):
        if doc_status == "queued":
            view_results = []
            for k, v in dict.iteritems():
                if k != "_design/queue" and "started_time" not in v:
                    view_results.append(
                        couchdb.client.Row(
                            id=str(k),
                            key=v["request_time"],
                            value={"id": str(k), "username": "******", "page": 1, "type": queue_name + "_request"},
                        )
                    )
            view_results.sort()
            return view_results
        elif doc_status == "underway":
            view_results = []
            for k, v in dict.iteritems():
                if k != "_design/queue":
                    try:
                        view_results.append(
                            couchdb.client.Row(
                                id=str(k),
                                key=v["started_time"],
                                value={"id": str(k), "username": "******", "page": 1, "type": queue_name + "_request"},
                            )
                        )
                    except KeyError:
                        pass
            view_results.sort()
            return view_results

    # populate the mock queue with entries
    for i in xrange(no_to_remove):
        dict[str(i)] = {
            "request_time": datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S.%f"),
            "username": "******",
            "page": 1,
            "type": "download_request",
        }
        time.sleep(0.01)

    # create a mock server and mock views (that contain dummy values)
    couchdb.client.Database.__getitem__ = Mock()
    couchdb.client.Database.__getitem__.side_effect = get_from_db
    couchdb.client.Database.__setitem__ = Mock()
    couchdb.client.Database.__setitem__.side_effect = store_in_db
    couchdb.client.Database.view = Mock()
    couchdb.client.Database.view.side_effect = get_view_results

    # set up target
    request_queue = RequestQueue("http://127.0.0.1:5984", "hashmapd")

    # -- ACT --
    for i in xrange(no_to_remove):
        result.append(request_queue.next("download"))

    # -- ASSERT --
    # ensure that the requests were popped off the queue in the correct order
    for i in xrange(no_to_remove - 1):
        assert result[i]["request_time"] < result[i + 1]["request_time"]