Example #1
0
    def __init__(self,
                 geocodeUserConfig,
                 inputQueue=None,
                 outputQueue=None,
                 dataCollection=None,
                 userAnalysisList=None):
        super(FollowerExtractorGateThread,
              self).__init__(self.__class__.__name__, criticalThread=True)

        if inputQueue is None:
            inputQueue = QueueEx()
        if outputQueue is None:
            outputQueue = QueueEx()

        assert dataCollection is not None
        assert isinstance(dataCollection, DataCollection)
        assert isinstance(geocodeUserConfig, UserGeocodeConfig)

        self.input_queue = inputQueue
        self.output_queue = outputQueue
        self.geocode_user_config = geocodeUserConfig

        # We use the data collection to check for users which already have followers.
        self.data_collection = dataCollection

        self.user_analysis_list = userAnalysisList

        self.num_dropped = 0
        self.num_processed = 0
        self.log_num_dropped_timer = Timer(
            Configuration.LOG_DROP_AMOUNT_FREQ_MS, False)
Example #2
0
    def __init__(self, webSocket, onRegisteredFunc=None):
        super(WebSocket, self).__init__(processSignalFunc=self.onUpdate,
                                        onRegisteredFunc=onRegisteredFunc)

        assert webSocket is not None

        self.web_socket = webSocket
        self.is_cleaned_up = False
        self.controls = dict()

        self.pingTimer = Timer(4000, False)

        self.cleanup_funcs = []
Example #3
0
def getCursorSizeSlow(cursor):
    timer = Timer()

    if cursor is None:
        return 0

    logger.info('Retrieving cursor size...')

    success = False
    attempt = 1
    maxAttempts = 5
    sizeOfCursor = 0
    while not success:
        try:
            sizeOfCursor = cursor.count(True)
            success = True
        except AutoReconnect as e:
            if attempt <= maxAttempts:
                logger.error(
                    'Failed to retrieve cursor size, AutoReconnect exception: %s (%d of %d attempts), errors: %s'
                    % (e.message, attempt, maxAttempts, unicode(e.errors)))
                attempt += 1
                cursor.rewind()
            else:
                raise

    logger.info('Successfully retrieved size of cursor: %d in %dms' %
                (sizeOfCursor, timer.time_since_constructed))

    return sizeOfCursor
Example #4
0
    def __init__(self,
                 geocodeUserConfig,
                 outputQueue=None,
                 twitterSession=None,
                 onTerminateFunc=None,
                 userAnalysisList=None):
        super(FollowerExtractorThread, self).__init__(self.__class__.__name__ +
                                                      "_" + str(getUniqueId()),
                                                      onTerminateFunc,
                                                      criticalThread=False)

        if outputQueue is None:
            outputQueue = QueueEx()
        if userAnalysisList is None:
            userAnalysisList = list()

        assert isinstance(twitterSession, TwitterSession)
        assert isinstance(geocodeUserConfig, UserGeocodeConfig)

        def continueRunningCheck():
            return twitterSession.is_session_active

        def notifyPositionFunc(item, position, lastPoppedItem):
            user = getUser(item)
            if user is None:
                return

            assert isinstance(user, User)
            user.follower_enrichment_progress.onQueuePositionChange(
                user, position, lastPoppedItem)
            self.output_queue.put(user)

        self.input_queue = QueueNotify(continueRunningCheck, 2,
                                       notifyPositionFunc)
        self.output_queue = outputQueue
        self.twitter_session = twitterSession
        self.user_analysis_list = userAnalysisList
        self.geocode_user_config = geocodeUserConfig

        self.num_followers_processed = 0
        self.num_followers_geocoded = 0
        self.num_followees_processed = 0
        self.log_performance_timer = Timer(60000, False)
Example #5
0
    def __init__(self, feed, outputQueue=None, initialData=None):
        super(TwitterThread, self).__init__(self.__class__.__name__ + "_" +
                                            str(getUniqueId()),
                                            criticalThread=False)

        if feed is None:
            feed = TwitterFeed([], [], [], DummyIterable(), None)

        if outputQueue is None:
            outputQueue = QueueEx()

        assert isinstance(feed, TwitterFeed)
        assert isinstance(feed.twitter_session, TwitterSession)

        self.input_queue = feed
        self.twitter_session = feed.twitter_session
        self.twitter_feed = feed

        self.output_queue = outputQueue

        if initialData is not None:
            for item in initialData:
                item = copy.deepcopy(item)

                user = getUser(item)
                assert isinstance(user, User)

                logger.info('Retrieved tweet/user from file: %s' % item)

                item.setTwitterSession(self.twitter_session)

                self.output_queue.put(item)

        self.num_dropped = 0
        self.num_processed = 0
        self.num_twitter_geocoded_place = 0
        self.num_twitter_geocoded_coordinate = 0
        self.num_twitter_geocoded_both = 0
        self.num_not_twitter_geocoded = 0
        self.num_no_location = 0
        self.num_geocodeable = 0
        self.log_num_dropped_timer = Timer(
            Configuration.LOG_DROP_AMOUNT_FREQ_MS, False)
Example #6
0
    def __init__(self,
                 geocodeConfig,
                 inputQueue=None,
                 successOutputQueue=None,
                 primaryFailureOutputQueue=None,
                 highLoadFailureOutputQueue=None,
                 inMemoryOnly=None):
        if inMemoryOnly:
            inMemoryOnlyStr = '_MEMORY_ONLY'
        else:
            inMemoryOnlyStr = ''

        super(GeocodeFromCacheThread, self).__init__(
            '%s%s' % (self.__class__.__name__, inMemoryOnlyStr),
            criticalThread=True)

        assert isinstance(geocodeConfig, UserGeocodeConfig)

        if inputQueue is None:
            inputQueue = QueueEx()
        if successOutputQueue is None:
            successOutputQueue = QueueEx()
        if primaryFailureOutputQueue is None:
            primaryFailureOutputQueue = QueueEx()

        self.input_queue = inputQueue
        self.success_output_queue = successOutputQueue
        self.primary_failure_output_queue = primaryFailureOutputQueue
        self.high_load_failure_output_queue = highLoadFailureOutputQueue
        self.geocode_config = geocodeConfig

        self.num_dropped_from_success = 0
        self.num_dropped_from_primary_failure = 0
        self.num_failed_over = 0
        self.log_timer = Timer(Configuration.LOG_DROP_AMOUNT_FREQ_MS, False)

        self.num_processed = 0
        self.in_memory_only = inMemoryOnly

        self.sleep_time = float(
            Configuration.GEOCODE_FROM_CACHE_THREAD_WAIT_TIME_MS) / 1000.0
Example #7
0
    def __init__(self, webSocket, onRegisteredFunc=None):
        super(WebSocket, self).__init__(processSignalFunc=self.onUpdate, onRegisteredFunc=onRegisteredFunc)

        assert webSocket is not None

        self.web_socket = webSocket
        self.is_cleaned_up = False
        self.controls = dict()

        self.pingTimer = Timer(4000,False)

        self.cleanup_funcs = []
Example #8
0
        def func(templateArguments, *args, **kwargs):
            if self.on_display_usage_func is not None:
                displayUsageFuncCallTimer = Timer(1000, True)

                def theDisplayUsageFunc():
                    if displayUsageFuncCallTimer.ticked():
                        self.on_display_usage_func(
                            self, packArguments(*args, **kwargs))

                onDisplayUsageFunc = theDisplayUsageFunc
            else:
                onDisplayUsageFunc = None

            # > 1 for buffering, ensure we are always sending not send -> wait for database -> send.
            worker = AsyncWorker(2, onDisplayUsageFunc)

            tunnelId = kwargs['tunnel_id']
            socketId = kwargs['socket_id']
            self.setResponseHeaders(tunnelId)

            logger.info('Tunnel %s on socket %s has started opening' %
                        (tunnelId, socketId))

            socket = self.sockets.get(socketId, None)
            if socket is None:
                logger.error(
                    'Bulk download attempted but no matching socket with ID: %d found'
                    % socketId)
                worker.on_finish()
                return worker.queue

            tunnelEvent = socket.tunnel_events.get(tunnelId, None)
            if tunnelEvent is None:
                logger.error('Invalid tunnel ID received: %s' % tunnelId)
                worker.on_finish()
                return worker.queue

            if tunnelEvent.is_set():
                logger.error(
                    'Attempted to assign two bulk download providers to one socket with ID: %d, and tunnel ID: %s'
                    % (socketId, tunnelId))
                worker.on_finish()
                return worker.queue

            socket.tunnels[tunnelId] = worker
            tunnelEvent.set()

            return worker.queue
Example #9
0
class GeocodeFromCacheThread(BaseThread):
    def __init__(self,
                 geocodeConfig,
                 inputQueue=None,
                 successOutputQueue=None,
                 primaryFailureOutputQueue=None,
                 highLoadFailureOutputQueue=None,
                 inMemoryOnly=None):
        if inMemoryOnly:
            inMemoryOnlyStr = '_MEMORY_ONLY'
        else:
            inMemoryOnlyStr = ''

        super(GeocodeFromCacheThread, self).__init__(
            '%s%s' % (self.__class__.__name__, inMemoryOnlyStr),
            criticalThread=True)

        assert isinstance(geocodeConfig, UserGeocodeConfig)

        if inputQueue is None:
            inputQueue = QueueEx()
        if successOutputQueue is None:
            successOutputQueue = QueueEx()
        if primaryFailureOutputQueue is None:
            primaryFailureOutputQueue = QueueEx()

        self.input_queue = inputQueue
        self.success_output_queue = successOutputQueue
        self.primary_failure_output_queue = primaryFailureOutputQueue
        self.high_load_failure_output_queue = highLoadFailureOutputQueue
        self.geocode_config = geocodeConfig

        self.num_dropped_from_success = 0
        self.num_dropped_from_primary_failure = 0
        self.num_failed_over = 0
        self.log_timer = Timer(Configuration.LOG_DROP_AMOUNT_FREQ_MS, False)

        self.num_processed = 0
        self.in_memory_only = inMemoryOnly

        self.sleep_time = float(
            Configuration.GEOCODE_FROM_CACHE_THREAD_WAIT_TIME_MS) / 1000.0

    def _run(self):
        for item in self.input_queue:
            if not self.in_memory_only:
                time.sleep(self.sleep_time)

            user = getUser(item)
            assert user is not None

            user.clearGeocode(
                True)  # in case previously geocoded by in memory.

            if user.is_geocoded:
                success = True
            else:
                success = user.geocodeLocationFromCache(
                    self.geocode_config, self.in_memory_only)

            if self.log_timer.ticked():
                numProcessed = self.num_processed
                numDroppedFromSuccess = self.num_dropped_from_success
                numDroppedFromPrimaryFailure = self.num_dropped_from_primary_failure
                numFailedOver = self.num_failed_over
                total = numProcessed + numDroppedFromSuccess + numDroppedFromPrimaryFailure + numFailedOver

                if total == 0:
                    percentageDroppedFromSuccess = 0
                    percentageDroppedFromPrimaryFailure = 0
                    percentageFailedOver = 0
                    percentageSuccess = 0
                else:
                    percentageDroppedFromSuccess = float(
                        numDroppedFromSuccess) / float(total) * 100.0
                    percentageDroppedFromPrimaryFailure = float(
                        numDroppedFromPrimaryFailure) / float(total) * 100.0
                    percentageFailedOver = float(numFailedOver) / float(
                        total) * 100.0
                    percentageSuccess = float(numProcessed) / float(
                        total) * 100.0

                outputQueueSize = self.success_output_queue.qsize()
                failOverOutputQueueSize = self.primary_failure_output_queue.qsize(
                )

                geocodeDataInMemoryCacheSize = getGeocodeDataInMemoryCacheSize(
                )
                geocodeQueryInMemoryCacheSize = getGeocodeQueryInMemoryCacheSize(
                )

                self.num_dropped_from_success = 0
                self.num_dropped_from_primary_failure = 0
                self.num_processed = 0
                self.num_failed_over = 0

                # FEGQ = follower extractor gate queue
                logger.info(
                    'Geocoded %d items (%.2f%%), failed over %d items (%.2f%%), dropped successful geocode items %d items (%.2f%%), dropped failed geocode items %d items (%.2f%%) - success output queue size: %d, fail over output queue size: %d - geocode cache size: %d, place cache size %d'
                    % (numProcessed, percentageSuccess, numFailedOver,
                       percentageFailedOver, numDroppedFromSuccess,
                       percentageDroppedFromSuccess,
                       numDroppedFromPrimaryFailure,
                       percentageDroppedFromPrimaryFailure, outputQueueSize,
                       failOverOutputQueueSize, geocodeDataInMemoryCacheSize,
                       geocodeQueryInMemoryCacheSize))

            if success:
                if self.success_output_queue.qsize(
                ) < Configuration.ANALYSIS_INPUT_THREAD_SIZE_CAP:
                    self.num_processed += 1
                    self.success_output_queue.put(item)
                else:
                    self.num_dropped_from_success += 1
            else:
                # Make sure the queue doesn't get too full, we only geocode once a second.
                # We don't deal with those that followers because that would take too long.
                if self.primary_failure_output_queue.qsize(
                ) <= Configuration.GEOCODE_FROM_CACHE_PRIMARY_FAILURE_OUTPUT_QUEUE_SIZE and (
                        user.has_location or user.has_twitter_place):
                    self.num_failed_over += 1
                    self.primary_failure_output_queue.put(item)
                elif self.high_load_failure_output_queue is not None:
                    self.num_dropped_from_primary_failure += 1
                    self.high_load_failure_output_queue.put(item)
import unittest
import requests
from api.config import Configuration, GE_MAP_QUEST, GE_GOOGLE
from api.core.utility import Timer
from api.geocode.geocode_shared import GeocodeResult, GeocodeResultGoogle, BadGeocodeException
import logging
import itertools

logger = logging.getLogger(__name__)

__author__ = 'Michael Pryor'

# 1 every two seconds.
# Confirmed with open map quest that 1 per second is okay, but set to every two seconds to be nice.
geocode_from_external_timer_omq    = Timer.rate_limited(60,120*1000)

# 2500 requests per day (24 hours).
# This works out as once every 35 seconds.
geocode_from_external_timer_google = Timer.rate_limited(2500,24*60*60*1000)

def _geocodeFromExternalOMQ(query, countryCode=None, acceptableTypes=None):
    """ Uses open map quest to do a location search, e.g. if query
        is London then information about London city will be returned.

        Note this method restricts itself to 1 call per second."""
    if query is None:
        return None

    geocode_from_external_timer_omq.waitForTick()
    try:
        url = "http://open.mapquestapi.com/nominatim/v1/search"
Example #11
0
def processCursor(cursor,
                  constructObjectFunc,
                  onIterationFunc=None,
                  cursorSize=None,
                  getCurrentIterationFunc=None):
    try:
        if cursor is None:
            return None

        timer = Timer()

        results = []

        if getCurrentIterationFunc is None:
            currentIterationCounter = [0]

            def getIterationFunc(
                    obj, currentIterationCounter=currentIterationCounter):
                currentIterationCounter[0] += 1
                return currentIterationCounter[0]

            getCurrentIterationFunc = getIterationFunc

        brokeOut = False
        iteration = 0
        cursorIterationOffset = 0

        endIteration = cursorSize

        isIterationBoundsInitialised = False

        if onIterationFunc is not None:
            onIterationFunc(cursorIterationOffset, endIteration, False, None,
                            'base')

        for item in cursor:
            currentObject = constructObjectFunc(item)

            if onIterationFunc is not None:
                iteration = getCurrentIterationFunc(currentObject)

                if iteration is None:
                    continue

                if cursorSize is not None:
                    if not isIterationBoundsInitialised:
                        cursorIterationOffset = iteration - 1  # Iterations don't have to be 0 indexed.
                        endIteration = cursorSize - cursorIterationOffset
                        isIterationBoundsInitialised = True

                    if isIterationBoundsInitialised:
                        iteration -= cursorIterationOffset

                #logger.info('S: %d, M: %d, E: %d' % (0, iteration, endIteration))
                #assert 0 <= iteration <= (endIteration + 5)

                result = onIterationFunc(iteration, endIteration, False,
                                         currentObject, 'base')
                if result is False:
                    brokeOut = True
                    break
            else:
                # Don't return in results if we have an iteration func.
                # This is important in case we are processing millions of rows
                # (more than we can fit in memory).
                if currentObject is not None:
                    results.append(currentObject)

        # Signal that we're finished.
        if onIterationFunc is not None:
            if not brokeOut:
                iteration = endIteration

            onIterationFunc(iteration, endIteration, True, None, 'base')

        timeTaken2 = timer.time_since_constructed
        logger.info('Successfully processed cursor in %dms' % timeTaken2)

        timeTaken = timer.time_since_constructed
        logger.info('Successfully read %d items from cache (%d) in %dms' %
                    (len(results), timer.__hash__(), timeTaken))

        if len(results) == 0:
            return None
        else:
            return results
    finally:
        if cursor is not None:
            cursor.close()
Example #12
0
def cursorItemsFromCache(instanceId, getCollectionFunc, placeId=None, epochMsStartRange=None, epochMsEndRange=None, pageNum=None, pageSize=None, typeSpecificQuery=None, projection=None, sortByTimestamp=None, typeSpecificHint=None):
    if sortByTimestamp is None:
        sortByTimestamp = True

    epochMsStartRange, epochMsEndRange = fixEpochMsRange(epochMsStartRange, epochMsEndRange)

    if epochMsEndRange is None:
        upperBoundTimestamp = getEpochMs()
    else:
        upperBoundTimestamp = epochMsEndRange

    if projection is not None and projection.do_query is False:
        return None

    assert instanceId is not None
    assert getCollectionFunc is not None
    collection = getCollectionFunc(instanceId)

    logFormatting = 'IN:%s, P:%s, ES:%s, EE:%s, PN:%s, PS:%s, T:%s, P:%s' % (instanceId, placeId, epochMsStartRange, epochMsEndRange, pageNum, pageSize, typeSpecificQuery, projection)

    timer = Timer()
    logger.info('Attempting to read items from cache (%d) -- %s' % (timer.__hash__(),logFormatting))

    findDic = dict()

    timestampDic = None
    if epochMsEndRange is not None:
        if timestampDic is None:
            timestampDic = dict()

        timestampDic.update({'$lt' : epochMsEndRange})

    if epochMsStartRange is not None:
        if timestampDic is None:
            timestampDic = dict()

        timestampDic.update({'$gte' : epochMsStartRange})

    if timestampDic is not None:
        findDic.update({'timestamp' : timestampDic})

    if placeId is not None:
        findDic.update({'geocode.placeId' : placeId['placeId'],
                        'geocode.providerId' : placeId['providerId']})

    # MongoDB sometimes gets it wrong, particularly with geocode.placeId.
    if typeSpecificHint is None:
        if timestampDic is not None:
            if placeId is not None:
                hint = [('geocode.placeId', pymongo.ASCENDING), ('timestamp', pymongo.ASCENDING)]
            else:
                hint = [('timestamp', pymongo.ASCENDING)]
        else:
            if placeId is not None:
                hint = [('geocode.placeId', pymongo.ASCENDING)]
            else:
                hint = None
    else:
        hint = typeSpecificHint

    if typeSpecificQuery is not None:
        findDic.update(typeSpecificQuery)

    if projection is None:
        cursor = collection.find(findDic,timeout=False).hint(hint)
    else:
        cursor = collection.find(findDic, projection.projection,timeout=False).hint(hint)

    if sortByTimestamp:
        cursor = cursor.sort([('timestamp', pymongo.ASCENDING)])

    if pageSize is not None and pageNum is not None:
        cursor = cursor.skip(pageSize*pageNum).limit(pageSize)

    # We use this to calculate progress through the cursor,
    # It is more efficient than using cursor.count.
    cursor.upper_bound_timestamp = upperBoundTimestamp

    timeTaken = timer.time_since_constructed
    logger.info('Successfully setup cursor in %dms -- %s' % (timeTaken,logFormatting))

    if Configuration.MONGO_EXPLAINS_ENABLED:
        logger.critical('Tweet/User Explain: %s' % unicode(cursor.explain()))

    return cursor
Example #13
0
        except KeyboardInterrupt:
            pass

        print 'Finished!'
        sys.exit(0)

    if args.show_database_storage_usage:
        f = open('db_results.txt', 'w')
        sys.stdout = f

        theStep = 1000 * 60 * 15
        print 'Running in show database storage mode, update every %dms' % theStep
        print
        f.flush()

        updateTimer = Timer(theStep,True)
        try:
            while True:
                updateTimer.waitForTick()

                collections = getCollections()
                print 'The time: %s'%  unicode(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
                print 'Available collections: %s' % unicode(collections)
                print
                print
                print 'Database statistics: %s' % unicode(getDatabase().command({'dbStats' : 1}))

                for collection in collections:
                    print '%*s collection statistics: %s' % (20, collection, unicode(getDatabase().command({'collStats' : collection})))

                print
    def manageSocket(self, webSocket, tupleArguments, socketId):
        instanceId = tupleArguments[0]

        mainControl = webSocket.controls[self.key]
        assert isinstance(mainControl, DocumentControl)

        bytesPerBatch       =        parseInteger(request.GET.batchSizeBytes, maximum=1024 * 1024 * 256, default=1024 * 1024 * 1)
        tweetInfo           =        parseBoolean(request.GET.tweet_info, False)
        followerInfo        =        parseBoolean(request.GET.follower_info_full, False)
        followerInfoShort   =        parseBoolean(request.GET.follower_info_short, False)
        providerId          =        parseInteger(request.GET.provider_id)
        placeId             =        parseInteger(request.GET.place_id)
        startEpoch          =        parseInteger(request.GET.start_epoch)
        endEpoch            =        parseInteger(request.GET.end_epoch)

        if placeId is not None and providerId is not None:
            placeCacheId = GeocodeResultAbstract.buildCacheId(providerId, placeId)
        else:
            placeCacheId = None

        if followerInfo:
            tweetInfo = False
            followerInfoShort = False
        elif tweetInfo:
            followerInfo = False
            followerInfoShort = False
        elif followerInfoShort:
            followerInfo = False
            tweetInfo = False
        else:
            followerInfo = True


        userTunnelId = 'user_tunnel'
        tweetTunnelId = None

        if tweetInfo:
            tweetTunnelId = 'tweet_tunnel'

        def openRequiredTunnels():
            if tweetInfo:
                return self.openTunnels(webSocket)
            else:
                return self.openTunnel(userTunnelId, webSocket)

        if not openRequiredTunnels():
            logger.error('Failed to open initial tunnels')
            return False

        if tweetInfo:
            followerIdsFlag = False
            followeeIdsFlag = False
            analysisFlag = False
            isFollowersLoadedRequirement = None
            associatedWithTweetRequirement = True
            recursiveCacheFlag = False
            followerIdsProjection = None
            outputType = 1 # for csv.
        elif followerInfo:
            followerIdsFlag = True
            followeeIdsFlag = True
            analysisFlag = True
            isFollowersLoadedRequirement = True
            associatedWithTweetRequirement = None
            recursiveCacheFlag = True
            followerIdsProjection = None # this gives us all data on each follower.
            outputType = 2
        elif followerInfoShort:
            followerIdsFlag = True
            followeeIdsFlag = True
            followerIdsProjection = NoQueryProjection()
            analysisFlag = True
            isFollowersLoadedRequirement = True
            associatedWithTweetRequirement = None
            recursiveCacheFlag = True
            outputType = 3
        else:
            raise NotImplementedError()

        userProjection = UserProjection(True,
                                        True,
                                        None,
                                        True,
                                        followerIdsFlag,
                                        followerIdsProjection,
                                        followeeIdsFlag,
                                        UserProjection.Id(),
                                        True,
                                        False,
                                        False,
                                        True,
                                        True,
                                        False,
                                        False,
                                        False,
                                        False,
                                        analysisFlag)

        isFirstIteration = [True]

        twitterInstance = self.application.twitter_instances.getInstanceByInstanceKey(instanceId)
        if twitterInstance is None:
            return False

        twitterSession = twitterInstance.twitter_thread.twitter_session
        progressBarTotalId = 'progress-bar-total'
        progressBarCurrentBatchId = 'progress-bar-current-batch'

        signaler = EventSignaler(self.key, [webSocket])

        updateProgressBarFreq = Timer(400,True)

        def sendData(tunnelId, data):
            self.sendDataOnTunnel(webSocket, tunnelId, (unicode(data) + '\r\n'))

        def sendHeader():
            sendData(userTunnelId, getUserHeader(outputType))

            if tweetTunnelId is not None:
                sendData(tweetTunnelId, getTweetHeader())

        def doProgressBarChange(percentage, progressBarId):
            mainControl.executeJavascript('$("#%s").width("%.3f%%");' % (progressBarId, percentage))

        sendHeader()

        counter = [0]
        previousCounter = [0]
        def updateSocket(controls,
                         data,
                         bytesCounter=counter,
                         bytesPerBatch=bytesPerBatch,
                         previousCounter=previousCounter,
                         isFirstIteration=isFirstIteration):
            user = data['user_data']
            tweet = data['tweet_data']
            percentage = data['percentage']
            isFinished = data['isFinished']

            control = controls[self.key]
            assert isinstance(control, DocumentControl)

            def updateProgressBars():
                previousCounter[0] = thisCounter = bytesCounter[0]

                percentageCurrentBatch = float(thisCounter) / float(bytesPerBatch) * 100
                percentageTotal = percentage

                if percentageTotal >= 100:
                    percentageCurrentBatch = 100

                if isFirstIteration[0] and percentageCurrentBatch < percentageTotal:
                    percentageCurrentBatch = percentageTotal

                doProgressBarChange(percentageTotal, progressBarTotalId)
                doProgressBarChange(percentageCurrentBatch, progressBarCurrentBatchId)

            if previousCounter[0] != bytesCounter[0] and updateProgressBarFreq.ticked():
                updateProgressBars()

            dataToSendToClient = ''
            if user is not None:
                assert isinstance(user,User)
                dataToSendToClient = getUserRepresentation(user, outputType)
                sendData(userTunnelId, dataToSendToClient)

            if tweet is not None:
                assert isinstance(tweet, Tweet)
                dataToSendToClient = getTweetRepresentation(tweet)
                sendData(tweetTunnelId, dataToSendToClient)

            dataLength = len(dataToSendToClient)
            bytesCounter[0] += dataLength

            if bytesCounter[0] > bytesPerBatch or isFinished:
                updateProgressBars()
                isFirstIteration[0] = False

                bytesCounter[0] = 0
                mainControl.executeJavascript('onBatchEnd();')

                self.closeTunnels(webSocket)

                if not isFinished:
                    logger.debug('Waiting to receive next data provider')
                    if not openRequiredTunnels():
                        logger.warning('Failed to reinitialize tunnel slots')
                        webSocket.cleanup()
                        return

                    sendHeader()
                else:
                    mainControl.executeJavascript('onFinished();')

                    webSocket.cleanup()

        def onCacheIteration(iteration, total, isFinished, data, iteratorId):
            # Don't write followee data to output as it would duplicate alot of data.
            if iteratorId == 'followee':
                data = None

            running = not webSocket.is_cleaned_up
            if running:
                # We need to do this so that if the client closes the socket we are notified.
                webSocket.pingFreqLimited()

                percentage = getPercentage(iteration, total)
                dataId = None
                if data is not None:
                    dataId = data.id
                #logger.info('iteration %.2f of %.2f (%.1f%%) - it: %s, userId: %s' % (iteration, total, percentage,iteratorId,dataId))

                user = None
                tweet = None
                if data is None:
                    pass
                elif isinstance(data, User):
                    user = data
                elif isinstance(data, Tweet):
                    tweet = data
                    if tweet.has_user:
                        user = tweet.user
                else:
                    logger.error('Invalid data from cache, type: %s' % type(data))
                    return running

                signaler.signalEvent({SignalActions.SOCKET: updateSocket, 'percentage' : percentage, 'user_data' : user, 'tweet_data' : tweet, 'isFinished' : isFinished})
                gevent.sleep(0)
            else:
                logger.debug('Ending cache download prematurely')

            return running

        logger.debug('Starting to read data from cache...')

        # This makes sure the search is finite.
        epochNow = getEpochMs()
        if endEpoch is None or endEpoch > epochNow:
            endEpoch = epochNow

        if followerInfo or followerInfoShort:
            readUsersFromCache(twitterSession,
                               instanceId,
                               placeId = placeCacheId,
                               epochMsStartRange=startEpoch,
                               epochMsEndRange=endEpoch,
                               isFollowersLoadedRequirement=isFollowersLoadedRequirement,
                               associatedWithTweetRequirement=associatedWithTweetRequirement,
                               onIterationFunc=onCacheIteration,
                               recursive=recursiveCacheFlag,
                               userProjection=userProjection)
        else:
            readTweetsFromCache(twitterSession,
                                instanceId,
                                placeId = placeCacheId,
                                epochMsStartRange=startEpoch,
                                epochMsEndRange=endEpoch,
                                onIterationFunc=onCacheIteration,
                                retrieveUserData=True,
                                userProjection=userProjection)

        # We want to cleanup everything now since we are done.
        return False
Example #15
0
class WebSocket(EventHandler):
    """ Base class for all web socket interactions. """
    class OP:
        """ Contains all operation codes, indicating what message is for.
            These codes are passed to the client directly via templating, so
            no need to modify elsewhere. """
        ADD_MARKER = 1
        ADD_LINE = 2
        REMOVE_ITEM = 3

        ADD_ROW = 4
        UPDATE_ROW = 6
        SET_HEADER = 7

        SET_ELEMENT_INNER_HTML = 8
        EXECUTE_JAVASCRIPT = 9

        PING = 0

    def __init__(self, webSocket, onRegisteredFunc=None):
        super(WebSocket, self).__init__(processSignalFunc=self.onUpdate, onRegisteredFunc=onRegisteredFunc)

        assert webSocket is not None

        self.web_socket = webSocket
        self.is_cleaned_up = False
        self.controls = dict()

        self.pingTimer = Timer(4000,False)

        self.cleanup_funcs = []

    def ping(self):
        self.send({'static_op' : WebSocket.OP.PING})

        pingBack = self.receive()

        if pingBack != 'PING_BACK':
            self.cleanup()

    def pingFreqLimited(self):
        if self.pingTimer.ticked():
            self.ping()

    def send(self, data):
        """ Sends a dictionary to the client in json form.
            @param data a dictionary to be sent to the client. """
        dataToSend = json.dumps(data)

        try:
            self.web_socket.send(dataToSend)
        except Exception as e:
            self.cleanup()
            logger.debug('Web socket connection terminated while sending, reason: %s, exception type %s' % (e, type(e)))

    def receive(self):
        try:
            return self.web_socket.receive()
        except Exception as e:
            self.cleanup()
            logger.debug('Web socket connection terminated while receiving, reason: %s, exception type %s' % (e, type(e)))
            return None

    def onUpdate(self, signaler, data):
        if data is None:
            return

        if SignalActions.SOCKET in data:
            data[SignalActions.SOCKET](self.controls, data)

    def cleanup(self):
        self.is_cleaned_up = True

        # Do not unregister from all here, the thread managing the socket does this
        # See WebSocketGroup.processWebSocket. This is important; we want the unregistering
        # to be done from a different thread to the one which the send operation originated from.
        # This avoids a problem where we might be iterating through event signalers, signal an event
        # but then the event signaler collection decreases in size as one is cleaned up. If from a different thread
        # it will change size after we have finished iterating through it.
        for item in self.cleanup_funcs:
            item(self)

    def addControls(self, controls):
        for control in controls:
            self.addControl(control)

    def addControl(self, control):
        assert isinstance(control, Control)
        self.controls[control.control_name] = control
        control.web_socket = self
Example #16
0
class TwitterThread(BaseThread):
    def __init__(self, feed, outputQueue=None, initialData=None):
        super(TwitterThread, self).__init__(self.__class__.__name__ + "_" +
                                            str(getUniqueId()),
                                            criticalThread=False)

        if feed is None:
            feed = TwitterFeed([], [], [], DummyIterable(), None)

        if outputQueue is None:
            outputQueue = QueueEx()

        assert isinstance(feed, TwitterFeed)
        assert isinstance(feed.twitter_session, TwitterSession)

        self.input_queue = feed
        self.twitter_session = feed.twitter_session
        self.twitter_feed = feed

        self.output_queue = outputQueue

        if initialData is not None:
            for item in initialData:
                item = copy.deepcopy(item)

                user = getUser(item)
                assert isinstance(user, User)

                logger.info('Retrieved tweet/user from file: %s' % item)

                item.setTwitterSession(self.twitter_session)

                self.output_queue.put(item)

        self.num_dropped = 0
        self.num_processed = 0
        self.num_twitter_geocoded_place = 0
        self.num_twitter_geocoded_coordinate = 0
        self.num_twitter_geocoded_both = 0
        self.num_not_twitter_geocoded = 0
        self.num_no_location = 0
        self.num_geocodeable = 0
        self.log_num_dropped_timer = Timer(
            Configuration.LOG_DROP_AMOUNT_FREQ_MS, False)

    def _onFailure(self, e):
        if not self.twitter_session.parent_instance.enable_shutdown_after_no_usage:
            logger.error(
                "Failure limit reached on instance, but not shutting it down because it is a core instance"
            )
            return

        logger.error(
            'Twitter stream thread has failed for instance %s, shutting down instance'
            % self.twitter_session.instance_key)
        self.twitter_session.parent_instance.shutdownInstance()
        super(TwitterThread, self)._onFailure(e)

    def _onRestart(self, e):
        logger.error(
            'Twitter stream thread has failed for instance %s, restarting stream'
            % self.twitter_session.instance_key)
        self.twitter_feed.restartConnection()

    def _run(self):
        for tweet in self.input_queue:
            if self.stopped:
                return 0

            if tweet is None:
                continue

            if self.log_num_dropped_timer.ticked():
                numProcessed = self.num_processed
                numDropped = self.num_dropped
                total = numProcessed + numDropped
                if total == 0:
                    percentageDropped = 0
                else:
                    percentageDropped = float(numDropped) / float(
                        total) * 100.0
                outputQueueSize = self.output_queue.qsize()

                numNoLocation = self.num_no_location
                numTwitterGeocodedPlace = self.num_twitter_geocoded_place
                numTwitterGeocodedCoordinate = self.num_twitter_geocoded_coordinate
                numTwitterGeocodedBoth = self.num_twitter_geocoded_both
                numNotTwitterGeocoded = self.num_not_twitter_geocoded
                numGeocodeable = self.num_geocodeable

                self.num_no_location = 0
                self.num_twitter_geocoded_place = 0
                self.num_twitter_geocoded_coordinate = 0
                self.num_twitter_geocoded_both = 0
                self.num_not_twitter_geocoded = 0
                self.num_geocodeable = 0

                self.num_dropped = 0
                self.num_processed = 0

                # GCQ = geocode cache queue.
                logger.info(
                    'Processed %d items, dropped %d items (%.2f%%) from GCQ (queue size: %d)'
                    % (numProcessed, numDropped, percentageDropped,
                       outputQueueSize))

                logger.info(
                    'Initial tweet state: no twitter geocode %d, twitter place %d, twitter coord %d, twitter place and coord %d, num geocodeable %d, num not geocodeable %d'
                    % (numNotTwitterGeocoded, numTwitterGeocodedPlace,
                       numTwitterGeocodedCoordinate, numTwitterGeocodedBoth,
                       numGeocodeable, numNoLocation))

            assert isinstance(tweet, Tweet)

            if tweet.has_twitter_place and tweet.coordinate is None:
                self.num_twitter_geocoded_place += 1
            elif tweet.coordinate is not None and not tweet.has_twitter_place:
                self.num_twitter_geocoded_coordinate += 1
            elif tweet.coordinate is not None and tweet.has_twitter_place:
                self.num_twitter_geocoded_both += 1
            else:
                self.num_not_twitter_geocoded += 1

            if tweet.has_user:
                if (not tweet.user.has_location
                    ) and tweet.coordinate is None and (
                        not tweet.has_twitter_place):
                    self.num_no_location += 1
                else:
                    self.num_geocodeable += 1

                if self.output_queue.qsize(
                ) < Configuration.GEOCODE_FROM_CACHE_INPUT_THREAD_SIZE_CAP:
                    self.num_processed += 1
                    self.output_queue.put(tweet)
                else:
                    self.num_dropped += 1

    def stop(self):
        super(TwitterThread, self).stop()
        if self.twitter_session is not None:
            self.twitter_session.close()
Example #17
0
def processCursor(cursor, constructObjectFunc, onIterationFunc=None, cursorSize=None, getCurrentIterationFunc=None):
    try:
        if cursor is None:
            return None

        timer = Timer()

        results = []

        if getCurrentIterationFunc is None:
            currentIterationCounter = [0]
            def getIterationFunc(obj, currentIterationCounter=currentIterationCounter):
                currentIterationCounter[0] += 1
                return currentIterationCounter[0]

            getCurrentIterationFunc = getIterationFunc

        brokeOut = False
        iteration = 0
        cursorIterationOffset = 0

        endIteration = cursorSize

        isIterationBoundsInitialised = False

        if onIterationFunc is not None:
            onIterationFunc(cursorIterationOffset, endIteration, False, None, 'base')

        for item in cursor:
            currentObject = constructObjectFunc(item)

            if onIterationFunc is not None:
                iteration = getCurrentIterationFunc(currentObject)

                if iteration is None:
                    continue

                if cursorSize is not None:
                    if not isIterationBoundsInitialised:
                        cursorIterationOffset = iteration - 1 # Iterations don't have to be 0 indexed.
                        endIteration = cursorSize - cursorIterationOffset
                        isIterationBoundsInitialised = True

                    if isIterationBoundsInitialised:
                        iteration -= cursorIterationOffset

                #logger.info('S: %d, M: %d, E: %d' % (0, iteration, endIteration))
                #assert 0 <= iteration <= (endIteration + 5)

                result = onIterationFunc(iteration, endIteration, False, currentObject, 'base')
                if result is False:
                    brokeOut = True
                    break
            else:
                # Don't return in results if we have an iteration func.
                # This is important in case we are processing millions of rows
                # (more than we can fit in memory).
                if currentObject is not None:
                    results.append(currentObject)

        # Signal that we're finished.
        if onIterationFunc is not None:
            if not brokeOut:
                iteration = endIteration

            onIterationFunc(iteration, endIteration, True, None, 'base')

        timeTaken2 = timer.time_since_constructed
        logger.info('Successfully processed cursor in %dms' % timeTaken2)

        timeTaken = timer.time_since_constructed
        logger.info('Successfully read %d items from cache (%d) in %dms' % (len(results),timer.__hash__(),timeTaken))

        if len(results) == 0:
            return None
        else:
            return results
    finally:
        if cursor is not None:
            cursor.close()
Example #18
0
class FollowerExtractorGateThread(BaseThread):
    # We have a follower extractor per twitter thread so that we can do more
    # than one user at a time.
    follower_extractor_threads = dict()
    _follower_extractor_threads_lock = RLock()

    def __init__(self,
                 geocodeUserConfig,
                 inputQueue=None,
                 outputQueue=None,
                 dataCollection=None,
                 userAnalysisList=None):
        super(FollowerExtractorGateThread,
              self).__init__(self.__class__.__name__, criticalThread=True)

        if inputQueue is None:
            inputQueue = QueueEx()
        if outputQueue is None:
            outputQueue = QueueEx()

        assert dataCollection is not None
        assert isinstance(dataCollection, DataCollection)
        assert isinstance(geocodeUserConfig, UserGeocodeConfig)

        self.input_queue = inputQueue
        self.output_queue = outputQueue
        self.geocode_user_config = geocodeUserConfig

        # We use the data collection to check for users which already have followers.
        self.data_collection = dataCollection

        self.user_analysis_list = userAnalysisList

        self.num_dropped = 0
        self.num_processed = 0
        self.log_num_dropped_timer = Timer(
            Configuration.LOG_DROP_AMOUNT_FREQ_MS, False)

    def getExtractorThreadByTwitterSession(self, twitterSession):
        if not twitterSession.is_session_active:
            return None

        extractorThread = criticalSection(
            FollowerExtractorGateThread._follower_extractor_threads_lock,
            lambda: FollowerExtractorGateThread.follower_extractor_threads.get(
                twitterSession, None))
        if extractorThread is not None:
            return extractorThread
        else:

            def onTerminateFunc():
                def doAction():
                    del FollowerExtractorGateThread.follower_extractor_threads[
                        twitterSession]

                criticalSection(
                    FollowerExtractorGateThread.
                    _follower_extractor_threads_lock, doAction)

            newThread = FollowerExtractorThread(
                self.geocode_user_config,
                outputQueue=self.output_queue,
                twitterSession=twitterSession,
                onTerminateFunc=onTerminateFunc,
                userAnalysisList=self.user_analysis_list)

            def doAction():
                FollowerExtractorGateThread.follower_extractor_threads[
                    twitterSession] = newThread

            criticalSection(
                FollowerExtractorGateThread._follower_extractor_threads_lock,
                doAction)

            newThread.start()
            return newThread

    def shouldProcessUser(self, user):
        if user is None:
            return False

        return not self.data_collection.isDeepUserObjectIn(user)

    def addUser(self, user, maxQueueSize=None, restrictInfluenceArea=True):
        assert isinstance(user, User)
        if user.id is None:
            return False

        if not self.shouldProcessUser(user):
            return False

        extractorThread = self.getExtractorThreadByTwitterSession(
            user.twitter_session)
        if extractorThread is None:
            return False

        extractorThreadQueue = extractorThread.input_queue
        extractorThreadSession = extractorThread.twitter_session

        if maxQueueSize is not None and extractorThreadQueue.qsize(
        ) > maxQueueSize:
            return False

        if not user.is_geocoded:
            return False

        if restrictInfluenceArea is True:
            found = False

            locations = list()
            locations.append(user.location_geocode)
            if user.location_geocode.country is not None:
                locations.append(user.location_geocode.country)
            if user.location_geocode.continent is not None:
                locations.append(user.location_geocode.continent)

            influenceSourceGeocodeIds = extractorThreadSession.parent_instance.influence_source_cache_ids
            influenceSourceRectangles = extractorThreadSession.parent_instance.influence_source_rectangles
            if influenceSourceGeocodeIds is not None:
                for geocodeCacheId in influenceSourceGeocodeIds:
                    for userLocation in locations:
                        if userLocation == geocodeCacheId:
                            found = True
                            break

            if (not found) and influenceSourceRectangles is not None:
                for rectangle in influenceSourceRectangles:
                    south = rectangle[0]
                    east = rectangle[1]
                    north = rectangle[2]
                    west = rectangle[3]

                    for userLocation in locations:
                        userCoord = userLocation.coordinate
                        result = south < userCoord[0] < north and \
                                 east < userCoord[1] < west

                        if result is True:
                            found = True
                            break

            if not found:
                return False

        user.queued_for_follower_enrichment = True
        FollowerExtractorGateThread.lastPlace = user.location_geocode.place_id

        extractorThreadQueue.put(user)
        return True

    def _run(self):
        for item in self.input_queue:
            user = getUser(item)
            assert user is not None
            assert isinstance(user, User)

            if self.log_num_dropped_timer.ticked():
                numDropped = self.num_dropped
                numProcessed = self.num_processed
                total = numDropped + numProcessed
                if total == 0:
                    percentageDropped = 0
                else:
                    percentageDropped = float(numDropped) / float(
                        total) * 100.0
                outputQueueSize = self.output_queue.qsize()

                self.num_dropped = 0
                self.num_processed = 0

                # AQ = analysis queue
                logger.info(
                    'Processed %d items, dropped %d items (%.2f%%) from AQ (queue size %d)'
                    % (numProcessed, numDropped, percentageDropped,
                       outputQueueSize))

            # Always add tweets, even if we don't extract the followers of the user.
            if isinstance(item, Tweet):
                if self.output_queue.qsize(
                ) < Configuration.ANALYSIS_INPUT_THREAD_SIZE_CAP:
                    self.num_processed += 1
                    self.output_queue.put(item)
                else:
                    self.num_dropped += 1

            # Already has followers loaded, maybe this came back from
            # geocoder, so we can put it in our output queue safely.
            if user.is_followers_loaded or user.is_followee:
                # Don't drop follower information, since that is so valuable.
                self.output_queue.put(item)
                continue

            # Make sure the queue doesn't get too full.
            if Configuration.AUTO_ENRICH_FOLLOWER_INFO_ENABLED:
                # Skip users with too few or twoo many followers.
                if (Configuration.FOLLOWER_ENRICHMENT_GATE_THREAD_MINIMUM_FOLLOWERS != 0 and user.num_followers < Configuration.FOLLOWER_ENRICHMENT_GATE_THREAD_MINIMUM_FOLLOWERS) or \
                   (Configuration.FOLLOWER_ENRICHMENT_GATE_THREAD_MAXIMUM_FOLLOWERS != 0 and user.num_followers > Configuration.FOLLOWER_ENRICHMENT_GATE_THREAD_MAXIMUM_FOLLOWERS):
                    continue

                self.addUser(user,
                             Configuration.FOLLOWER_ENRICHMENT_QUEUE_SIZE)
Example #19
0
    @classmethod
    def GeocodeFollowers(cls, includeFolloweeData, followeeDataProjection,
                         includeFollowersData, followersDataProjection):
        geocodeProjection = UserProjection.Geocode(includeFolloweeData,
                                                   followeeDataProjection)
        return cls(False, includeFollowersData, followersDataProjection, True,
                   geocodeProjection, False, None, False, False, False, False,
                   False, False, False, False, False, True)

    @classmethod
    def ExcludeRecursiveData(cls, dataProjection=None):
        return cls(True, True, dataProjection, True, False, None, False, None,
                   True, True, True, True, True, True, True, True, True, True)


logUserWritePerformanceTimer = Timer(Configuration.LOG_DROP_AMOUNT_FREQ_MS,
                                     False)


def writeUserToCache(user, doUpdate):
    assert isinstance(user, User)

    # Used with $set operation.
    setFields = dict()

    # Used $addToSet operation.
    addToSetFields = dict()

    if user.is_followers_loaded:
        setFields.update({'is_followers_loaded': True})

    if user.is_followee:
Example #20
0
class FollowerExtractorThread(BaseThread):
    def __init__(self,
                 geocodeUserConfig,
                 outputQueue=None,
                 twitterSession=None,
                 onTerminateFunc=None,
                 userAnalysisList=None):
        super(FollowerExtractorThread, self).__init__(self.__class__.__name__ +
                                                      "_" + str(getUniqueId()),
                                                      onTerminateFunc,
                                                      criticalThread=False)

        if outputQueue is None:
            outputQueue = QueueEx()
        if userAnalysisList is None:
            userAnalysisList = list()

        assert isinstance(twitterSession, TwitterSession)
        assert isinstance(geocodeUserConfig, UserGeocodeConfig)

        def continueRunningCheck():
            return twitterSession.is_session_active

        def notifyPositionFunc(item, position, lastPoppedItem):
            user = getUser(item)
            if user is None:
                return

            assert isinstance(user, User)
            user.follower_enrichment_progress.onQueuePositionChange(
                user, position, lastPoppedItem)
            self.output_queue.put(user)

        self.input_queue = QueueNotify(continueRunningCheck, 2,
                                       notifyPositionFunc)
        self.output_queue = outputQueue
        self.twitter_session = twitterSession
        self.user_analysis_list = userAnalysisList
        self.geocode_user_config = geocodeUserConfig

        self.num_followers_processed = 0
        self.num_followers_geocoded = 0
        self.num_followees_processed = 0
        self.log_performance_timer = Timer(60000, False)

    def _onFailure(self, e):
        logger.error(
            'Follower extractor thread has failed for instance %s, shutting down instance'
            % self.twitter_session.instance_key)
        self.twitter_session.parent_instance.shutdownInstance()
        super(FollowerExtractorThread, self)._onFailure(e)

    def _run(self):
        for item in self.input_queue:
            user = getUser(item)
            assert user is not None

            if user.is_followers_loaded:
                continue

            if user.twitter_session is None:
                logger.error(
                    'User reached enrichment thread with no twitter session')
                continue

            instance = user.twitter_session.parent_instance
            instance_key = user.instance_key
            startTime = instance.constructed_at
            temporalCollection = getTemporalInfluenceCollection(instance_key)

            analysis_list = list()
            for item in self.user_analysis_list:
                analysisObj = item(user)
                if analysisObj is not None:
                    assert isinstance(analysisObj, UserAnalysis)
                    analysis_list.append(analysisObj)

            def idsIterationFunc(userId, iteration, totalIterations):
                if not self.twitter_session.is_session_active:
                    return False

                #logger.info('Retrieved ids of user %d/%d' % (iteration, totalIterations))
                self.output_queue.put(user)

                return True

            def addTemporalEntryForCurrentUser(follower):
                timeId = getTimeIdFromTimestamp(startTime,
                                                Configuration.TEMPORAL_STEP,
                                                getEpochMs())

                userCacheIds = user.location_geocode.all_geocode_results_cache_id
                followerGeocodeResults = follower.location_geocode.all_geocode_results

                for userCacheId in userCacheIds:
                    userPlaceId = GeocodeResultAbstract.getPlaceIdFromCacheId(
                        userCacheId)
                    userProviderId = GeocodeResultAbstract.getProviderIdFromCacheId(
                        userCacheId)

                    for followerGeocodeResult in followerGeocodeResults:
                        followerPlaceId = followerGeocodeResult.place_id
                        followerProviderId = followerGeocodeResult.provider_id
                        followerPlaceType = followerGeocodeResult.place_type

                        instance.addTemporalEntry(temporalCollection, timeId,
                                                  userProviderId, userPlaceId,
                                                  followerProviderId,
                                                  followerPlaceId,
                                                  followerPlaceType)

            def iterationFunc(userId, iteration, totalIterations,
                              followersFromLastIteration):
                if followersFromLastIteration is not None:
                    for follower in followersFromLastIteration:
                        if not self.twitter_session.is_session_active:
                            return False

                        assert isinstance(follower, User)
                        follower.is_followee = True

                        follower.geocodeLocationFromCache(
                            self.geocode_user_config, False)
                        self.output_queue.put(follower)

                        # Follower is now ready to be analysed.
                        for item in analysis_list:
                            item.onFollower(follower)

                        self.num_followers_processed += 1

                        if user.is_geocoded and follower.is_geocoded:
                            self.num_followers_geocoded += 1
                            addTemporalEntryForCurrentUser(follower)

                self.output_queue.put(user)
                return True

            # Retrieve followers.
            #logger.info('Attempting to retrieve followers for user: %s' % user)
            user.getFollowerIds(idsIterationFunc)
            result = user.getFollowers(iterationFunc)

            for item in analysis_list:
                user.addAnalyser(item)

            user.queued_for_follower_enrichment = False

            if result is None:
                logger.error(
                    'Failed to retrieve followers for user: %s - explanation: %s, %s, %s'
                    % (user.last_follower_enrichment_error,
                       user.is_followers_loaded, user.is_follower_ids_loaded,
                       user))
            #else:
            #logger.info('Retrieved %d followers for user %s' % (len(result), user))

            # Push update.
            self.num_followees_processed += 1
            self.output_queue.put(user)

            if self.log_performance_timer.ticked():
                numFolloweesProcessed = self.num_followees_processed
                numFollowersProcessed = self.num_followers_processed
                numFollowersGeocoded = self.num_followers_geocoded
                self.num_followees_processed = 0
                self.num_followers_processed = 0
                self.num_followers_geocoded = 0

                logger.info(
                    'Num followees processed %d, num followers processed %d, num followers geocoded %d'
                    % (numFolloweesProcessed, numFollowersProcessed,
                       numFollowersGeocoded))

        # Prevent this thread from being restarted.
        self.stop()
Example #21
0
def cursorItemsFromCache(instanceId,
                         getCollectionFunc,
                         placeId=None,
                         epochMsStartRange=None,
                         epochMsEndRange=None,
                         pageNum=None,
                         pageSize=None,
                         typeSpecificQuery=None,
                         projection=None,
                         sortByTimestamp=None,
                         typeSpecificHint=None):
    if sortByTimestamp is None:
        sortByTimestamp = True

    epochMsStartRange, epochMsEndRange = fixEpochMsRange(
        epochMsStartRange, epochMsEndRange)

    if epochMsEndRange is None:
        upperBoundTimestamp = getEpochMs()
    else:
        upperBoundTimestamp = epochMsEndRange

    if projection is not None and projection.do_query is False:
        return None

    assert instanceId is not None
    assert getCollectionFunc is not None
    collection = getCollectionFunc(instanceId)

    logFormatting = 'IN:%s, P:%s, ES:%s, EE:%s, PN:%s, PS:%s, T:%s, P:%s' % (
        instanceId, placeId, epochMsStartRange, epochMsEndRange, pageNum,
        pageSize, typeSpecificQuery, projection)

    timer = Timer()
    logger.info('Attempting to read items from cache (%d) -- %s' %
                (timer.__hash__(), logFormatting))

    findDic = dict()

    timestampDic = None
    if epochMsEndRange is not None:
        if timestampDic is None:
            timestampDic = dict()

        timestampDic.update({'$lt': epochMsEndRange})

    if epochMsStartRange is not None:
        if timestampDic is None:
            timestampDic = dict()

        timestampDic.update({'$gte': epochMsStartRange})

    if timestampDic is not None:
        findDic.update({'timestamp': timestampDic})

    if placeId is not None:
        findDic.update(
            dict({
                'geocode.providerId': placeId['providerId'],
                'geocode.placeId': placeId['placeId']
            }))

    # MongoDB sometimes gets it wrong, particularly with geocode.placeId.
    if typeSpecificHint is None:
        if timestampDic is not None:
            if placeId is not None:
                hint = [('geocode.placeId', pymongo.ASCENDING),
                        ('timestamp', pymongo.ASCENDING)]
            else:
                hint = [('timestamp', pymongo.ASCENDING)]
        else:
            if placeId is not None:
                hint = [('geocode.placeId', pymongo.ASCENDING)]
            else:
                hint = None
    else:
        hint = typeSpecificHint

    if typeSpecificQuery is not None:
        findDic.update(typeSpecificQuery)

    if projection is None:
        cursor = collection.find(findDic).hint(hint)
    else:
        cursor = collection.find(findDic, projection.projection).hint(hint)

    if sortByTimestamp:
        cursor = cursor.sort([('timestamp', pymongo.ASCENDING)])

    if pageSize is not None and pageNum is not None:
        cursor = cursor.skip(pageSize * pageNum).limit(pageSize)

    # We use this to calculate progress through the cursor,
    # It is more efficient than using cursor.count.
    cursor.upper_bound_timestamp = upperBoundTimestamp

    timeTaken = timer.time_since_constructed
    logger.info('Successfully setup cursor in %dms -- %s' %
                (timeTaken, logFormatting))

    if Configuration.MONGO_EXPLAINS_ENABLED:
        logger.critical('Tweet/User Explain: %s' % unicode(cursor.explain()))

    return cursor
import unittest
import requests
from api.config import Configuration, GE_MAP_QUEST, GE_GOOGLE
from api.core.utility import Timer
from api.geocode.geocode_shared import GeocodeResult, GeocodeResultGoogle, BadGeocodeException
import logging
import itertools

logger = logging.getLogger(__name__)

__author__ = 'Michael Pryor'

# 1 every two seconds.
# Confirmed with open map quest that 1 per second is okay, but set to every two seconds to be nice.
geocode_from_external_timer_omq = Timer.rate_limited(60, 120 * 1000)

# 2500 requests per day (24 hours).
# This works out as once every 35 seconds.
geocode_from_external_timer_google = Timer.rate_limited(
    2500, 24 * 60 * 60 * 1000)


def _geocodeFromExternalOMQ(query, countryCode=None, acceptableTypes=None):
    """ Uses open map quest to do a location search, e.g. if query
        is London then information about London city will be returned.

        Note this method restricts itself to 1 call per second."""
    if query is None:
        return None

    geocode_from_external_timer_omq.waitForTick()
Example #23
0
class WebSocket(EventHandler):
    """ Base class for all web socket interactions. """
    class OP:
        """ Contains all operation codes, indicating what message is for.
            These codes are passed to the client directly via templating, so
            no need to modify elsewhere. """
        ADD_MARKER = 1
        ADD_LINE = 2
        REMOVE_ITEM = 3

        ADD_ROW = 4
        UPDATE_ROW = 6
        SET_HEADER = 7

        SET_ELEMENT_INNER_HTML = 8
        EXECUTE_JAVASCRIPT = 9

        PING = 0

    def __init__(self, webSocket, onRegisteredFunc=None):
        super(WebSocket, self).__init__(processSignalFunc=self.onUpdate,
                                        onRegisteredFunc=onRegisteredFunc)

        assert webSocket is not None

        self.web_socket = webSocket
        self.is_cleaned_up = False
        self.controls = dict()

        self.pingTimer = Timer(4000, False)

        self.cleanup_funcs = []

    def ping(self):
        self.send({'static_op': WebSocket.OP.PING})

        pingBack = self.receive()

        if pingBack != 'PING_BACK':
            self.cleanup()

    def pingFreqLimited(self):
        if self.pingTimer.ticked():
            self.ping()

    def send(self, data):
        """ Sends a dictionary to the client in json form.
            @param data a dictionary to be sent to the client. """
        dataToSend = json.dumps(data)

        try:
            self.web_socket.send(dataToSend)
        except Exception as e:
            self.cleanup()
            logger.debug(
                'Web socket connection terminated while sending, reason: %s, exception type %s'
                % (e, type(e)))

    def receive(self):
        try:
            return self.web_socket.receive()
        except Exception as e:
            self.cleanup()
            logger.debug(
                'Web socket connection terminated while receiving, reason: %s, exception type %s'
                % (e, type(e)))
            return None

    def onUpdate(self, signaler, data):
        if data is None:
            return

        if SignalActions.SOCKET in data:
            data[SignalActions.SOCKET](self.controls, data)

    def cleanup(self):
        self.is_cleaned_up = True

        # Do not unregister from all here, the thread managing the socket does this
        # See WebSocketGroup.processWebSocket. This is important; we want the unregistering
        # to be done from a different thread to the one which the send operation originated from.
        # This avoids a problem where we might be iterating through event signalers, signal an event
        # but then the event signaler collection decreases in size as one is cleaned up. If from a different thread
        # it will change size after we have finished iterating through it.
        for item in self.cleanup_funcs:
            item(self)

    def addControls(self, controls):
        for control in controls:
            self.addControl(control)

    def addControl(self, control):
        assert isinstance(control, Control)
        self.controls[control.control_name] = control
        control.web_socket = self