Ejemplo n.º 1
0
def getCursorSizeSlow(cursor):
    timer = Timer()

    if cursor is None:
        return 0

    logger.info('Retrieving cursor size...')

    success = False
    attempt = 1
    maxAttempts = 5
    sizeOfCursor = 0
    while not success:
        try:
            sizeOfCursor = cursor.count(True)
            success = True
        except AutoReconnect as e:
            if attempt <= maxAttempts:
                logger.error(
                    'Failed to retrieve cursor size, AutoReconnect exception: %s (%d of %d attempts), errors: %s'
                    % (e.message, attempt, maxAttempts, unicode(e.errors)))
                attempt += 1
                cursor.rewind()
            else:
                raise

    logger.info('Successfully retrieved size of cursor: %d in %dms' %
                (sizeOfCursor, timer.time_since_constructed))

    return sizeOfCursor
Ejemplo n.º 2
0
    def __init__(self,
                 geocodeUserConfig,
                 inputQueue=None,
                 outputQueue=None,
                 dataCollection=None,
                 userAnalysisList=None):
        super(FollowerExtractorGateThread,
              self).__init__(self.__class__.__name__, criticalThread=True)

        if inputQueue is None:
            inputQueue = QueueEx()
        if outputQueue is None:
            outputQueue = QueueEx()

        assert dataCollection is not None
        assert isinstance(dataCollection, DataCollection)
        assert isinstance(geocodeUserConfig, UserGeocodeConfig)

        self.input_queue = inputQueue
        self.output_queue = outputQueue
        self.geocode_user_config = geocodeUserConfig

        # We use the data collection to check for users which already have followers.
        self.data_collection = dataCollection

        self.user_analysis_list = userAnalysisList

        self.num_dropped = 0
        self.num_processed = 0
        self.log_num_dropped_timer = Timer(
            Configuration.LOG_DROP_AMOUNT_FREQ_MS, False)
Ejemplo n.º 3
0
    def __init__(self, webSocket, onRegisteredFunc=None):
        super(WebSocket, self).__init__(processSignalFunc=self.onUpdate,
                                        onRegisteredFunc=onRegisteredFunc)

        assert webSocket is not None

        self.web_socket = webSocket
        self.is_cleaned_up = False
        self.controls = dict()

        self.pingTimer = Timer(4000, False)

        self.cleanup_funcs = []
Ejemplo n.º 4
0
        def func(templateArguments, *args, **kwargs):
            if self.on_display_usage_func is not None:
                displayUsageFuncCallTimer = Timer(1000, True)

                def theDisplayUsageFunc():
                    if displayUsageFuncCallTimer.ticked():
                        self.on_display_usage_func(
                            self, packArguments(*args, **kwargs))

                onDisplayUsageFunc = theDisplayUsageFunc
            else:
                onDisplayUsageFunc = None

            # > 1 for buffering, ensure we are always sending not send -> wait for database -> send.
            worker = AsyncWorker(2, onDisplayUsageFunc)

            tunnelId = kwargs['tunnel_id']
            socketId = kwargs['socket_id']
            self.setResponseHeaders(tunnelId)

            logger.info('Tunnel %s on socket %s has started opening' %
                        (tunnelId, socketId))

            socket = self.sockets.get(socketId, None)
            if socket is None:
                logger.error(
                    'Bulk download attempted but no matching socket with ID: %d found'
                    % socketId)
                worker.on_finish()
                return worker.queue

            tunnelEvent = socket.tunnel_events.get(tunnelId, None)
            if tunnelEvent is None:
                logger.error('Invalid tunnel ID received: %s' % tunnelId)
                worker.on_finish()
                return worker.queue

            if tunnelEvent.is_set():
                logger.error(
                    'Attempted to assign two bulk download providers to one socket with ID: %d, and tunnel ID: %s'
                    % (socketId, tunnelId))
                worker.on_finish()
                return worker.queue

            socket.tunnels[tunnelId] = worker
            tunnelEvent.set()

            return worker.queue
Ejemplo n.º 5
0
    def __init__(self,
                 geocodeUserConfig,
                 outputQueue=None,
                 twitterSession=None,
                 onTerminateFunc=None,
                 userAnalysisList=None):
        super(FollowerExtractorThread, self).__init__(self.__class__.__name__ +
                                                      "_" + str(getUniqueId()),
                                                      onTerminateFunc,
                                                      criticalThread=False)

        if outputQueue is None:
            outputQueue = QueueEx()
        if userAnalysisList is None:
            userAnalysisList = list()

        assert isinstance(twitterSession, TwitterSession)
        assert isinstance(geocodeUserConfig, UserGeocodeConfig)

        def continueRunningCheck():
            return twitterSession.is_session_active

        def notifyPositionFunc(item, position, lastPoppedItem):
            user = getUser(item)
            if user is None:
                return

            assert isinstance(user, User)
            user.follower_enrichment_progress.onQueuePositionChange(
                user, position, lastPoppedItem)
            self.output_queue.put(user)

        self.input_queue = QueueNotify(continueRunningCheck, 2,
                                       notifyPositionFunc)
        self.output_queue = outputQueue
        self.twitter_session = twitterSession
        self.user_analysis_list = userAnalysisList
        self.geocode_user_config = geocodeUserConfig

        self.num_followers_processed = 0
        self.num_followers_geocoded = 0
        self.num_followees_processed = 0
        self.log_performance_timer = Timer(60000, False)
Ejemplo n.º 6
0
    def __init__(self, feed, outputQueue=None, initialData=None):
        super(TwitterThread, self).__init__(self.__class__.__name__ + "_" +
                                            str(getUniqueId()),
                                            criticalThread=False)

        if feed is None:
            feed = TwitterFeed([], [], [], DummyIterable(), None)

        if outputQueue is None:
            outputQueue = QueueEx()

        assert isinstance(feed, TwitterFeed)
        assert isinstance(feed.twitter_session, TwitterSession)

        self.input_queue = feed
        self.twitter_session = feed.twitter_session
        self.twitter_feed = feed

        self.output_queue = outputQueue

        if initialData is not None:
            for item in initialData:
                item = copy.deepcopy(item)

                user = getUser(item)
                assert isinstance(user, User)

                logger.info('Retrieved tweet/user from file: %s' % item)

                item.setTwitterSession(self.twitter_session)

                self.output_queue.put(item)

        self.num_dropped = 0
        self.num_processed = 0
        self.num_twitter_geocoded_place = 0
        self.num_twitter_geocoded_coordinate = 0
        self.num_twitter_geocoded_both = 0
        self.num_not_twitter_geocoded = 0
        self.num_no_location = 0
        self.num_geocodeable = 0
        self.log_num_dropped_timer = Timer(
            Configuration.LOG_DROP_AMOUNT_FREQ_MS, False)
Ejemplo n.º 7
0
    def __init__(self,
                 geocodeConfig,
                 inputQueue=None,
                 successOutputQueue=None,
                 primaryFailureOutputQueue=None,
                 highLoadFailureOutputQueue=None,
                 inMemoryOnly=None):
        if inMemoryOnly:
            inMemoryOnlyStr = '_MEMORY_ONLY'
        else:
            inMemoryOnlyStr = ''

        super(GeocodeFromCacheThread, self).__init__(
            '%s%s' % (self.__class__.__name__, inMemoryOnlyStr),
            criticalThread=True)

        assert isinstance(geocodeConfig, UserGeocodeConfig)

        if inputQueue is None:
            inputQueue = QueueEx()
        if successOutputQueue is None:
            successOutputQueue = QueueEx()
        if primaryFailureOutputQueue is None:
            primaryFailureOutputQueue = QueueEx()

        self.input_queue = inputQueue
        self.success_output_queue = successOutputQueue
        self.primary_failure_output_queue = primaryFailureOutputQueue
        self.high_load_failure_output_queue = highLoadFailureOutputQueue
        self.geocode_config = geocodeConfig

        self.num_dropped_from_success = 0
        self.num_dropped_from_primary_failure = 0
        self.num_failed_over = 0
        self.log_timer = Timer(Configuration.LOG_DROP_AMOUNT_FREQ_MS, False)

        self.num_processed = 0
        self.in_memory_only = inMemoryOnly

        self.sleep_time = float(
            Configuration.GEOCODE_FROM_CACHE_THREAD_WAIT_TIME_MS) / 1000.0
Ejemplo n.º 8
0
def processCursor(cursor,
                  constructObjectFunc,
                  onIterationFunc=None,
                  cursorSize=None,
                  getCurrentIterationFunc=None):
    try:
        if cursor is None:
            return None

        timer = Timer()

        results = []

        if getCurrentIterationFunc is None:
            currentIterationCounter = [0]

            def getIterationFunc(
                    obj, currentIterationCounter=currentIterationCounter):
                currentIterationCounter[0] += 1
                return currentIterationCounter[0]

            getCurrentIterationFunc = getIterationFunc

        brokeOut = False
        iteration = 0
        cursorIterationOffset = 0

        endIteration = cursorSize

        isIterationBoundsInitialised = False

        if onIterationFunc is not None:
            onIterationFunc(cursorIterationOffset, endIteration, False, None,
                            'base')

        for item in cursor:
            currentObject = constructObjectFunc(item)

            if onIterationFunc is not None:
                iteration = getCurrentIterationFunc(currentObject)

                if iteration is None:
                    continue

                if cursorSize is not None:
                    if not isIterationBoundsInitialised:
                        cursorIterationOffset = iteration - 1  # Iterations don't have to be 0 indexed.
                        endIteration = cursorSize - cursorIterationOffset
                        isIterationBoundsInitialised = True

                    if isIterationBoundsInitialised:
                        iteration -= cursorIterationOffset

                #logger.info('S: %d, M: %d, E: %d' % (0, iteration, endIteration))
                #assert 0 <= iteration <= (endIteration + 5)

                result = onIterationFunc(iteration, endIteration, False,
                                         currentObject, 'base')
                if result is False:
                    brokeOut = True
                    break
            else:
                # Don't return in results if we have an iteration func.
                # This is important in case we are processing millions of rows
                # (more than we can fit in memory).
                if currentObject is not None:
                    results.append(currentObject)

        # Signal that we're finished.
        if onIterationFunc is not None:
            if not brokeOut:
                iteration = endIteration

            onIterationFunc(iteration, endIteration, True, None, 'base')

        timeTaken2 = timer.time_since_constructed
        logger.info('Successfully processed cursor in %dms' % timeTaken2)

        timeTaken = timer.time_since_constructed
        logger.info('Successfully read %d items from cache (%d) in %dms' %
                    (len(results), timer.__hash__(), timeTaken))

        if len(results) == 0:
            return None
        else:
            return results
    finally:
        if cursor is not None:
            cursor.close()
Ejemplo n.º 9
0
def cursorItemsFromCache(instanceId,
                         getCollectionFunc,
                         placeId=None,
                         epochMsStartRange=None,
                         epochMsEndRange=None,
                         pageNum=None,
                         pageSize=None,
                         typeSpecificQuery=None,
                         projection=None,
                         sortByTimestamp=None,
                         typeSpecificHint=None):
    if sortByTimestamp is None:
        sortByTimestamp = True

    epochMsStartRange, epochMsEndRange = fixEpochMsRange(
        epochMsStartRange, epochMsEndRange)

    if epochMsEndRange is None:
        upperBoundTimestamp = getEpochMs()
    else:
        upperBoundTimestamp = epochMsEndRange

    if projection is not None and projection.do_query is False:
        return None

    assert instanceId is not None
    assert getCollectionFunc is not None
    collection = getCollectionFunc(instanceId)

    logFormatting = 'IN:%s, P:%s, ES:%s, EE:%s, PN:%s, PS:%s, T:%s, P:%s' % (
        instanceId, placeId, epochMsStartRange, epochMsEndRange, pageNum,
        pageSize, typeSpecificQuery, projection)

    timer = Timer()
    logger.info('Attempting to read items from cache (%d) -- %s' %
                (timer.__hash__(), logFormatting))

    findDic = dict()

    timestampDic = None
    if epochMsEndRange is not None:
        if timestampDic is None:
            timestampDic = dict()

        timestampDic.update({'$lt': epochMsEndRange})

    if epochMsStartRange is not None:
        if timestampDic is None:
            timestampDic = dict()

        timestampDic.update({'$gte': epochMsStartRange})

    if timestampDic is not None:
        findDic.update({'timestamp': timestampDic})

    if placeId is not None:
        findDic.update(
            dict({
                'geocode.providerId': placeId['providerId'],
                'geocode.placeId': placeId['placeId']
            }))

    # MongoDB sometimes gets it wrong, particularly with geocode.placeId.
    if typeSpecificHint is None:
        if timestampDic is not None:
            if placeId is not None:
                hint = [('geocode.placeId', pymongo.ASCENDING),
                        ('timestamp', pymongo.ASCENDING)]
            else:
                hint = [('timestamp', pymongo.ASCENDING)]
        else:
            if placeId is not None:
                hint = [('geocode.placeId', pymongo.ASCENDING)]
            else:
                hint = None
    else:
        hint = typeSpecificHint

    if typeSpecificQuery is not None:
        findDic.update(typeSpecificQuery)

    if projection is None:
        cursor = collection.find(findDic).hint(hint)
    else:
        cursor = collection.find(findDic, projection.projection).hint(hint)

    if sortByTimestamp:
        cursor = cursor.sort([('timestamp', pymongo.ASCENDING)])

    if pageSize is not None and pageNum is not None:
        cursor = cursor.skip(pageSize * pageNum).limit(pageSize)

    # We use this to calculate progress through the cursor,
    # It is more efficient than using cursor.count.
    cursor.upper_bound_timestamp = upperBoundTimestamp

    timeTaken = timer.time_since_constructed
    logger.info('Successfully setup cursor in %dms -- %s' %
                (timeTaken, logFormatting))

    if Configuration.MONGO_EXPLAINS_ENABLED:
        logger.critical('Tweet/User Explain: %s' % unicode(cursor.explain()))

    return cursor
Ejemplo n.º 10
0
    @classmethod
    def GeocodeFollowers(cls, includeFolloweeData, followeeDataProjection,
                         includeFollowersData, followersDataProjection):
        geocodeProjection = UserProjection.Geocode(includeFolloweeData,
                                                   followeeDataProjection)
        return cls(False, includeFollowersData, followersDataProjection, True,
                   geocodeProjection, False, None, False, False, False, False,
                   False, False, False, False, False, True)

    @classmethod
    def ExcludeRecursiveData(cls, dataProjection=None):
        return cls(True, True, dataProjection, True, False, None, False, None,
                   True, True, True, True, True, True, True, True, True, True)


logUserWritePerformanceTimer = Timer(Configuration.LOG_DROP_AMOUNT_FREQ_MS,
                                     False)


def writeUserToCache(user, doUpdate):
    assert isinstance(user, User)

    # Used with $set operation.
    setFields = dict()

    # Used $addToSet operation.
    addToSetFields = dict()

    if user.is_followers_loaded:
        setFields.update({'is_followers_loaded': True})

    if user.is_followee:
Ejemplo n.º 11
0
        except KeyboardInterrupt:
            pass

        print 'Finished!'
        sys.exit(0)

    if args.show_database_storage_usage:
        f = open('db_results.txt', 'w')
        sys.stdout = f

        theStep = 1000 * 60 * 15
        print 'Running in show database storage mode, update every %dms' % theStep
        print
        f.flush()

        updateTimer = Timer(theStep,True)
        try:
            while True:
                updateTimer.waitForTick()

                collections = getCollections()
                print 'The time: %s'%  unicode(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
                print 'Available collections: %s' % unicode(collections)
                print
                print
                print 'Database statistics: %s' % unicode(getDatabase().command({'dbStats' : 1}))

                for collection in collections:
                    print '%*s collection statistics: %s' % (20, collection, unicode(getDatabase().command({'collStats' : collection})))

                print
Ejemplo n.º 12
0
    def manageSocket(self, webSocket, tupleArguments, socketId):
        instanceId = tupleArguments[0]

        mainControl = webSocket.controls[self.key]
        assert isinstance(mainControl, DocumentControl)

        bytesPerBatch       =        parseInteger(request.GET.batchSizeBytes, maximum=1024 * 1024 * 256, default=1024 * 1024 * 1)
        tweetInfo           =        parseBoolean(request.GET.tweet_info, False)
        followerInfo        =        parseBoolean(request.GET.follower_info_full, False)
        followerInfoShort   =        parseBoolean(request.GET.follower_info_short, False)
        providerId          =        parseInteger(request.GET.provider_id)
        placeId             =        parseInteger(request.GET.place_id)
        startEpoch          =        parseInteger(request.GET.start_epoch)
        endEpoch            =        parseInteger(request.GET.end_epoch)

        if placeId is not None and providerId is not None:
            placeCacheId = GeocodeResultAbstract.buildCacheId(providerId, placeId)
        else:
            placeCacheId = None

        if followerInfo:
            tweetInfo = False
            followerInfoShort = False
        elif tweetInfo:
            followerInfo = False
            followerInfoShort = False
        elif followerInfoShort:
            followerInfo = False
            tweetInfo = False
        else:
            followerInfo = True


        userTunnelId = 'user_tunnel'
        tweetTunnelId = None

        if tweetInfo:
            tweetTunnelId = 'tweet_tunnel'

        def openRequiredTunnels():
            if tweetInfo:
                return self.openTunnels(webSocket)
            else:
                return self.openTunnel(userTunnelId, webSocket)

        if not openRequiredTunnels():
            logger.error('Failed to open initial tunnels')
            return False

        if tweetInfo:
            followerIdsFlag = False
            followeeIdsFlag = False
            analysisFlag = False
            isFollowersLoadedRequirement = None
            associatedWithTweetRequirement = True
            recursiveCacheFlag = False
            followerIdsProjection = None
            outputType = 1 # for csv.
        elif followerInfo:
            followerIdsFlag = True
            followeeIdsFlag = True
            analysisFlag = True
            isFollowersLoadedRequirement = True
            associatedWithTweetRequirement = None
            recursiveCacheFlag = True
            followerIdsProjection = None # this gives us all data on each follower.
            outputType = 2
        elif followerInfoShort:
            followerIdsFlag = True
            followeeIdsFlag = True
            followerIdsProjection = NoQueryProjection()
            analysisFlag = True
            isFollowersLoadedRequirement = True
            associatedWithTweetRequirement = None
            recursiveCacheFlag = True
            outputType = 3
        else:
            raise NotImplementedError()

        userProjection = UserProjection(True,
                                        True,
                                        None,
                                        True,
                                        followerIdsFlag,
                                        followerIdsProjection,
                                        followeeIdsFlag,
                                        UserProjection.Id(),
                                        True,
                                        False,
                                        False,
                                        True,
                                        True,
                                        False,
                                        False,
                                        False,
                                        False,
                                        analysisFlag)

        isFirstIteration = [True]

        twitterInstance = self.application.twitter_instances.getInstanceByInstanceKey(instanceId)
        if twitterInstance is None:
            return False

        twitterSession = twitterInstance.twitter_thread.twitter_session
        progressBarTotalId = 'progress-bar-total'
        progressBarCurrentBatchId = 'progress-bar-current-batch'

        signaler = EventSignaler(self.key, [webSocket])

        updateProgressBarFreq = Timer(400,True)

        def sendData(tunnelId, data):
            self.sendDataOnTunnel(webSocket, tunnelId, (unicode(data) + '\r\n'))

        def sendHeader():
            sendData(userTunnelId, getUserHeader(outputType))

            if tweetTunnelId is not None:
                sendData(tweetTunnelId, getTweetHeader())

        def doProgressBarChange(percentage, progressBarId):
            mainControl.executeJavascript('$("#%s").width("%.3f%%");' % (progressBarId, percentage))

        sendHeader()

        counter = [0]
        previousCounter = [0]
        def updateSocket(controls,
                         data,
                         bytesCounter=counter,
                         bytesPerBatch=bytesPerBatch,
                         previousCounter=previousCounter,
                         isFirstIteration=isFirstIteration):
            user = data['user_data']
            tweet = data['tweet_data']
            percentage = data['percentage']
            isFinished = data['isFinished']

            control = controls[self.key]
            assert isinstance(control, DocumentControl)

            def updateProgressBars():
                previousCounter[0] = thisCounter = bytesCounter[0]

                percentageCurrentBatch = float(thisCounter) / float(bytesPerBatch) * 100
                percentageTotal = percentage

                if percentageTotal >= 100:
                    percentageCurrentBatch = 100

                if isFirstIteration[0] and percentageCurrentBatch < percentageTotal:
                    percentageCurrentBatch = percentageTotal

                doProgressBarChange(percentageTotal, progressBarTotalId)
                doProgressBarChange(percentageCurrentBatch, progressBarCurrentBatchId)

            if previousCounter[0] != bytesCounter[0] and updateProgressBarFreq.ticked():
                updateProgressBars()

            dataToSendToClient = ''
            if user is not None:
                assert isinstance(user,User)
                dataToSendToClient = getUserRepresentation(user, outputType)
                sendData(userTunnelId, dataToSendToClient)

            if tweet is not None:
                assert isinstance(tweet, Tweet)
                dataToSendToClient = getTweetRepresentation(tweet)
                sendData(tweetTunnelId, dataToSendToClient)

            dataLength = len(dataToSendToClient)
            bytesCounter[0] += dataLength

            if bytesCounter[0] > bytesPerBatch or isFinished:
                updateProgressBars()
                isFirstIteration[0] = False

                bytesCounter[0] = 0
                mainControl.executeJavascript('onBatchEnd();')

                self.closeTunnels(webSocket)

                if not isFinished:
                    logger.debug('Waiting to receive next data provider')
                    if not openRequiredTunnels():
                        logger.warning('Failed to reinitialize tunnel slots')
                        webSocket.cleanup()
                        return

                    sendHeader()
                else:
                    mainControl.executeJavascript('onFinished();')

                    webSocket.cleanup()

        def onCacheIteration(iteration, total, isFinished, data, iteratorId):
            # Don't write followee data to output as it would duplicate alot of data.
            if iteratorId == 'followee':
                data = None

            running = not webSocket.is_cleaned_up
            if running:
                # We need to do this so that if the client closes the socket we are notified.
                webSocket.pingFreqLimited()

                percentage = getPercentage(iteration, total)
                dataId = None
                if data is not None:
                    dataId = data.id
                #logger.info('iteration %.2f of %.2f (%.1f%%) - it: %s, userId: %s' % (iteration, total, percentage,iteratorId,dataId))

                user = None
                tweet = None
                if data is None:
                    pass
                elif isinstance(data, User):
                    user = data
                elif isinstance(data, Tweet):
                    tweet = data
                    if tweet.has_user:
                        user = tweet.user
                else:
                    logger.error('Invalid data from cache, type: %s' % type(data))
                    return running

                signaler.signalEvent({SignalActions.SOCKET: updateSocket, 'percentage' : percentage, 'user_data' : user, 'tweet_data' : tweet, 'isFinished' : isFinished})
                gevent.sleep(0)
            else:
                logger.debug('Ending cache download prematurely')

            return running

        logger.debug('Starting to read data from cache...')

        # This makes sure the search is finite.
        epochNow = getEpochMs()
        if endEpoch is None or endEpoch > epochNow:
            endEpoch = epochNow

        if followerInfo or followerInfoShort:
            readUsersFromCache(twitterSession,
                               instanceId,
                               placeId = placeCacheId,
                               epochMsStartRange=startEpoch,
                               epochMsEndRange=endEpoch,
                               isFollowersLoadedRequirement=isFollowersLoadedRequirement,
                               associatedWithTweetRequirement=associatedWithTweetRequirement,
                               onIterationFunc=onCacheIteration,
                               recursive=recursiveCacheFlag,
                               userProjection=userProjection)
        else:
            readTweetsFromCache(twitterSession,
                                instanceId,
                                placeId = placeCacheId,
                                epochMsStartRange=startEpoch,
                                epochMsEndRange=endEpoch,
                                onIterationFunc=onCacheIteration,
                                retrieveUserData=True,
                                userProjection=userProjection)

        # We want to cleanup everything now since we are done.
        return False