Example #1
0
def writeTweetToCache(tweet):
    placeId = None
    if tweet.user.is_geocoded:
        placeId = tweet.user.location_geocode.all_geocode_results_cache_id

    timer = getEpochMs()

    collection = getTweetCollection(tweet.instance_key)

    collection.ensure_index([
        ('timestamp', pymongo.ASCENDING)
    ])  # for cache download where no place is specified.
    collection.ensure_index([('geocode.placeId', pymongo.ASCENDING),
                             ('timestamp', pymongo.ASCENDING)])

    _writeItemToCache(getTweetCollection, None, tweet.instance_key, tweet.data,
                      tweet.isDataNew, tweet.timestamp, placeId)
    tweet.isDataNew = False

    writingToDatabaseTime = getEpochMs() - timer

    global logTweetWritePerformanceTimer
    if logTweetWritePerformanceTimer.ticked():
        logger.info('Writing tweet to database took %dms' %
                    writingToDatabaseTime)
Example #2
0
        def func(templateArguments, instance):
            instance = self.application.twitter_instances.getInstanceByInstanceKey(instance)
            if instance is None:
                abort(404, "No active search stream found at this address")

            assert isinstance(instance, TwitterInstance)

            keywords = instance.twitter_thread.twitter_feed.keywords
            geographicalSetupString = instance.geographic_setup_string

            if keywords is None:
                keywords = ''
                keywordsDisplay = '[None]'
            else:
                keywords = ','.join(keywords)
                keywordsDisplay = keywords

            instanceDescription = getInstanceDescription(instance, False)
            instanceDescriptionWithPrefix = getInstanceDescription(instance, True)

            homeLink = getHomeLink(Configuration.PROJECT_NAME)

            templateArguments.update({'instance_description' : instanceDescription,
                                      'instance_description_with_prefix' : instanceDescriptionWithPrefix,
                                      'home_link' : homeLink,
                                      'instance_name': instance.instance_key,
                                      'keywords': keywords,
                                      'keywords_display' : keywordsDisplay,
                                      'instance_map_data' : geographicalSetupString,
                                      'post_address': WEBSITE_ROOT_HTTP + '/manage_instance', # for terminate instance button.
                                      'login_address' : OAuthSignIn.link_info.getPageLink(),
                                      'start_epoch' : instance.constructed_at,
                                      'server_current_epoch' : getEpochMs()})

            return template('locations-map.tpl', templateArguments)
Example #3
0
    def __init__(self, constructedAt=None):
        object.__init__(self)
        self.touch()

        if constructedAt is None:
            self.constructed_at = getEpochMs()
        else:
            self.constructed_at = constructedAt
Example #4
0
        def func(templateArguments, instance, location, provider):
            twitterInstance = self.application.twitter_instances.getInstanceByInstanceKey(
                instance)
            if twitterInstance is None:
                abort(404, "No active search stream found at this address")

            geocode = geocodeFromCacheById(
                GeocodeResultAbstract.buildCacheId(provider, location))
            assert geocode is not None

            instanceDescription = getInstanceDescription(twitterInstance)
            instanceLink = getInstanceLink(twitterInstance)

            homeLink = getHomeLink(Configuration.PROJECT_NAME)

            templateArguments.update(
                {
                    'home_link': homeLink,
                    'instance': instance,
                    'location': location,
                    'provider': provider,
                    'instance_link': instanceLink,
                    'instance_description': instanceDescription,
                    'place': geocode.display_name_short,
                    'place_coord': geocode.coordinate,
                    'startEpoch': twitterInstance.constructed_at,
                    'server_current_epoch': getEpochMs(),
                    'max_tweets': Configuration.MAX_CLIENT_LIVE_TWEETS
                }
            )  # Client needs to offset to this epoch in case its clock is wrong.

            if geocode.has_bounding_box:
                templateArguments.update(
                    {'place_bounding_box': geocode.bounding_box})

            if geocode.has_country:
                templateArguments.update({
                    'place_country_link':
                    LocationsPage.link_info.getPageLink(
                        instance, geocode.country.place_id,
                        geocode.country.provider_id)
                })
                templateArguments.update(
                    {'place_country': geocode.country.display_name_short})

            if geocode.has_continent:
                templateArguments.update({
                    'place_continent_link':
                    LocationsPage.link_info.getPageLink(
                        instance, geocode.continent.place_id,
                        geocode.continent.provider_id)
                })
                templateArguments.update(
                    {'place_continent': geocode.continent.display_name_short})

            return template('location.tpl', templateArguments)
Example #5
0
def fixEpochMsRange(epochMsStartRange, epochMsEndRange):
    if epochMsStartRange is not None and epochMsStartRange > getEpochMs():
        logger.warn('Attempt was made to read from cache with a future epoch, this could cause read/write collision: %d' % epochMsStartRange)
        return None

    if (epochMsStartRange is not None and epochMsEndRange is not None) and epochMsEndRange < epochMsStartRange:
        logger.warn('End epoch is less than start epoch - this is invalid')
        return None

    return epochMsStartRange, epochMsEndRange
Example #6
0
def writeTweetToCache(tweet):
    placeId = None
    if tweet.user.is_geocoded:
        placeId = tweet.user.location_geocode.all_geocode_results_cache_id

    timer = getEpochMs()

    collection = getTweetCollection(tweet.instance_key)

    collection.ensure_index([('timestamp', pymongo.ASCENDING)]) # for cache download where no place is specified.
    collection.ensure_index([('geocode.placeId', pymongo.ASCENDING), ('timestamp', pymongo.ASCENDING)])

    _writeItemToCache(getTweetCollection, None, tweet.instance_key, tweet.data, tweet.isDataNew, tweet.timestamp, placeId)
    tweet.isDataNew = False

    writingToDatabaseTime = getEpochMs() - timer

    global logTweetWritePerformanceTimer
    if logTweetWritePerformanceTimer.ticked():
        logger.info('Writing tweet to database took %dms' % writingToDatabaseTime)
Example #7
0
def fixEpochMsRange(epochMsStartRange, epochMsEndRange):
    if epochMsStartRange is not None and epochMsStartRange > getEpochMs():
        logger.warn(
            'Attempt was made to read from cache with a future epoch, this could cause read/write collision: %d'
            % epochMsStartRange)
        return None

    if (epochMsStartRange is not None and epochMsEndRange
            is not None) and epochMsEndRange < epochMsStartRange:
        logger.warn('End epoch is less than start epoch - this is invalid')
        return None

    return epochMsStartRange, epochMsEndRange
Example #8
0
        def func(templateArguments, instance):
            instance = self.application.twitter_instances.getInstanceByInstanceKey(
                instance)
            if instance is None:
                abort(404, "No active search stream found at this address")

            assert isinstance(instance, TwitterInstance)

            keywords = instance.twitter_thread.twitter_feed.keywords
            geographicalSetupString = instance.geographic_setup_string

            if keywords is None:
                keywords = ''
                keywordsDisplay = '[None]'
            else:
                keywords = ','.join(keywords)
                keywordsDisplay = keywords

            instanceDescription = getInstanceDescription(instance, False)
            instanceDescriptionWithPrefix = getInstanceDescription(
                instance, True)

            homeLink = getHomeLink(Configuration.PROJECT_NAME)

            templateArguments.update({
                'instance_description':
                instanceDescription,
                'instance_description_with_prefix':
                instanceDescriptionWithPrefix,
                'home_link':
                homeLink,
                'instance_name':
                instance.instance_key,
                'keywords':
                keywords,
                'keywords_display':
                keywordsDisplay,
                'instance_map_data':
                geographicalSetupString,
                'post_address':
                WEBSITE_ROOT_HTTP +
                '/manage_instance',  # for terminate instance button.
                'login_address':
                OAuthSignIn.link_info.getPageLink(),
                'start_epoch':
                instance.constructed_at,
                'server_current_epoch':
                getEpochMs()
            })

            return template('locations-map.tpl', templateArguments)
Example #9
0
        def func(templateArguments, instance, location, provider):
            twitterInstance = self.application.twitter_instances.getInstanceByInstanceKey(instance)
            if twitterInstance is None:
                abort(404, "No active search stream found at this address")

            geocode = geocodeFromCacheById(GeocodeResultAbstract.buildCacheId(provider,location))
            assert geocode is not None

            instanceDescription = getInstanceDescription(twitterInstance)
            instanceLink = getInstanceLink(twitterInstance)

            homeLink = getHomeLink(Configuration.PROJECT_NAME)

            templateArguments.update({'home_link' : homeLink,
                                      'instance' : instance,
                                      'location' : location,
                                      'provider' : provider,
                                      'instance_link' : instanceLink,
                                      'instance_description' : instanceDescription,
                                      'place' : geocode.display_name_short,
                                      'place_coord' : geocode.coordinate,
                                      'startEpoch' : twitterInstance.constructed_at,
                                      'server_current_epoch' : getEpochMs(),
                                      'max_tweets' : Configuration.MAX_CLIENT_LIVE_TWEETS}) # Client needs to offset to this epoch in case its clock is wrong.

            if geocode.has_bounding_box:
                templateArguments.update({'place_bounding_box' : geocode.bounding_box})

            if geocode.has_country:
                templateArguments.update({'place_country_link' : LocationsPage.link_info.getPageLink(instance, geocode.country.place_id, geocode.country.provider_id)})
                templateArguments.update({'place_country' : geocode.country.display_name_short})

            if geocode.has_continent:
                templateArguments.update({'place_continent_link' : LocationsPage.link_info.getPageLink(instance, geocode.continent.place_id, geocode.continent.provider_id)})
                templateArguments.update({'place_continent' : geocode.continent.display_name_short})

            return template('location.tpl', templateArguments)
Example #10
0
            def addTemporalEntryForCurrentUser(follower):
                timeId = getTimeIdFromTimestamp(startTime,
                                                Configuration.TEMPORAL_STEP,
                                                getEpochMs())

                userCacheIds = user.location_geocode.all_geocode_results_cache_id
                followerGeocodeResults = follower.location_geocode.all_geocode_results

                for userCacheId in userCacheIds:
                    userPlaceId = GeocodeResultAbstract.getPlaceIdFromCacheId(
                        userCacheId)
                    userProviderId = GeocodeResultAbstract.getProviderIdFromCacheId(
                        userCacheId)

                    for followerGeocodeResult in followerGeocodeResults:
                        followerPlaceId = followerGeocodeResult.place_id
                        followerProviderId = followerGeocodeResult.provider_id
                        followerPlaceType = followerGeocodeResult.place_type

                        instance.addTemporalEntry(temporalCollection, timeId,
                                                  userProviderId, userPlaceId,
                                                  followerProviderId,
                                                  followerPlaceId,
                                                  followerPlaceType)
Example #11
0
        def func(templateArguments, instance):
            twitterInstance = self.application.twitter_instances.getInstanceByInstanceKey(
                instance)
            if twitterInstance is None:
                return dict()

            baseEpoch = twitterInstance.constructed_at

            start_epoch = parseInteger(request.GET.start_epoch, default=None)
            end_epoch = parseInteger(request.GET.end_epoch, default=None)
            source_place_id = parseInteger(request.GET.source_place_id)
            source_provider_id = parseInteger(request.GET.source_provider_id)

            if source_place_id is None:
                logger.error(
                    'Invalid place ID specified while providing influence data: %s'
                    % unicode(source_place_id))
                return dict()

            source_cache_id = GeocodeResultAbstract.buildCacheId(
                source_provider_id, source_place_id)

            temporalCollection = getTemporalInfluenceCollection(instance)

            if start_epoch is not None:
                start_time_id = getTimeIdFromTimestamp(
                    baseEpoch, Configuration.TEMPORAL_STEP, start_epoch)
            else:
                start_time_id = None

            if end_epoch is not None:
                end_time_id = getTimeIdFromTimestamp(
                    baseEpoch, Configuration.TEMPORAL_STEP, end_epoch)
            else:
                end_time_id = None

            timerMs = getEpochMs()
            cacheData = getTemporalRange(temporalCollection,
                                         start_time_id,
                                         end_time_id,
                                         source_cache_id,
                                         preciseFromBack=True,
                                         preciseFromFront=True)
            logger.info('Took %dms to read temporal range data' %
                        (getEpochMs() - timerMs))

            timerMs = getEpochMs()

            geocodeByPlaceType = dict()
            totalsByPlaceType = dict()

            if cacheData is not None:
                for providerId, providerIdData in cacheData.iteritems():
                    providerId = int(providerId)

                    for destination, count in providerIdData.iteritems():
                        split = destination.split('_')
                        placeType = int(split[0])
                        placeId = int(split[1])

                        record = [placeId, providerId, None, None, count, None]

                        geocodeByPlaceType.setdefault(placeType,
                                                      list()).append(record)

                # Process only the records we are going to display.
                for placeType, records in geocodeByPlaceType.iteritems():
                    aux = sorted(records, key=lambda x: x[4], reverse=True)
                    aux = aux[:Configuration.
                              DISPLAY_MAX_NUM_INFLUENCE_RECORDS_PER_PLACE_TYPE]
                    geocodeByPlaceType[placeType] = aux

                    for record in aux:
                        cacheId = GeocodeResultAbstract.buildCacheId(
                            record[1], record[0])
                        geocode = geocodeFromCacheById(cacheId)

                        record[2] = geocode.display_name
                        record[3] = geocode.coordinate
                        count = record[4]
                        record[5] = geocode.bounding_box

                        totalsByPlaceType[placeType] = totalsByPlaceType.get(
                            placeType, 0) + count

            def getResultPart(placeType):
                return {
                    'geocode_list': geocodeByPlaceType.get(placeType, list()),
                    'total': totalsByPlaceType.get(placeType, 0)
                }

            resultData = dict()
            resultData['city'] = getResultPart(
                GeocodeResultAbstract.PlaceTypes.CITY)
            resultData['country'] = getResultPart(
                GeocodeResultAbstract.PlaceTypes.COUNTRY)
            resultData['continent'] = getResultPart(
                GeocodeResultAbstract.PlaceTypes.CONTINENT)

            logger.info('Took %dms to build temporal range result data' %
                        (getEpochMs() - timerMs))

            return {'json': resultData}
Example #12
0
def cursorItemsFromCache(instanceId, getCollectionFunc, placeId=None, epochMsStartRange=None, epochMsEndRange=None, pageNum=None, pageSize=None, typeSpecificQuery=None, projection=None, sortByTimestamp=None, typeSpecificHint=None):
    if sortByTimestamp is None:
        sortByTimestamp = True

    epochMsStartRange, epochMsEndRange = fixEpochMsRange(epochMsStartRange, epochMsEndRange)

    if epochMsEndRange is None:
        upperBoundTimestamp = getEpochMs()
    else:
        upperBoundTimestamp = epochMsEndRange

    if projection is not None and projection.do_query is False:
        return None

    assert instanceId is not None
    assert getCollectionFunc is not None
    collection = getCollectionFunc(instanceId)

    logFormatting = 'IN:%s, P:%s, ES:%s, EE:%s, PN:%s, PS:%s, T:%s, P:%s' % (instanceId, placeId, epochMsStartRange, epochMsEndRange, pageNum, pageSize, typeSpecificQuery, projection)

    timer = Timer()
    logger.info('Attempting to read items from cache (%d) -- %s' % (timer.__hash__(),logFormatting))

    findDic = dict()

    timestampDic = None
    if epochMsEndRange is not None:
        if timestampDic is None:
            timestampDic = dict()

        timestampDic.update({'$lt' : epochMsEndRange})

    if epochMsStartRange is not None:
        if timestampDic is None:
            timestampDic = dict()

        timestampDic.update({'$gte' : epochMsStartRange})

    if timestampDic is not None:
        findDic.update({'timestamp' : timestampDic})

    if placeId is not None:
        findDic.update({'geocode.placeId' : placeId['placeId'],
                        'geocode.providerId' : placeId['providerId']})

    # MongoDB sometimes gets it wrong, particularly with geocode.placeId.
    if typeSpecificHint is None:
        if timestampDic is not None:
            if placeId is not None:
                hint = [('geocode.placeId', pymongo.ASCENDING), ('timestamp', pymongo.ASCENDING)]
            else:
                hint = [('timestamp', pymongo.ASCENDING)]
        else:
            if placeId is not None:
                hint = [('geocode.placeId', pymongo.ASCENDING)]
            else:
                hint = None
    else:
        hint = typeSpecificHint

    if typeSpecificQuery is not None:
        findDic.update(typeSpecificQuery)

    if projection is None:
        cursor = collection.find(findDic,timeout=False).hint(hint)
    else:
        cursor = collection.find(findDic, projection.projection,timeout=False).hint(hint)

    if sortByTimestamp:
        cursor = cursor.sort([('timestamp', pymongo.ASCENDING)])

    if pageSize is not None and pageNum is not None:
        cursor = cursor.skip(pageSize*pageNum).limit(pageSize)

    # We use this to calculate progress through the cursor,
    # It is more efficient than using cursor.count.
    cursor.upper_bound_timestamp = upperBoundTimestamp

    timeTaken = timer.time_since_constructed
    logger.info('Successfully setup cursor in %dms -- %s' % (timeTaken,logFormatting))

    if Configuration.MONGO_EXPLAINS_ENABLED:
        logger.critical('Tweet/User Explain: %s' % unicode(cursor.explain()))

    return cursor
Example #13
0
 def construct_age(self):
     return getEpochMs() - self.constructed_at
Example #14
0
 def touch(self):
     self.timestamp = getEpochMs()
Example #15
0
        def func(templateArguments, instance):
            twitterInstance = self.application.twitter_instances.getInstanceByInstanceKey(instance)
            if twitterInstance is None:
                return dict()

            baseEpoch = twitterInstance.constructed_at

            start_epoch = parseInteger(request.GET.start_epoch, default=None)
            end_epoch = parseInteger(request.GET.end_epoch, default=None)
            source_place_id = parseInteger(request.GET.source_place_id)
            source_provider_id = parseInteger(request.GET.source_provider_id)

            if source_place_id is None:
                logger.error('Invalid place ID specified while providing influence data: %s' % unicode(source_place_id))
                return dict()

            source_cache_id = GeocodeResultAbstract.buildCacheId(source_provider_id, source_place_id)

            temporalCollection = getTemporalInfluenceCollection(instance)

            if start_epoch is not None:
                start_time_id = getTimeIdFromTimestamp(baseEpoch, Configuration.TEMPORAL_STEP, start_epoch)
            else:
                start_time_id = None

            if end_epoch is not None:
                end_time_id = getTimeIdFromTimestamp(baseEpoch, Configuration.TEMPORAL_STEP, end_epoch)
            else:
                end_time_id = None

            timerMs = getEpochMs()
            cacheData = getTemporalRange(temporalCollection, start_time_id, end_time_id, source_cache_id, preciseFromBack=True, preciseFromFront=True)
            logger.info('Took %dms to read temporal range data' % (getEpochMs() - timerMs))

            timerMs = getEpochMs()

            geocodeByPlaceType = dict()
            totalsByPlaceType = dict()

            if cacheData is not None:
                for providerId, providerIdData in cacheData.iteritems():
                    providerId = int(providerId)

                    for destination, count in providerIdData.iteritems():
                        split = destination.split('_')
                        placeType = int(split[0])
                        placeId = int(split[1])

                        record = [placeId,
                                  providerId,
                                  None,
                                  None,
                                  count,
                                  None]

                        geocodeByPlaceType.setdefault(placeType,list()).append(record)

                # Process only the records we are going to display.
                for placeType, records in geocodeByPlaceType.iteritems():
                    aux = sorted(records, key=lambda x: x[4], reverse=True)
                    aux = aux[:Configuration.DISPLAY_MAX_NUM_INFLUENCE_RECORDS_PER_PLACE_TYPE]
                    geocodeByPlaceType[placeType] = aux

                    for record in aux:
                        cacheId = GeocodeResultAbstract.buildCacheId(record[1], record[0])
                        geocode = geocodeFromCacheById(cacheId)

                        record[2] = geocode.display_name
                        record[3] = geocode.coordinate
                        count = record[4]
                        record[5] = geocode.bounding_box

                        totalsByPlaceType[placeType] = totalsByPlaceType.get(placeType,0) + count

            def getResultPart(placeType):
                return {'geocode_list' : geocodeByPlaceType.get(placeType,list()), 'total' : totalsByPlaceType.get(placeType, 0)}

            resultData = dict()
            resultData['city'] =        getResultPart(GeocodeResultAbstract.PlaceTypes.CITY)
            resultData['country'] =     getResultPart(GeocodeResultAbstract.PlaceTypes.COUNTRY)
            resultData['continent'] =   getResultPart(GeocodeResultAbstract.PlaceTypes.CONTINENT)

            logger.info('Took %dms to build temporal range result data' % (getEpochMs() - timerMs))

            return {'json' : resultData}
Example #16
0
def cursorItemsFromCache(instanceId,
                         getCollectionFunc,
                         placeId=None,
                         epochMsStartRange=None,
                         epochMsEndRange=None,
                         pageNum=None,
                         pageSize=None,
                         typeSpecificQuery=None,
                         projection=None,
                         sortByTimestamp=None,
                         typeSpecificHint=None):
    if sortByTimestamp is None:
        sortByTimestamp = True

    epochMsStartRange, epochMsEndRange = fixEpochMsRange(
        epochMsStartRange, epochMsEndRange)

    if epochMsEndRange is None:
        upperBoundTimestamp = getEpochMs()
    else:
        upperBoundTimestamp = epochMsEndRange

    if projection is not None and projection.do_query is False:
        return None

    assert instanceId is not None
    assert getCollectionFunc is not None
    collection = getCollectionFunc(instanceId)

    logFormatting = 'IN:%s, P:%s, ES:%s, EE:%s, PN:%s, PS:%s, T:%s, P:%s' % (
        instanceId, placeId, epochMsStartRange, epochMsEndRange, pageNum,
        pageSize, typeSpecificQuery, projection)

    timer = Timer()
    logger.info('Attempting to read items from cache (%d) -- %s' %
                (timer.__hash__(), logFormatting))

    findDic = dict()

    timestampDic = None
    if epochMsEndRange is not None:
        if timestampDic is None:
            timestampDic = dict()

        timestampDic.update({'$lt': epochMsEndRange})

    if epochMsStartRange is not None:
        if timestampDic is None:
            timestampDic = dict()

        timestampDic.update({'$gte': epochMsStartRange})

    if timestampDic is not None:
        findDic.update({'timestamp': timestampDic})

    if placeId is not None:
        findDic.update(
            dict({
                'geocode.providerId': placeId['providerId'],
                'geocode.placeId': placeId['placeId']
            }))

    # MongoDB sometimes gets it wrong, particularly with geocode.placeId.
    if typeSpecificHint is None:
        if timestampDic is not None:
            if placeId is not None:
                hint = [('geocode.placeId', pymongo.ASCENDING),
                        ('timestamp', pymongo.ASCENDING)]
            else:
                hint = [('timestamp', pymongo.ASCENDING)]
        else:
            if placeId is not None:
                hint = [('geocode.placeId', pymongo.ASCENDING)]
            else:
                hint = None
    else:
        hint = typeSpecificHint

    if typeSpecificQuery is not None:
        findDic.update(typeSpecificQuery)

    if projection is None:
        cursor = collection.find(findDic).hint(hint)
    else:
        cursor = collection.find(findDic, projection.projection).hint(hint)

    if sortByTimestamp:
        cursor = cursor.sort([('timestamp', pymongo.ASCENDING)])

    if pageSize is not None and pageNum is not None:
        cursor = cursor.skip(pageSize * pageNum).limit(pageSize)

    # We use this to calculate progress through the cursor,
    # It is more efficient than using cursor.count.
    cursor.upper_bound_timestamp = upperBoundTimestamp

    timeTaken = timer.time_since_constructed
    logger.info('Successfully setup cursor in %dms -- %s' %
                (timeTaken, logFormatting))

    if Configuration.MONGO_EXPLAINS_ENABLED:
        logger.critical('Tweet/User Explain: %s' % unicode(cursor.explain()))

    return cursor
Example #17
0
 def age(self):
     """ @return the age of the item. """
     return getEpochMs() - self.timestamp
Example #18
0
            def __init__(self, age=None):
                Timestamped.__init__(self)

                if age is not None:
                    self.timestamp = getEpochMs() - age
Example #19
0
def writeUserToCache(user, doUpdate):
    assert isinstance(user, User)

    # Used with $set operation.
    setFields = dict()

    # Used $addToSet operation.
    addToSetFields = dict()

    if user.is_followers_loaded:
        setFields.update({'is_followers_loaded' : True})

    if user.is_followee:
        setFields.update({'is_followee' : True})

        followeeIds = [x.id for x in user.known_followees]
        addToSetFields.update({'known_followees' : {'$each' : followeeIds}})

    if user.has_twitter_place:
        setFields.update({'twitter_place' : user.twitter_place.data})

    if user.is_associated_with_tweet:
        setFields.update({'is_associated_with_tweet' : True})

    if user.last_follower_enrichment_error is not None:
        setFields.update({'last_follower_enrichment_error' : user.last_follower_enrichment_error})

    if user.queued_for_follower_enrichment:
        p = user.follower_enrichment_progress

        queue_progress, user_progress, user_id_progress, enrichment_progress_description, queue_waiting_for_user = p.getTuple()
        if queue_waiting_for_user is not None:
            queue_waiting_for_user = queue_waiting_for_user.id

        setFields.update({'queued_for_follower_enrichment' : user.queued_for_follower_enrichment})

        setFields.update({'follower_enrichment_progress' : (queue_progress,
                                                            user_progress,
                                                            user_id_progress,
                                                            enrichment_progress_description,
                                                            queue_waiting_for_user)})
    else:
        # Remove redundant information.
        if user.is_followers_loaded:
            setFields.update({'follower_enrichment_progress' : None})
            setFields.update({'queued_for_follower_enrichment' : False})

    placeId = None
    if user.is_geocoded:
        placeId = user.location_geocode.all_geocode_results_cache_id

        if user.geocode_bias is not None:
            setFields.update({'geocode_bias' : user.geocode_bias})

        if user.geocoded_from is not None:
            setFields.update({'geocoded_from' : user.geocoded_from})

    if user.has_analysers:
        analysis = [{x[0] : x[1].results_cacheable} for x in user.analysers.iteritems()]
        addToSetFields.update({'analysis' : {'$each' : analysis}})

    theQuery = dict()
    if len(setFields) > 0:
        theQuery.update({'$set' : setFields})

    if len(addToSetFields) > 0:
        theQuery.update({'$addToSet' : addToSetFields})


    collection = getUserCollection(user.instance_key)

    timer = getEpochMs()

    # This is for the user page where followers are looked up.
    # Not sure if sparse=True does anything, pymongo docs not clear on how to create sparse index.
    collection.ensure_index([('known_followees', pymongo.ASCENDING)], sparse = True)

    # For short follow information download.
    # Note: Only place ID is used because indexes are expensive on database RAM,
    # and we don't really need to do provider ID too since it is extremely rare
    # that two providers will have the same place ID. Also note I had some trouble
    # getting MongoDB to use an index with provider ID in it (not sure why, but it
    # wouldn't use the index).
    collection.ensure_index([('is_followers_loaded', pymongo.ASCENDING), ('timestamp', pymongo.ASCENDING)], sparse = True)
    collection.ensure_index([('geocode.placeId', pymongo.ASCENDING), ('is_followers_loaded', pymongo.ASCENDING), ('timestamp', pymongo.ASCENDING)], sparse = True)

    ensureIndexTime = getEpochMs() - timer
    timer = getEpochMs()

    _writeItemToCache(getUserCollection, user.id, user.instance_key, user.data, user.isDataNew, user.timestamp, placeId, theQuery, doUpdate)

    writingToDatabaseTime = getEpochMs() - timer

    # This is an optimization, the next time we see this same user object we won't push its data.
    user.isDataNew = False

    global logUserWritePerformanceTimer
    if logUserWritePerformanceTimer.ticked():
        logger.info('Writing user to database took %dms ensuring index, %dms writing to database' % (ensureIndexTime, writingToDatabaseTime))
Example #20
0
def writeUserToCache(user, doUpdate):
    assert isinstance(user, User)

    # Used with $set operation.
    setFields = dict()

    # Used $addToSet operation.
    addToSetFields = dict()

    if user.is_followers_loaded:
        setFields.update({'is_followers_loaded': True})

    if user.is_followee:
        setFields.update({'is_followee': True})

        followeeIds = [x.id for x in user.known_followees]
        addToSetFields.update({'known_followees': {'$each': followeeIds}})

    if user.has_twitter_place:
        setFields.update({'twitter_place': user.twitter_place.data})

    if user.is_associated_with_tweet:
        setFields.update({'is_associated_with_tweet': True})

    if user.last_follower_enrichment_error is not None:
        setFields.update({
            'last_follower_enrichment_error':
            user.last_follower_enrichment_error
        })

    if user.queued_for_follower_enrichment:
        p = user.follower_enrichment_progress

        queue_progress, user_progress, user_id_progress, enrichment_progress_description, queue_waiting_for_user = p.getTuple(
        )
        if queue_waiting_for_user is not None:
            queue_waiting_for_user = queue_waiting_for_user.id

        setFields.update({
            'queued_for_follower_enrichment':
            user.queued_for_follower_enrichment
        })

        setFields.update({
            'follower_enrichment_progress':
            (queue_progress, user_progress, user_id_progress,
             enrichment_progress_description, queue_waiting_for_user)
        })
    else:
        # Remove redundant information.
        if user.is_followers_loaded:
            setFields.update({'follower_enrichment_progress': None})
            setFields.update({'queued_for_follower_enrichment': False})

    placeId = None
    if user.is_geocoded:
        placeId = user.location_geocode.all_geocode_results_cache_id

        if user.geocode_bias is not None:
            setFields.update({'geocode_bias': user.geocode_bias})

        if user.geocoded_from is not None:
            setFields.update({'geocoded_from': user.geocoded_from})

    if user.has_analysers:
        analysis = [{
            x[0]: x[1].results_cacheable
        } for x in user.analysers.iteritems()]
        addToSetFields.update({'analysis': {'$each': analysis}})

    theQuery = dict()
    if len(setFields) > 0:
        theQuery.update({'$set': setFields})

    if len(addToSetFields) > 0:
        theQuery.update({'$addToSet': addToSetFields})

    collection = getUserCollection(user.instance_key)

    timer = getEpochMs()

    # This is for the user page where followers are looked up.
    # Not sure if sparse=True does anything, pymongo docs not clear on how to create sparse index.
    collection.ensure_index([('known_followees', pymongo.ASCENDING)],
                            sparse=True)

    # For short follow information download.
    # Note: Only place ID is used because indexes are expensive on database RAM,
    # and we don't really need to do provider ID too since it is extremely rare
    # that two providers will have the same place ID. Also note I had some trouble
    # getting MongoDB to use an index with provider ID in it (not sure why, but it
    # wouldn't use the index properly, see: https://stackoverflow.com/questions/41085666/mongodb-explains-totalkeysexamined-more-than-limit).
    collection.ensure_index([('is_followers_loaded', pymongo.ASCENDING),
                             ('timestamp', pymongo.ASCENDING)],
                            sparse=True)
    collection.ensure_index([('geocode.placeId', pymongo.ASCENDING),
                             ('is_followers_loaded', pymongo.ASCENDING),
                             ('timestamp', pymongo.ASCENDING)],
                            sparse=True)

    ensureIndexTime = getEpochMs() - timer
    timer = getEpochMs()

    _writeItemToCache(getUserCollection, user.id, user.instance_key, user.data,
                      user.isDataNew, user.timestamp, placeId, theQuery,
                      doUpdate)

    writingToDatabaseTime = getEpochMs() - timer

    # This is an optimization, the next time we see this same user object we won't push its data.
    user.isDataNew = False

    global logUserWritePerformanceTimer
    if logUserWritePerformanceTimer.ticked():
        logger.info(
            'Writing user to database took %dms ensuring index, %dms writing to database'
            % (ensureIndexTime, writingToDatabaseTime))
    def manageSocket(self, webSocket, tupleArguments, socketId):
        instanceId = tupleArguments[0]

        mainControl = webSocket.controls[self.key]
        assert isinstance(mainControl, DocumentControl)

        bytesPerBatch       =        parseInteger(request.GET.batchSizeBytes, maximum=1024 * 1024 * 256, default=1024 * 1024 * 1)
        tweetInfo           =        parseBoolean(request.GET.tweet_info, False)
        followerInfo        =        parseBoolean(request.GET.follower_info_full, False)
        followerInfoShort   =        parseBoolean(request.GET.follower_info_short, False)
        providerId          =        parseInteger(request.GET.provider_id)
        placeId             =        parseInteger(request.GET.place_id)
        startEpoch          =        parseInteger(request.GET.start_epoch)
        endEpoch            =        parseInteger(request.GET.end_epoch)

        if placeId is not None and providerId is not None:
            placeCacheId = GeocodeResultAbstract.buildCacheId(providerId, placeId)
        else:
            placeCacheId = None

        if followerInfo:
            tweetInfo = False
            followerInfoShort = False
        elif tweetInfo:
            followerInfo = False
            followerInfoShort = False
        elif followerInfoShort:
            followerInfo = False
            tweetInfo = False
        else:
            followerInfo = True


        userTunnelId = 'user_tunnel'
        tweetTunnelId = None

        if tweetInfo:
            tweetTunnelId = 'tweet_tunnel'

        def openRequiredTunnels():
            if tweetInfo:
                return self.openTunnels(webSocket)
            else:
                return self.openTunnel(userTunnelId, webSocket)

        if not openRequiredTunnels():
            logger.error('Failed to open initial tunnels')
            return False

        if tweetInfo:
            followerIdsFlag = False
            followeeIdsFlag = False
            analysisFlag = False
            isFollowersLoadedRequirement = None
            associatedWithTweetRequirement = True
            recursiveCacheFlag = False
            followerIdsProjection = None
            outputType = 1 # for csv.
        elif followerInfo:
            followerIdsFlag = True
            followeeIdsFlag = True
            analysisFlag = True
            isFollowersLoadedRequirement = True
            associatedWithTweetRequirement = None
            recursiveCacheFlag = True
            followerIdsProjection = None # this gives us all data on each follower.
            outputType = 2
        elif followerInfoShort:
            followerIdsFlag = True
            followeeIdsFlag = True
            followerIdsProjection = NoQueryProjection()
            analysisFlag = True
            isFollowersLoadedRequirement = True
            associatedWithTweetRequirement = None
            recursiveCacheFlag = True
            outputType = 3
        else:
            raise NotImplementedError()

        userProjection = UserProjection(True,
                                        True,
                                        None,
                                        True,
                                        followerIdsFlag,
                                        followerIdsProjection,
                                        followeeIdsFlag,
                                        UserProjection.Id(),
                                        True,
                                        False,
                                        False,
                                        True,
                                        True,
                                        False,
                                        False,
                                        False,
                                        False,
                                        analysisFlag)

        isFirstIteration = [True]

        twitterInstance = self.application.twitter_instances.getInstanceByInstanceKey(instanceId)
        if twitterInstance is None:
            return False

        twitterSession = twitterInstance.twitter_thread.twitter_session
        progressBarTotalId = 'progress-bar-total'
        progressBarCurrentBatchId = 'progress-bar-current-batch'

        signaler = EventSignaler(self.key, [webSocket])

        updateProgressBarFreq = Timer(400,True)

        def sendData(tunnelId, data):
            self.sendDataOnTunnel(webSocket, tunnelId, (unicode(data) + '\r\n'))

        def sendHeader():
            sendData(userTunnelId, getUserHeader(outputType))

            if tweetTunnelId is not None:
                sendData(tweetTunnelId, getTweetHeader())

        def doProgressBarChange(percentage, progressBarId):
            mainControl.executeJavascript('$("#%s").width("%.3f%%");' % (progressBarId, percentage))

        sendHeader()

        counter = [0]
        previousCounter = [0]
        def updateSocket(controls,
                         data,
                         bytesCounter=counter,
                         bytesPerBatch=bytesPerBatch,
                         previousCounter=previousCounter,
                         isFirstIteration=isFirstIteration):
            user = data['user_data']
            tweet = data['tweet_data']
            percentage = data['percentage']
            isFinished = data['isFinished']

            control = controls[self.key]
            assert isinstance(control, DocumentControl)

            def updateProgressBars():
                previousCounter[0] = thisCounter = bytesCounter[0]

                percentageCurrentBatch = float(thisCounter) / float(bytesPerBatch) * 100
                percentageTotal = percentage

                if percentageTotal >= 100:
                    percentageCurrentBatch = 100

                if isFirstIteration[0] and percentageCurrentBatch < percentageTotal:
                    percentageCurrentBatch = percentageTotal

                doProgressBarChange(percentageTotal, progressBarTotalId)
                doProgressBarChange(percentageCurrentBatch, progressBarCurrentBatchId)

            if previousCounter[0] != bytesCounter[0] and updateProgressBarFreq.ticked():
                updateProgressBars()

            dataToSendToClient = ''
            if user is not None:
                assert isinstance(user,User)
                dataToSendToClient = getUserRepresentation(user, outputType)
                sendData(userTunnelId, dataToSendToClient)

            if tweet is not None:
                assert isinstance(tweet, Tweet)
                dataToSendToClient = getTweetRepresentation(tweet)
                sendData(tweetTunnelId, dataToSendToClient)

            dataLength = len(dataToSendToClient)
            bytesCounter[0] += dataLength

            if bytesCounter[0] > bytesPerBatch or isFinished:
                updateProgressBars()
                isFirstIteration[0] = False

                bytesCounter[0] = 0
                mainControl.executeJavascript('onBatchEnd();')

                self.closeTunnels(webSocket)

                if not isFinished:
                    logger.debug('Waiting to receive next data provider')
                    if not openRequiredTunnels():
                        logger.warning('Failed to reinitialize tunnel slots')
                        webSocket.cleanup()
                        return

                    sendHeader()
                else:
                    mainControl.executeJavascript('onFinished();')

                    webSocket.cleanup()

        def onCacheIteration(iteration, total, isFinished, data, iteratorId):
            # Don't write followee data to output as it would duplicate alot of data.
            if iteratorId == 'followee':
                data = None

            running = not webSocket.is_cleaned_up
            if running:
                # We need to do this so that if the client closes the socket we are notified.
                webSocket.pingFreqLimited()

                percentage = getPercentage(iteration, total)
                dataId = None
                if data is not None:
                    dataId = data.id
                #logger.info('iteration %.2f of %.2f (%.1f%%) - it: %s, userId: %s' % (iteration, total, percentage,iteratorId,dataId))

                user = None
                tweet = None
                if data is None:
                    pass
                elif isinstance(data, User):
                    user = data
                elif isinstance(data, Tweet):
                    tweet = data
                    if tweet.has_user:
                        user = tweet.user
                else:
                    logger.error('Invalid data from cache, type: %s' % type(data))
                    return running

                signaler.signalEvent({SignalActions.SOCKET: updateSocket, 'percentage' : percentage, 'user_data' : user, 'tweet_data' : tweet, 'isFinished' : isFinished})
                gevent.sleep(0)
            else:
                logger.debug('Ending cache download prematurely')

            return running

        logger.debug('Starting to read data from cache...')

        # This makes sure the search is finite.
        epochNow = getEpochMs()
        if endEpoch is None or endEpoch > epochNow:
            endEpoch = epochNow

        if followerInfo or followerInfoShort:
            readUsersFromCache(twitterSession,
                               instanceId,
                               placeId = placeCacheId,
                               epochMsStartRange=startEpoch,
                               epochMsEndRange=endEpoch,
                               isFollowersLoadedRequirement=isFollowersLoadedRequirement,
                               associatedWithTweetRequirement=associatedWithTweetRequirement,
                               onIterationFunc=onCacheIteration,
                               recursive=recursiveCacheFlag,
                               userProjection=userProjection)
        else:
            readTweetsFromCache(twitterSession,
                                instanceId,
                                placeId = placeCacheId,
                                epochMsStartRange=startEpoch,
                                epochMsEndRange=endEpoch,
                                onIterationFunc=onCacheIteration,
                                retrieveUserData=True,
                                userProjection=userProjection)

        # We want to cleanup everything now since we are done.
        return False