def writeTweetToCache(tweet): placeId = None if tweet.user.is_geocoded: placeId = tweet.user.location_geocode.all_geocode_results_cache_id timer = getEpochMs() collection = getTweetCollection(tweet.instance_key) collection.ensure_index([ ('timestamp', pymongo.ASCENDING) ]) # for cache download where no place is specified. collection.ensure_index([('geocode.placeId', pymongo.ASCENDING), ('timestamp', pymongo.ASCENDING)]) _writeItemToCache(getTweetCollection, None, tweet.instance_key, tweet.data, tweet.isDataNew, tweet.timestamp, placeId) tweet.isDataNew = False writingToDatabaseTime = getEpochMs() - timer global logTweetWritePerformanceTimer if logTweetWritePerformanceTimer.ticked(): logger.info('Writing tweet to database took %dms' % writingToDatabaseTime)
def func(templateArguments, instance): instance = self.application.twitter_instances.getInstanceByInstanceKey(instance) if instance is None: abort(404, "No active search stream found at this address") assert isinstance(instance, TwitterInstance) keywords = instance.twitter_thread.twitter_feed.keywords geographicalSetupString = instance.geographic_setup_string if keywords is None: keywords = '' keywordsDisplay = '[None]' else: keywords = ','.join(keywords) keywordsDisplay = keywords instanceDescription = getInstanceDescription(instance, False) instanceDescriptionWithPrefix = getInstanceDescription(instance, True) homeLink = getHomeLink(Configuration.PROJECT_NAME) templateArguments.update({'instance_description' : instanceDescription, 'instance_description_with_prefix' : instanceDescriptionWithPrefix, 'home_link' : homeLink, 'instance_name': instance.instance_key, 'keywords': keywords, 'keywords_display' : keywordsDisplay, 'instance_map_data' : geographicalSetupString, 'post_address': WEBSITE_ROOT_HTTP + '/manage_instance', # for terminate instance button. 'login_address' : OAuthSignIn.link_info.getPageLink(), 'start_epoch' : instance.constructed_at, 'server_current_epoch' : getEpochMs()}) return template('locations-map.tpl', templateArguments)
def __init__(self, constructedAt=None): object.__init__(self) self.touch() if constructedAt is None: self.constructed_at = getEpochMs() else: self.constructed_at = constructedAt
def func(templateArguments, instance, location, provider): twitterInstance = self.application.twitter_instances.getInstanceByInstanceKey( instance) if twitterInstance is None: abort(404, "No active search stream found at this address") geocode = geocodeFromCacheById( GeocodeResultAbstract.buildCacheId(provider, location)) assert geocode is not None instanceDescription = getInstanceDescription(twitterInstance) instanceLink = getInstanceLink(twitterInstance) homeLink = getHomeLink(Configuration.PROJECT_NAME) templateArguments.update( { 'home_link': homeLink, 'instance': instance, 'location': location, 'provider': provider, 'instance_link': instanceLink, 'instance_description': instanceDescription, 'place': geocode.display_name_short, 'place_coord': geocode.coordinate, 'startEpoch': twitterInstance.constructed_at, 'server_current_epoch': getEpochMs(), 'max_tweets': Configuration.MAX_CLIENT_LIVE_TWEETS } ) # Client needs to offset to this epoch in case its clock is wrong. if geocode.has_bounding_box: templateArguments.update( {'place_bounding_box': geocode.bounding_box}) if geocode.has_country: templateArguments.update({ 'place_country_link': LocationsPage.link_info.getPageLink( instance, geocode.country.place_id, geocode.country.provider_id) }) templateArguments.update( {'place_country': geocode.country.display_name_short}) if geocode.has_continent: templateArguments.update({ 'place_continent_link': LocationsPage.link_info.getPageLink( instance, geocode.continent.place_id, geocode.continent.provider_id) }) templateArguments.update( {'place_continent': geocode.continent.display_name_short}) return template('location.tpl', templateArguments)
def fixEpochMsRange(epochMsStartRange, epochMsEndRange): if epochMsStartRange is not None and epochMsStartRange > getEpochMs(): logger.warn('Attempt was made to read from cache with a future epoch, this could cause read/write collision: %d' % epochMsStartRange) return None if (epochMsStartRange is not None and epochMsEndRange is not None) and epochMsEndRange < epochMsStartRange: logger.warn('End epoch is less than start epoch - this is invalid') return None return epochMsStartRange, epochMsEndRange
def writeTweetToCache(tweet): placeId = None if tweet.user.is_geocoded: placeId = tweet.user.location_geocode.all_geocode_results_cache_id timer = getEpochMs() collection = getTweetCollection(tweet.instance_key) collection.ensure_index([('timestamp', pymongo.ASCENDING)]) # for cache download where no place is specified. collection.ensure_index([('geocode.placeId', pymongo.ASCENDING), ('timestamp', pymongo.ASCENDING)]) _writeItemToCache(getTweetCollection, None, tweet.instance_key, tweet.data, tweet.isDataNew, tweet.timestamp, placeId) tweet.isDataNew = False writingToDatabaseTime = getEpochMs() - timer global logTweetWritePerformanceTimer if logTweetWritePerformanceTimer.ticked(): logger.info('Writing tweet to database took %dms' % writingToDatabaseTime)
def fixEpochMsRange(epochMsStartRange, epochMsEndRange): if epochMsStartRange is not None and epochMsStartRange > getEpochMs(): logger.warn( 'Attempt was made to read from cache with a future epoch, this could cause read/write collision: %d' % epochMsStartRange) return None if (epochMsStartRange is not None and epochMsEndRange is not None) and epochMsEndRange < epochMsStartRange: logger.warn('End epoch is less than start epoch - this is invalid') return None return epochMsStartRange, epochMsEndRange
def func(templateArguments, instance): instance = self.application.twitter_instances.getInstanceByInstanceKey( instance) if instance is None: abort(404, "No active search stream found at this address") assert isinstance(instance, TwitterInstance) keywords = instance.twitter_thread.twitter_feed.keywords geographicalSetupString = instance.geographic_setup_string if keywords is None: keywords = '' keywordsDisplay = '[None]' else: keywords = ','.join(keywords) keywordsDisplay = keywords instanceDescription = getInstanceDescription(instance, False) instanceDescriptionWithPrefix = getInstanceDescription( instance, True) homeLink = getHomeLink(Configuration.PROJECT_NAME) templateArguments.update({ 'instance_description': instanceDescription, 'instance_description_with_prefix': instanceDescriptionWithPrefix, 'home_link': homeLink, 'instance_name': instance.instance_key, 'keywords': keywords, 'keywords_display': keywordsDisplay, 'instance_map_data': geographicalSetupString, 'post_address': WEBSITE_ROOT_HTTP + '/manage_instance', # for terminate instance button. 'login_address': OAuthSignIn.link_info.getPageLink(), 'start_epoch': instance.constructed_at, 'server_current_epoch': getEpochMs() }) return template('locations-map.tpl', templateArguments)
def func(templateArguments, instance, location, provider): twitterInstance = self.application.twitter_instances.getInstanceByInstanceKey(instance) if twitterInstance is None: abort(404, "No active search stream found at this address") geocode = geocodeFromCacheById(GeocodeResultAbstract.buildCacheId(provider,location)) assert geocode is not None instanceDescription = getInstanceDescription(twitterInstance) instanceLink = getInstanceLink(twitterInstance) homeLink = getHomeLink(Configuration.PROJECT_NAME) templateArguments.update({'home_link' : homeLink, 'instance' : instance, 'location' : location, 'provider' : provider, 'instance_link' : instanceLink, 'instance_description' : instanceDescription, 'place' : geocode.display_name_short, 'place_coord' : geocode.coordinate, 'startEpoch' : twitterInstance.constructed_at, 'server_current_epoch' : getEpochMs(), 'max_tweets' : Configuration.MAX_CLIENT_LIVE_TWEETS}) # Client needs to offset to this epoch in case its clock is wrong. if geocode.has_bounding_box: templateArguments.update({'place_bounding_box' : geocode.bounding_box}) if geocode.has_country: templateArguments.update({'place_country_link' : LocationsPage.link_info.getPageLink(instance, geocode.country.place_id, geocode.country.provider_id)}) templateArguments.update({'place_country' : geocode.country.display_name_short}) if geocode.has_continent: templateArguments.update({'place_continent_link' : LocationsPage.link_info.getPageLink(instance, geocode.continent.place_id, geocode.continent.provider_id)}) templateArguments.update({'place_continent' : geocode.continent.display_name_short}) return template('location.tpl', templateArguments)
def addTemporalEntryForCurrentUser(follower): timeId = getTimeIdFromTimestamp(startTime, Configuration.TEMPORAL_STEP, getEpochMs()) userCacheIds = user.location_geocode.all_geocode_results_cache_id followerGeocodeResults = follower.location_geocode.all_geocode_results for userCacheId in userCacheIds: userPlaceId = GeocodeResultAbstract.getPlaceIdFromCacheId( userCacheId) userProviderId = GeocodeResultAbstract.getProviderIdFromCacheId( userCacheId) for followerGeocodeResult in followerGeocodeResults: followerPlaceId = followerGeocodeResult.place_id followerProviderId = followerGeocodeResult.provider_id followerPlaceType = followerGeocodeResult.place_type instance.addTemporalEntry(temporalCollection, timeId, userProviderId, userPlaceId, followerProviderId, followerPlaceId, followerPlaceType)
def func(templateArguments, instance): twitterInstance = self.application.twitter_instances.getInstanceByInstanceKey( instance) if twitterInstance is None: return dict() baseEpoch = twitterInstance.constructed_at start_epoch = parseInteger(request.GET.start_epoch, default=None) end_epoch = parseInteger(request.GET.end_epoch, default=None) source_place_id = parseInteger(request.GET.source_place_id) source_provider_id = parseInteger(request.GET.source_provider_id) if source_place_id is None: logger.error( 'Invalid place ID specified while providing influence data: %s' % unicode(source_place_id)) return dict() source_cache_id = GeocodeResultAbstract.buildCacheId( source_provider_id, source_place_id) temporalCollection = getTemporalInfluenceCollection(instance) if start_epoch is not None: start_time_id = getTimeIdFromTimestamp( baseEpoch, Configuration.TEMPORAL_STEP, start_epoch) else: start_time_id = None if end_epoch is not None: end_time_id = getTimeIdFromTimestamp( baseEpoch, Configuration.TEMPORAL_STEP, end_epoch) else: end_time_id = None timerMs = getEpochMs() cacheData = getTemporalRange(temporalCollection, start_time_id, end_time_id, source_cache_id, preciseFromBack=True, preciseFromFront=True) logger.info('Took %dms to read temporal range data' % (getEpochMs() - timerMs)) timerMs = getEpochMs() geocodeByPlaceType = dict() totalsByPlaceType = dict() if cacheData is not None: for providerId, providerIdData in cacheData.iteritems(): providerId = int(providerId) for destination, count in providerIdData.iteritems(): split = destination.split('_') placeType = int(split[0]) placeId = int(split[1]) record = [placeId, providerId, None, None, count, None] geocodeByPlaceType.setdefault(placeType, list()).append(record) # Process only the records we are going to display. for placeType, records in geocodeByPlaceType.iteritems(): aux = sorted(records, key=lambda x: x[4], reverse=True) aux = aux[:Configuration. DISPLAY_MAX_NUM_INFLUENCE_RECORDS_PER_PLACE_TYPE] geocodeByPlaceType[placeType] = aux for record in aux: cacheId = GeocodeResultAbstract.buildCacheId( record[1], record[0]) geocode = geocodeFromCacheById(cacheId) record[2] = geocode.display_name record[3] = geocode.coordinate count = record[4] record[5] = geocode.bounding_box totalsByPlaceType[placeType] = totalsByPlaceType.get( placeType, 0) + count def getResultPart(placeType): return { 'geocode_list': geocodeByPlaceType.get(placeType, list()), 'total': totalsByPlaceType.get(placeType, 0) } resultData = dict() resultData['city'] = getResultPart( GeocodeResultAbstract.PlaceTypes.CITY) resultData['country'] = getResultPart( GeocodeResultAbstract.PlaceTypes.COUNTRY) resultData['continent'] = getResultPart( GeocodeResultAbstract.PlaceTypes.CONTINENT) logger.info('Took %dms to build temporal range result data' % (getEpochMs() - timerMs)) return {'json': resultData}
def cursorItemsFromCache(instanceId, getCollectionFunc, placeId=None, epochMsStartRange=None, epochMsEndRange=None, pageNum=None, pageSize=None, typeSpecificQuery=None, projection=None, sortByTimestamp=None, typeSpecificHint=None): if sortByTimestamp is None: sortByTimestamp = True epochMsStartRange, epochMsEndRange = fixEpochMsRange(epochMsStartRange, epochMsEndRange) if epochMsEndRange is None: upperBoundTimestamp = getEpochMs() else: upperBoundTimestamp = epochMsEndRange if projection is not None and projection.do_query is False: return None assert instanceId is not None assert getCollectionFunc is not None collection = getCollectionFunc(instanceId) logFormatting = 'IN:%s, P:%s, ES:%s, EE:%s, PN:%s, PS:%s, T:%s, P:%s' % (instanceId, placeId, epochMsStartRange, epochMsEndRange, pageNum, pageSize, typeSpecificQuery, projection) timer = Timer() logger.info('Attempting to read items from cache (%d) -- %s' % (timer.__hash__(),logFormatting)) findDic = dict() timestampDic = None if epochMsEndRange is not None: if timestampDic is None: timestampDic = dict() timestampDic.update({'$lt' : epochMsEndRange}) if epochMsStartRange is not None: if timestampDic is None: timestampDic = dict() timestampDic.update({'$gte' : epochMsStartRange}) if timestampDic is not None: findDic.update({'timestamp' : timestampDic}) if placeId is not None: findDic.update({'geocode.placeId' : placeId['placeId'], 'geocode.providerId' : placeId['providerId']}) # MongoDB sometimes gets it wrong, particularly with geocode.placeId. if typeSpecificHint is None: if timestampDic is not None: if placeId is not None: hint = [('geocode.placeId', pymongo.ASCENDING), ('timestamp', pymongo.ASCENDING)] else: hint = [('timestamp', pymongo.ASCENDING)] else: if placeId is not None: hint = [('geocode.placeId', pymongo.ASCENDING)] else: hint = None else: hint = typeSpecificHint if typeSpecificQuery is not None: findDic.update(typeSpecificQuery) if projection is None: cursor = collection.find(findDic,timeout=False).hint(hint) else: cursor = collection.find(findDic, projection.projection,timeout=False).hint(hint) if sortByTimestamp: cursor = cursor.sort([('timestamp', pymongo.ASCENDING)]) if pageSize is not None and pageNum is not None: cursor = cursor.skip(pageSize*pageNum).limit(pageSize) # We use this to calculate progress through the cursor, # It is more efficient than using cursor.count. cursor.upper_bound_timestamp = upperBoundTimestamp timeTaken = timer.time_since_constructed logger.info('Successfully setup cursor in %dms -- %s' % (timeTaken,logFormatting)) if Configuration.MONGO_EXPLAINS_ENABLED: logger.critical('Tweet/User Explain: %s' % unicode(cursor.explain())) return cursor
def construct_age(self): return getEpochMs() - self.constructed_at
def touch(self): self.timestamp = getEpochMs()
def func(templateArguments, instance): twitterInstance = self.application.twitter_instances.getInstanceByInstanceKey(instance) if twitterInstance is None: return dict() baseEpoch = twitterInstance.constructed_at start_epoch = parseInteger(request.GET.start_epoch, default=None) end_epoch = parseInteger(request.GET.end_epoch, default=None) source_place_id = parseInteger(request.GET.source_place_id) source_provider_id = parseInteger(request.GET.source_provider_id) if source_place_id is None: logger.error('Invalid place ID specified while providing influence data: %s' % unicode(source_place_id)) return dict() source_cache_id = GeocodeResultAbstract.buildCacheId(source_provider_id, source_place_id) temporalCollection = getTemporalInfluenceCollection(instance) if start_epoch is not None: start_time_id = getTimeIdFromTimestamp(baseEpoch, Configuration.TEMPORAL_STEP, start_epoch) else: start_time_id = None if end_epoch is not None: end_time_id = getTimeIdFromTimestamp(baseEpoch, Configuration.TEMPORAL_STEP, end_epoch) else: end_time_id = None timerMs = getEpochMs() cacheData = getTemporalRange(temporalCollection, start_time_id, end_time_id, source_cache_id, preciseFromBack=True, preciseFromFront=True) logger.info('Took %dms to read temporal range data' % (getEpochMs() - timerMs)) timerMs = getEpochMs() geocodeByPlaceType = dict() totalsByPlaceType = dict() if cacheData is not None: for providerId, providerIdData in cacheData.iteritems(): providerId = int(providerId) for destination, count in providerIdData.iteritems(): split = destination.split('_') placeType = int(split[0]) placeId = int(split[1]) record = [placeId, providerId, None, None, count, None] geocodeByPlaceType.setdefault(placeType,list()).append(record) # Process only the records we are going to display. for placeType, records in geocodeByPlaceType.iteritems(): aux = sorted(records, key=lambda x: x[4], reverse=True) aux = aux[:Configuration.DISPLAY_MAX_NUM_INFLUENCE_RECORDS_PER_PLACE_TYPE] geocodeByPlaceType[placeType] = aux for record in aux: cacheId = GeocodeResultAbstract.buildCacheId(record[1], record[0]) geocode = geocodeFromCacheById(cacheId) record[2] = geocode.display_name record[3] = geocode.coordinate count = record[4] record[5] = geocode.bounding_box totalsByPlaceType[placeType] = totalsByPlaceType.get(placeType,0) + count def getResultPart(placeType): return {'geocode_list' : geocodeByPlaceType.get(placeType,list()), 'total' : totalsByPlaceType.get(placeType, 0)} resultData = dict() resultData['city'] = getResultPart(GeocodeResultAbstract.PlaceTypes.CITY) resultData['country'] = getResultPart(GeocodeResultAbstract.PlaceTypes.COUNTRY) resultData['continent'] = getResultPart(GeocodeResultAbstract.PlaceTypes.CONTINENT) logger.info('Took %dms to build temporal range result data' % (getEpochMs() - timerMs)) return {'json' : resultData}
def cursorItemsFromCache(instanceId, getCollectionFunc, placeId=None, epochMsStartRange=None, epochMsEndRange=None, pageNum=None, pageSize=None, typeSpecificQuery=None, projection=None, sortByTimestamp=None, typeSpecificHint=None): if sortByTimestamp is None: sortByTimestamp = True epochMsStartRange, epochMsEndRange = fixEpochMsRange( epochMsStartRange, epochMsEndRange) if epochMsEndRange is None: upperBoundTimestamp = getEpochMs() else: upperBoundTimestamp = epochMsEndRange if projection is not None and projection.do_query is False: return None assert instanceId is not None assert getCollectionFunc is not None collection = getCollectionFunc(instanceId) logFormatting = 'IN:%s, P:%s, ES:%s, EE:%s, PN:%s, PS:%s, T:%s, P:%s' % ( instanceId, placeId, epochMsStartRange, epochMsEndRange, pageNum, pageSize, typeSpecificQuery, projection) timer = Timer() logger.info('Attempting to read items from cache (%d) -- %s' % (timer.__hash__(), logFormatting)) findDic = dict() timestampDic = None if epochMsEndRange is not None: if timestampDic is None: timestampDic = dict() timestampDic.update({'$lt': epochMsEndRange}) if epochMsStartRange is not None: if timestampDic is None: timestampDic = dict() timestampDic.update({'$gte': epochMsStartRange}) if timestampDic is not None: findDic.update({'timestamp': timestampDic}) if placeId is not None: findDic.update( dict({ 'geocode.providerId': placeId['providerId'], 'geocode.placeId': placeId['placeId'] })) # MongoDB sometimes gets it wrong, particularly with geocode.placeId. if typeSpecificHint is None: if timestampDic is not None: if placeId is not None: hint = [('geocode.placeId', pymongo.ASCENDING), ('timestamp', pymongo.ASCENDING)] else: hint = [('timestamp', pymongo.ASCENDING)] else: if placeId is not None: hint = [('geocode.placeId', pymongo.ASCENDING)] else: hint = None else: hint = typeSpecificHint if typeSpecificQuery is not None: findDic.update(typeSpecificQuery) if projection is None: cursor = collection.find(findDic).hint(hint) else: cursor = collection.find(findDic, projection.projection).hint(hint) if sortByTimestamp: cursor = cursor.sort([('timestamp', pymongo.ASCENDING)]) if pageSize is not None and pageNum is not None: cursor = cursor.skip(pageSize * pageNum).limit(pageSize) # We use this to calculate progress through the cursor, # It is more efficient than using cursor.count. cursor.upper_bound_timestamp = upperBoundTimestamp timeTaken = timer.time_since_constructed logger.info('Successfully setup cursor in %dms -- %s' % (timeTaken, logFormatting)) if Configuration.MONGO_EXPLAINS_ENABLED: logger.critical('Tweet/User Explain: %s' % unicode(cursor.explain())) return cursor
def age(self): """ @return the age of the item. """ return getEpochMs() - self.timestamp
def __init__(self, age=None): Timestamped.__init__(self) if age is not None: self.timestamp = getEpochMs() - age
def writeUserToCache(user, doUpdate): assert isinstance(user, User) # Used with $set operation. setFields = dict() # Used $addToSet operation. addToSetFields = dict() if user.is_followers_loaded: setFields.update({'is_followers_loaded' : True}) if user.is_followee: setFields.update({'is_followee' : True}) followeeIds = [x.id for x in user.known_followees] addToSetFields.update({'known_followees' : {'$each' : followeeIds}}) if user.has_twitter_place: setFields.update({'twitter_place' : user.twitter_place.data}) if user.is_associated_with_tweet: setFields.update({'is_associated_with_tweet' : True}) if user.last_follower_enrichment_error is not None: setFields.update({'last_follower_enrichment_error' : user.last_follower_enrichment_error}) if user.queued_for_follower_enrichment: p = user.follower_enrichment_progress queue_progress, user_progress, user_id_progress, enrichment_progress_description, queue_waiting_for_user = p.getTuple() if queue_waiting_for_user is not None: queue_waiting_for_user = queue_waiting_for_user.id setFields.update({'queued_for_follower_enrichment' : user.queued_for_follower_enrichment}) setFields.update({'follower_enrichment_progress' : (queue_progress, user_progress, user_id_progress, enrichment_progress_description, queue_waiting_for_user)}) else: # Remove redundant information. if user.is_followers_loaded: setFields.update({'follower_enrichment_progress' : None}) setFields.update({'queued_for_follower_enrichment' : False}) placeId = None if user.is_geocoded: placeId = user.location_geocode.all_geocode_results_cache_id if user.geocode_bias is not None: setFields.update({'geocode_bias' : user.geocode_bias}) if user.geocoded_from is not None: setFields.update({'geocoded_from' : user.geocoded_from}) if user.has_analysers: analysis = [{x[0] : x[1].results_cacheable} for x in user.analysers.iteritems()] addToSetFields.update({'analysis' : {'$each' : analysis}}) theQuery = dict() if len(setFields) > 0: theQuery.update({'$set' : setFields}) if len(addToSetFields) > 0: theQuery.update({'$addToSet' : addToSetFields}) collection = getUserCollection(user.instance_key) timer = getEpochMs() # This is for the user page where followers are looked up. # Not sure if sparse=True does anything, pymongo docs not clear on how to create sparse index. collection.ensure_index([('known_followees', pymongo.ASCENDING)], sparse = True) # For short follow information download. # Note: Only place ID is used because indexes are expensive on database RAM, # and we don't really need to do provider ID too since it is extremely rare # that two providers will have the same place ID. Also note I had some trouble # getting MongoDB to use an index with provider ID in it (not sure why, but it # wouldn't use the index). collection.ensure_index([('is_followers_loaded', pymongo.ASCENDING), ('timestamp', pymongo.ASCENDING)], sparse = True) collection.ensure_index([('geocode.placeId', pymongo.ASCENDING), ('is_followers_loaded', pymongo.ASCENDING), ('timestamp', pymongo.ASCENDING)], sparse = True) ensureIndexTime = getEpochMs() - timer timer = getEpochMs() _writeItemToCache(getUserCollection, user.id, user.instance_key, user.data, user.isDataNew, user.timestamp, placeId, theQuery, doUpdate) writingToDatabaseTime = getEpochMs() - timer # This is an optimization, the next time we see this same user object we won't push its data. user.isDataNew = False global logUserWritePerformanceTimer if logUserWritePerformanceTimer.ticked(): logger.info('Writing user to database took %dms ensuring index, %dms writing to database' % (ensureIndexTime, writingToDatabaseTime))
def writeUserToCache(user, doUpdate): assert isinstance(user, User) # Used with $set operation. setFields = dict() # Used $addToSet operation. addToSetFields = dict() if user.is_followers_loaded: setFields.update({'is_followers_loaded': True}) if user.is_followee: setFields.update({'is_followee': True}) followeeIds = [x.id for x in user.known_followees] addToSetFields.update({'known_followees': {'$each': followeeIds}}) if user.has_twitter_place: setFields.update({'twitter_place': user.twitter_place.data}) if user.is_associated_with_tweet: setFields.update({'is_associated_with_tweet': True}) if user.last_follower_enrichment_error is not None: setFields.update({ 'last_follower_enrichment_error': user.last_follower_enrichment_error }) if user.queued_for_follower_enrichment: p = user.follower_enrichment_progress queue_progress, user_progress, user_id_progress, enrichment_progress_description, queue_waiting_for_user = p.getTuple( ) if queue_waiting_for_user is not None: queue_waiting_for_user = queue_waiting_for_user.id setFields.update({ 'queued_for_follower_enrichment': user.queued_for_follower_enrichment }) setFields.update({ 'follower_enrichment_progress': (queue_progress, user_progress, user_id_progress, enrichment_progress_description, queue_waiting_for_user) }) else: # Remove redundant information. if user.is_followers_loaded: setFields.update({'follower_enrichment_progress': None}) setFields.update({'queued_for_follower_enrichment': False}) placeId = None if user.is_geocoded: placeId = user.location_geocode.all_geocode_results_cache_id if user.geocode_bias is not None: setFields.update({'geocode_bias': user.geocode_bias}) if user.geocoded_from is not None: setFields.update({'geocoded_from': user.geocoded_from}) if user.has_analysers: analysis = [{ x[0]: x[1].results_cacheable } for x in user.analysers.iteritems()] addToSetFields.update({'analysis': {'$each': analysis}}) theQuery = dict() if len(setFields) > 0: theQuery.update({'$set': setFields}) if len(addToSetFields) > 0: theQuery.update({'$addToSet': addToSetFields}) collection = getUserCollection(user.instance_key) timer = getEpochMs() # This is for the user page where followers are looked up. # Not sure if sparse=True does anything, pymongo docs not clear on how to create sparse index. collection.ensure_index([('known_followees', pymongo.ASCENDING)], sparse=True) # For short follow information download. # Note: Only place ID is used because indexes are expensive on database RAM, # and we don't really need to do provider ID too since it is extremely rare # that two providers will have the same place ID. Also note I had some trouble # getting MongoDB to use an index with provider ID in it (not sure why, but it # wouldn't use the index properly, see: https://stackoverflow.com/questions/41085666/mongodb-explains-totalkeysexamined-more-than-limit). collection.ensure_index([('is_followers_loaded', pymongo.ASCENDING), ('timestamp', pymongo.ASCENDING)], sparse=True) collection.ensure_index([('geocode.placeId', pymongo.ASCENDING), ('is_followers_loaded', pymongo.ASCENDING), ('timestamp', pymongo.ASCENDING)], sparse=True) ensureIndexTime = getEpochMs() - timer timer = getEpochMs() _writeItemToCache(getUserCollection, user.id, user.instance_key, user.data, user.isDataNew, user.timestamp, placeId, theQuery, doUpdate) writingToDatabaseTime = getEpochMs() - timer # This is an optimization, the next time we see this same user object we won't push its data. user.isDataNew = False global logUserWritePerformanceTimer if logUserWritePerformanceTimer.ticked(): logger.info( 'Writing user to database took %dms ensuring index, %dms writing to database' % (ensureIndexTime, writingToDatabaseTime))
def manageSocket(self, webSocket, tupleArguments, socketId): instanceId = tupleArguments[0] mainControl = webSocket.controls[self.key] assert isinstance(mainControl, DocumentControl) bytesPerBatch = parseInteger(request.GET.batchSizeBytes, maximum=1024 * 1024 * 256, default=1024 * 1024 * 1) tweetInfo = parseBoolean(request.GET.tweet_info, False) followerInfo = parseBoolean(request.GET.follower_info_full, False) followerInfoShort = parseBoolean(request.GET.follower_info_short, False) providerId = parseInteger(request.GET.provider_id) placeId = parseInteger(request.GET.place_id) startEpoch = parseInteger(request.GET.start_epoch) endEpoch = parseInteger(request.GET.end_epoch) if placeId is not None and providerId is not None: placeCacheId = GeocodeResultAbstract.buildCacheId(providerId, placeId) else: placeCacheId = None if followerInfo: tweetInfo = False followerInfoShort = False elif tweetInfo: followerInfo = False followerInfoShort = False elif followerInfoShort: followerInfo = False tweetInfo = False else: followerInfo = True userTunnelId = 'user_tunnel' tweetTunnelId = None if tweetInfo: tweetTunnelId = 'tweet_tunnel' def openRequiredTunnels(): if tweetInfo: return self.openTunnels(webSocket) else: return self.openTunnel(userTunnelId, webSocket) if not openRequiredTunnels(): logger.error('Failed to open initial tunnels') return False if tweetInfo: followerIdsFlag = False followeeIdsFlag = False analysisFlag = False isFollowersLoadedRequirement = None associatedWithTweetRequirement = True recursiveCacheFlag = False followerIdsProjection = None outputType = 1 # for csv. elif followerInfo: followerIdsFlag = True followeeIdsFlag = True analysisFlag = True isFollowersLoadedRequirement = True associatedWithTweetRequirement = None recursiveCacheFlag = True followerIdsProjection = None # this gives us all data on each follower. outputType = 2 elif followerInfoShort: followerIdsFlag = True followeeIdsFlag = True followerIdsProjection = NoQueryProjection() analysisFlag = True isFollowersLoadedRequirement = True associatedWithTweetRequirement = None recursiveCacheFlag = True outputType = 3 else: raise NotImplementedError() userProjection = UserProjection(True, True, None, True, followerIdsFlag, followerIdsProjection, followeeIdsFlag, UserProjection.Id(), True, False, False, True, True, False, False, False, False, analysisFlag) isFirstIteration = [True] twitterInstance = self.application.twitter_instances.getInstanceByInstanceKey(instanceId) if twitterInstance is None: return False twitterSession = twitterInstance.twitter_thread.twitter_session progressBarTotalId = 'progress-bar-total' progressBarCurrentBatchId = 'progress-bar-current-batch' signaler = EventSignaler(self.key, [webSocket]) updateProgressBarFreq = Timer(400,True) def sendData(tunnelId, data): self.sendDataOnTunnel(webSocket, tunnelId, (unicode(data) + '\r\n')) def sendHeader(): sendData(userTunnelId, getUserHeader(outputType)) if tweetTunnelId is not None: sendData(tweetTunnelId, getTweetHeader()) def doProgressBarChange(percentage, progressBarId): mainControl.executeJavascript('$("#%s").width("%.3f%%");' % (progressBarId, percentage)) sendHeader() counter = [0] previousCounter = [0] def updateSocket(controls, data, bytesCounter=counter, bytesPerBatch=bytesPerBatch, previousCounter=previousCounter, isFirstIteration=isFirstIteration): user = data['user_data'] tweet = data['tweet_data'] percentage = data['percentage'] isFinished = data['isFinished'] control = controls[self.key] assert isinstance(control, DocumentControl) def updateProgressBars(): previousCounter[0] = thisCounter = bytesCounter[0] percentageCurrentBatch = float(thisCounter) / float(bytesPerBatch) * 100 percentageTotal = percentage if percentageTotal >= 100: percentageCurrentBatch = 100 if isFirstIteration[0] and percentageCurrentBatch < percentageTotal: percentageCurrentBatch = percentageTotal doProgressBarChange(percentageTotal, progressBarTotalId) doProgressBarChange(percentageCurrentBatch, progressBarCurrentBatchId) if previousCounter[0] != bytesCounter[0] and updateProgressBarFreq.ticked(): updateProgressBars() dataToSendToClient = '' if user is not None: assert isinstance(user,User) dataToSendToClient = getUserRepresentation(user, outputType) sendData(userTunnelId, dataToSendToClient) if tweet is not None: assert isinstance(tweet, Tweet) dataToSendToClient = getTweetRepresentation(tweet) sendData(tweetTunnelId, dataToSendToClient) dataLength = len(dataToSendToClient) bytesCounter[0] += dataLength if bytesCounter[0] > bytesPerBatch or isFinished: updateProgressBars() isFirstIteration[0] = False bytesCounter[0] = 0 mainControl.executeJavascript('onBatchEnd();') self.closeTunnels(webSocket) if not isFinished: logger.debug('Waiting to receive next data provider') if not openRequiredTunnels(): logger.warning('Failed to reinitialize tunnel slots') webSocket.cleanup() return sendHeader() else: mainControl.executeJavascript('onFinished();') webSocket.cleanup() def onCacheIteration(iteration, total, isFinished, data, iteratorId): # Don't write followee data to output as it would duplicate alot of data. if iteratorId == 'followee': data = None running = not webSocket.is_cleaned_up if running: # We need to do this so that if the client closes the socket we are notified. webSocket.pingFreqLimited() percentage = getPercentage(iteration, total) dataId = None if data is not None: dataId = data.id #logger.info('iteration %.2f of %.2f (%.1f%%) - it: %s, userId: %s' % (iteration, total, percentage,iteratorId,dataId)) user = None tweet = None if data is None: pass elif isinstance(data, User): user = data elif isinstance(data, Tweet): tweet = data if tweet.has_user: user = tweet.user else: logger.error('Invalid data from cache, type: %s' % type(data)) return running signaler.signalEvent({SignalActions.SOCKET: updateSocket, 'percentage' : percentage, 'user_data' : user, 'tweet_data' : tweet, 'isFinished' : isFinished}) gevent.sleep(0) else: logger.debug('Ending cache download prematurely') return running logger.debug('Starting to read data from cache...') # This makes sure the search is finite. epochNow = getEpochMs() if endEpoch is None or endEpoch > epochNow: endEpoch = epochNow if followerInfo or followerInfoShort: readUsersFromCache(twitterSession, instanceId, placeId = placeCacheId, epochMsStartRange=startEpoch, epochMsEndRange=endEpoch, isFollowersLoadedRequirement=isFollowersLoadedRequirement, associatedWithTweetRequirement=associatedWithTweetRequirement, onIterationFunc=onCacheIteration, recursive=recursiveCacheFlag, userProjection=userProjection) else: readTweetsFromCache(twitterSession, instanceId, placeId = placeCacheId, epochMsStartRange=startEpoch, epochMsEndRange=endEpoch, onIterationFunc=onCacheIteration, retrieveUserData=True, userProjection=userProjection) # We want to cleanup everything now since we are done. return False