def writeTweetsForDay(currentDay): fileName = houston_data_folder+FileIO.getFileByDay(currentDay) for tweet in tweets.find({'ca': {'$gt':currentDay, '$lt': currentDay+timedelta(seconds=86399)}}, fields=['ca', 'tx', 'uid']): screenName = GenerateHoustonTweetsData.getScreenName(tweet['uid']) if screenName!=None: data = {'id': tweet['_id'], 'text': tweet['tx'], 'created_at':getStringRepresentationForTweetTimestamp(tweet['ca']), 'user':{'screen_name': GenerateHoustonTweetsData.getScreenName(tweet['uid'])}} FileIO.writeToFileAsJson(data, fileName) os.system('gzip %s'%fileName)
def iterateTweetsFromHouston(houstonDataStartTime=datetime(2010, 11, 1), houstonDataEndTime=datetime(2011, 5, 30)): currentTime = houstonDataStartTime while currentTime <= houstonDataEndTime: for tweet in TwitterIterators.iterateFromFile( houston_twitter_stream_settings.twitter_users_tweets_folder + '%s.gz' % FileIO.getFileByDay(currentTime)): yield tweet currentTime += timedelta(days=1)
def iterateTweetsFromExperts(expertsDataStartTime=datetime(2011, 3, 19), expertsDataEndTime=datetime(2011, 4, 12)): experts = getExperts() currentTime = expertsDataStartTime while currentTime <= expertsDataEndTime: for tweet in TwitterIterators.iterateFromFile( experts_twitter_stream_settings.twitter_users_tweets_folder + '%s.gz' % FileIO.getFileByDay(currentTime)): if tweet['user']['id_str'] in experts: if getDateTimeObjectFromTweetTimestamp( tweet['created_at']) <= expertsDataEndTime: yield tweet else: return currentTime += timedelta(days=1)
def writeClusters(hdStreamClusteringObject, currentMessageTime): print '\n\n\nEntering:', currentMessageTime, len(hdStreamClusteringObject.phraseTextAndDimensionMap), len(hdStreamClusteringObject.phraseTextToPhraseObjectMap), len(hdStreamClusteringObject.clusters) iterationData = {'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime), 'clusters': map(TwitterCrowdsSpecificMethods.getClusterInMapFormat, [cluster for cluster, _ in sorted(StreamCluster.iterateByAttribute(hdStreamClusteringObject.clusters.values(), 'length'), key=itemgetter(1), reverse=True)]), 'settings': Settings.getSerialzedObject(hdStreamClusteringObject.stream_settings) } FileIO.writeToFileAsJson(iterationData, hdStreamClusteringObject.stream_settings['lsh_clusters_folder']+FileIO.getFileByDay(currentMessageTime)) print 'Leaving: ', currentMessageTime, len(hdStreamClusteringObject.phraseTextAndDimensionMap), len(hdStreamClusteringObject.phraseTextToPhraseObjectMap), len(hdStreamClusteringObject.clusters)
def iterateHoustonClusters(startingDay=datetime(2010,11,1), endingDay=datetime(2010,11,19)): while startingDay<=endingDay: for line in FileIO.iterateJsonFromFile(houston_twitter_stream_settings.lsh_clusters_folder+FileIO.getFileByDay(startingDay)): currentTime = getDateTimeObjectFromTweetTimestamp(line['time_stamp']) for clusterMap in line['clusters']: yield (currentTime, TwitterCrowdsSpecificMethods.getClusterFromMapFormat(clusterMap)) startingDay+=timedelta(days=1)
def iterateTweetsFromHouston(houstonDataStartTime=datetime(2010,11,1), houstonDataEndTime=datetime(2011,5,30)): currentTime = houstonDataStartTime while currentTime <= houstonDataEndTime: for tweet in TwitterIterators.iterateFromFile(houston_twitter_stream_settings.twitter_users_tweets_folder+'%s.gz'%FileIO.getFileByDay(currentTime)): yield tweet currentTime+=timedelta(days=1)
def iterateTweetsFromExperts(expertsDataStartTime=datetime(2011,3,19), expertsDataEndTime=datetime(2011,4,12)): experts = getExperts() currentTime = expertsDataStartTime while currentTime <= expertsDataEndTime: for tweet in TwitterIterators.iterateFromFile(experts_twitter_stream_settings.twitter_users_tweets_folder+'%s.gz'%FileIO.getFileByDay(currentTime)): if tweet['user']['id_str'] in experts: if getDateTimeObjectFromTweetTimestamp(tweet['created_at']) <= expertsDataEndTime : yield tweet else: return currentTime+=timedelta(days=1)