def writeTweetsForDay(currentDay):
     fileName = houston_data_folder+FileIO.getFileByDay(currentDay)
     for tweet in tweets.find({'ca': {'$gt':currentDay, '$lt': currentDay+timedelta(seconds=86399)}}, fields=['ca', 'tx', 'uid']):
         screenName = GenerateHoustonTweetsData.getScreenName(tweet['uid'])
         if screenName!=None: 
             data = {'id': tweet['_id'], 'text': tweet['tx'], 'created_at':getStringRepresentationForTweetTimestamp(tweet['ca']), 'user':{'screen_name': GenerateHoustonTweetsData.getScreenName(tweet['uid'])}}
             FileIO.writeToFileAsJson(data, fileName) 
     os.system('gzip %s'%fileName)
Exemple #2
0
 def iterateTweetsFromHouston(houstonDataStartTime=datetime(2010, 11, 1),
                              houstonDataEndTime=datetime(2011, 5, 30)):
     currentTime = houstonDataStartTime
     while currentTime <= houstonDataEndTime:
         for tweet in TwitterIterators.iterateFromFile(
                 houston_twitter_stream_settings.twitter_users_tweets_folder
                 + '%s.gz' % FileIO.getFileByDay(currentTime)):
             yield tweet
         currentTime += timedelta(days=1)
Exemple #3
0
 def iterateTweetsFromExperts(expertsDataStartTime=datetime(2011, 3, 19),
                              expertsDataEndTime=datetime(2011, 4, 12)):
     experts = getExperts()
     currentTime = expertsDataStartTime
     while currentTime <= expertsDataEndTime:
         for tweet in TwitterIterators.iterateFromFile(
                 experts_twitter_stream_settings.twitter_users_tweets_folder
                 + '%s.gz' % FileIO.getFileByDay(currentTime)):
             if tweet['user']['id_str'] in experts:
                 if getDateTimeObjectFromTweetTimestamp(
                         tweet['created_at']) <= expertsDataEndTime:
                     yield tweet
                 else:
                     return
         currentTime += timedelta(days=1)
 def writeClusters(hdStreamClusteringObject, currentMessageTime):
     print '\n\n\nEntering:', currentMessageTime, len(hdStreamClusteringObject.phraseTextAndDimensionMap), len(hdStreamClusteringObject.phraseTextToPhraseObjectMap), len(hdStreamClusteringObject.clusters)
     iterationData = {'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime),
                      'clusters': map(TwitterCrowdsSpecificMethods.getClusterInMapFormat, [cluster for cluster, _ in sorted(StreamCluster.iterateByAttribute(hdStreamClusteringObject.clusters.values(), 'length'), key=itemgetter(1), reverse=True)]),
                      'settings': Settings.getSerialzedObject(hdStreamClusteringObject.stream_settings)
                      }
     FileIO.writeToFileAsJson(iterationData, hdStreamClusteringObject.stream_settings['lsh_clusters_folder']+FileIO.getFileByDay(currentMessageTime))
     print 'Leaving: ', currentMessageTime, len(hdStreamClusteringObject.phraseTextAndDimensionMap), len(hdStreamClusteringObject.phraseTextToPhraseObjectMap), len(hdStreamClusteringObject.clusters)
 def iterateHoustonClusters(startingDay=datetime(2010,11,1), endingDay=datetime(2010,11,19)):
     while startingDay<=endingDay:
         for line in FileIO.iterateJsonFromFile(houston_twitter_stream_settings.lsh_clusters_folder+FileIO.getFileByDay(startingDay)): 
             currentTime = getDateTimeObjectFromTweetTimestamp(line['time_stamp'])
             for clusterMap in line['clusters']: yield (currentTime, TwitterCrowdsSpecificMethods.getClusterFromMapFormat(clusterMap))
         startingDay+=timedelta(days=1)
 def iterateTweetsFromHouston(houstonDataStartTime=datetime(2010,11,1), houstonDataEndTime=datetime(2011,5,30)):
     currentTime = houstonDataStartTime
     while currentTime <= houstonDataEndTime:
         for tweet in TwitterIterators.iterateFromFile(houston_twitter_stream_settings.twitter_users_tweets_folder+'%s.gz'%FileIO.getFileByDay(currentTime)): yield tweet
         currentTime+=timedelta(days=1)
 def iterateTweetsFromExperts(expertsDataStartTime=datetime(2011,3,19), expertsDataEndTime=datetime(2011,4,12)):
     experts = getExperts()
     currentTime = expertsDataStartTime
     while currentTime <= expertsDataEndTime:
         for tweet in TwitterIterators.iterateFromFile(experts_twitter_stream_settings.twitter_users_tweets_folder+'%s.gz'%FileIO.getFileByDay(currentTime)):
             if tweet['user']['id_str'] in experts:
                 if getDateTimeObjectFromTweetTimestamp(tweet['created_at']) <= expertsDataEndTime : yield tweet
                 else: return
         currentTime+=timedelta(days=1)