def iterateUserDocuments(fileName): dataForAggregation = defaultdict(Vector) textToIdMap = defaultdict(int) for tweet in FileIO.iterateJsonFromFile(fileName): textVector = TwitterCrowdsSpecificMethods.convertTweetJSONToMessage(tweet, **default_experts_twitter_stream_settings).vector textIdVector = Vector() for phrase in textVector: if phrase not in textToIdMap: textToIdMap[phrase]=str(len(textToIdMap)) textIdVector[textToIdMap[phrase]]=textVector[phrase] dataForAggregation[tweet['user']['screen_name'].lower()]+=textIdVector for k, v in dataForAggregation.iteritems(): yield k, v
def iterateTweetUsersAfterCombiningTweets(fileName, **stream_settings): dataForAggregation = defaultdict(Vector) textToIdMap = defaultdict(int) for tweet in TweetFiles.iterateTweetsFromGzip(fileName): textVector = TwitterCrowdsSpecificMethods.convertTweetJSONToMessage(tweet, **stream_settings).vector textIdVector = Vector() for phrase in textVector: if phrase not in textToIdMap: textToIdMap[phrase]=str(len(textToIdMap)) textIdVector[textToIdMap[phrase]]=textVector[phrase] dataForAggregation[tweet['user']['screen_name'].lower()]+=textIdVector for k, v in dataForAggregation.iteritems(): yield k, v
def _iterateUserDocuments(self): dataForAggregation = defaultdict(Vector) textToIdMap = defaultdict(int) for tweet in TweetFiles.iterateTweetsFromGzip(self.rawDataFileName): textVector = TwitterCrowdsSpecificMethods.convertTweetJSONToMessage(tweet, **self.stream_settings).vector textIdVector = Vector() for phrase in textVector: if phrase not in textToIdMap: textToIdMap[phrase] = str(len(textToIdMap)) textIdVector[textToIdMap[phrase]] = textVector[phrase] dataForAggregation[tweet["user"]["screen_name"].lower()] += textIdVector for k, v in dataForAggregation.iteritems(): yield k, v
def iterateUserDocuments(fileName): dataForAggregation = defaultdict(Vector) textToIdMap = defaultdict(int) for tweet in FileIO.iterateJsonFromFile(fileName): textVector = TwitterCrowdsSpecificMethods.convertTweetJSONToMessage( tweet, **default_experts_twitter_stream_settings).vector textIdVector = Vector() for phrase in textVector: if phrase not in textToIdMap: textToIdMap[phrase] = str(len(textToIdMap)) textIdVector[textToIdMap[phrase]] = textVector[phrase] dataForAggregation[tweet['user'] ['screen_name'].lower()] += textIdVector for k, v in dataForAggregation.iteritems(): yield k, v
def iterateTweetUsersAfterCombiningTweets(fileName, **stream_settings): dataForAggregation = defaultdict(Vector) textToIdMap = defaultdict(int) for tweet in TweetFiles.iterateTweetsFromGzip(fileName): textVector = TwitterCrowdsSpecificMethods.convertTweetJSONToMessage( tweet, **stream_settings).vector textIdVector = Vector() for phrase in textVector: if phrase not in textToIdMap: textToIdMap[phrase] = str(len(textToIdMap)) textIdVector[textToIdMap[phrase]] = textVector[phrase] dataForAggregation[tweet['user'] ['screen_name'].lower()] += textIdVector for k, v in dataForAggregation.iteritems(): yield k, v
def iteratePhrases(): for tweet in TweetFiles.iterateTweetsFromGzip('/mnt/chevron/kykamath/data/twitter/tweets_by_trends/2011_2_6.gz'): message = TwitterCrowdsSpecificMethods.convertTweetJSONToMessage(tweet, **settings) if message.vector: for phrase in message.vector: if phrase!='': yield (phrase, GeneralMethods.approximateEpoch(GeneralMethods.getEpochFromDateTimeObject(message.timeStamp), 60))
def test_convertTweetJSONToMessage(self): message = TwitterCrowdsSpecificMethods.convertTweetJSONToMessage(self.tweet, **twitter_stream_settings) self.assertEqual({'project': 1, 'cluster': 1, 'streams': 1, 'highdimensional': 1}, message.vector)