def getStatsForCDA(): global previousTime default_experts_twitter_stream_settings['cluster_analysis_method'] = clusterAnalysis default_experts_twitter_stream_settings['cluster_analysis_frequency_in_seconds'] = 30 clustering = HDSkipStreamClustering(**default_experts_twitter_stream_settings) previousTime = time.time() clustering.cluster(TweetFiles.iterateTweetsFromGzip('/mnt/chevron/kykamath/data/twitter/lsh_clustering/clustering_quality_experts_folder/data/1000000.gz'))
def generateData(): i = 0 for line in TweetFiles.iterateTweetsFromGzip( '/mnt/chevron/kykamath/data/twitter/lsh_clustering/clustering_quality_experts_folder/data/1000000.gz' ): FileIO.writeToFileAsJson( line, time_to_process_points + '10000/%s' % (i / 10000)) i += 1
def iterateTweetUsersAfterCombiningTweets(fileName, **stream_settings): dataForAggregation = defaultdict(Vector) textToIdMap = defaultdict(int) for tweet in TweetFiles.iterateTweetsFromGzip(fileName): textVector = TwitterCrowdsSpecificMethods.convertTweetJSONToMessage(tweet, **stream_settings).vector textIdVector = Vector() for phrase in textVector: if phrase not in textToIdMap: textToIdMap[phrase]=str(len(textToIdMap)) textIdVector[textToIdMap[phrase]]=textVector[phrase] dataForAggregation[tweet['user']['screen_name'].lower()]+=textIdVector for k, v in dataForAggregation.iteritems(): yield k, v
def getStatsForCDA(): global previousTime default_experts_twitter_stream_settings[ 'cluster_analysis_method'] = clusterAnalysis default_experts_twitter_stream_settings[ 'cluster_analysis_frequency_in_seconds'] = 30 clustering = HDSkipStreamClustering( **default_experts_twitter_stream_settings) previousTime = time.time() clustering.cluster( TweetFiles.iterateTweetsFromGzip( '/mnt/chevron/kykamath/data/twitter/lsh_clustering/clustering_quality_experts_folder/data/1000000.gz' ))
def _iterateUserDocuments(self): dataForAggregation = defaultdict(Vector) textToIdMap = defaultdict(int) for tweet in TweetFiles.iterateTweetsFromGzip(self.rawDataFileName): textVector = TwitterCrowdsSpecificMethods.convertTweetJSONToMessage(tweet, **self.stream_settings).vector textIdVector = Vector() for phrase in textVector: if phrase not in textToIdMap: textToIdMap[phrase] = str(len(textToIdMap)) textIdVector[textToIdMap[phrase]] = textVector[phrase] dataForAggregation[tweet["user"]["screen_name"].lower()] += textIdVector for k, v in dataForAggregation.iteritems(): yield k, v
def iterateTweetUsersAfterCombiningTweets(fileName, **stream_settings): dataForAggregation = defaultdict(Vector) textToIdMap = defaultdict(int) for tweet in TweetFiles.iterateTweetsFromGzip(fileName): textVector = TwitterCrowdsSpecificMethods.convertTweetJSONToMessage( tweet, **stream_settings).vector textIdVector = Vector() for phrase in textVector: if phrase not in textToIdMap: textToIdMap[phrase] = str(len(textToIdMap)) textIdVector[textToIdMap[phrase]] = textVector[phrase] dataForAggregation[tweet['user'] ['screen_name'].lower()] += textIdVector for k, v in dataForAggregation.iteritems(): yield k, v
def iteratePhrases(): for tweet in TweetFiles.iterateTweetsFromGzip('/mnt/chevron/kykamath/data/twitter/tweets_by_trends/2011_2_6.gz'): message = TwitterCrowdsSpecificMethods.convertTweetJSONToMessage(tweet, **settings) if message.vector: for phrase in message.vector: if phrase!='': yield (phrase, GeneralMethods.approximateEpoch(GeneralMethods.getEpochFromDateTimeObject(message.timeStamp), 60))
def iterateFromFile(file): for tweet in TweetFiles.iterateTweetsFromGzip(file): yield tweet
def generateData(): i = 0 for line in TweetFiles.iterateTweetsFromGzip('/mnt/chevron/kykamath/data/twitter/lsh_clustering/clustering_quality_experts_folder/data/1000000.gz'): FileIO.writeToFileAsJson(line, time_to_process_points+'10000/%s'%(i/10000)) i+=1