def performanceForCDAAt(noOfTweets, fileName, **stream_settings):
     clustering=HDStreaminClustering(**stream_settings)
     ts = time.time()
     clustering.cluster(TwitterIterators.iterateFromFile(fileName))
     te = time.time()
     documentClusters = [cluster.documentsInCluster.keys() for k, cluster in clustering.clusters.iteritems() if len(cluster.documentsInCluster.keys())>=stream_settings['cluster_filter_threshold']]
     return Evaluation.getEvaluationMetrics(noOfTweets, documentClusters, te-ts)
 def _tweetIterator(self):
         userMap = {}
         for tweet in TwitterIterators.iterateFromFile(self.fileName+'.gz'):
             user = tweet['user']['screen_name']
             phrases = [phrase.replace(' ', unique_string) for phrase in getPhrases(getWordsFromRawEnglishMessage(tweet['text']), self.stream_settings['min_phrase_length'], self.stream_settings['max_phrase_length'])]
             if phrases:
                 if user not in userMap: userMap[user] = ' '.join(phrases)
                 else: userMap[user]+= ' ' + ' '.join(phrases)
         return userMap.iteritems()
 def _tweetWithTimestampIterator(self):
         userMap = defaultdict(dict)
         for tweet in TwitterIterators.iterateFromFile(self.fileName+'.gz'):
             user = tweet['user']['screen_name']
             userMap[user]['user'] = {'screen_name': user}
             userMap[user]['id'] = tweet['id']
             userMap[user]['created_at'] = tweet['created_at']
             if 'text' not in userMap[user]: userMap[user]['text'] = ' '
             phrases = [phrase.replace(' ', unique_string) for phrase in getPhrases(getWordsFromRawEnglishMessage(tweet['text']), self.stream_settings['min_phrase_length'], self.stream_settings['max_phrase_length'])]
             if phrases: userMap[user]['text']+= ' ' + ' '.join(phrases)
         return userMap.iteritems()
Esempio n. 4
0
 def performanceForCDAAt(noOfTweets, fileName, **stream_settings):
     clustering = HDStreaminClustering(**stream_settings)
     ts = time.time()
     clustering.cluster(TwitterIterators.iterateFromFile(fileName))
     te = time.time()
     documentClusters = [
         cluster.documentsInCluster.keys()
         for k, cluster in clustering.clusters.iteritems()
         if len(cluster.documentsInCluster.keys()) >=
         stream_settings['cluster_filter_threshold']
     ]
     return Evaluation.getEvaluationMetrics(noOfTweets, documentClusters,
                                            te - ts)
Esempio n. 5
0
 def _tweetIterator(self):
     userMap = {}
     for tweet in TwitterIterators.iterateFromFile(self.fileName + '.gz'):
         user = tweet['user']['screen_name']
         phrases = [
             phrase.replace(' ', unique_string) for phrase in getPhrases(
                 getWordsFromRawEnglishMessage(tweet['text']),
                 self.stream_settings['min_phrase_length'],
                 self.stream_settings['max_phrase_length'])
         ]
         if phrases:
             if user not in userMap: userMap[user] = ' '.join(phrases)
             else: userMap[user] += ' ' + ' '.join(phrases)
     return userMap.iteritems()
Esempio n. 6
0
 def _tweetWithTimestampIterator(self):
     userMap = defaultdict(dict)
     for tweet in TwitterIterators.iterateFromFile(self.fileName + '.gz'):
         user = tweet['user']['screen_name']
         userMap[user]['user'] = {'screen_name': user}
         userMap[user]['id'] = tweet['id']
         userMap[user]['created_at'] = tweet['created_at']
         if 'text' not in userMap[user]: userMap[user]['text'] = ' '
         phrases = [
             phrase.replace(' ', unique_string) for phrase in getPhrases(
                 getWordsFromRawEnglishMessage(tweet['text']),
                 self.stream_settings['min_phrase_length'],
                 self.stream_settings['max_phrase_length'])
         ]
         if phrases: userMap[user]['text'] += ' ' + ' '.join(phrases)
     return userMap.iteritems()