def generateStatsForMRKMeansClusteringQuality():
     for i in [90000, 100000, 200000, 300000, 400000, 500000]: 
         print 'Generating stats for: ',i
         tf = TweetsFile(i, **experts_twitter_stream_settings)
         FileIO.writeToFileAsJson({'mr_k_means': tf.generateStatsForKMeansMRClustering(), 
                                   'settings': Settings.getSerialzedObject(tf.stream_settings)}, 
                                   TweetsFile.mr_stats_file)
 def writeClusters(hdStreamClusteringObject, currentMessageTime):
     print '\n\n\nEntering:', currentMessageTime, len(hdStreamClusteringObject.phraseTextAndDimensionMap), len(hdStreamClusteringObject.phraseTextToPhraseObjectMap), len(hdStreamClusteringObject.clusters)
     iterationData = {'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime),
                      'clusters': map(TwitterCrowdsSpecificMethods.getClusterInMapFormat, [cluster for cluster, _ in sorted(StreamCluster.iterateByAttribute(hdStreamClusteringObject.clusters.values(), 'length'), key=itemgetter(1), reverse=True)]),
                      'settings': Settings.getSerialzedObject(hdStreamClusteringObject.stream_settings)
                      }
     FileIO.writeToFileAsJson(iterationData, hdStreamClusteringObject.stream_settings['lsh_clusters_folder']+FileIO.getFileByDay(currentMessageTime))
     print 'Leaving: ', currentMessageTime, len(hdStreamClusteringObject.phraseTextAndDimensionMap), len(hdStreamClusteringObject.phraseTextToPhraseObjectMap), len(hdStreamClusteringObject.clusters)
 def generateStatsForDefaultStreamSettings():
     for i in [10**3, 10**4, 10**5]: 
         for j in range(1, 10):
             print 'Generating stats for: ',i*j
             tf = TweetsFile(i*j, **default_experts_twitter_stream_settings)
             FileIO.writeToFileAsJson({'streaming_lsh': tf.generateStatsForStreamingLSHClustering(), 
                                       'settings': Settings.getSerialzedObject(tf.stream_settings)}, 
                                       TweetsFile.default_stats_file)
 def generateStatsForQualityComparisonWithSSA():
     #        for length in [i*j for i in 10**3, 10**4, 10**5 for j in range(1, 10)]:
     for length in [1000000]:
         print "Generating stats for: ", length
         tf = TweetsFile(length, **experts_twitter_stream_settings)
         #            stats = {'ssa': tf.getStatsForSSA(), 'ssa_mr': tf.getStatsForSSAMR(), 'streaming_lsh': KMeansTweetsFile(length, **experts_twitter_stream_settings).generateStatsForStreamingLSHClustering(), 'settings': Settings.getSerialzedObject(tf.stream_settings)}
         stats = {"ssa_mr": tf.getStatsForSSAMR(), "settings": Settings.getSerialzedObject(tf.stream_settings)}
         FileIO.writeToFileAsJson(stats, TweetsFile.stats_file)
Beispiel #5
0
 def generateStatsForMRKMeansClusteringQuality():
     for i in [90000, 100000, 200000, 300000, 400000, 500000]:
         print 'Generating stats for: ', i
         tf = TweetsFile(i, **experts_twitter_stream_settings)
         FileIO.writeToFileAsJson(
             {
                 'mr_k_means': tf.generateStatsForKMeansMRClustering(),
                 'settings': Settings.getSerialzedObject(tf.stream_settings)
             }, TweetsFile.mr_stats_file)
    def generateStatsForOptimized():
#        for i in [10**3, 10**4, 10**5]: 
        for length in [1000000, 1100000, 1200000]: 
#        for i in [10**6]:
#            for j in range(1, 10): 
                print 'Generating stats for: ', length
                tf = TweetsFile(length, **experts_twitter_stream_settings)
                FileIO.writeToFileAsJson({'streaming_lsh': tf.generateStatsForHDLSHClustering(), 
                                          'settings': Settings.getSerialzedObject(tf.stream_settings)}, 
                                          hd_clustering_performance_folder+'cda')
 def generateStatsForQualityComparisonWithSSA():
     #        for length in [i*j for i in 10**3, 10**4, 10**5 for j in range(1, 10)]:
     for length in [1000000]:
         print 'Generating stats for: ', length
         tf = TweetsFile(length, **experts_twitter_stream_settings)
         #            stats = {'ssa': tf.getStatsForSSA(), 'ssa_mr': tf.getStatsForSSAMR(), 'streaming_lsh': KMeansTweetsFile(length, **experts_twitter_stream_settings).generateStatsForStreamingLSHClustering(), 'settings': Settings.getSerialzedObject(tf.stream_settings)}
         stats = {
             'ssa_mr': tf.getStatsForSSAMR(),
             'settings': Settings.getSerialzedObject(tf.stream_settings)
         }
         FileIO.writeToFileAsJson(stats, TweetsFile.stats_file)
Beispiel #8
0
 def generateStatsForCDA():
     for length, fileName in GenerateStats.lengthAndFileIterator():
         print 'Generating stats for: ', length
         performance = GenerateStats.performanceForCDAAt(
             length, fileName, **experts_twitter_stream_settings)
         stats = {
             CDA:
             performance,
             'settings':
             Settings.getSerialzedObject(experts_twitter_stream_settings)
         }
         FileIO.writeToFileAsJson(stats, getPerformanceFile(CDA))
Beispiel #9
0
 def generateStatsForOptimized():
     #        for i in [10**3, 10**4, 10**5]:
     for length in [1000000, 1100000, 1200000]:
         #        for i in [10**6]:
         #            for j in range(1, 10):
         print 'Generating stats for: ', length
         tf = TweetsFile(length, **experts_twitter_stream_settings)
         FileIO.writeToFileAsJson(
             {
                 'streaming_lsh': tf.generateStatsForHDLSHClustering(),
                 'settings': Settings.getSerialzedObject(tf.stream_settings)
             }, hd_clustering_performance_folder + 'cda')
Beispiel #10
0
 def generateStatsForDefaultStreamSettings():
     for i in [10**3, 10**4, 10**5]:
         for j in range(1, 10):
             print 'Generating stats for: ', i * j
             tf = TweetsFile(i * j,
                             **default_experts_twitter_stream_settings)
             FileIO.writeToFileAsJson(
                 {
                     'streaming_lsh':
                     tf.generateStatsForStreamingLSHClustering(),
                     'settings':
                     Settings.getSerialzedObject(tf.stream_settings)
                 }, TweetsFile.default_stats_file)
    def generateStatsForUnOptimized():
#        for i in [10**3, 10**4, 10**5]: 
        for length in [1000000, 1100000, 1200000]: 
#        for i in [10**6]:
#            for j in range(1, 10): 
                print 'Generating stats for: ', length
#                default_experts_twitter_stream_settings['cluster_filtering_method'] = emptyClusterFilteringMethod
                tf = TweetsFile(length, **default_experts_twitter_stream_settings)
                performance = tf.generateStatsForHDLSHClustering()
                FileIO.writeToFileAsJson({'streaming_lsh': performance,
                                          'settings': Settings.getSerialzedObject(tf.stream_settings)}, 
                                          hd_clustering_performance_folder+'cda_unopt')
                del performance['clusters']
                print performance
Beispiel #12
0
 def generateStatsForUnOptimized():
     #        for i in [10**3, 10**4, 10**5]:
     for length in [1000000, 1100000, 1200000]:
         #        for i in [10**6]:
         #            for j in range(1, 10):
         print 'Generating stats for: ', length
         #                default_experts_twitter_stream_settings['cluster_filtering_method'] = emptyClusterFilteringMethod
         tf = TweetsFile(length, **default_experts_twitter_stream_settings)
         performance = tf.generateStatsForHDLSHClustering()
         FileIO.writeToFileAsJson(
             {
                 'streaming_lsh': performance,
                 'settings': Settings.getSerialzedObject(tf.stream_settings)
             }, hd_clustering_performance_folder + 'cda_unopt')
         del performance['clusters']
         print performance
Beispiel #13
0
 def thresholdForDocumentToBeInCluterEstimation(stats_file,
                                                **stream_settings):
     ''' Estimate thresold for the clusters by varying the threshold_for_document_to_be_in_cluster value.
     Run this on a document set of size 100K. 
     '''
     for length in [
             i * j for i in 10**3, 10**4, 10**5 for j in range(1, 10)
     ]:
         #            for t in range(1, 16):
         for t in range(16, 21):
             stream_settings[
                 'threshold_for_document_to_be_in_cluster'] = t * 0.05
             print length, stream_settings[
                 'threshold_for_document_to_be_in_cluster']
             stats = {
                 'streaming_lsh':
                 KMeansTweetsFile(length, **stream_settings).
                 generateStatsForStreamingLSHClustering(),
                 'settings':
                 Settings.getSerialzedObject(stream_settings)
             }
             FileIO.writeToFileAsJson(stats, stats_file)
    def thresholdForDocumentToBeInCluterEstimation(stats_file, **stream_settings):
        ''' Estimate thresold for the clusters by varying the threshold_for_document_to_be_in_cluster value.
        Run this on a document set of size 100K. 
        '''
        for length in [i * j for i in 10 ** 3, 10 ** 4, 10 ** 5 for j in range(1, 10)]: 
#            for t in range(1, 16): 
            for t in range(16, 21):
                stream_settings['threshold_for_document_to_be_in_cluster'] = t * 0.05
                print length, stream_settings['threshold_for_document_to_be_in_cluster']
                stats = {'streaming_lsh': KMeansTweetsFile(length, **stream_settings).generateStatsForStreamingLSHClustering(), 'settings': Settings.getSerialzedObject(stream_settings)}
                FileIO.writeToFileAsJson(stats, stats_file)
 def generateStatsForCDA():
     for length, fileName in GenerateStats.lengthAndFileIterator(): 
         print 'Generating stats for: ',length
         performance = GenerateStats.performanceForCDAAt(length, fileName, **experts_twitter_stream_settings)
         stats = {CDA: performance, 'settings': Settings.getSerialzedObject(experts_twitter_stream_settings)}
         FileIO.writeToFileAsJson(stats, getPerformanceFile(CDA))