def generateStatsForMRKMeansClusteringQuality(): for i in [90000, 100000, 200000, 300000, 400000, 500000]: print 'Generating stats for: ',i tf = TweetsFile(i, **experts_twitter_stream_settings) FileIO.writeToFileAsJson({'mr_k_means': tf.generateStatsForKMeansMRClustering(), 'settings': Settings.getSerialzedObject(tf.stream_settings)}, TweetsFile.mr_stats_file)
def writeClusters(hdStreamClusteringObject, currentMessageTime): print '\n\n\nEntering:', currentMessageTime, len(hdStreamClusteringObject.phraseTextAndDimensionMap), len(hdStreamClusteringObject.phraseTextToPhraseObjectMap), len(hdStreamClusteringObject.clusters) iterationData = {'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime), 'clusters': map(TwitterCrowdsSpecificMethods.getClusterInMapFormat, [cluster for cluster, _ in sorted(StreamCluster.iterateByAttribute(hdStreamClusteringObject.clusters.values(), 'length'), key=itemgetter(1), reverse=True)]), 'settings': Settings.getSerialzedObject(hdStreamClusteringObject.stream_settings) } FileIO.writeToFileAsJson(iterationData, hdStreamClusteringObject.stream_settings['lsh_clusters_folder']+FileIO.getFileByDay(currentMessageTime)) print 'Leaving: ', currentMessageTime, len(hdStreamClusteringObject.phraseTextAndDimensionMap), len(hdStreamClusteringObject.phraseTextToPhraseObjectMap), len(hdStreamClusteringObject.clusters)
def generateStatsForDefaultStreamSettings(): for i in [10**3, 10**4, 10**5]: for j in range(1, 10): print 'Generating stats for: ',i*j tf = TweetsFile(i*j, **default_experts_twitter_stream_settings) FileIO.writeToFileAsJson({'streaming_lsh': tf.generateStatsForStreamingLSHClustering(), 'settings': Settings.getSerialzedObject(tf.stream_settings)}, TweetsFile.default_stats_file)
def generateStatsForQualityComparisonWithSSA(): # for length in [i*j for i in 10**3, 10**4, 10**5 for j in range(1, 10)]: for length in [1000000]: print "Generating stats for: ", length tf = TweetsFile(length, **experts_twitter_stream_settings) # stats = {'ssa': tf.getStatsForSSA(), 'ssa_mr': tf.getStatsForSSAMR(), 'streaming_lsh': KMeansTweetsFile(length, **experts_twitter_stream_settings).generateStatsForStreamingLSHClustering(), 'settings': Settings.getSerialzedObject(tf.stream_settings)} stats = {"ssa_mr": tf.getStatsForSSAMR(), "settings": Settings.getSerialzedObject(tf.stream_settings)} FileIO.writeToFileAsJson(stats, TweetsFile.stats_file)
def generateStatsForMRKMeansClusteringQuality(): for i in [90000, 100000, 200000, 300000, 400000, 500000]: print 'Generating stats for: ', i tf = TweetsFile(i, **experts_twitter_stream_settings) FileIO.writeToFileAsJson( { 'mr_k_means': tf.generateStatsForKMeansMRClustering(), 'settings': Settings.getSerialzedObject(tf.stream_settings) }, TweetsFile.mr_stats_file)
def generateStatsForOptimized(): # for i in [10**3, 10**4, 10**5]: for length in [1000000, 1100000, 1200000]: # for i in [10**6]: # for j in range(1, 10): print 'Generating stats for: ', length tf = TweetsFile(length, **experts_twitter_stream_settings) FileIO.writeToFileAsJson({'streaming_lsh': tf.generateStatsForHDLSHClustering(), 'settings': Settings.getSerialzedObject(tf.stream_settings)}, hd_clustering_performance_folder+'cda')
def generateStatsForQualityComparisonWithSSA(): # for length in [i*j for i in 10**3, 10**4, 10**5 for j in range(1, 10)]: for length in [1000000]: print 'Generating stats for: ', length tf = TweetsFile(length, **experts_twitter_stream_settings) # stats = {'ssa': tf.getStatsForSSA(), 'ssa_mr': tf.getStatsForSSAMR(), 'streaming_lsh': KMeansTweetsFile(length, **experts_twitter_stream_settings).generateStatsForStreamingLSHClustering(), 'settings': Settings.getSerialzedObject(tf.stream_settings)} stats = { 'ssa_mr': tf.getStatsForSSAMR(), 'settings': Settings.getSerialzedObject(tf.stream_settings) } FileIO.writeToFileAsJson(stats, TweetsFile.stats_file)
def generateStatsForCDA(): for length, fileName in GenerateStats.lengthAndFileIterator(): print 'Generating stats for: ', length performance = GenerateStats.performanceForCDAAt( length, fileName, **experts_twitter_stream_settings) stats = { CDA: performance, 'settings': Settings.getSerialzedObject(experts_twitter_stream_settings) } FileIO.writeToFileAsJson(stats, getPerformanceFile(CDA))
def generateStatsForOptimized(): # for i in [10**3, 10**4, 10**5]: for length in [1000000, 1100000, 1200000]: # for i in [10**6]: # for j in range(1, 10): print 'Generating stats for: ', length tf = TweetsFile(length, **experts_twitter_stream_settings) FileIO.writeToFileAsJson( { 'streaming_lsh': tf.generateStatsForHDLSHClustering(), 'settings': Settings.getSerialzedObject(tf.stream_settings) }, hd_clustering_performance_folder + 'cda')
def generateStatsForDefaultStreamSettings(): for i in [10**3, 10**4, 10**5]: for j in range(1, 10): print 'Generating stats for: ', i * j tf = TweetsFile(i * j, **default_experts_twitter_stream_settings) FileIO.writeToFileAsJson( { 'streaming_lsh': tf.generateStatsForStreamingLSHClustering(), 'settings': Settings.getSerialzedObject(tf.stream_settings) }, TweetsFile.default_stats_file)
def generateStatsForUnOptimized(): # for i in [10**3, 10**4, 10**5]: for length in [1000000, 1100000, 1200000]: # for i in [10**6]: # for j in range(1, 10): print 'Generating stats for: ', length # default_experts_twitter_stream_settings['cluster_filtering_method'] = emptyClusterFilteringMethod tf = TweetsFile(length, **default_experts_twitter_stream_settings) performance = tf.generateStatsForHDLSHClustering() FileIO.writeToFileAsJson({'streaming_lsh': performance, 'settings': Settings.getSerialzedObject(tf.stream_settings)}, hd_clustering_performance_folder+'cda_unopt') del performance['clusters'] print performance
def generateStatsForUnOptimized(): # for i in [10**3, 10**4, 10**5]: for length in [1000000, 1100000, 1200000]: # for i in [10**6]: # for j in range(1, 10): print 'Generating stats for: ', length # default_experts_twitter_stream_settings['cluster_filtering_method'] = emptyClusterFilteringMethod tf = TweetsFile(length, **default_experts_twitter_stream_settings) performance = tf.generateStatsForHDLSHClustering() FileIO.writeToFileAsJson( { 'streaming_lsh': performance, 'settings': Settings.getSerialzedObject(tf.stream_settings) }, hd_clustering_performance_folder + 'cda_unopt') del performance['clusters'] print performance
def thresholdForDocumentToBeInCluterEstimation(stats_file, **stream_settings): ''' Estimate thresold for the clusters by varying the threshold_for_document_to_be_in_cluster value. Run this on a document set of size 100K. ''' for length in [ i * j for i in 10**3, 10**4, 10**5 for j in range(1, 10) ]: # for t in range(1, 16): for t in range(16, 21): stream_settings[ 'threshold_for_document_to_be_in_cluster'] = t * 0.05 print length, stream_settings[ 'threshold_for_document_to_be_in_cluster'] stats = { 'streaming_lsh': KMeansTweetsFile(length, **stream_settings). generateStatsForStreamingLSHClustering(), 'settings': Settings.getSerialzedObject(stream_settings) } FileIO.writeToFileAsJson(stats, stats_file)
def thresholdForDocumentToBeInCluterEstimation(stats_file, **stream_settings): ''' Estimate thresold for the clusters by varying the threshold_for_document_to_be_in_cluster value. Run this on a document set of size 100K. ''' for length in [i * j for i in 10 ** 3, 10 ** 4, 10 ** 5 for j in range(1, 10)]: # for t in range(1, 16): for t in range(16, 21): stream_settings['threshold_for_document_to_be_in_cluster'] = t * 0.05 print length, stream_settings['threshold_for_document_to_be_in_cluster'] stats = {'streaming_lsh': KMeansTweetsFile(length, **stream_settings).generateStatsForStreamingLSHClustering(), 'settings': Settings.getSerialzedObject(stream_settings)} FileIO.writeToFileAsJson(stats, stats_file)
def generateStatsForCDA(): for length, fileName in GenerateStats.lengthAndFileIterator(): print 'Generating stats for: ',length performance = GenerateStats.performanceForCDAAt(length, fileName, **experts_twitter_stream_settings) stats = {CDA: performance, 'settings': Settings.getSerialzedObject(experts_twitter_stream_settings)} FileIO.writeToFileAsJson(stats, getPerformanceFile(CDA))