def generateStatsForMRKMeansClusteringQuality(): for i in [90000, 100000, 200000, 300000, 400000, 500000]: print 'Generating stats for: ',i tf = TweetsFile(i, **experts_twitter_stream_settings) FileIO.writeToFileAsJson({'mr_k_means': tf.generateStatsForKMeansMRClustering(), 'settings': Settings.getSerialzedObject(tf.stream_settings)}, TweetsFile.mr_stats_file)
def writeClusters(hdStreamClusteringObject, currentMessageTime): print '\n\n\nEntering:', currentMessageTime, len(hdStreamClusteringObject.phraseTextAndDimensionMap), len(hdStreamClusteringObject.phraseTextToPhraseObjectMap), len(hdStreamClusteringObject.clusters) iterationData = {'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime), 'clusters': map(TwitterCrowdsSpecificMethods.getClusterInMapFormat, [cluster for cluster, _ in sorted(StreamCluster.iterateByAttribute(hdStreamClusteringObject.clusters.values(), 'length'), key=itemgetter(1), reverse=True)]), 'settings': Settings.getSerialzedObject(hdStreamClusteringObject.stream_settings) } FileIO.writeToFileAsJson(iterationData, hdStreamClusteringObject.stream_settings['lsh_clusters_folder']+FileIO.getFileByDay(currentMessageTime)) print 'Leaving: ', currentMessageTime, len(hdStreamClusteringObject.phraseTextAndDimensionMap), len(hdStreamClusteringObject.phraseTextToPhraseObjectMap), len(hdStreamClusteringObject.clusters)
def generateStatsForDefaultStreamSettings(): for i in [10**3, 10**4, 10**5]: for j in range(1, 10): print 'Generating stats for: ',i*j tf = TweetsFile(i*j, **default_experts_twitter_stream_settings) FileIO.writeToFileAsJson({'streaming_lsh': tf.generateStatsForStreamingLSHClustering(), 'settings': Settings.getSerialzedObject(tf.stream_settings)}, TweetsFile.default_stats_file)
def generateStatsForQualityComparisonWithSSA(): # for length in [i*j for i in 10**3, 10**4, 10**5 for j in range(1, 10)]: for length in [1000000]: print "Generating stats for: ", length tf = TweetsFile(length, **experts_twitter_stream_settings) # stats = {'ssa': tf.getStatsForSSA(), 'ssa_mr': tf.getStatsForSSAMR(), 'streaming_lsh': KMeansTweetsFile(length, **experts_twitter_stream_settings).generateStatsForStreamingLSHClustering(), 'settings': Settings.getSerialzedObject(tf.stream_settings)} stats = {"ssa_mr": tf.getStatsForSSAMR(), "settings": Settings.getSerialzedObject(tf.stream_settings)} FileIO.writeToFileAsJson(stats, TweetsFile.stats_file)
def generateStatsForMRKMeansClusteringQuality(): for i in [90000, 100000, 200000, 300000, 400000, 500000]: print 'Generating stats for: ', i tf = TweetsFile(i, **experts_twitter_stream_settings) FileIO.writeToFileAsJson( { 'mr_k_means': tf.generateStatsForKMeansMRClustering(), 'settings': Settings.getSerialzedObject(tf.stream_settings) }, TweetsFile.mr_stats_file)
def generateStatsForOptimized(): # for i in [10**3, 10**4, 10**5]: for length in [1000000, 1100000, 1200000]: # for i in [10**6]: # for j in range(1, 10): print 'Generating stats for: ', length tf = TweetsFile(length, **experts_twitter_stream_settings) FileIO.writeToFileAsJson({'streaming_lsh': tf.generateStatsForHDLSHClustering(), 'settings': Settings.getSerialzedObject(tf.stream_settings)}, hd_clustering_performance_folder+'cda')
def generateStatsForQualityComparisonWithSSA(): # for length in [i*j for i in 10**3, 10**4, 10**5 for j in range(1, 10)]: for length in [1000000]: print 'Generating stats for: ', length tf = TweetsFile(length, **experts_twitter_stream_settings) # stats = {'ssa': tf.getStatsForSSA(), 'ssa_mr': tf.getStatsForSSAMR(), 'streaming_lsh': KMeansTweetsFile(length, **experts_twitter_stream_settings).generateStatsForStreamingLSHClustering(), 'settings': Settings.getSerialzedObject(tf.stream_settings)} stats = { 'ssa_mr': tf.getStatsForSSAMR(), 'settings': Settings.getSerialzedObject(tf.stream_settings) } FileIO.writeToFileAsJson(stats, TweetsFile.stats_file)
def generateStatsForCDA(): for length, fileName in GenerateStats.lengthAndFileIterator(): print 'Generating stats for: ', length performance = GenerateStats.performanceForCDAAt( length, fileName, **experts_twitter_stream_settings) stats = { CDA: performance, 'settings': Settings.getSerialzedObject(experts_twitter_stream_settings) } FileIO.writeToFileAsJson(stats, getPerformanceFile(CDA))
def generateStatsForOptimized(): # for i in [10**3, 10**4, 10**5]: for length in [1000000, 1100000, 1200000]: # for i in [10**6]: # for j in range(1, 10): print 'Generating stats for: ', length tf = TweetsFile(length, **experts_twitter_stream_settings) FileIO.writeToFileAsJson( { 'streaming_lsh': tf.generateStatsForHDLSHClustering(), 'settings': Settings.getSerialzedObject(tf.stream_settings) }, hd_clustering_performance_folder + 'cda')
def generateStatsForDefaultStreamSettings(): for i in [10**3, 10**4, 10**5]: for j in range(1, 10): print 'Generating stats for: ', i * j tf = TweetsFile(i * j, **default_experts_twitter_stream_settings) FileIO.writeToFileAsJson( { 'streaming_lsh': tf.generateStatsForStreamingLSHClustering(), 'settings': Settings.getSerialzedObject(tf.stream_settings) }, TweetsFile.default_stats_file)
def generateStatsForUnOptimized(): # for i in [10**3, 10**4, 10**5]: for length in [1000000, 1100000, 1200000]: # for i in [10**6]: # for j in range(1, 10): print 'Generating stats for: ', length # default_experts_twitter_stream_settings['cluster_filtering_method'] = emptyClusterFilteringMethod tf = TweetsFile(length, **default_experts_twitter_stream_settings) performance = tf.generateStatsForHDLSHClustering() FileIO.writeToFileAsJson({'streaming_lsh': performance, 'settings': Settings.getSerialzedObject(tf.stream_settings)}, hd_clustering_performance_folder+'cda_unopt') del performance['clusters'] print performance
def generateStatsForUnOptimized(): # for i in [10**3, 10**4, 10**5]: for length in [1000000, 1100000, 1200000]: # for i in [10**6]: # for j in range(1, 10): print 'Generating stats for: ', length # default_experts_twitter_stream_settings['cluster_filtering_method'] = emptyClusterFilteringMethod tf = TweetsFile(length, **default_experts_twitter_stream_settings) performance = tf.generateStatsForHDLSHClustering() FileIO.writeToFileAsJson( { 'streaming_lsh': performance, 'settings': Settings.getSerialzedObject(tf.stream_settings) }, hd_clustering_performance_folder + 'cda_unopt') del performance['clusters'] print performance
def thresholdForDocumentToBeInCluterEstimation(stats_file, **stream_settings): ''' Estimate thresold for the clusters by varying the threshold_for_document_to_be_in_cluster value. Run this on a document set of size 100K. ''' for length in [ i * j for i in 10**3, 10**4, 10**5 for j in range(1, 10) ]: # for t in range(1, 16): for t in range(16, 21): stream_settings[ 'threshold_for_document_to_be_in_cluster'] = t * 0.05 print length, stream_settings[ 'threshold_for_document_to_be_in_cluster'] stats = { 'streaming_lsh': KMeansTweetsFile(length, **stream_settings). generateStatsForStreamingLSHClustering(), 'settings': Settings.getSerialzedObject(stream_settings) } FileIO.writeToFileAsJson(stats, stats_file)
def thresholdForDocumentToBeInCluterEstimation(stats_file, **stream_settings): ''' Estimate thresold for the clusters by varying the threshold_for_document_to_be_in_cluster value. Run this on a document set of size 100K. ''' for length in [i * j for i in 10 ** 3, 10 ** 4, 10 ** 5 for j in range(1, 10)]: # for t in range(1, 16): for t in range(16, 21): stream_settings['threshold_for_document_to_be_in_cluster'] = t * 0.05 print length, stream_settings['threshold_for_document_to_be_in_cluster'] stats = {'streaming_lsh': KMeansTweetsFile(length, **stream_settings).generateStatsForStreamingLSHClustering(), 'settings': Settings.getSerialzedObject(stream_settings)} FileIO.writeToFileAsJson(stats, stats_file)
def generateStatsForCDA(): for length, fileName in GenerateStats.lengthAndFileIterator(): print 'Generating stats for: ',length performance = GenerateStats.performanceForCDAAt(length, fileName, **experts_twitter_stream_settings) stats = {CDA: performance, 'settings': Settings.getSerialzedObject(experts_twitter_stream_settings)} FileIO.writeToFileAsJson(stats, getPerformanceFile(CDA))
# General twitter stream settings. time_unit_in_seconds=timedelta(seconds=5*60) twitter_stream_settings = Settings( stream_id='twitter_stream', # Unique id to represent the stream. dimensions=0, # Number of maximum dimensions to consider at a time. Make sue this is prime. This is also equal to the number of top phrases that will be considered for crowd discovery. min_phrase_length=2, # Minumum lenght of phrases. For example min_phrase_length=1 and max_phrase_length=1 will result in only unigrams as features. max_phrase_length=2, # Maximum lenght of phrases. For example min_phrase_length=1 and max_phrase_length=2 will result in both unigrams and bigrams as features. phrase_decay_coefficient=0.75, # The rate at which phrases decays. stream_decay_coefficient=0.75, # The rate at which stream decays. stream_cluster_decay_coefficient=0.5, # The rate at which a cluster decays. time_unit_in_seconds=time_unit_in_seconds, # This value will be used to determine the length of unit time intervals. # dimension_update_frequency_in_seconds=timedelta(seconds=15*60) dimension_update_frequency_in_seconds=None, # Every these many seconds, old phrases are pruned and new dimensions are created. # max_phrase_inactivity_time_in_seconds=timedelta(seconds=30*60) max_phrase_inactivity_time_in_seconds=None, # Time after which a phrase can be considered old and need not be tracked. cluster_analysis_frequency_in_seconds=time_unit_in_seconds*3, # Every these many seconds current clusters will be analyzed. cluster_filtering_frequency_in_seconds=time_unit_in_seconds*3, # Every these many seconds current clusters will be filtered. cluster_inactivity_time_in_seconds=None, # Clusters that have not added users below this are removed. # Cluster pruning properties. cluster_filter_attribute = 'length', # The attribute based on which stream clusters will be pruned. 'length' => Size of clusters; score => streaming cluster score. cluster_filter_threshold = 0, # Value for the cluster filter threshold. All clusters with attribute values below this will be pruned. cluster_merging_jaccard_distance_threshold = 0.75 # Clusters are merged if the jaccard similarity is above this value. ) # Streaming LSH clustering specific settings. streaming_lsh_settings=Settings(