Python getLatexForString Examples, library.plotting.getLatexForString Python Examples

Example #1

0

Show file

 def plotClusteringSpeed(saveFig=True):
     dataToPlot = {
         'k_means': {
             'x': [],
             'y': []
         },
         'mr_k_means': {
             'x': [],
             'y': []
         },
         'streaming_lsh': {
             'x': [],
             'y': []
         }
     }
     for data in FileIO.iterateJsonFromFile(TweetsFile.combined_stats_file):
         for k in plotSettings:
             dataToPlot[k]['x'].append(data[k]['no_of_documents'])
             dataToPlot[k]['y'].append(data[k]['iteration_time'])
     for k in plotSettings:
         plt.loglog(dataToPlot[k]['x'],
                    dataToPlot[k]['y'],
                    label=plotSettings[k]['label'],
                    color=plotSettings[k]['color'],
                    lw=2)
     plt.legend(loc=4)
     plt.xlabel(getLatexForString('\# of documents'))
     plt.ylabel(getLatexForString('Running time (s)'))
     plt.title(
         getLatexForString(
             'Running time comparsion for Streaing LSH with k-Means'))
     plt.xlim(xmax=500000)
     #        plt.show()
     if saveFig: plt.savefig('speedComparisonWithKMeans.pdf')

Example #2

0

Show file

File: quality_comparison_with_ssa.py Project: kykamath/hd_streams_clustering

 def plotClusteringQuality():
     del plotSettings["ssa_mr"]
     speedStats = dict([(k, {"f1": [], "nmi": [], "purity": []}) for k in plotSettings])
     for data in FileIO.iterateJsonFromFile(TweetsFile.stats_file):
         for k in speedStats:
             for metric in speedStats["ssa"]:
                 speedStats[k][metric].append(data[k][metric])
     dataForPlot = dict([(k, []) for k in plotSettings])
     for k, v in speedStats.iteritems():
         print k
         for k1, v1 in v.iteritems():
             if type(v1[0]) != type([]):
                 print k1, "(%0.2f %0.2f)" % (np.mean(v1), np.var(v1))
                 dataForPlot[k] += [np.mean(v1)]
             else:
                 print k1, ["(%0.2f %0.2f)" % (np.mean(z), np.var(z)) for z in zip(*v1)]
                 dataForPlot[k] += [np.mean(z) for z in zip(*v1)]
     ind, width = np.arange(5), 0.1
     rects, i = [], 0
     for k in dataForPlot:
         rects.append(plt.bar(ind + i * width, dataForPlot[k], width, color=plotSettings[k]["color"]))
         i += 1
     plt.ylabel(getLatexForString("Score"))
     plt.title(getLatexForString("Clustering quality comparison for Streaming LSH with SSA"))
     plt.xticks(ind + width, ("$F$", "$Precision$", "$Recall$", "$Purity$", "$NMI$"))
     plt.legend([r[0] for r in rects], [plotSettings[k]["label"] for k in plotSettings], loc=4)
     #        plt.show()
     plt.savefig("qualityComparisonWithSSA.pdf")

Example #3

0

Show file

 def plotICDFDimensionsInactivityThreshold(self, returnAxisValuesOnly=True):
     ''' Plot P(in_actiivty > threshold timeunit)
         Find time unit at which probability is low.
         Experts stream 0.25 129
         Houston stream 0.25 144
     '''
     dataX, dataY, total = set(), defaultdict(list), []
     for line in list(
             FileIO.iterateJsonFromFile(self.dimensionInActivityTimeFile)):
         data = dict((int(k), v) for k, v in line[
             ParameterEstimation.dimensionInActivityTimeId].iteritems())
         total.append(sum(data.values()))
         for i in data:
             dataY[i].append(data[i])
             dataX.add(i)
     totalInstancesObserved = float(sum(total))
     x = sorted(dataX)
     y = getInverseCumulativeDistribution(
         [sum(dataY[k]) / totalInstancesObserved for k in x])
     plt.plot(x,
              y,
              label=getLatexForString(self.stream_settings['plot_label']),
              color=self.stream_settings['plot_color'],
              lw=2)
     plt.ylabel(
         r'$P\ (\ inactivity\ duration\ \geq\ \ inactivity\ duration\ threshold )$'
     ), plt.xlabel(
         getLatexForString('Inactivity duration threshold')), plt.title(
             getLatexForString('Inactivity analysis for dimensions.'))
     plt.legend()
     if returnAxisValuesOnly: plt.show()

Example #4

0

Show file

 def plotGrowthOfPhrasesInTime(self, returnAxisValuesOnly=True):
     '''
     This plot tells us the time when the number of phrases in the stream stablizes. 
     Consider the time after we have seen maximum phrases to determine dimensions.
     But, if these phrases increase linearly with time, it shows that we have infinte
     dimensions and hence this motivates us to have a way to determine number of 
     dimensions.
     
     numberOfTimeUnits=10*24*12
     '''
     x, y = [], []
     [(x.append(getDateTimeObjectFromTweetTimestamp(line['time_stamp'])),
       y.append(line['total_number_of_phrases']))
      for line in FileIO.iterateJsonFromFile(self.dimensionsEstimationFile)]
     x = x[:numberOfTimeUnits]
     y = y[:numberOfTimeUnits]
     plt.subplot(111).yaxis.set_major_formatter(
         FuncFormatter(lambda x, i: '%0.1f' % (x / 10.**6)))
     plt.text(0.0,
              1.01,
              getLatexForString('10^6'),
              transform=plt.gca().transAxes)
     plt.ylabel(getLatexForString('\# of dimensions')), plt.xlabel(
         getLatexForString(xlabelTimeUnits)), plt.title(
             getLatexForString(
                 'Growth in dimensions with increasing time.'))
     plt.plot(y,
              color=self.stream_settings['plot_color'],
              label=getLatexForString(self.stream_settings['plot_label']),
              lw=2)
     plt.legend(loc=4)
     if returnAxisValuesOnly: plt.show()

Example #5

0

Show file

File: quality_comparison_with_ssa.py Project: kykamath/hd_streams_clustering

 def plotClusteringSpeed(saveFig=True):
     plotSettings = {
         "k_means": {"label": "Iterative k-means", "color": "#FD0006"},
         "mr_k_means": {"label": "MR k-means", "color": "#5AF522"},
         "streaming_lsh": {"label": "Stream CDA", "color": "#7109AA"},
     }
     dataToPlot = {
         "k_means": {"x": [], "y": []},
         "mr_k_means": {"x": [], "y": []},
         "streaming_lsh": {"x": [], "y": []},
     }
     for data in FileIO.iterateJsonFromFile(TweetsFile.combined_stats_file):
         for k in plotSettings:
             dataToPlot[k]["x"].append(data[k]["no_of_documents"])
             dataToPlot[k]["y"].append(data[k]["iteration_time"])
     for k in plotSettings:
         plt.loglog(
             dataToPlot[k]["x"],
             dataToPlot[k]["y"],
             label=plotSettings[k]["label"],
             color=plotSettings[k]["color"],
             lw=2,
         )
     plt.legend(loc=4)
     if saveFig:
         plt.xlabel(getLatexForString("\# of documents"))
         plt.ylabel(getLatexForString("Running time (s)"))
         plt.title(getLatexForString("Running time comparsion for Streaing LSH with k-Means"))
     plt.xlim(xmin=800, xmax=100000)
     plt.xticks([])
     #        plt.show()
     if saveFig:
         plt.savefig("speedComparisonWithKMeans.pdf")

Example #6

0

Show file

File: quality_comparison_with_ssa.py Project: kykamath/hd_streams_clustering

 def plotQualityWithKMeansAndSSA():
     del plotSettings["ssa_mr"]
     speedStats = dict([(k, {"f1": [], "nmi": [], "purity": []}) for k in plotSettings])
     for data in FileIO.iterateJsonFromFile(TweetsFile.stats_file):
         for k in speedStats:
             for metric in speedStats["ssa"]:
                 speedStats[k][metric].append(data[k][metric])
     for k in speedStats:
         del speedStats[k]["f1"]
     speedStats.update(dict([(k, {"f1": [], "nmi": [], "purity": []}) for k in kMeansPlotSettings]))
     k = "k_means"
     for data in FileIO.iterateJsonFromFile(TweetsFile.combined_stats_file):
         for metric in speedStats["k_means"]:
             speedStats[k][metric].append(data[k][metric])
     for k in speedStats:
         if "f1" in speedStats[k]:
             del speedStats[k]["f1"]
     dataForPlot = dict([(k, []) for k in speedStats])
     for k in speedStats:
         for k1 in speedStats[k]:
             dataForPlot[k] += [np.mean(speedStats[k][k1])]
     #        del dataForPlot['k_means']
     print dataForPlot
     ind, width = np.arange(2), 0.1
     rects, i = [], 1
     plotSettings.update(kMeansPlotSettings)
     for k in dataForPlot:
         rects.append(plt.bar(ind + i * width, dataForPlot[k], width, color=plotSettings[k]["color"]))
         i += 1
     plt.ylabel(getLatexForString("Score"))
     plt.title(getLatexForString("Clustering quality comparison for Streaming LSH with SSA"))
     plt.xticks(ind + 2 * width, ("$Purity$", "$NMI$"))
     plt.legend([r[0] for r in rects], [plotSettings[k]["label"] for k in plotSettings], loc=4)
     #        plt.show()
     plt.savefig("qualityComparisonAll.pdf")

Example #7

0

Show file

File: quality_comparison_with_ssa.py Project: kykamath/hd_streams_clustering

 def plotClusteringSpeed(saveFig=True):
     dataToPlot = dict([(k, {"x": [], "y": []}) for k in plotSettings])
     for data in FileIO.iterateJsonFromFile(TweetsFile.stats_file):
         for k in plotSettings:
             dataToPlot[k]["x"].append(data[k]["no_of_documents"])
             dataToPlot[k]["y"].append(data[k]["iteration_time"])
     for k in plotSettings:
         plt.loglog(
             dataToPlot[k]["x"],
             movingAverage(dataToPlot[k]["y"], 1),
             label=plotSettings[k]["label"],
             color=plotSettings[k]["color"],
             lw=2,
         )
     print dataToPlot["streaming_lsh"]["x"][10]
     print dataToPlot["streaming_lsh"]["y"][10]
     plt.legend(loc=4)
     if saveFig:
         plt.xlabel(getLatexForString("\# of documents"))
         plt.ylabel(getLatexForString("Running time (s)"))
         plt.title(getLatexForString("Running time comparsion for Streaing LSH with SSA"))
     plt.xlim(xmin=500, xmax=600000)
     #        plt.show()
     if saveFig:
         plt.savefig("speedComparisonWithSSA.pdf")

Example #8

0

Show file

File: stream_parameters_estimation.py Project: ylaron/hd_streams_clustering

 def plotCDFClustersLagDistribution(self, returnAxisValuesOnly=True):
     '''
     This determines the time after which a cluster can be considered 
     decayed and hence removed.
     
     Experts stream [ 0.66002386  0.07035227] 0.1 82
     Houston stream [ 0.73800037  0.05890473] 0.1 29
     
     458 (# of time units) Experts stream [ 0.66002386  0.07035227] 0.2 15
     71 (# of time units) Houston stream [ 0.73756656  0.05883258] 0.2 3
     
     '''
     def calculateInActivityTimeFor(params, probabilityOfInactivity): return int(CurveFit.inverseOfIncreasingExponentialFunction(params, 1 - probabilityOfInactivity))
     data = list(FileIO.iterateJsonFromFile(self.hdsClustering.stream_settings['%s_file' % ClusteringParametersEstimation.clusterLagDistributionId]))[-1]
     total = float(sum(data['lag_between_streams_added_to_cluster'].values()))
     x = sorted(map(int, data['lag_between_streams_added_to_cluster'].keys()))
     y = getCumulativeDistribution([data['lag_between_streams_added_to_cluster'][str(i)] / total for i in x])
     exponentialCurveParams = CurveFit.getParamsAfterFittingData(x, y, CurveFit.increasingExponentialFunction, [1., 1.])
     print self.stream_settings['plot_label'], exponentialCurveParams, calculateInActivityTimeFor(exponentialCurveParams, 0.2) 
     plt.plot(x, y, 'o', label=getLatexForString(self.stream_settings['plot_label']) + getLatexForString(' (%0.2fx^{%0.2f})') % (exponentialCurveParams[0], exponentialCurveParams[1]), color=self.stream_settings['plot_color'])
     plt.plot(x, CurveFit.getYValues(CurveFit.increasingExponentialFunction, exponentialCurveParams, x), color=self.stream_settings['plot_color'], lw=2)
     plt.ylabel(r'$P\ (\ lag\ \leq\ TU\ )$'), plt.xlabel(getLatexForString(xlabelTimeUnits)), plt.title(getLatexForString('CDF for clusters lag distribution.'))
     plt.ylim((0, 1.2))
     plt.legend(loc=4)
     if returnAxisValuesOnly: plt.show()

Example #9

0

Show file

File: stream_parameters_estimation.py Project: greeness/hd_streams_clustering

 def plotCDFClustersLagDistribution(self, returnAxisValuesOnly=True):
     '''
     This determines the time after which a cluster can be considered 
     decayed and hence removed.
     
     Experts stream [ 0.66002386  0.07035227] 0.1 82
     Houston stream [ 0.73800037  0.05890473] 0.1 29
     
     458 (# of time units) Experts stream [ 0.66002386  0.07035227] 0.2 15
     71 (# of time units) Houston stream [ 0.73756656  0.05883258] 0.2 3
     
     '''
     def calculateInActivityTimeFor(params, probabilityOfInactivity): return int(CurveFit.inverseOfIncreasingExponentialFunction(params, 1 - probabilityOfInactivity))
     data = list(FileIO.iterateJsonFromFile(self.hdsClustering.stream_settings['%s_file' % ClusteringParametersEstimation.clusterLagDistributionId]))[-1]
     total = float(sum(data['lag_between_streams_added_to_cluster'].values()))
     x = sorted(map(int, data['lag_between_streams_added_to_cluster'].keys()))
     y = getCumulativeDistribution([data['lag_between_streams_added_to_cluster'][str(i)] / total for i in x])
     exponentialCurveParams = CurveFit.getParamsAfterFittingData(x, y, CurveFit.increasingExponentialFunction, [1., 1.])
     print self.stream_settings['plot_label'], exponentialCurveParams, calculateInActivityTimeFor(exponentialCurveParams, 0.2) 
     plt.plot(x, y, 'o', label=getLatexForString(self.stream_settings['plot_label']) + getLatexForString(' (%0.2fx^{%0.2f})') % (exponentialCurveParams[0], exponentialCurveParams[1]), color=self.stream_settings['plot_color'])
     plt.plot(x, CurveFit.getYValues(CurveFit.increasingExponentialFunction, exponentialCurveParams, x), color=self.stream_settings['plot_color'], lw=2)
     plt.ylabel(r'$P\ (\ lag\ \leq\ TU\ )$'), plt.xlabel(getLatexForString(xlabelTimeUnits)), plt.title(getLatexForString('CDF for clusters lag distribution.'))
     plt.ylim((0, 1.2))
     plt.legend(loc=4)
     if returnAxisValuesOnly: plt.show()

Example #10

0

Show file

def plotMessagesProcessedWithTime(iterators):
    time_limit = 2300
    for iterator, info in iterators:
        dataX, dataY = [], []
        if not info['id'].startswith('ssa'):
            for data in iterator:
                if data['iteration_time'] < time_limit:
                    dataX.append(data['iteration_time']), dataY.append(
                        data['number_of_messages'])
        else:
            iteration_time = 0
            for data in iterator:
                if data['batch_size'] == 10000:
                    iteration_time += data['iteration_time']
                    if iteration_time < time_limit:
                        dataX.append(iteration_time), dataY.append(
                            data['number_of_messages'])
#        print info, dataX, dataY
        plt.plot(dataX, [y / 10**3 for y in dataY],
                 lw=2,
                 label=info['label'],
                 color=info['color'])
#    plt.xlim(xmin=15, xmax=3000)
    plt.legend(loc=2)
    plt.xlabel(getLatexForString('Time (s)'))
    plt.ylabel(getLatexForString('\# of messages (10^3)'))
    plt.title(getLatexForString('Message processing rate'))
    plt.plot()
    plt.savefig('messagesProcessedWithTime.pdf')
    plt.savefig('messagesProcessedWithTime.eps')

Example #11

0

Show file

 def getClusteringQuality():
     '''
     no_of_documents: 300000
     k_means
         f1, p, r ['(0.95 0.04)', '(0.95 0.04)', '(0.95 0.04)']
         purity (0.95 0.04)
         nmi (0.94 0.04)
     streaming_lsh
         f1, p, r ['(0.67 0.01)', '(0.71 0.01)', '(0.64 0.02)']
         purity (0.96 0.00)
         nmi (0.87 0.00)
     '''
     del plotSettings['mr_k_means']
     del plotSettings['default_streaming_lsh']
     speedStats = dict([(k, {
         'f1': [],
         'nmi': [],
         'purity': []
     }) for k in plotSettings])
     for data in FileIO.iterateJsonFromFile(TweetsFile.combined_stats_file):
         for k in speedStats:
             for metric in speedStats['k_means']:
                 speedStats[k][metric].append(data[k][metric])
     # Adding this because final value of f1 is 0 instead of tuple at 300K documents.
     speedStats['k_means']['f1'][-1] = [0., 0., 0.]
     dataForPlot = dict([(k, []) for k in plotSettings])
     for k, v in speedStats.iteritems():
         print k
         for k1, v1 in v.iteritems():
             if type(v1[0]) != type([]):
                 print k1, '(%0.2f %0.2f)' % (np.mean(v1), np.var(v1))
                 dataForPlot[k] += [np.mean(v1)]
             else:
                 print k1, [
                     '(%0.2f %0.2f)' % (np.mean(z), np.var(z))
                     for z in zip(*v1)
                 ]
                 dataForPlot[k] += [np.mean(z) for z in zip(*v1)]
     ind, width = np.arange(5), 0.1
     rects, i = [], 0
     for k in dataForPlot:
         rects.append(
             plt.bar(ind + i * width,
                     dataForPlot[k],
                     width,
                     color=plotSettings[k]['color']))
         i += 1
     plt.ylabel(getLatexForString('Score'))
     plt.title(
         getLatexForString(
             'Clustering quality comparison for Streaming LSH with k-Means')
     )
     plt.xticks(ind + width,
                ('$F$', '$Precision$', '$Recall$', '$Purity$', '$NMI$'))
     plt.legend([r[0] for r in rects],
                [plotSettings[k]['label'] for k in plotSettings],
                loc=4)
     plt.show()

Example #12

0

Show file

File: quality_comparison_with_kmeans.py Project: greeness/hd_streams_clustering

    def plotClusteringSpeed(saveFig=True):
        dataToPlot = {'k_means': {'x': [], 'y': []}, 'mr_k_means': {'x': [], 'y': []}, 'streaming_lsh': {'x': [], 'y': []}}
        for data in FileIO.iterateJsonFromFile(TweetsFile.combined_stats_file):
            for k in plotSettings: dataToPlot[k]['x'].append(data[k]['no_of_documents']); dataToPlot[k]['y'].append(data[k]['iteration_time'])
        for k in plotSettings: plt.loglog(dataToPlot[k]['x'], dataToPlot[k]['y'], label=plotSettings[k]['label'], color=plotSettings[k]['color'], lw=2)
        plt.legend(loc=4); 
        plt.xlabel(getLatexForString('\# of documents')); plt.ylabel(getLatexForString('Running time (s)')); plt.title(getLatexForString('Running time comparsion for Streaing LSH with k-Means'))
        plt.xlim(xmax=500000)
#        plt.show()
        if saveFig: plt.savefig('speedComparisonWithKMeans.pdf')

Example #13

0

Show file

File: quality_comparison_with_ssa.py Project: kykamath/hd_streams_clustering

 def plotSpeedWithKMeansAndSSA():
     plt.subplot(211)
     QualityComparisonWithKMeans.plotClusteringSpeed(saveFig=False)
     plt.title(getLatexForString("Running time comparisons for CDA Algorithm"))
     plt.ylabel(getLatexForString("Running time (s)"))
     plt.subplot(212)
     QualityComparisonWithSSA.plotClusteringSpeed(saveFig=False)
     plt.xlabel(getLatexForString("\# of documents."))
     plt.ylabel(getLatexForString("Running time (s)"))
     plt.savefig("runningTimeComparisonAll.pdf")

Example #14

0

Show file

File: data_generation_and_crowd_analysis.py Project: ylaron/hd_streams_clustering

 def crowdSizeToLifeSpanPlot(self, returnAxisValuesOnly=True):
     AnalyzeData.reset(), AnalyzeData.constructCrowdDataStructures(self.stream_settings['data_iterator'])
     crowdSizeX, lifeSpanY = [], []
     for crowd in AnalyzeData.crowdMap: crowdSizeX.append(AnalyzeData.crowdMap[crowd].crowdSize), lifeSpanY.append(AnalyzeData.crowdMap[crowd].lifespan)
     plt.loglog(crowdSizeX, lifeSpanY, 'o', color=self.stream_settings['plot_color'], label=self.stream_settings['plot_label'])
     plt.xlabel(getLatexForString('Crowd Size'))
     plt.ylabel(getLatexForString('Lifespan'))
     plt.title(getLatexForString('Crowd size Vs Lifespan'))
     plt.legend()
     if returnAxisValuesOnly: plt.show()

Example #15

0

Show file

File: quality_comparison_with_ssa.py Project: ylaron/hd_streams_clustering

 def plotSpeedWithKMeansAndSSA():
     plt.subplot(211)
     QualityComparisonWithKMeans.plotClusteringSpeed(saveFig=False)
     plt.title(
         getLatexForString('Running time comparisons for CDA Algorithm'))
     plt.ylabel(getLatexForString('Running time (s)'))
     plt.subplot(212)
     QualityComparisonWithSSA.plotClusteringSpeed(saveFig=False)
     plt.xlabel(getLatexForString('\# of documents.'))
     plt.ylabel(getLatexForString('Running time (s)'))
     plt.savefig('runningTimeComparisonAll.pdf')

Example #16

0

Show file

File: algorithms_performance.py Project: greeness/hd_streams_clustering

 def plotJustifyTrie(self):
         pltInfo = {JustifyTrie.with_trie: {'label': getLatexForString('With prefix tree'), 'color': '#7109AA', 'type': '-'}, JustifyTrie.with_sorted_list: {'label': getLatexForString('With sorted list'), 'color': '#5AF522', 'type': '-'}}
         experimentsData = {JustifyTrie.with_trie: {'iteration_time': [], 'quality': [], 'total_clusters': []}, JustifyTrie.with_sorted_list: {'iteration_time': [], 'quality': [], 'total_clusters': []}}
         loadExperimentsData(experimentsData, JustifyTrie.stats_file)
         plt.subplot(312); numberOfPoints = plotRunningTime(experimentsData, pltInfo, JustifyTrie.with_trie, JustifyTrie.with_sorted_list); plt.xlim(xmax=200); plt.xticks([], tick1On=False); plt.ylim(ymin=1, ymax=35000);
         plt.legend(loc=2, ncol=2)
         plt.xlabel(getLatexForString('Time'))
         plt.subplot(311); plotClusters(experimentsData, numberOfPoints, pltInfo); plt.xticks([], tick1On=False); plt.xlim(xmax=200)
         plt.title(getLatexForString('Impact of using prefix tree'))
         plt.subplot(313); plotQuality(experimentsData, numberOfPoints, pltInfo); plt.xlim(xmax=200)
         plt.savefig('justifyTrie.pdf')

Example #17

0

Show file

File: data_generation_and_crowd_analysis.py Project: ylaron/hd_streams_clustering

 def crowdSizeDistribution(self, returnAxisValuesOnly=True):
     AnalyzeData.reset(), AnalyzeData.constructCrowdDataStructures(self.stream_settings['data_iterator'])
     y,x= np.histogram([AnalyzeData.crowdMap[crowd].crowdSize for crowd in AnalyzeData.crowdMap], bins=15)
     plt.semilogy(x[:-1], y, color=self.stream_settings['plot_color'], lw=2, label=self.stream_settings['plot_label'])
     plt.xlabel(getLatexForString('Crowd Size'))
     plt.ylabel(getLatexForString('\# of crowds'))
     plt.title(getLatexForString('Crowd size distribution'))
     plt.legend()
     plt.xlim(xmax=300) 
     plt.savefig('crowdSizeDistribution.pdf')
     plt.savefig('crowdSizeDistribution.eps')

Example #18

0

Show file

File: quality_comparison_with_ssa.py Project: ylaron/hd_streams_clustering

    def plotQualityWithKMeansAndSSA():
        del plotSettings['ssa_mr']
        speedStats = dict([(k, {
            'f1': [],
            'nmi': [],
            'purity': []
        }) for k in plotSettings])
        for data in FileIO.iterateJsonFromFile(TweetsFile.stats_file):
            for k in speedStats:
                for metric in speedStats['ssa']:
                    speedStats[k][metric].append(data[k][metric])
        for k in speedStats:
            del speedStats[k]['f1']
        speedStats.update(
            dict([(k, {
                'f1': [],
                'nmi': [],
                'purity': []
            }) for k in kMeansPlotSettings]))
        k = 'k_means'
        for data in FileIO.iterateJsonFromFile(TweetsFile.combined_stats_file):
            for metric in speedStats['k_means']:
                speedStats[k][metric].append(data[k][metric])
        for k in speedStats:
            if 'f1' in speedStats[k]: del speedStats[k]['f1']
        dataForPlot = dict([(k, []) for k in speedStats])
        for k in speedStats:
            for k1 in speedStats[k]:
                dataForPlot[k] += [np.mean(speedStats[k][k1])]


#        del dataForPlot['k_means']
        print dataForPlot
        ind, width = np.arange(2), 0.1
        rects, i = [], 1
        plotSettings.update(kMeansPlotSettings)
        for k in dataForPlot:
            rects.append(
                plt.bar(ind + i * width,
                        dataForPlot[k],
                        width,
                        color=plotSettings[k]['color']))
            i += 1
        plt.ylabel(getLatexForString('Score'))
        plt.title(
            getLatexForString(
                'Clustering quality comparison for Streaming LSH with SSA'))
        plt.xticks(ind + 2 * width, ('$Purity$', '$NMI$'))
        plt.legend([r[0] for r in rects],
                   [plotSettings[k]['label'] for k in plotSettings],
                   loc=4)
        #        plt.show()
        plt.savefig('qualityComparisonAll.pdf')

Example #19

0

Show file

File: algorithms_performance.py Project: greeness/hd_streams_clustering

 def plotJustifyExponentialDecay(self):
     pltInfo =  {JustifyExponentialDecay.with_decay: {'label': getLatexForString('With decay'), 'color': '#7109AA', 'type': '-'}, JustifyExponentialDecay.without_decay: {'label': getLatexForString('With out decay'), 'color': '#5AF522', 'type': '-'}}
     experimentsData = {JustifyExponentialDecay.with_decay: {'iteration_time': [], 'quality': [], 'total_clusters': []}, JustifyExponentialDecay.without_decay: {'iteration_time': [], 'quality': [], 'total_clusters': []}}
     loadExperimentsData(experimentsData, JustifyExponentialDecay.stats_file)
     numberOfPoints = 275
     plt.subplot(311); plotClusters(experimentsData, numberOfPoints, pltInfo); plt.title(getLatexForString('Impact of exponential decay')), plt.xlim(xmax=275)
     plt.subplot(312); plotRunningTime(experimentsData, pltInfo, JustifyExponentialDecay.with_decay, JustifyExponentialDecay.without_decay); plt.xticks([], tick1On=False), plt.xlim(xmax=275)
     plt.legend(loc=2, ncol=2)
     plt.ylabel(getLatexForString('Running time (s)'))
     plt.subplot(313); plotQuality(experimentsData, numberOfPoints, pltInfo), plt.xlim(xmax=275)
     plt.xlabel(getLatexForString('Time'))
     plt.savefig('justifyExponentialDecay.pdf')

Example #20

0

Show file

    def runningTimesWithCDA(*iterators, **kwargs):
        loc = kwargs.get('loc', 1)
        fileName = kwargs.get('file_name', 'running_times.eps')
        xmax = kwargs.get('xmax', None)
        semilog = kwargs.get('log', False)
        xmin = kwargs.get('xmin', None)
        title = kwargs.get('title', None)
        dataX, dataYValues = [], []
        for id, iterator in iterators:
            dataX, dataY = [], []
            for data in iterator:
                if data['no_of_documents'] != 900000:
                    if xmax and data['no_of_documents'] <= xmax:
                        dataX.append(data['no_of_documents']), dataY.append(
                            data['iteration_time'])
                    else:
                        dataX.append(data['no_of_documents']), dataY.append(
                            data['iteration_time'])
            if not semilog:
                plt.loglog(dataX,
                           dataY,
                           label=algorithm_info[id]['label'],
                           color=algorithm_info[id]['color'],
                           lw=2,
                           marker=algorithm_info[id]['marker'])
            else:
                plt.plot(dataX,
                         dataY,
                         label=algorithm_info[id]['label'],
                         color=algorithm_info[id]['color'],
                         lw=2,
                         marker=algorithm_info[id]['marker'])
            dataYValues.append(dataY)

        dataDifference = []
        for i in range(len(dataX)):
            dataDifference.append(dataYValues[1][i] - dataYValues[0][i])
        plt.plot(dataX, dataDifference, label='Difference', lw=2)
        #        for i, j in zip(dataX, dataDifference): print i,j

        plt.legend(loc=loc)
        plt.xlabel(getLatexForString('Length of information stream (10^4)'),
                   fontsize=20)
        plt.ylabel(getLatexForString('Running time (s)'), fontsize=20)
        #plt.title(getLatexForString(title))
        locs, labels = plt.xticks()
        plt.xticks(locs, map(lambda x: "%d" % (x / 10000), locs))
        if xmax: plt.xlim(xmax=xmax)
        if xmin: plt.xlim(xmin=xmin)
        #        plt.savefig(fileName+'.eps')
        #        plt.savefig(fileName+'.pdf')
        plt.savefig(fileName + '.png')

Example #21

0

Show file

    def plotDimensionsEstimation(self, returnAxisValuesOnly=True):
        def calculateDimensionsFor(params, percentageOfNewDimensions):
            '''
            numberOfTimeUnits=10*24*12
            Experts stream [  1.17707899e+03   1.03794580e+00] 76819
            Houston stream [  2.73913900e+03   1.02758516e+00] 195731
            '''
            print getSmallestPrimeNumberGreaterThan(
                int(
                    CurveFit.inverseOfDecreasingExponentialFunction(
                        params, percentageOfNewDimensions)))

        dataDistribution = defaultdict(list)
        for line in FileIO.iterateJsonFromFile(self.dimensionsEstimationFile):
            for k, v in line[
                    ParameterEstimation.dimensionsEstimationId].iteritems():
                k = int(k)
                if k not in dataDistribution: dataDistribution[k] = [0., 0.]
                dataDistribution[k][0] += v
                dataDistribution[k][1] += 1
        x, y = [], []
        [(x.append(k),
          y.append((dataDistribution[k][0] / dataDistribution[k][1]) / k))
         for k in sorted(dataDistribution) if k > 1000]
        x, y = x[:numberOfTimeUnits], y[:numberOfTimeUnits]
        exponentialCurveParams = CurveFit.getParamsAfterFittingData(
            x, y, CurveFit.decreasingExponentialFunction, [1., 1.])
        print self.stream_settings[
            'plot_label'], exponentialCurveParams, calculateDimensionsFor(
                exponentialCurveParams, 0.01)
        plt.ylabel(getLatexForString('\% of decaying dimensions')), plt.xlabel(
            getLatexForString('\# of dimensions')
        ), plt.title(
            getLatexForString(
                'Dimension stability with increasing number of dimensions.'))
        plt.semilogy(
            x,
            y,
            'o',
            color=self.stream_settings['plot_color'],
            label=getLatexForString(self.stream_settings['plot_label']) +
            getLatexForString(' (%0.2fx^{-%0.2f})') %
            (exponentialCurveParams[0], exponentialCurveParams[1]),
            lw=2)
        plt.semilogy(x,
                     CurveFit.getYValues(
                         CurveFit.decreasingExponentialFunction,
                         exponentialCurveParams, x),
                     color=self.stream_settings['plot_color'],
                     lw=2)
        plt.legend()
        if returnAxisValuesOnly: plt.show()

Example #22

0

Show file

File: algorithms_performance.py Project: greeness/hd_streams_clustering

 def plotJustifyNotUsingVanillaLSH(self):
     pltInfo = {
         JustifyNotUsingVanillaLSH.with_modified_lsh: {
             'label': getLatexForString('Modified LSH'),
             'color': '#7109AA',
             'type': '-'
         },
         JustifyNotUsingVanillaLSH.with_vanilla_lsh: {
             'label': getLatexForString('Plain LSH'),
             'color': '#5AF522',
             'type': '-'
         }
     }
     experimentsData = {
         JustifyNotUsingVanillaLSH.with_modified_lsh: {
             'iteration_time': [],
             'quality': [],
             'total_clusters': []
         },
         JustifyNotUsingVanillaLSH.with_vanilla_lsh: {
             'iteration_time': [],
             'quality': [],
             'total_clusters': []
         }
     }
     loadExperimentsData(experimentsData,
                         JustifyNotUsingVanillaLSH.stats_file)
     #            loadExperimentsData(experimentsData, 'temp/modified_lsh_need_analysis')
     numberOfPoints = 275
     plt.subplot(312)
     plotRunningTime(experimentsData,
                     pltInfo,
                     JustifyNotUsingVanillaLSH.with_modified_lsh,
                     JustifyNotUsingVanillaLSH.with_vanilla_lsh,
                     semilog=True)
     plt.xlim(xmax=270)
     plt.xticks([], tick1On=False)
     plt.ylim(ymin=1, ymax=5000)
     plt.legend(loc=2, ncol=2)
     #            plt.xlabel(getLatexForString('Time'))
     plt.subplot(313)
     plotQuality(experimentsData, numberOfPoints, pltInfo)
     plt.xlabel(getLatexForString('Time'))
     plt.ylim(ymin=0.72)
     plt.xlim(xmax=270)
     plt.subplot(311)
     plotClusters(experimentsData, numberOfPoints, pltInfo)
     plt.title(getLatexForString('Impact of modified lsh'))
     plt.xticks([], tick1On=False)
     plt.xlim(xmax=270)
     plt.savefig('justifyNotUsingVanillaLSH.pdf')

Example #23

0

Show file

File: plots.py Project: greeness/hd_streams_clustering

    def runningTimesWithCDA(*iterators, **kwargs):
        loc = kwargs.get('loc', 1)
        fileName = kwargs.get('file_name', 'running_times.eps')
        xmax = kwargs.get('xmax', None)
        semilog = kwargs.get('log', False)
        xmin = kwargs.get('xmin', None)
        title = kwargs.get('title', None)
        dataX, dataYValues = [], []
        for id, iterator in iterators:
            dataX, dataY = [], []
            for data in iterator:
                if data['no_of_documents']!=900000:
                    if xmax and data['no_of_documents'] <= xmax: dataX.append(data['no_of_documents']), dataY.append(data['iteration_time'])
                    else: dataX.append(data['no_of_documents']), dataY.append(data['iteration_time'])
            if not semilog: plt.loglog(dataX, dataY, label=algorithm_info[id]['label'], color=algorithm_info[id]['color'], lw=2, marker=algorithm_info[id]['marker'])
            else: plt.plot(dataX, dataY, label=algorithm_info[id]['label'], color=algorithm_info[id]['color'], lw=2, marker=algorithm_info[id]['marker'])
            dataYValues.append(dataY)
        
        dataDifference = []
        for i in range(len(dataX)): dataDifference.append(dataYValues[1][i] - dataYValues[0][i])
        plt.plot(dataX, dataDifference, label='Difference', lw=2)
#        for i, j in zip(dataX, dataDifference): print i,j
        
        plt.legend(loc=loc)
        plt.xlabel(getLatexForString('Length of information stream (10^4)')); plt.ylabel(getLatexForString('Running time (s)')); plt.title(getLatexForString(title))
        locs,labels = plt.xticks()
        plt.xticks(locs, map(lambda x: "%d"%(x/10000), locs))
        if xmax: plt.xlim(xmax=xmax) 
        if xmin: plt.xlim(xmin=xmin) 
        plt.savefig(fileName+'.eps')
        plt.savefig(fileName+'.pdf')

Example #24

0

Show file

File: algorithms_performance.py Project: greeness/hd_streams_clustering

def plotRunningTime(experimentsData, pltInfo, key1, key2, semilog=True):
    dataY1, dataY2 = [], []
    for y1, y2 in zip(experimentsData[key1]['iteration_time'],
                      experimentsData[key2]['iteration_time']):
        dataY1.append(y1), dataY2.append(y2)
    numberOfPoints = len(dataY1)
    for k, dataY in zip([key1, key2], [dataY1, dataY2]):
        window = 20
        if semilog:
            plt.semilogy(range(numberOfPoints)[:-window],
                         movingAverage(dataY[:numberOfPoints],
                                       window)[:-window],
                         pltInfo[k]['type'],
                         label=pltInfo[k]['label'],
                         color=pltInfo[k]['color'],
                         lw=2)
        else:
            plt.plot(range(numberOfPoints)[:-window],
                     movingAverage(dataY[:numberOfPoints], window)[:-window],
                     pltInfo[k]['type'],
                     label=pltInfo[k]['label'],
                     color=pltInfo[k]['color'],
                     lw=2)
    plt.ylabel(getLatexForString('Running time (s)'))
    return numberOfPoints

Example #25

0

Show file

 def subPlot(id, timeUnit):
     plt.subplot(id)
     print timeUnit, calculatePercentageOfDecayedPhrasesFor(
         params, timeUnit)
     plt.plot(
         x,
         y,
         'o',
         label=getLatexForString(self.stream_settings['plot_label']) +
         getLatexForString(' (%0.2fx^{%0.2f})') %
         (params[0], params[1]),
         color=self.stream_settings['plot_color'])
     plt.plot(x,
              CurveFit.getYValues(
                  CurveFit.increasingExponentialFunction, params, x),
              color=self.stream_settings['plot_color'],
              lw=2)

Example #26

0

Show file

File: stream_parameters_estimation.py Project: greeness/hd_streams_clustering

 def plotGrowthOfPhrasesInTime(self, returnAxisValuesOnly=True):
     '''
     This plot tells us the time when the number of phrases in the stream stablizes. 
     Consider the time after we have seen maximum phrases to determine dimensions.
     But, if these phrases increase linearly with time, it shows that we have infinte
     dimensions and hence this motivates us to have a way to determine number of 
     dimensions.
     
     numberOfTimeUnits=10*24*12
     '''
     x, y = [], []; [(x.append(getDateTimeObjectFromTweetTimestamp(line['time_stamp'])), y.append(line['total_number_of_phrases'])) for line in FileIO.iterateJsonFromFile(self.dimensionsEstimationFile)]
     x = x[:numberOfTimeUnits]; y = y[:numberOfTimeUnits]
     plt.subplot(111).yaxis.set_major_formatter(FuncFormatter(lambda x, i: '%0.1f' % (x / 10. ** 6)))
     plt.text(0.0, 1.01, getLatexForString('10^6'), transform=plt.gca().transAxes)
     plt.ylabel(getLatexForString('\# of dimensions')), plt.xlabel(getLatexForString(xlabelTimeUnits)), plt.title(getLatexForString('Growth in dimensions with increasing time.'))
     plt.plot(y, color=self.stream_settings['plot_color'], label=getLatexForString(self.stream_settings['plot_label']), lw=2)
     plt.legend(loc=4)
     if returnAxisValuesOnly: plt.show()

Example #27

0

Show file

File: algorithms_performance.py Project: greeness/hd_streams_clustering

 def plotJustifyMemoryPruning(self):
     pltInfo =  {JustifyMemoryPruning.with_memory_pruning: {'label': getLatexForString('With pruning'), 'color': '#7109AA', 'type': '-'}, JustifyMemoryPruning.without_memory_pruning: {'label': getLatexForString('With out pruning'), 'color': '#5AF522', 'type': '-'}}
     experimentsData = {JustifyMemoryPruning.with_memory_pruning: {'iteration_time': [], 'quality': [], 'total_clusters': []}, JustifyMemoryPruning.without_memory_pruning: {'iteration_time': [], 'quality': [], 'total_clusters': []}}
     loadExperimentsData(experimentsData, JustifyMemoryPruning.stats_file)
     numberOfPoints = 275
     plt.subplot(312); plotRunningTime(experimentsData, pltInfo, JustifyMemoryPruning.with_memory_pruning, JustifyMemoryPruning.without_memory_pruning); plt.legend(loc=2, ncol=2); plt.xticks([], tick1On=False), plt.xlim(xmax=270)
     plt.subplot(313); plotQuality(experimentsData, numberOfPoints, pltInfo); plt.xlabel(getLatexForString('Time')), plt.xlim(xmax=270)
     plt.subplot(311); plotClusters(experimentsData, numberOfPoints, pltInfo); plt.title(getLatexForString('Impact of memory pruning')); plt.xticks([], tick1On=False), plt.xlim(xmax=270)
     plt.savefig('justifyMemoryPruning.pdf')

Example #28

0

Show file

File: algorithms_performance.py Project: greeness/hd_streams_clustering

 def plotJustifyExponentialDecay(self):
     pltInfo = {
         JustifyExponentialDecay.with_decay: {
             'label': getLatexForString('With decay'),
             'color': '#7109AA',
             'type': '-'
         },
         JustifyExponentialDecay.without_decay: {
             'label': getLatexForString('With out decay'),
             'color': '#5AF522',
             'type': '-'
         }
     }
     experimentsData = {
         JustifyExponentialDecay.with_decay: {
             'iteration_time': [],
             'quality': [],
             'total_clusters': []
         },
         JustifyExponentialDecay.without_decay: {
             'iteration_time': [],
             'quality': [],
             'total_clusters': []
         }
     }
     loadExperimentsData(experimentsData,
                         JustifyExponentialDecay.stats_file)
     numberOfPoints = 275
     plt.subplot(311)
     plotClusters(experimentsData, numberOfPoints, pltInfo)
     plt.title(getLatexForString('Impact of exponential decay')), plt.xlim(
         xmax=275)
     plt.subplot(312)
     plotRunningTime(experimentsData, pltInfo,
                     JustifyExponentialDecay.with_decay,
                     JustifyExponentialDecay.without_decay)
     plt.xticks([], tick1On=False), plt.xlim(xmax=275)
     plt.legend(loc=2, ncol=2)
     plt.ylabel(getLatexForString('Running time (s)'))
     plt.subplot(313)
     plotQuality(experimentsData, numberOfPoints,
                 pltInfo), plt.xlim(xmax=275)
     plt.xlabel(getLatexForString('Time'))
     plt.savefig('justifyExponentialDecay.pdf')

Example #29

0

Show file

File: algorithms_performance.py Project: greeness/hd_streams_clustering

 def plotJustifyTrie(self):
     pltInfo = {
         JustifyTrie.with_trie: {
             'label': getLatexForString('With prefix tree'),
             'color': '#7109AA',
             'type': '-'
         },
         JustifyTrie.with_sorted_list: {
             'label': getLatexForString('With sorted list'),
             'color': '#5AF522',
             'type': '-'
         }
     }
     experimentsData = {
         JustifyTrie.with_trie: {
             'iteration_time': [],
             'quality': [],
             'total_clusters': []
         },
         JustifyTrie.with_sorted_list: {
             'iteration_time': [],
             'quality': [],
             'total_clusters': []
         }
     }
     loadExperimentsData(experimentsData, JustifyTrie.stats_file)
     plt.subplot(312)
     numberOfPoints = plotRunningTime(experimentsData, pltInfo,
                                      JustifyTrie.with_trie,
                                      JustifyTrie.with_sorted_list)
     plt.xlim(xmax=200)
     plt.xticks([], tick1On=False)
     plt.ylim(ymin=1, ymax=35000)
     plt.legend(loc=2, ncol=2)
     plt.xlabel(getLatexForString('Time'))
     plt.subplot(311)
     plotClusters(experimentsData, numberOfPoints, pltInfo)
     plt.xticks([], tick1On=False)
     plt.xlim(xmax=200)
     plt.title(getLatexForString('Impact of using prefix tree'))
     plt.subplot(313)
     plotQuality(experimentsData, numberOfPoints, pltInfo)
     plt.xlim(xmax=200)
     plt.savefig('justifyTrie.pdf')

Example #30

0

Show file

File: quality_comparison_with_ssa.py Project: ylaron/hd_streams_clustering

 def plotClusteringQuality():
     del plotSettings['ssa_mr']
     speedStats = dict([(k, {
         'f1': [],
         'nmi': [],
         'purity': []
     }) for k in plotSettings])
     for data in FileIO.iterateJsonFromFile(TweetsFile.stats_file):
         for k in speedStats:
             for metric in speedStats['ssa']:
                 speedStats[k][metric].append(data[k][metric])
     dataForPlot = dict([(k, []) for k in plotSettings])
     for k, v in speedStats.iteritems():
         print k
         for k1, v1 in v.iteritems():
             if type(v1[0]) != type([]):
                 print k1, '(%0.2f %0.2f)' % (np.mean(v1), np.var(v1))
                 dataForPlot[k] += [np.mean(v1)]
             else:
                 print k1, [
                     '(%0.2f %0.2f)' % (np.mean(z), np.var(z))
                     for z in zip(*v1)
                 ]
                 dataForPlot[k] += [np.mean(z) for z in zip(*v1)]
     ind, width = np.arange(5), 0.1
     rects, i = [], 0
     for k in dataForPlot:
         rects.append(
             plt.bar(ind + i * width,
                     dataForPlot[k],
                     width,
                     color=plotSettings[k]['color']))
         i += 1
     plt.ylabel(getLatexForString('Score'))
     plt.title(
         getLatexForString(
             'Clustering quality comparison for Streaming LSH with SSA'))
     plt.xticks(ind + width,
                ('$F$', '$Precision$', '$Recall$', '$Purity$', '$NMI$'))
     plt.legend([r[0] for r in rects],
                [plotSettings[k]['label'] for k in plotSettings],
                loc=4)
     #        plt.show()
     plt.savefig('qualityComparisonWithSSA.pdf')

Example #31

0

Show file

File: algorithms_performance.py Project: greeness/hd_streams_clustering

def plotRunningTime(experimentsData, pltInfo, key1, key2, semilog=True):
    dataY1, dataY2 = [], []
    for y1, y2 in zip(experimentsData[key1]['iteration_time'], experimentsData[key2]['iteration_time']): dataY1.append(y1), dataY2.append(y2)
    numberOfPoints = len(dataY1)
    for k, dataY in zip([key1, key2], [dataY1, dataY2]): 
        window = 20; 
        if semilog: plt.semilogy(range(numberOfPoints)[:-window], movingAverage(dataY[:numberOfPoints], window)[:-window], pltInfo[k]['type'], label=pltInfo[k]['label'], color=pltInfo[k]['color'], lw=2)
        else: plt.plot(range(numberOfPoints)[:-window], movingAverage(dataY[:numberOfPoints], window)[:-window], pltInfo[k]['type'], label=pltInfo[k]['label'], color=pltInfo[k]['color'], lw=2)
    plt.ylabel(getLatexForString('Running time (s)'))
    return numberOfPoints

Example #32

0

Show file

File: stream_parameters_estimation.py Project: ylaron/hd_streams_clustering

 def plotThresholdForDocumentToBeInCluster(self, statsFile):
     dataToPlot = dict(('%0.2f' % (t * 0.05), {'iteration_time':[], 'purity': [], 'nmi': []}) for t in range(1, 21))
     for data in FileIO.iterateJsonFromFile(statsFile):
         threshold = '%0.2f' % data['settings']['threshold_for_document_to_be_in_cluster']
         for k in dataToPlot[threshold]: dataToPlot[threshold][k] += [data['streaming_lsh'][k]]
     for t in dataToPlot:
         for k in dataToPlot[t]: dataToPlot[t][k] = np.mean(dataToPlot[t][k]) 
     dataX = sorted([float(i) for i in dataToPlot])[:-1]
     print dataX
     # Plot iteration time.
     plt.subplot(211)
     plt.plot(dataX, [dataToPlot['%0.2f' % x]['iteration_time'] for x in dataX], lw=2, color='k')
     plt.ylabel(getLatexForString('Time (s)'))
     plt.title(getLatexForString('Estimation of \epsilon^\prime for Stream SSA'))
     plt.subplot(212)
     for metric, label, color in [('nmi', 'NMI', '#F60018'), ('purity', 'Purity', '#25D500')]: plt.plot(dataX, [dataToPlot['%0.2f' % x][metric] for x in dataX], label=label, color=color, lw=2)
     plt.ylabel(getLatexForString('Score'))
     plt.xlabel(getLatexForString('Similarity threshold (\epsilon^\prime)'))
     plt.legend(loc=4)
     plt.show()

Example #33

0

Show file

File: stream_parameters_estimation.py Project: greeness/hd_streams_clustering

 def plotThresholdForDocumentToBeInCluster(self, statsFile):
     dataToPlot = dict(('%0.2f' % (t * 0.05), {'iteration_time':[], 'purity': [], 'nmi': []}) for t in range(1, 21))
     for data in FileIO.iterateJsonFromFile(statsFile):
         threshold = '%0.2f' % data['settings']['threshold_for_document_to_be_in_cluster']
         for k in dataToPlot[threshold]: dataToPlot[threshold][k] += [data['streaming_lsh'][k]]
     for t in dataToPlot:
         for k in dataToPlot[t]: dataToPlot[t][k] = np.mean(dataToPlot[t][k]) 
     dataX = sorted([float(i) for i in dataToPlot])[:-1]
     print dataX
     # Plot iteration time.
     plt.subplot(211)
     plt.plot(dataX, [dataToPlot['%0.2f' % x]['iteration_time'] for x in dataX], lw=2, color='k')
     plt.ylabel(getLatexForString('Time (s)'))
     plt.title(getLatexForString('Estimation of \epsilon^\prime for Stream SSA'))
     plt.subplot(212)
     for metric, label, color in [('nmi', 'NMI', '#F60018'), ('purity', 'Purity', '#25D500')]: plt.plot(dataX, [dataToPlot['%0.2f' % x][metric] for x in dataX], label=label, color=color, lw=2)
     plt.ylabel(getLatexForString('Score'))
     plt.xlabel(getLatexForString('Similarity threshold (\epsilon^\prime)'))
     plt.legend(loc=4)
     plt.show()

Example #34

0

Show file

File: algorithms_performance.py Project: greeness/hd_streams_clustering

 def plotJustifyDimensionsEstimation(self):
     runningTimeData, purityData = defaultdict(list), defaultdict(list)
     for data in FileIO.iterateJsonFromFile(JustifyDimensionsEstimation.stats_file):
         if data['iteration_parameters']['dimensions']<data['no_of_observed_dimensions']:
             no_of_dimensions = data['iteration_parameters']['dimensions']
             runningTimeData[no_of_dimensions].append(data['iteration_time']), purityData[no_of_dimensions].append(data['purity'])
     plt.subplot(111)
     dataX, dataY = [], []
     del purityData[169991]; del purityData[39989]
     plt.title(getLatexForString('Impact of dimension estimation'))
     for k in sorted(purityData): dataX.append(k), dataY.append(np.mean(purityData[k])) 
     plt.semilogx(dataX, [0.96]*len(dataX), '--', label=getLatexForString('Top n dimensions'), color='#7109AA', lw=2)
     plt.semilogx(dataX, [np.mean(dataY)]*len(dataX), '--', color='#5AF522', lw=2)
     plt.semilogx(dataX, dataY, '-x', label=getLatexForString('Fixed dimensions'), color='#5AF522', lw=2)
     plt.ylim(0.8, 1.0)
     plt.xlim(7000, 203000)
     plt.xlabel(getLatexForString('\# of dimensions'))
     plt.ylabel(getLatexForString('Purity'))
     plt.legend(loc=3)
     plt.savefig('justifyDimensionsEstimation.pdf')

Example #35

0

Show file

File: algorithms_performance.py Project: greeness/hd_streams_clustering

 def plotJustifyMemoryPruning(self):
     pltInfo = {
         JustifyMemoryPruning.with_memory_pruning: {
             'label': getLatexForString('With pruning'),
             'color': '#7109AA',
             'type': '-'
         },
         JustifyMemoryPruning.without_memory_pruning: {
             'label': getLatexForString('With out pruning'),
             'color': '#5AF522',
             'type': '-'
         }
     }
     experimentsData = {
         JustifyMemoryPruning.with_memory_pruning: {
             'iteration_time': [],
             'quality': [],
             'total_clusters': []
         },
         JustifyMemoryPruning.without_memory_pruning: {
             'iteration_time': [],
             'quality': [],
             'total_clusters': []
         }
     }
     loadExperimentsData(experimentsData, JustifyMemoryPruning.stats_file)
     numberOfPoints = 275
     plt.subplot(312)
     plotRunningTime(experimentsData, pltInfo,
                     JustifyMemoryPruning.with_memory_pruning,
                     JustifyMemoryPruning.without_memory_pruning)
     plt.legend(loc=2, ncol=2)
     plt.xticks([], tick1On=False), plt.xlim(xmax=270)
     plt.subplot(313)
     plotQuality(experimentsData, numberOfPoints, pltInfo)
     plt.xlabel(getLatexForString('Time')), plt.xlim(xmax=270)
     plt.subplot(311)
     plotClusters(experimentsData, numberOfPoints, pltInfo)
     plt.title(getLatexForString('Impact of memory pruning'))
     plt.xticks([], tick1On=False), plt.xlim(xmax=270)
     plt.savefig('justifyMemoryPruning.pdf')

Example #36

0

Show file

File: stream_parameters_estimation.py Project: greeness/hd_streams_clustering

 def plotCDFDimensionsLagDistribution(self, returnAxisValuesOnly=True):
     '''
     Inactivity time is the time after which there is a high probability that a
     dimension will not appear. Find time_unit that gives this probability. 
     
     Cumulative distribution function (http://en.wikipedia.org/wiki/Cumulative_distribution_function)
     lag = time betweeen occurance of two dimensions (similar to inactivty_time)
     
     F(time_unit) = P(lag<=time_unit)
     time_unit = F_inv(P(lag<=time_unit))
     
     Given P(inactivty_time>time_unit) determine time_unit as shown:
     P(inactivty_time<=time_unit) = 1 - P(inactivty_time>time_unit)
     inactivty_time = F_inv(P(inactivty_time<=time_unit))
     
     numberOfTimeUnits=10*24*12
     
     Experts stream [ 0.23250341  0.250209  ] 0.25 107
     Houston stream [ 0.16948096  0.30751358] 0.25 126
     
     Experts stream [ 0.23250341  0.250209  ] 0.1, 223
     Houston stream [ 0.16948096  0.30751358] 0.1, 228
     
     Compared to other vaues these values are pretty close to each
     other. This is expected. Irrespective of size of the streams,
     the phrases have the same lifetime and hence decay close to each other.
     '''
     def calculateInActivityTimeFor(params, probabilityOfInactivity): return int(CurveFit.inverseOfIncreasingExponentialFunction(params, 1 - probabilityOfInactivity))
     data = list(FileIO.iterateJsonFromFile(self.dimensionInActivityTimeFile))[numberOfTimeUnits]
     total = float(sum(data[ParameterEstimation.dimensionInActivityTimeId].values()))
     x = sorted(map(int, data[ParameterEstimation.dimensionInActivityTimeId].keys()))
     y = getCumulativeDistribution([data[ParameterEstimation.dimensionInActivityTimeId][str(i)] / total for i in x])
     print len(x)
     exponentialCurveParams = CurveFit.getParamsAfterFittingData(x, y, CurveFit.increasingExponentialFunction, [1., 1.])
     print self.stream_settings['plot_label'], exponentialCurveParams, calculateInActivityTimeFor(exponentialCurveParams, 0.1) 
     plt.plot(x, y, 'o', label=getLatexForString(self.stream_settings['plot_label']) + getLatexForString(' (%0.2fx^{%0.2f})') % (exponentialCurveParams[0], exponentialCurveParams[1]), color=self.stream_settings['plot_color'])
     plt.plot(x, CurveFit.getYValues(CurveFit.increasingExponentialFunction, exponentialCurveParams, x), color=self.stream_settings['plot_color'], lw=2)
     plt.ylabel(r'$P\ (\ lag\ \leq\ TU\ )$'), plt.xlabel(getLatexForString(xlabelTimeUnits)), plt.title(getLatexForString('CDF for dimension lag distribution.'))
     plt.ylim((0, 1.2))
     plt.legend(loc=4)
     if returnAxisValuesOnly: plt.show()

Example #37

0

Show file

File: stream_parameters_estimation.py Project: ylaron/hd_streams_clustering

 def plotCDFDimensionsLagDistribution(self, returnAxisValuesOnly=True):
     '''
     Inactivity time is the time after which there is a high probability that a
     dimension will not appear. Find time_unit that gives this probability. 
     
     Cumulative distribution function (http://en.wikipedia.org/wiki/Cumulative_distribution_function)
     lag = time betweeen occurance of two dimensions (similar to inactivty_time)
     
     F(time_unit) = P(lag<=time_unit)
     time_unit = F_inv(P(lag<=time_unit))
     
     Given P(inactivty_time>time_unit) determine time_unit as shown:
     P(inactivty_time<=time_unit) = 1 - P(inactivty_time>time_unit)
     inactivty_time = F_inv(P(inactivty_time<=time_unit))
     
     numberOfTimeUnits=10*24*12
     
     Experts stream [ 0.23250341  0.250209  ] 0.25 107
     Houston stream [ 0.16948096  0.30751358] 0.25 126
     
     Experts stream [ 0.23250341  0.250209  ] 0.1, 223
     Houston stream [ 0.16948096  0.30751358] 0.1, 228
     
     Compared to other vaues these values are pretty close to each
     other. This is expected. Irrespective of size of the streams,
     the phrases have the same lifetime and hence decay close to each other.
     '''
     def calculateInActivityTimeFor(params, probabilityOfInactivity): return int(CurveFit.inverseOfIncreasingExponentialFunction(params, 1 - probabilityOfInactivity))
     data = list(FileIO.iterateJsonFromFile(self.dimensionInActivityTimeFile))[numberOfTimeUnits]
     total = float(sum(data[ParameterEstimation.dimensionInActivityTimeId].values()))
     x = sorted(map(int, data[ParameterEstimation.dimensionInActivityTimeId].keys()))
     y = getCumulativeDistribution([data[ParameterEstimation.dimensionInActivityTimeId][str(i)] / total for i in x])
     print len(x)
     exponentialCurveParams = CurveFit.getParamsAfterFittingData(x, y, CurveFit.increasingExponentialFunction, [1., 1.])
     print self.stream_settings['plot_label'], exponentialCurveParams, calculateInActivityTimeFor(exponentialCurveParams, 0.1) 
     plt.plot(x, y, 'o', label=getLatexForString(self.stream_settings['plot_label']) + getLatexForString(' (%0.2fx^{%0.2f})') % (exponentialCurveParams[0], exponentialCurveParams[1]), color=self.stream_settings['plot_color'])
     plt.plot(x, CurveFit.getYValues(CurveFit.increasingExponentialFunction, exponentialCurveParams, x), color=self.stream_settings['plot_color'], lw=2)
     plt.ylabel(r'$P\ (\ lag\ \leq\ TU\ )$'), plt.xlabel(getLatexForString(xlabelTimeUnits)), plt.title(getLatexForString('CDF for dimension lag distribution.'))
     plt.ylim((0, 1.2))
     plt.legend(loc=4)
     if returnAxisValuesOnly: plt.show()

Example #38

0

Show file

File: time_to_process_points.py Project: greeness/hd_streams_clustering

def plotMessagesProcessedWithTime(iterators):
    time_limit = 2300
    for iterator, info in iterators:
        dataX, dataY = [], []
        if not info['id'].startswith('ssa'):
            for data in iterator: 
                if data['iteration_time']<time_limit: dataX.append(data['iteration_time']), dataY.append(data['number_of_messages'])
        else:
            iteration_time = 0
            for data in iterator: 
                if data['batch_size']==10000:
                    iteration_time+=data['iteration_time']
                    if iteration_time<time_limit: dataX.append(iteration_time), dataY.append(data['number_of_messages'])
#        print info, dataX, dataY
        plt.plot(dataX, [y/10**3 for y in dataY], lw=2, label=info['label'], color=info['color'])
#    plt.xlim(xmin=15, xmax=3000)
    plt.legend(loc=2)
    plt.xlabel(getLatexForString('Time (s)')); plt.ylabel(getLatexForString('\# of messages (10^3)')); plt.title(getLatexForString('Message processing rate'))
    plt.plot()
    plt.savefig('messagesProcessedWithTime.pdf')
    plt.savefig('messagesProcessedWithTime.eps')

Example #39

0

Show file

File: algorithms_performance.py Project: greeness/hd_streams_clustering

    def plotJustifyNotUsingVanillaLSH(self):
            pltInfo = {JustifyNotUsingVanillaLSH.with_modified_lsh: {'label': getLatexForString('Modified LSH'), 'color': '#7109AA', 'type': '-'}, JustifyNotUsingVanillaLSH.with_vanilla_lsh: {'label': getLatexForString('Plain LSH'), 'color': '#5AF522', 'type': '-'}}
            experimentsData = {JustifyNotUsingVanillaLSH.with_modified_lsh: {'iteration_time': [], 'quality': [], 'total_clusters': []}, JustifyNotUsingVanillaLSH.with_vanilla_lsh: {'iteration_time': [], 'quality': [], 'total_clusters': []}}
            loadExperimentsData(experimentsData, JustifyNotUsingVanillaLSH.stats_file)
#            loadExperimentsData(experimentsData, 'temp/modified_lsh_need_analysis')
            numberOfPoints = 275
            plt.subplot(312); plotRunningTime(experimentsData, pltInfo, JustifyNotUsingVanillaLSH.with_modified_lsh, JustifyNotUsingVanillaLSH.with_vanilla_lsh, semilog=True); plt.xlim(xmax=270); plt.xticks([], tick1On=False); plt.ylim(ymin=1, ymax=5000);
            plt.legend(loc=2, ncol=2)
#            plt.xlabel(getLatexForString('Time'))
            plt.subplot(313); plotQuality(experimentsData, numberOfPoints, pltInfo); plt.xlabel(getLatexForString('Time')); plt.ylim(ymin=0.72); plt.xlim(xmax=270) 
            plt.subplot(311);plotClusters(experimentsData, numberOfPoints, pltInfo); plt.title(getLatexForString('Impact of modified lsh')); plt.xticks([], tick1On=False); plt.xlim(xmax=270)
            plt.savefig('justifyNotUsingVanillaLSH.pdf')

Example #40

0

Show file

File: plots.py Project: greeness/hd_streams_clustering

 def runningTimes(*iterators, **kwargs):
     loc = kwargs.get('loc', 1)
     fileName = kwargs.get('file_name', 'running_times.eps')
     xmax = kwargs.get('xmax', None)
     semilog = kwargs.get('log', False)
     xmin = kwargs.get('xmin', None)
     title = kwargs.get('title', None)
     for id, iterator in iterators:
         dataX, dataY = [], []
         for data in iterator:
             if data['no_of_documents']!=900000:
                 if xmax and data['no_of_documents'] <= xmax: dataX.append(data['no_of_documents']), dataY.append(data['iteration_time'])
                 else: dataX.append(data['no_of_documents']), dataY.append(data['iteration_time'])
         if not semilog: plt.loglog(dataX, dataY, label=algorithm_info[id]['label'], color=algorithm_info[id]['color'], lw=2, marker=algorithm_info[id]['marker'])
         else: plt.semilogx(dataX, dataY, label=algorithm_info[id]['label'], color=algorithm_info[id]['color'], lw=2, marker=algorithm_info[id]['marker'])
     plt.legend(loc=loc)
     plt.xlabel(getLatexForString('Length of information stream')); plt.ylabel(getLatexForString('Running time (s)')); plt.title(getLatexForString(title))
     if xmax: plt.xlim(xmax=xmax) 
     if xmin: plt.xlim(xmin=xmin) 
     plt.savefig(fileName+'.eps')
     plt.savefig(fileName+'.pdf')

Example #41

0

Show file

File: algorithms_performance.py Project: greeness/hd_streams_clustering

def plotClusters(experimentsData, numberOfPoints, pltInfo):
    for k in experimentsData:
        window = 4
        plt.semilogy(range(numberOfPoints)[:-window],
                     movingAverage(
                         experimentsData[k]['total_clusters'][:numberOfPoints],
                         window)[:-window],
                     pltInfo[k]['type'],
                     label=pltInfo[k]['label'],
                     color=pltInfo[k]['color'],
                     lw=2)
    plt.ylabel(getLatexForString('Clusters in memory'))

Example #42

0

Show file

 def plotICDFClustersLagDistribution(self, returnAxisValuesOnly=True):
     '''
     Experts stream 0.25 199
     Houston stream 0.25 152
     '''
     self.stream_settings[
         '%s_file' % ClusteringParametersEstimation.
         clusterLagDistributionId] = self.stream_settings[
             'parameter_estimation_folder'] + ClusteringParametersEstimation.clusterLagDistributionId
     dataX, dataY, total = set(), defaultdict(list), []
     for line in list(
             FileIO.iterateJsonFromFile(self.stream_settings[
                 '%s_file' %
                 ClusteringParametersEstimation.clusterLagDistributionId])):
         print line.keys()
         data = dict((int(k), v)
                     for k, v in line[ClusteringParametersEstimation.
                                      clusterLagDistributionId].iteritems())
         total.append(sum(data.values()))
         for i in data:
             dataY[i].append(data[i])
             dataX.add(i)
     totalInstancesObserved = float(sum(total))
     x = sorted(dataX)
     y = getInverseCumulativeDistribution(
         [sum(dataY[k]) / totalInstancesObserved for k in x])
     plt.plot(x,
              y,
              label=getLatexForString(self.stream_settings['plot_label']),
              color=self.stream_settings['plot_color'],
              lw=2)
     if self.stream_settings['plot_label'] == 'Houston stream':
         plt.plot([0, x[-1]], [1, 0], '--', color='#5AF522', lw=2)
     plt.ylabel(
         r'$P\ (\ inactivity\ duration\ \geq\ \ inactivity\ duration\ threshold )$'
     ), plt.xlabel(
         getLatexForString('Inactivity duration threshold')), plt.title(
             getLatexForString('Inactivity analysis for crowds.'))
     plt.legend()
     if returnAxisValuesOnly: plt.show()

Example #43

0

Show file

File: stream_parameters_estimation.py Project: ylaron/hd_streams_clustering

 def plotDimensionsUpdateFrequencyEstimation(self, returnAxisValuesOnly=True):
     '''
     numberOfTimeUnits=10*24*12
     Experts stream 12
     Houston stream 2
     '''
     dataDistribution = defaultdict(list)
     for line in FileIO.iterateJsonFromFile(self.dimensionsUpdateFrequencyFile):
         for k, v in line[ParameterEstimation.dimensionsUpdateFrequencyId].iteritems():
             k = int(k) / self.timeUnitInSeconds.seconds
             if k not in dataDistribution: dataDistribution[k] = [0., 0.]
             dataDistribution[k][0] += v; dataDistribution[k][1] += 1
     x, y = [], []; [(x.append(k), y.append((dataDistribution[k][0] / dataDistribution[k][1]))) for k in sorted(dataDistribution)]
     x1, y1 = [], []; [(x1.append(k), y1.append((dataDistribution[k][0] / dataDistribution[k][1]) / k)) for k in sorted(dataDistribution)]
     x = x[:numberOfTimeUnits]; y = y[:numberOfTimeUnits]; x1 = x1[:numberOfTimeUnits]; y1 = y1[:numberOfTimeUnits]
     def subPlot(id):
         plt.subplot(id)
         inactivityCorordinates = max(zip(x1, y1), key=itemgetter(1))
         plt.semilogx(x1, y1, '-', color=self.stream_settings['plot_color'], label=getLatexForString(self.stream_settings['plot_label'] + ' (Update frequency=%d TU)' % inactivityCorordinates[0]), lw=2)
         plt.subplot(id).yaxis.set_major_formatter(FuncFormatter(lambda x, i: '%0.1f' % (x / 10. ** 3)))
         plt.semilogx([inactivityCorordinates[0]], [inactivityCorordinates[1]], 'o', alpha=0.7, color='r')
         plt.subplot(id).yaxis.set_major_formatter(FuncFormatter(lambda x, i: '%0.1f' % (x / 10. ** 3)))
         plt.yticks((min(y1), max(y1)))
         print self.stream_settings['plot_label'], inactivityCorordinates[0]
     plt.subplot(311)
     plt.title(getLatexForString('Dimensions update frequency estimation'))
     plt.semilogx(x, y, '-', color=self.stream_settings['plot_color'], label=getLatexForString(self.stream_settings['plot_label']), lw=2)
     plt.subplot(311).yaxis.set_major_formatter(FuncFormatter(lambda x, i: '%0.1f' % (x / 10. ** 5)))
     plt.text(0.0, 1.01, getLatexForString('10^5'), transform=plt.gca().transAxes)
     plt.ylabel(getLatexForString('\# of decayed dimensions'))
     if self.stream_settings['stream_id'] == 'experts_twitter_stream': subPlot(312)
     else: subPlot(313); plt.xlabel(getLatexForString(xlabelTimeUnits))
     plt.ylabel(getLatexForString('Rate of DD (10^3)'))
     plt.legend(loc=3)
     if returnAxisValuesOnly: plt.show()

Example #44

0

Show file

File: plots.py Project: greeness/hd_streams_clustering

    def plotQuality():
        kmeans = (0.79, 0.78, 0.80, 0.80, 0.79)
        cda_it = (0.98, 0.93, 0.81, 0.84, 0.79)
        cda_unopt = (0.95, 0.85, 0.86, 0.84, 0.87)
        cda = (0.96, 0.88, 0.86, 0.85, 0.88)
#        kmeans = (0.79, 0.78)
#        cda_it = (0.98, 0.93)
#        cda_unopt = (0.95, 0.85)
#        cda = (0.96, 0.88)
        
        N = len(kmeans)
        ind = np.arange(N)  # the x locations for the groups
        width = 0.1       # the width of the bars
        
        fig = plt.figure()
        ax = fig.add_subplot(111)
#        rectsKmeans = ax.bar(ind, kmeans, width, color='#FF7A7A', label='k-means', hatch='\\')
#        rectsCdaIt = ax.bar(ind+width, cda_it, width, color='#FF7AEB', label='Iterative CDA', hatch='/')
#        rectsCdaUnopt = ax.bar(ind+2*width, cda_unopt, width, color='#7A7AFF', label='Stream-CDA', hatch='-')
#        rectsCda = ax.bar(ind+3*width, cda, width, color='#B0B0B0', label='Tailored Stream-CDA', hatch='x')
        rectsKmeans = ax.bar(ind, kmeans, width, color='#DCDCDC', label=getLatexForString('k-means'), hatch='\\')
        rectsCdaIt = ax.bar(ind+width, cda_it, width, color='#808080', label=getLatexForString('Iterative CDA'), hatch='/')
        rectsCdaUnopt = ax.bar(ind+2*width, cda_unopt, width, color='#778899', label=getLatexForString('Stream-CDA'), hatch='-')
        rectsCda = ax.bar(ind+3*width, cda, width, color='#2F4F4F', label=getLatexForString('Tailored Stream-CDA'), hatch='x')
        
        ax.set_ylabel(getLatexForString('Score'))
        ax.set_title(getLatexForString('Quality of crowds discovered'))
        ax.set_xticks(ind+width)
        ax.set_xticklabels( ('Purity', 'NMI', 'F1', 'Precision', 'Recall') )
#        ax.set_xticklabels((getLatexForString('Purity'), getLatexForString('NMI')))
        
        plt.legend(loc=8, ncol=2)
        plt.savefig('crowds_quality.pdf')

Example #45

0

Show file

File: stream_parameters_estimation.py Project: greeness/hd_streams_clustering

 def plotDimensionsUpdateFrequencyEstimation(self, returnAxisValuesOnly=True):
     '''
     numberOfTimeUnits=10*24*12
     Experts stream 12
     Houston stream 2
     '''
     dataDistribution = defaultdict(list)
     for line in FileIO.iterateJsonFromFile(self.dimensionsUpdateFrequencyFile):
         for k, v in line[ParameterEstimation.dimensionsUpdateFrequencyId].iteritems():
             k = int(k) / self.timeUnitInSeconds.seconds
             if k not in dataDistribution: dataDistribution[k] = [0., 0.]
             dataDistribution[k][0] += v; dataDistribution[k][1] += 1
     x, y = [], []; [(x.append(k), y.append((dataDistribution[k][0] / dataDistribution[k][1]))) for k in sorted(dataDistribution)]
     x1, y1 = [], []; [(x1.append(k), y1.append((dataDistribution[k][0] / dataDistribution[k][1]) / k)) for k in sorted(dataDistribution)]
     x = x[:numberOfTimeUnits]; y = y[:numberOfTimeUnits]; x1 = x1[:numberOfTimeUnits]; y1 = y1[:numberOfTimeUnits]
     def subPlot(id):
         plt.subplot(id)
         inactivityCorordinates = max(zip(x1, y1), key=itemgetter(1))
         plt.semilogx(x1, y1, '-', color=self.stream_settings['plot_color'], label=getLatexForString(self.stream_settings['plot_label'] + ' (Update frequency=%d TU)' % inactivityCorordinates[0]), lw=2)
         plt.subplot(id).yaxis.set_major_formatter(FuncFormatter(lambda x, i: '%0.1f' % (x / 10. ** 3)))
         plt.semilogx([inactivityCorordinates[0]], [inactivityCorordinates[1]], 'o', alpha=0.7, color='r')
         plt.subplot(id).yaxis.set_major_formatter(FuncFormatter(lambda x, i: '%0.1f' % (x / 10. ** 3)))
         plt.yticks((min(y1), max(y1)))
         print self.stream_settings['plot_label'], inactivityCorordinates[0]
     plt.subplot(311)
     plt.title(getLatexForString('Dimensions update frequency estimation'))
     plt.semilogx(x, y, '-', color=self.stream_settings['plot_color'], label=getLatexForString(self.stream_settings['plot_label']), lw=2)
     plt.subplot(311).yaxis.set_major_formatter(FuncFormatter(lambda x, i: '%0.1f' % (x / 10. ** 5)))
     plt.text(0.0, 1.01, getLatexForString('10^5'), transform=plt.gca().transAxes)
     plt.ylabel(getLatexForString('\# of decayed dimensions'))
     if self.stream_settings['stream_id'] == 'experts_twitter_stream': subPlot(312)
     else: subPlot(313); plt.xlabel(getLatexForString(xlabelTimeUnits))
     plt.ylabel(getLatexForString('Rate of DD (10^3)'))
     plt.legend(loc=3)
     if returnAxisValuesOnly: plt.show()

Example #46

0

Show file

File: plots.py Project: greeness/hd_streams_clustering

 def runningTimes(*iterators, **kwargs):
     loc = kwargs.get('loc', 1)
     fileName = kwargs.get('file_name', 'running_times.eps')
     xmax = kwargs.get('xmax', None)
     semilog = kwargs.get('log', False)
     xmin = kwargs.get('xmin', None)
     title = kwargs.get('title', None)
     for id, iterator in iterators:
         dataX, dataY = [], []
         for data in iterator:
             if data['no_of_documents'] != 900000:
                 if xmax and data['no_of_documents'] <= xmax:
                     dataX.append(data['no_of_documents']), dataY.append(
                         data['iteration_time'])
                 else:
                     dataX.append(data['no_of_documents']), dataY.append(
                         data['iteration_time'])
         if not semilog:
             plt.loglog(dataX,
                        dataY,
                        label=algorithm_info[id]['label'],
                        color=algorithm_info[id]['color'],
                        lw=2,
                        marker=algorithm_info[id]['marker'])
         else:
             plt.semilogx(dataX,
                          dataY,
                          label=algorithm_info[id]['label'],
                          color=algorithm_info[id]['color'],
                          lw=2,
                          marker=algorithm_info[id]['marker'])
     plt.legend(loc=loc)
     plt.xlabel(getLatexForString('Length of information stream'))
     plt.ylabel(getLatexForString('Running time (s)'))
     plt.title(getLatexForString(title))
     if xmax: plt.xlim(xmax=xmax)
     if xmin: plt.xlim(xmin=xmin)
     plt.savefig(fileName + '.eps')
     plt.savefig(fileName + '.pdf')

Example #47

0

Show file

File: algorithms_performance.py Project: greeness/hd_streams_clustering

    def plotJustifyDimensionsEstimation2(self):
        pltInfo =  {JustifyDimensionsEstimation.top_n_dimension: {'label': getLatexForString('Temporally significant'), 'color': '#7109AA', 'type': '-'}, JustifyDimensionsEstimation.first_n_dimension: {'label': getLatexForString('By occurrence'), 'color': '#5AF522', 'type': '-'}}
#        experimentsData = {JustifyMemoryPruning.with_memory_pruning: {'iteration_time': [], 'quality': [], 'total_clusters': []}, JustifyMemoryPruning.without_memory_pruning: {'iteration_time': [], 'quality': [], 'total_clusters': []}}
        experimentsData = {JustifyDimensionsEstimation.top_n_dimension: defaultdict(dict), JustifyDimensionsEstimation.first_n_dimension: defaultdict(dict)}
        for data in FileIO.iterateJsonFromFile(JustifyDimensionsEstimation.stats_file_2):
#        for data in FileIO.iterateJsonFromFile('temp/dimensions_need_analysis_2'):
#            if 'dimensions' in data['iteration_parameters']: 
            dimension = data['iteration_parameters']['dimensions']
            type = data['iteration_parameters']['type']
            if dimension not in experimentsData[type]: experimentsData[type][dimension] = {'iteration_time': [], 'quality': [], 'total_clusters': []}
            experimentsData[type][dimension]['iteration_time'].append(data['iteration_time']), experimentsData[type][dimension]['quality'].append(data['purity']), experimentsData[type][dimension]['total_clusters'].append(data['no_of_clusters'])
        lshData = dict([(k, np.mean(experimentsData[JustifyDimensionsEstimation.top_n_dimension][76819][k])) for k in experimentsData[JustifyDimensionsEstimation.top_n_dimension][76819]])
        del experimentsData[JustifyDimensionsEstimation.top_n_dimension][76819]
        print lshData
        plotData = {JustifyDimensionsEstimation.top_n_dimension: defaultdict(list), JustifyDimensionsEstimation.first_n_dimension: defaultdict(list)}
        for type in experimentsData:
            for dimension in sorted(experimentsData[type]): plotData[type]['dataX'].append(dimension); [plotData[type][k].append(np.mean(experimentsData[type][dimension][k])) for k in experimentsData[type][dimension]]
        plt.subplot(311); 
        for type in experimentsData:
            plt.semilogy([x/10**3 for x in plotData[type]['dataX']], movingAverage(plotData[type]['total_clusters'], 4), color=pltInfo[type]['color'], label=pltInfo[type]['label'], lw=2);
        plt.semilogy([x/10**3 for x in plotData[JustifyDimensionsEstimation.top_n_dimension]['dataX']], [lshData['total_clusters']]*len(plotData[JustifyDimensionsEstimation.top_n_dimension]['dataX']), '--', color='#FF1300', label=getLatexForString('Top-76819 dimensions'), lw=2);
        plt.ylim(ymin=1)
        
        plt.subplot(312); 
        for type in experimentsData:
            plt.semilogy([x/10**3 for x in plotData[type]['dataX']], movingAverage(plotData[type]['iteration_time'], 4), color=pltInfo[type]['color'], label=pltInfo[type]['label'], lw=2);
        plt.semilogy([x/10**3 for x in plotData[JustifyDimensionsEstimation.top_n_dimension]['dataX']], [lshData['iteration_time']]*len(plotData[JustifyDimensionsEstimation.top_n_dimension]['dataX']), '--', color='#FF1300', label=getLatexForString('Top-76819'), lw=2);
        plt.ylim(ymin=1, ymax=1500)
        plt.legend(loc=2, ncol=2)
        plt.subplot(313); 
        for type in experimentsData:
            plt.plot([x/10**3 for x in plotData[type]['dataX']], movingAverage(plotData[type]['quality'], 4), color=pltInfo[type]['color'], label=pltInfo[type]['label'], lw=2);
        plt.ylabel('$Mean\ purity\ per\ iteration$'); 
        plt.title(getLatexForString('Impact of dimension ranking'))
        plt.xlabel('$\#\ number\ of\ dimensions\ (10^3)$')
        plt.plot([x/10**3 for x in plotData[JustifyDimensionsEstimation.top_n_dimension]['dataX']], [lshData['quality']]*len(plotData[JustifyDimensionsEstimation.top_n_dimension]['dataX']), '--', color='#FF1300', label=getLatexForString('Top-76819 dimensions'), lw=2);
        plt.ylim(ymin=0.80,ymax=1.0)
        plt.savefig('justifyDimensionsEstimation2.pdf')

Example #48

0

Show file

File: algorithms_performance.py Project: greeness/hd_streams_clustering

 def plotJustifyDimensionsEstimation(self):
     runningTimeData, purityData = defaultdict(list), defaultdict(list)
     for data in FileIO.iterateJsonFromFile(
             JustifyDimensionsEstimation.stats_file):
         if data['iteration_parameters']['dimensions'] < data[
                 'no_of_observed_dimensions']:
             no_of_dimensions = data['iteration_parameters']['dimensions']
             runningTimeData[no_of_dimensions].append(
                 data['iteration_time']
             ), purityData[no_of_dimensions].append(data['purity'])
     plt.subplot(111)
     dataX, dataY = [], []
     del purityData[169991]
     del purityData[39989]
     plt.title(getLatexForString('Impact of dimension estimation'))
     for k in sorted(purityData):
         dataX.append(k), dataY.append(np.mean(purityData[k]))
     plt.semilogx(dataX, [0.96] * len(dataX),
                  '--',
                  label=getLatexForString('Top n dimensions'),
                  color='#7109AA',
                  lw=2)
     plt.semilogx(dataX, [np.mean(dataY)] * len(dataX),
                  '--',
                  color='#5AF522',
                  lw=2)
     plt.semilogx(dataX,
                  dataY,
                  '-x',
                  label=getLatexForString('Fixed dimensions'),
                  color='#5AF522',
                  lw=2)
     plt.ylim(0.8, 1.0)
     plt.xlim(7000, 203000)
     plt.xlabel(getLatexForString('\# of dimensions'))
     plt.ylabel(getLatexForString('Purity'))
     plt.legend(loc=3)
     plt.savefig('justifyDimensionsEstimation.pdf')

Example #49

0

Show file

File: data_generation_and_crowd_analysis.py Project: ylaron/hd_streams_clustering

    def sampleCrowds(self):
        # Set dates for experts as startingDay=datetime(2011,3,19), endingDay=datetime(2011,3, 30) with a minimum of 7 users at a time.
        AnalyzeData.reset(), AnalyzeData.constructCrowdDataStructures(self.stream_settings['data_iterator'])
        fig = plt.figure(); ax = fig.gca()
#        expectedTags = set(['#redsox', '#mlb', '#sfgiants', '#49ers', '#mariners', '#twins', '#springtraining', '#mets', '#reds'])
#        expectedTags = set(['#ctia']); title = 'CTIA 2011'
#        expectedTags = set(['#55', '#hcr', '#hcrbday', '#oklahomas', '#aca', '#hcworks', '#npr', '#teaparty'])
#        expectedTags = set(['#budget11', '#taxdodgers', '#budget', '#pmqs', '#budget11', '#indybudget'])
#        expectedTags = set(['#egypt2dc', '#libyan', '#yemen', '#egypt', '#syria', '#gaddaficrimes', '#damascus', '#jan25', 
#                '#daraa', '#feb17', '#gaddafi', '#libya', '#feb17', '#gadhafi', '#muslimbrotherhood', '#gaddafis']); title = 'Middle East'
        expectedTags = set(['#libya']); title = 'Libya'
        for crowd in self._filteredCrowdIterator():
            if expectedTags.intersection(set(list(crowd.hashtagDimensions))):
                x, y = zip(*[(datetime.fromtimestamp(clusterGenerationTime), len(crowd.clusters[clusterGenerationTime].documentsInCluster)) for clusterGenerationTime in sorted(crowd.clusters)])
                plt.plot_date(x, y, '-', color=GeneralMethods.getRandomColor(), lw=2, label=' '.join([crowd.crowdId]+list(crowd.hashtagDimensions)[:1]))
        fig.autofmt_xdate(rotation=30)
        ax.xaxis.set_major_locator(matplotlib.dates.HourLocator(interval=24))
        ax.xaxis.set_major_formatter(matplotlib.dates.DateFormatter('%a %d %b'))
#        plt.legend()
        plt.xlim((datetime(2011, 3, 19), datetime(2011, 3, 30)))
        plt.title(getLatexForString('Crowds for '+title))
        plt.ylabel(getLatexForString('Crowd size'))
        plt.show()

Example #50

0

Show file

File: stream_parameters_estimation.py Project: greeness/hd_streams_clustering

 def plotDimensionsEstimation(self, returnAxisValuesOnly=True):
     def calculateDimensionsFor(params, percentageOfNewDimensions): 
         '''
         numberOfTimeUnits=10*24*12
         Experts stream [  1.17707899e+03   1.03794580e+00] 76819
         Houston stream [  2.73913900e+03   1.02758516e+00] 195731
         '''
         print getSmallestPrimeNumberGreaterThan(int(CurveFit.inverseOfDecreasingExponentialFunction(params, percentageOfNewDimensions)))
     dataDistribution = defaultdict(list)
     for line in FileIO.iterateJsonFromFile(self.dimensionsEstimationFile):
         for k, v in line[ParameterEstimation.dimensionsEstimationId].iteritems():
             k = int(k)
             if k not in dataDistribution: dataDistribution[k] = [0., 0.]
             dataDistribution[k][0] += v; dataDistribution[k][1] += 1
     x, y = [], []; [(x.append(k), y.append((dataDistribution[k][0] / dataDistribution[k][1]) / k)) for k in sorted(dataDistribution) if k > 1000]
     x, y = x[:numberOfTimeUnits], y[:numberOfTimeUnits]
     exponentialCurveParams = CurveFit.getParamsAfterFittingData(x, y, CurveFit.decreasingExponentialFunction, [1., 1.])
     print self.stream_settings['plot_label'], exponentialCurveParams, calculateDimensionsFor(exponentialCurveParams, 0.01) 
     plt.ylabel(getLatexForString('\% of decaying dimensions')), plt.xlabel(getLatexForString('\# of dimensions')), plt.title(getLatexForString('Dimension stability with increasing number of dimensions.'))
     plt.semilogy(x, y, 'o', color=self.stream_settings['plot_color'], label=getLatexForString(self.stream_settings['plot_label']) + getLatexForString(' (%0.2fx^{-%0.2f})') % (exponentialCurveParams[0], exponentialCurveParams[1]), lw=2)
     plt.semilogy(x, CurveFit.getYValues(CurveFit.decreasingExponentialFunction, exponentialCurveParams, x), color=self.stream_settings['plot_color'], lw=2)
     plt.legend()
     if returnAxisValuesOnly: plt.show()

Example #51

0

Show file

File: algorithms_performance.py Project: greeness/hd_streams_clustering

def plotQuality(experimentsData, numberOfPoints, pltInfo):
    for k in experimentsData:
        dataY = movingAverage(experimentsData[k]['quality'][:numberOfPoints],
                              4)
        plt.plot(range(numberOfPoints), [np.mean(dataY)] * numberOfPoints,
                 '--',
                 color=pltInfo[k]['color'],
                 lw=2)
        plt.plot(range(numberOfPoints),
                 dataY,
                 pltInfo[k]['type'],
                 label=pltInfo[k]['label'],
                 color=pltInfo[k]['color'],
                 lw=2)
    plt.ylabel(getLatexForString('Purity'))

Example #52

0

Show file

File: quality_comparison_with_ssa.py Project: ylaron/hd_streams_clustering

 def plotClusteringSpeed(saveFig=True):
     dataToPlot = dict([(k, {'x': [], 'y': []}) for k in plotSettings])
     for data in FileIO.iterateJsonFromFile(TweetsFile.stats_file):
         for k in plotSettings:
             dataToPlot[k]['x'].append(data[k]['no_of_documents'])
             dataToPlot[k]['y'].append(data[k]['iteration_time'])
     for k in plotSettings:
         plt.loglog(dataToPlot[k]['x'],
                    movingAverage(dataToPlot[k]['y'], 1),
                    label=plotSettings[k]['label'],
                    color=plotSettings[k]['color'],
                    lw=2)
     print dataToPlot['streaming_lsh']['x'][10]
     print dataToPlot['streaming_lsh']['y'][10]
     plt.legend(loc=4)
     if saveFig:
         plt.xlabel(getLatexForString('\# of documents'))
         plt.ylabel(getLatexForString('Running time (s)'))
         plt.title(
             getLatexForString(
                 'Running time comparsion for Streaing LSH with SSA'))
     plt.xlim(xmin=500, xmax=600000)
     #        plt.show()
     if saveFig: plt.savefig('speedComparisonWithSSA.pdf')

Example #53

0

Show file

File: plots.py Project: greeness/hd_streams_clustering

    def plotQuality():
        kmeans = (0.79, 0.78, 0.80, 0.80, 0.79)
        cda_it = (0.98, 0.93, 0.81, 0.84, 0.79)
        cda_unopt = (0.95, 0.85, 0.86, 0.84, 0.87)
        cda = (0.96, 0.88, 0.86, 0.85, 0.88)
        #        kmeans = (0.79, 0.78)
        #        cda_it = (0.98, 0.93)
        #        cda_unopt = (0.95, 0.85)
        #        cda = (0.96, 0.88)

        N = len(kmeans)
        ind = np.arange(N)  # the x locations for the groups
        width = 0.1  # the width of the bars

        fig = plt.figure()
        ax = fig.add_subplot(111)
        #        rectsKmeans = ax.bar(ind, kmeans, width, color='#FF7A7A', label='k-means', hatch='\\')
        #        rectsCdaIt = ax.bar(ind+width, cda_it, width, color='#FF7AEB', label='Iterative CDA', hatch='/')
        #        rectsCdaUnopt = ax.bar(ind+2*width, cda_unopt, width, color='#7A7AFF', label='Stream-CDA', hatch='-')
        #        rectsCda = ax.bar(ind+3*width, cda, width, color='#B0B0B0', label='Tailored Stream-CDA', hatch='x')
        rectsKmeans = ax.bar(ind,
                             kmeans,
                             width,
                             color='#DCDCDC',
                             label=getLatexForString('k-means'),
                             hatch='\\')
        rectsCdaIt = ax.bar(ind + width,
                            cda_it,
                            width,
                            color='#808080',
                            label=getLatexForString('Iterative CDA'),
                            hatch='/')
        rectsCdaUnopt = ax.bar(ind + 2 * width,
                               cda_unopt,
                               width,
                               color='#778899',
                               label=getLatexForString('Stream-CDA'),
                               hatch='-')
        rectsCda = ax.bar(ind + 3 * width,
                          cda,
                          width,
                          color='#2F4F4F',
                          label=getLatexForString('Tailored Stream-CDA'),
                          hatch='x')

        ax.set_ylabel(getLatexForString('Score'))
        ax.set_title(getLatexForString('Quality of crowds discovered'))
        ax.set_xticks(ind + width)
        ax.set_xticklabels(('Purity', 'NMI', 'F1', 'Precision', 'Recall'))
        #        ax.set_xticklabels((getLatexForString('Purity'), getLatexForString('NMI')))

        plt.legend(loc=8, ncol=2)
        plt.savefig('crowds_quality.pdf')

Example #54

0

Show file

File: stream_parameters_estimation.py Project: greeness/hd_streams_clustering

 def plotICDFDimensionsInactivityThreshold(self, returnAxisValuesOnly=True):
     ''' Plot P(in_actiivty > threshold timeunit)
         Find time unit at which probability is low.
         Experts stream 0.25 129
         Houston stream 0.25 144
     '''
     dataX, dataY, total = set(), defaultdict(list), []
     for line in list(FileIO.iterateJsonFromFile(self.dimensionInActivityTimeFile)):
         data = dict((int(k), v) for k,v in line[ParameterEstimation.dimensionInActivityTimeId].iteritems())
         total.append(sum(data.values()))
         for i in data: dataY[i].append(data[i]); dataX.add(i)
     totalInstancesObserved=float(sum(total))
     x = sorted(dataX)
     y = getInverseCumulativeDistribution([sum(dataY[k])/totalInstancesObserved for k in x])
     plt.plot(x, y, label=getLatexForString(self.stream_settings['plot_label']), color=self.stream_settings['plot_color'], lw=2)
     plt.ylabel(r'$P\ (\ inactivity\ duration\ \geq\ \ inactivity\ duration\ threshold )$'), plt.xlabel(getLatexForString('Inactivity duration threshold')), plt.title(getLatexForString('Inactivity analysis for dimensions.'))
     plt.legend()
     if returnAxisValuesOnly: plt.show()

Example #55

0

Show file

File: stream_parameters_estimation.py Project: greeness/hd_streams_clustering

 def plotPercentageOfDimensionsWithinALag(self, returnAxisValuesOnly=True):
     '''
     This gives us the percentage of phrases we can loose everytime we prune phrases.
     
     Measures the percentage of dimensions having lag less than TU.
     
     So at the end of 10th day, almost y% of phrases can be removed. With some probabiity
     that it will not occure again.
     
     numberOfTimeUnits=10*24*12
     With 75% probability.
     Experts stream [ 0.0097055   0.81888514] 107 0.554497397565
     Houston stream [ 0.00943499  0.825918  ] 126 0.487757815615
     With 90% probability.
     Experts stream [ 0.0097055   0.81888514] 223 0.187150798756
     Houston stream [ 0.00943499  0.825918  ] 228 0.164007589276
     '''
     def calculatePercentageOfDecayedPhrasesFor(params, timeUnit): return 1 - CurveFit.increasingExponentialFunction(params, timeUnit)
     dataDistribution = {}
     currentTimeUnit = 0
     for data in list(FileIO.iterateJsonFromFile(self.dimensionInActivityTimeFile))[:numberOfTimeUnits]:
         totalDimensions = float(sum(data['phrases_lag_distribution'].values()))
         tempArray = []
         for k, v in data['phrases_lag_distribution'].iteritems():
             k = int(k)
             if k not in dataDistribution: dataDistribution[k] = [0] * numberOfTimeUnits
             dataDistribution[k][currentTimeUnit] = v / totalDimensions
             tempArray.append(v / totalDimensions)
         currentTimeUnit += 1
     x = sorted(dataDistribution)
     y = getCumulativeDistribution([np.mean(dataDistribution[k]) for k in x])
     params = CurveFit.getParamsAfterFittingData(x, y, CurveFit.increasingExponentialFunction, [1., 1.])
     print self.stream_settings['plot_label'], params,
     def subPlot(id, timeUnit):
         plt.subplot(id)
         print timeUnit, calculatePercentageOfDecayedPhrasesFor(params, timeUnit)
         plt.plot(x, y, 'o', label=getLatexForString(self.stream_settings['plot_label']) + getLatexForString(' (%0.2fx^{%0.2f})') % (params[0], params[1]), color=self.stream_settings['plot_color'])
         plt.plot(x, CurveFit.getYValues(CurveFit.increasingExponentialFunction, params, x), color=self.stream_settings['plot_color'], lw=2)
     if self.stream_settings['stream_id'] == 'experts_twitter_stream': subPlot(111, 107); plt.title(getLatexForString('Percentage of phrases within a lag'))
     else: subPlot(111, 126); plt.xlabel(getLatexForString(xlabelTimeUnits))
     plt.ylabel(r'$\%\ of\ phrases\ with\ lag\ \leq\ TU$')
     plt.legend(loc=4)
     if returnAxisValuesOnly: plt.show()

Example #56

0

Show file

File: stream_parameters_estimation.py Project: greeness/hd_streams_clustering

 def plotICDFClustersLagDistribution(self, returnAxisValuesOnly=True):
     '''
     Experts stream 0.25 199
     Houston stream 0.25 152
     '''
     self.stream_settings['%s_file' % ClusteringParametersEstimation.clusterLagDistributionId] = self.stream_settings['parameter_estimation_folder'] + ClusteringParametersEstimation.clusterLagDistributionId
     dataX, dataY, total = set(), defaultdict(list), []
     for line in list(FileIO.iterateJsonFromFile(self.stream_settings['%s_file' % ClusteringParametersEstimation.clusterLagDistributionId])):
         print line.keys()
         data = dict((int(k), v) for k,v in line[ClusteringParametersEstimation.clusterLagDistributionId].iteritems())
         total.append(sum(data.values()))
         for i in data: dataY[i].append(data[i]); dataX.add(i)
     totalInstancesObserved=float(sum(total))
     x = sorted(dataX)
     y = getInverseCumulativeDistribution([sum(dataY[k])/totalInstancesObserved for k in x])
     plt.plot(x, y, label=getLatexForString(self.stream_settings['plot_label']), color=self.stream_settings['plot_color'], lw=2)
     if self.stream_settings['plot_label']=='Houston stream': plt.plot([0,x[-1]], [1, 0], '--', color='#5AF522', lw=2)
     plt.ylabel(r'$P\ (\ inactivity\ duration\ \geq\ \ inactivity\ duration\ threshold )$'), plt.xlabel(getLatexForString('Inactivity duration threshold')), plt.title(getLatexForString('Inactivity analysis for crowds.'))
     plt.legend()
     if returnAxisValuesOnly: plt.show()

Example #57

0

Show file

File: stream_parameters_estimation.py Project: greeness/hd_streams_clustering

    def plotPercentageOfClustersWithinALag(self, returnAxisValuesOnly=True):
        '''
        458 Experts stream [ 0.01860266  0.70639136] 15 0.874004297177
        80 Houston stream [ 0.0793181   0.47644004] 3 0.866127308876
        '''
        def calculatePercentageOfDecayedPhrasesFor(params, timeUnit): return 1 - CurveFit.increasingExponentialFunction(params, timeUnit)
        dataDistribution = {}
        currentTimeUnit = 0
#        file='/mnt/chevron/kykamath/data/twitter/lsh_crowds/houston_stream/parameter_estimation/cluster_lag_distribution'
        file = self.hdsClustering.stream_settings['%s_file' % ClusteringParametersEstimation.clusterLagDistributionId]
        lines = list(FileIO.iterateJsonFromFile(file))
        numberOfTimeUnits = len(lines)
        for data in lines:
            totalClusters = float(sum(data[ClusteringParametersEstimation.clusterLagDistributionId].values()))
            tempArray = []
            for k, v in data[ClusteringParametersEstimation.clusterLagDistributionId].iteritems():
                k = int(k)
                if k not in dataDistribution: dataDistribution[k] = [0] * numberOfTimeUnits
                dataDistribution[k][currentTimeUnit] = v / totalClusters
                tempArray.append(v / totalClusters)
            currentTimeUnit += 1
        x = sorted(dataDistribution)
        print numberOfTimeUnits,
        y = getCumulativeDistribution([np.mean(dataDistribution[k]) for k in x])
        params = CurveFit.getParamsAfterFittingData(x, y, CurveFit.increasingExponentialFunction, [1., 1.])
        print self.stream_settings['plot_label'], params,
        def subPlot(id, timeUnit):
            plt.subplot(id)
            print timeUnit, calculatePercentageOfDecayedPhrasesFor(params, timeUnit)
            plt.plot(x, y, 'o', label=getLatexForString(self.stream_settings['plot_label']) + getLatexForString(' (%0.2fx^{%0.2f})') % (params[0], params[1]), color=self.stream_settings['plot_color'])
            plt.plot(x, CurveFit.getYValues(CurveFit.increasingExponentialFunction, params, x), color=self.stream_settings['plot_color'], lw=2)
        if self.stream_settings['stream_id'] == 'experts_twitter_stream': subPlot(111, 15); plt.title(getLatexForString('Percentage of clusters within a lag'))
        else: subPlot(111, 3); plt.xlabel(getLatexForString(xlabelTimeUnits))
        plt.ylabel(r'$\%\ of\ clusters\ with\ lag\ \leq\ TU$')
        plt.legend(loc=4)
        if returnAxisValuesOnly: plt.show()