def performanceForCDAAt(noOfTweets, fileName, **stream_settings):
     clustering=HDStreaminClustering(**stream_settings)
     ts = time.time()
     clustering.cluster(TwitterIterators.iterateFromFile(fileName))
     te = time.time()
     documentClusters = [cluster.documentsInCluster.keys() for k, cluster in clustering.clusters.iteritems() if len(cluster.documentsInCluster.keys())>=stream_settings['cluster_filter_threshold']]
     return Evaluation.getEvaluationMetrics(noOfTweets, documentClusters, te-ts)
Example #2
0
 def performanceForCDAAt(noOfTweets, fileName, **stream_settings):
     clustering = HDStreaminClustering(**stream_settings)
     ts = time.time()
     clustering.cluster(TwitterIterators.iterateFromFile(fileName))
     te = time.time()
     documentClusters = [
         cluster.documentsInCluster.keys()
         for k, cluster in clustering.clusters.iteritems()
         if len(cluster.documentsInCluster.keys()) >=
         stream_settings['cluster_filter_threshold']
     ]
     return Evaluation.getEvaluationMetrics(noOfTweets, documentClusters,
                                            te - ts)
Example #3
0
    def generateStatsForHDLSHClustering(self):
        print 'HD LSH'

        def _getDocumentFromTuple((user, text)):
            vector, words = Vector(), text.split()
            for word in words[1:]:
                if word not in vector: vector[word] = 1
                else: vector[word] += 1
            return Document(user, vector)

        self.stream_settings[
            'convert_data_to_message_method'] = TwitterCrowdsSpecificMethods.convertTweetJSONToMessage
        self.stream_settings[
            'cluster_analysis_method'] = emptyClusterAnalysisMethod
        #        self.stream_settings['cluster_filtering_method'] = emptyClusterFilteringMethod
        self.documents = [
            tw[1] for tw in list(self._tweetWithTimestampIterator())
            if tw[1]['text'].strip() != ''
        ]
        self.documents = [
            tw[0] for tw in sorted([(
                t, getDateTimeObjectFromTweetTimestamp(t['created_at']))
                                    for t in self.documents],
                                   key=itemgetter(0))
        ]
        clustering = HDStreaminClustering(**self.stream_settings)
        ts = time.time()
        #        for tweet in self.documents: clustering.getClusterAndUpdateExistingClusters(_getDocumentFromTuple(tweet))
        #        clustering.cluster([_getDocumentFromTuple(d) for d in self.documents])
        clustering.cluster(self.documents)
        te = time.time()
        documentClusters = [
            cluster.documentsInCluster.keys()
            for k, cluster in clustering.clusters.iteritems()
            if len(cluster.documentsInCluster.keys()) >=
            self.stream_settings['cluster_filter_threshold']
        ]
        return self.getEvaluationMetrics(documentClusters, te - ts)
    def generateStatsForHDLSHClustering(self):
        print 'HD LSH'
        def _getDocumentFromTuple((user, text)):
            vector, words = Vector(), text.split()
            for word in words[1:]:
                if word not in vector: vector[word]=1
                else: vector[word]+=1
            return Document(user, vector)
        self.stream_settings['convert_data_to_message_method'] = TwitterCrowdsSpecificMethods.convertTweetJSONToMessage
        self.stream_settings['cluster_analysis_method'] = emptyClusterAnalysisMethod
#        self.stream_settings['cluster_filtering_method'] = emptyClusterFilteringMethod
        self.documents = [tw[1] for tw in list(self._tweetWithTimestampIterator()) if tw[1]['text'].strip()!='']
        self.documents = [ tw[0] for tw in 
                          sorted([(t, getDateTimeObjectFromTweetTimestamp(t['created_at']))  for t in self.documents], key=itemgetter(0))
                          ]
        clustering=HDStreaminClustering(**self.stream_settings)
        ts = time.time()
#        for tweet in self.documents: clustering.getClusterAndUpdateExistingClusters(_getDocumentFromTuple(tweet))
#        clustering.cluster([_getDocumentFromTuple(d) for d in self.documents])
        clustering.cluster(self.documents)
        te = time.time()
        documentClusters = [cluster.documentsInCluster.keys() for k, cluster in clustering.clusters.iteritems() if len(cluster.documentsInCluster.keys())>=self.stream_settings['cluster_filter_threshold']]
        return self.getEvaluationMetrics(documentClusters, te-ts)
class ClusteringParametersEstimation():
    clusterLagDistributionId = 'cluster_lag_distribution'
    def __init__(self, **stream_settings):
        stream_settings['%s_file' % ClusteringParametersEstimation.clusterLagDistributionId] = stream_settings['parameter_estimation_folder'] + ClusteringParametersEstimation.clusterLagDistributionId
        self.stream_settings = stream_settings
        self.hdsClustering = HDStreaminClustering(**self.stream_settings)
    def run(self, iterator): self.hdsClustering.cluster(iterator)
    @staticmethod
    def emptyClusterFilteringMethod(hdStreamClusteringObject, currentMessageTime): pass
    @staticmethod
    def clusterLagDistributionMethod(hdStreamClusteringObject, currentMessageTime):
        lagDistribution = defaultdict(int)
        for cluster in hdStreamClusteringObject.clusters.values():
            lag = DateTimeAirthematic.getDifferenceInTimeUnits(currentMessageTime, cluster.lastStreamAddedTime, hdStreamClusteringObject.stream_settings['time_unit_in_seconds'].seconds)
            lagDistribution[str(lag)] += 1
        print currentMessageTime, len(hdStreamClusteringObject.clusters)
        iterationData = {
                         'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime),
                         'settings': pprint.pformat(hdStreamClusteringObject.stream_settings),
                         ClusteringParametersEstimation.clusterLagDistributionId: lagDistribution,
                         'lag_between_streams_added_to_cluster': hdStreamClusteringObject.stream_settings['lag_between_streams_added_to_cluster']
                         }
#        print hdStreamClusteringObject.stream_settings['lag_between_streams_added_to_cluster']
        FileIO.writeToFileAsJson(iterationData, hdStreamClusteringObject.stream_settings['%s_file' % ClusteringParametersEstimation.clusterLagDistributionId])
    def plotCDFClustersLagDistribution(self, returnAxisValuesOnly=True):
        '''
        This determines the time after which a cluster can be considered 
        decayed and hence removed.
        
        Experts stream [ 0.66002386  0.07035227] 0.1 82
        Houston stream [ 0.73800037  0.05890473] 0.1 29
        
        458 (# of time units) Experts stream [ 0.66002386  0.07035227] 0.2 15
        71 (# of time units) Houston stream [ 0.73756656  0.05883258] 0.2 3
        
        '''
        def calculateInActivityTimeFor(params, probabilityOfInactivity): return int(CurveFit.inverseOfIncreasingExponentialFunction(params, 1 - probabilityOfInactivity))
        data = list(FileIO.iterateJsonFromFile(self.hdsClustering.stream_settings['%s_file' % ClusteringParametersEstimation.clusterLagDistributionId]))[-1]
        total = float(sum(data['lag_between_streams_added_to_cluster'].values()))
        x = sorted(map(int, data['lag_between_streams_added_to_cluster'].keys()))
        y = getCumulativeDistribution([data['lag_between_streams_added_to_cluster'][str(i)] / total for i in x])
        exponentialCurveParams = CurveFit.getParamsAfterFittingData(x, y, CurveFit.increasingExponentialFunction, [1., 1.])
        print self.stream_settings['plot_label'], exponentialCurveParams, calculateInActivityTimeFor(exponentialCurveParams, 0.2) 
        plt.plot(x, y, 'o', label=getLatexForString(self.stream_settings['plot_label']) + getLatexForString(' (%0.2fx^{%0.2f})') % (exponentialCurveParams[0], exponentialCurveParams[1]), color=self.stream_settings['plot_color'])
        plt.plot(x, CurveFit.getYValues(CurveFit.increasingExponentialFunction, exponentialCurveParams, x), color=self.stream_settings['plot_color'], lw=2)
        plt.ylabel(r'$P\ (\ lag\ \leq\ TU\ )$'), plt.xlabel(getLatexForString(xlabelTimeUnits)), plt.title(getLatexForString('CDF for clusters lag distribution.'))
        plt.ylim((0, 1.2))
        plt.legend(loc=4)
        if returnAxisValuesOnly: plt.show()
    def plotPercentageOfClustersWithinALag(self, returnAxisValuesOnly=True):
        '''
        458 Experts stream [ 0.01860266  0.70639136] 15 0.874004297177
        80 Houston stream [ 0.0793181   0.47644004] 3 0.866127308876
        '''
        def calculatePercentageOfDecayedPhrasesFor(params, timeUnit): return 1 - CurveFit.increasingExponentialFunction(params, timeUnit)
        dataDistribution = {}
        currentTimeUnit = 0
#        file='/mnt/chevron/kykamath/data/twitter/lsh_crowds/houston_stream/parameter_estimation/cluster_lag_distribution'
        file = self.hdsClustering.stream_settings['%s_file' % ClusteringParametersEstimation.clusterLagDistributionId]
        lines = list(FileIO.iterateJsonFromFile(file))
        numberOfTimeUnits = len(lines)
        for data in lines:
            totalClusters = float(sum(data[ClusteringParametersEstimation.clusterLagDistributionId].values()))
            tempArray = []
            for k, v in data[ClusteringParametersEstimation.clusterLagDistributionId].iteritems():
                k = int(k)
                if k not in dataDistribution: dataDistribution[k] = [0] * numberOfTimeUnits
                dataDistribution[k][currentTimeUnit] = v / totalClusters
                tempArray.append(v / totalClusters)
            currentTimeUnit += 1
        x = sorted(dataDistribution)
        print numberOfTimeUnits,
        y = getCumulativeDistribution([np.mean(dataDistribution[k]) for k in x])
        params = CurveFit.getParamsAfterFittingData(x, y, CurveFit.increasingExponentialFunction, [1., 1.])
        print self.stream_settings['plot_label'], params,
        def subPlot(id, timeUnit):
            plt.subplot(id)
            print timeUnit, calculatePercentageOfDecayedPhrasesFor(params, timeUnit)
            plt.plot(x, y, 'o', label=getLatexForString(self.stream_settings['plot_label']) + getLatexForString(' (%0.2fx^{%0.2f})') % (params[0], params[1]), color=self.stream_settings['plot_color'])
            plt.plot(x, CurveFit.getYValues(CurveFit.increasingExponentialFunction, params, x), color=self.stream_settings['plot_color'], lw=2)
        if self.stream_settings['stream_id'] == 'experts_twitter_stream': subPlot(111, 15); plt.title(getLatexForString('Percentage of clusters within a lag'))
        else: subPlot(111, 3); plt.xlabel(getLatexForString(xlabelTimeUnits))
        plt.ylabel(r'$\%\ of\ clusters\ with\ lag\ \leq\ TU$')
        plt.legend(loc=4)
        if returnAxisValuesOnly: plt.show()
    @staticmethod
    def thresholdForDocumentToBeInCluterEstimation(stats_file, **stream_settings):
        ''' Estimate thresold for the clusters by varying the threshold_for_document_to_be_in_cluster value.
        Run this on a document set of size 100K. 
        '''
        for length in [i * j for i in 10 ** 3, 10 ** 4, 10 ** 5 for j in range(1, 10)]: 
#            for t in range(1, 16): 
            for t in range(16, 21):
                stream_settings['threshold_for_document_to_be_in_cluster'] = t * 0.05
                print length, stream_settings['threshold_for_document_to_be_in_cluster']
                stats = {'streaming_lsh': KMeansTweetsFile(length, **stream_settings).generateStatsForStreamingLSHClustering(), 'settings': Settings.getSerialzedObject(stream_settings)}
                FileIO.writeToFileAsJson(stats, stats_file)
Example #6
0
class ClusteringParametersEstimation():
    clusterLagDistributionId = 'cluster_lag_distribution'

    def __init__(self, **stream_settings):
        stream_settings[
            '%s_file' % ClusteringParametersEstimation.
            clusterLagDistributionId] = stream_settings[
                'parameter_estimation_folder'] + ClusteringParametersEstimation.clusterLagDistributionId
        self.stream_settings = stream_settings
        self.hdsClustering = HDStreaminClustering(**self.stream_settings)

    def run(self, iterator):
        self.hdsClustering.cluster(iterator)

    @staticmethod
    def emptyClusterFilteringMethod(hdStreamClusteringObject,
                                    currentMessageTime):
        pass

    @staticmethod
    def clusterLagDistributionMethod(hdStreamClusteringObject,
                                     currentMessageTime):
        lagDistribution = defaultdict(int)
        for cluster in hdStreamClusteringObject.clusters.values():
            lag = DateTimeAirthematic.getDifferenceInTimeUnits(
                currentMessageTime, cluster.lastStreamAddedTime,
                hdStreamClusteringObject.
                stream_settings['time_unit_in_seconds'].seconds)
            lagDistribution[str(lag)] += 1
        print currentMessageTime, len(hdStreamClusteringObject.clusters)
        iterationData = {
            'time_stamp':
            getStringRepresentationForTweetTimestamp(currentMessageTime),
            'settings':
            pprint.pformat(hdStreamClusteringObject.stream_settings),
            ClusteringParametersEstimation.clusterLagDistributionId:
            lagDistribution,
            'lag_between_streams_added_to_cluster':
            hdStreamClusteringObject.
            stream_settings['lag_between_streams_added_to_cluster']
        }
        #        print hdStreamClusteringObject.stream_settings['lag_between_streams_added_to_cluster']
        FileIO.writeToFileAsJson(
            iterationData, hdStreamClusteringObject.stream_settings[
                '%s_file' %
                ClusteringParametersEstimation.clusterLagDistributionId])

    def plotCDFClustersLagDistribution(self, returnAxisValuesOnly=True):
        '''
        This determines the time after which a cluster can be considered 
        decayed and hence removed.
        
        Experts stream [ 0.66002386  0.07035227] 0.1 82
        Houston stream [ 0.73800037  0.05890473] 0.1 29
        
        458 (# of time units) Experts stream [ 0.66002386  0.07035227] 0.2 15
        71 (# of time units) Houston stream [ 0.73756656  0.05883258] 0.2 3
        
        '''
        def calculateInActivityTimeFor(params, probabilityOfInactivity):
            return int(
                CurveFit.inverseOfIncreasingExponentialFunction(
                    params, 1 - probabilityOfInactivity))

        data = list(
            FileIO.iterateJsonFromFile(self.hdsClustering.stream_settings[
                '%s_file' %
                ClusteringParametersEstimation.clusterLagDistributionId]))[-1]
        total = float(
            sum(data['lag_between_streams_added_to_cluster'].values()))
        x = sorted(
            map(int, data['lag_between_streams_added_to_cluster'].keys()))
        y = getCumulativeDistribution([
            data['lag_between_streams_added_to_cluster'][str(i)] / total
            for i in x
        ])
        exponentialCurveParams = CurveFit.getParamsAfterFittingData(
            x, y, CurveFit.increasingExponentialFunction, [1., 1.])
        print self.stream_settings[
            'plot_label'], exponentialCurveParams, calculateInActivityTimeFor(
                exponentialCurveParams, 0.2)
        plt.plot(x,
                 y,
                 'o',
                 label=getLatexForString(self.stream_settings['plot_label']) +
                 getLatexForString(' (%0.2fx^{%0.2f})') %
                 (exponentialCurveParams[0], exponentialCurveParams[1]),
                 color=self.stream_settings['plot_color'])
        plt.plot(x,
                 CurveFit.getYValues(CurveFit.increasingExponentialFunction,
                                     exponentialCurveParams, x),
                 color=self.stream_settings['plot_color'],
                 lw=2)
        plt.ylabel(r'$P\ (\ lag\ \leq\ TU\ )$'), plt.xlabel(
            getLatexForString(xlabelTimeUnits)), plt.title(
                getLatexForString('CDF for clusters lag distribution.'))
        plt.ylim((0, 1.2))
        plt.legend(loc=4)
        if returnAxisValuesOnly: plt.show()

    def plotPercentageOfClustersWithinALag(self, returnAxisValuesOnly=True):
        '''
        458 Experts stream [ 0.01860266  0.70639136] 15 0.874004297177
        80 Houston stream [ 0.0793181   0.47644004] 3 0.866127308876
        '''
        def calculatePercentageOfDecayedPhrasesFor(params, timeUnit):
            return 1 - CurveFit.increasingExponentialFunction(params, timeUnit)

        dataDistribution = {}
        currentTimeUnit = 0
        #        file='/mnt/chevron/kykamath/data/twitter/lsh_crowds/houston_stream/parameter_estimation/cluster_lag_distribution'
        file = self.hdsClustering.stream_settings[
            '%s_file' %
            ClusteringParametersEstimation.clusterLagDistributionId]
        lines = list(FileIO.iterateJsonFromFile(file))
        numberOfTimeUnits = len(lines)
        for data in lines:
            totalClusters = float(
                sum(data[ClusteringParametersEstimation.
                         clusterLagDistributionId].values()))
            tempArray = []
            for k, v in data[ClusteringParametersEstimation.
                             clusterLagDistributionId].iteritems():
                k = int(k)
                if k not in dataDistribution:
                    dataDistribution[k] = [0] * numberOfTimeUnits
                dataDistribution[k][currentTimeUnit] = v / totalClusters
                tempArray.append(v / totalClusters)
            currentTimeUnit += 1
        x = sorted(dataDistribution)
        print numberOfTimeUnits,
        y = getCumulativeDistribution(
            [np.mean(dataDistribution[k]) for k in x])
        params = CurveFit.getParamsAfterFittingData(
            x, y, CurveFit.increasingExponentialFunction, [1., 1.])
        print self.stream_settings['plot_label'], params,

        def subPlot(id, timeUnit):
            plt.subplot(id)
            print timeUnit, calculatePercentageOfDecayedPhrasesFor(
                params, timeUnit)
            plt.plot(
                x,
                y,
                'o',
                label=getLatexForString(self.stream_settings['plot_label']) +
                getLatexForString(' (%0.2fx^{%0.2f})') %
                (params[0], params[1]),
                color=self.stream_settings['plot_color'])
            plt.plot(x,
                     CurveFit.getYValues(
                         CurveFit.increasingExponentialFunction, params, x),
                     color=self.stream_settings['plot_color'],
                     lw=2)

        if self.stream_settings['stream_id'] == 'experts_twitter_stream':
            subPlot(111, 15)
            plt.title(getLatexForString('Percentage of clusters within a lag'))
        else:
            subPlot(111, 3)
            plt.xlabel(getLatexForString(xlabelTimeUnits))
        plt.ylabel(r'$\%\ of\ clusters\ with\ lag\ \leq\ TU$')
        plt.legend(loc=4)
        if returnAxisValuesOnly: plt.show()

    @staticmethod
    def thresholdForDocumentToBeInCluterEstimation(stats_file,
                                                   **stream_settings):
        ''' Estimate thresold for the clusters by varying the threshold_for_document_to_be_in_cluster value.
        Run this on a document set of size 100K. 
        '''
        for length in [
                i * j for i in 10**3, 10**4, 10**5 for j in range(1, 10)
        ]:
            #            for t in range(1, 16):
            for t in range(16, 21):
                stream_settings[
                    'threshold_for_document_to_be_in_cluster'] = t * 0.05
                print length, stream_settings[
                    'threshold_for_document_to_be_in_cluster']
                stats = {
                    'streaming_lsh':
                    KMeansTweetsFile(length, **stream_settings).
                    generateStatsForStreamingLSHClustering(),
                    'settings':
                    Settings.getSerialzedObject(stream_settings)
                }
                FileIO.writeToFileAsJson(stats, stats_file)