def iterateExpertClusters(startingDay=datetime(2011,3,19), endingDay=datetime(2011,3, 30)):
#    def iterateExpertClusters(startingDay=datetime(2011,3,19), endingDay=datetime(2011,4,7)):
        while startingDay<=endingDay:
            for line in FileIO.iterateJsonFromFile(experts_twitter_stream_settings.lsh_clusters_folder+FileIO.getFileByDay(startingDay)): 
                currentTime = getDateTimeObjectFromTweetTimestamp(line['time_stamp'])
                for clusterMap in line['clusters']: yield (currentTime, TwitterCrowdsSpecificMethods.getClusterFromMapFormat(clusterMap))
            startingDay+=timedelta(days=1)
コード例 #2
0
def iterateHashtagObjectInstances(line):
    data = cjson.decode(line)
    l = None
    if 'geo' in data: l = data['geo']
    else: l = data['bb']
    t = time.mktime(getDateTimeObjectFromTweetTimestamp(data['t']).timetuple())
    for h in data['h']: yield h.lower(), [getLattice(l, ACCURACY), t]
def getStreamStats(streamTweetsIterator):
    ''' 30-day
        Experts stats:
        # of users:  4804
        # of tweets:  1614510
        # of tweets per tu (mean, var):  186.497631974 7860.12570191
        
        Houston stats
        # of users:  107494
        # of tweets:  15946768
        # of tweets per tu (mean, var):  1730.33506944 4834419.37341
        
        10-day
        Experts stats
        # of users:  4674
        # of tweets:  608798
        # of tweets per tu (mean, var):  190.726190476 8132.75460228
        Houston stats
        # of users:  39618
        # of tweets:  2139829
        # of tweets per tu (mean, var):  619.163483796 94450.7334004

    '''
    numberOfTweets, users, distributionPerTU = 0, set(), defaultdict(int)
    for tweet in streamTweetsIterator: 
        users.add(tweet['user']['screen_name'])
        distributionPerTU[GeneralMethods.getEpochFromDateTimeObject(getDateTimeObjectFromTweetTimestamp(tweet['created_at']))//300]+=1
        numberOfTweets+=1
    print '# of users: ', len(users)
    print '# of tweets: ', numberOfTweets 
    print '# of tweets per tu (mean, var): ', np.mean(distributionPerTU.values()), np.var(distributionPerTU.values())
コード例 #4
0
 def plotGrowthOfPhrasesInTime(self, returnAxisValuesOnly=True):
     '''
     This plot tells us the time when the number of phrases in the stream stablizes. 
     Consider the time after we have seen maximum phrases to determine dimensions.
     But, if these phrases increase linearly with time, it shows that we have infinte
     dimensions and hence this motivates us to have a way to determine number of 
     dimensions.
     
     numberOfTimeUnits=10*24*12
     '''
     x, y = [], []
     [(x.append(getDateTimeObjectFromTweetTimestamp(line['time_stamp'])),
       y.append(line['total_number_of_phrases']))
      for line in FileIO.iterateJsonFromFile(self.dimensionsEstimationFile)]
     x = x[:numberOfTimeUnits]
     y = y[:numberOfTimeUnits]
     plt.subplot(111).yaxis.set_major_formatter(
         FuncFormatter(lambda x, i: '%0.1f' % (x / 10.**6)))
     plt.text(0.0,
              1.01,
              getLatexForString('10^6'),
              transform=plt.gca().transAxes)
     plt.ylabel(getLatexForString('\# of dimensions')), plt.xlabel(
         getLatexForString(xlabelTimeUnits)), plt.title(
             getLatexForString(
                 'Growth in dimensions with increasing time.'))
     plt.plot(y,
              color=self.stream_settings['plot_color'],
              label=getLatexForString(self.stream_settings['plot_label']),
              lw=2)
     plt.legend(loc=4)
     if returnAxisValuesOnly: plt.show()
コード例 #5
0
def iterate_hashtag_occurrences_with_high_accuracy_lid(line):
    data = cjson.decode(line)
    l = None
    if 'geo' in data: l = data['geo']
    else: l = data['bb']
    t = time.mktime(getDateTimeObjectFromTweetTimestamp(data['t']).timetuple())
    lid = getLatticeLid(l, accuracy=0.0001)
    for h in data['h']: yield h.lower(), [lid, GeneralMethods.approximateEpoch(t, TIME_UNIT_IN_SECONDS)]
コード例 #6
0
 def convertTweetJSONToMessage(tweet, **twitter_stream_settings):
     tweetTime = getDateTimeObjectFromTweetTimestamp(tweet['created_at'])
     message = Message(tweet['user']['screen_name'], tweet['id'], tweet['text'], tweetTime)
     message.vector = Vector()
     for phrase in getPhrases(getWordsFromRawEnglishMessage(tweet['text']), twitter_stream_settings['min_phrase_length'], twitter_stream_settings['max_phrase_length']):
         if phrase not in message.vector: message.vector[phrase]=0
         message.vector[phrase]+=1
     return message
コード例 #7
0
def iterateHashtagObjectInstances(line):
    data = cjson.decode(line)
    l = None
    if 'geo' in data: l = data['geo']
    else: l = data['bb']
    t = time.mktime(getDateTimeObjectFromTweetTimestamp(data['t']).timetuple())
    point = getLattice(l, LATTICE_ACCURACY)
#    if isWithinBoundingBox(point, BOUNDING_BOX):
    for h in data['h']: yield h.lower(), [point, t]
コード例 #8
0
 def iterateTweetsFromExperts(expertsDataStartTime=datetime(2011,3,19), expertsDataEndTime=datetime(2011,4,12)):
     experts = getExperts()
     currentTime = expertsDataStartTime
     while currentTime <= expertsDataEndTime:
         for tweet in TwitterIterators.iterateFromFile(experts_twitter_stream_settings.twitter_users_tweets_folder+'%s.gz'%FileIO.getFileByDay(currentTime)):
             if tweet['user']['id_str'] in experts:
                 if getDateTimeObjectFromTweetTimestamp(tweet['created_at']) <= expertsDataEndTime : yield tweet
                 else: return
         currentTime+=timedelta(days=1)
コード例 #9
0
def iterateHashtagObjectInstances(line):
    data = cjson.decode(line)
    l = None
    if 'geo' in data: l = data['geo']
    else: l = data['bb']
    t =  GeneralMethods.approximateEpoch(time.mktime(getDateTimeObjectFromTweetTimestamp(data['t']).timetuple()), TIME_UNIT_IN_SECONDS)
    if isWithinBoundingBox(l, BOUNDARY):
        point = getLatticeLid(l, LATTICE_ACCURACY)
        if point!='0.0000_0.0000':
            for h in data['h']: yield h.lower(), [point, t]
コード例 #10
0
 def convertTweetJSONToMessage(tweet, **twitter_stream_settings):
     tweetTime = getDateTimeObjectFromTweetTimestamp(tweet['created_at'])
     message = Message(tweet['user']['screen_name'], tweet['id'],
                       tweet['text'], tweetTime)
     message.vector = Vector()
     for phrase in getPhrases(getWordsFromRawEnglishMessage(tweet['text']),
                              twitter_stream_settings['min_phrase_length'],
                              twitter_stream_settings['max_phrase_length']):
         if phrase not in message.vector: message.vector[phrase] = 0
         message.vector[phrase] += 1
     return message
コード例 #11
0
 def getClusterFromMapFormat(clusterMap):
     dummyMessage = Message(1, '', '', datetime.now())
     dummyMessage.vector=Vector({})
     dummyStream=Stream(1, dummyMessage)
     cluster = StreamCluster(dummyStream)
     cluster.clusterId = clusterMap['clusterId']
     cluster.lastStreamAddedTime = getDateTimeObjectFromTweetTimestamp(clusterMap['lastStreamAddedTime'])
     cluster.mergedClustersList = clusterMap['mergedClustersList']
     cluster.documentsInCluster = clusterMap['streams']
     for k,v in clusterMap['dimensions'].iteritems(): cluster[k]=v
     return cluster
コード例 #12
0
def iterate_reduced_tweets(line):
    data = cjson.decode(line)
    loc = None
    if 'geo' in data: loc = data['geo']
    else: loc = data['bb']
    if data['id'] != None: uid = data['id']
    time1 = time.mktime(getDateTimeObjectFromTweetTimestamp(data['t']).timetuple())# make time represent as sec from 9 tuple
#this here is selecting place in US.
#    if loc[0]>24.52 and loc[0]<49.38 and loc[1]<-66.95 and loc[1]>-124.77:
#    if loc[0]>40.48 and loc[0]<40.90 and loc[1]<-73.69 and loc[1]>-74.25:
#    if loc[0]>40.7022 and loc[0]<40.807 and loc[1]<-73.927 and loc[1]>-74.0218:
    for h in data['h']: yield h.lower(), (loc, time1, uid)
コード例 #13
0
def iterateHashtagObjectInstances(line, all_locations = False):
    data = cjson.decode(line)
    l = None
    if 'geo' in data: l = data['geo']
    else: l = data['bb']
    t = time.mktime(getDateTimeObjectFromTweetTimestamp(data['t']).timetuple())
    point = getLattice(l, LOCATION_ACCURACY)
    if not all_locations:
        lattice_lid = getLatticeLid(point, LOCATION_ACCURACY)
        if lattice_lid in VALID_LOCATIONS_LIST:
            for h in data['h']: yield h.lower(), [point, t]
    else:
        for h in data['h']: yield h.lower(), [point, t]
コード例 #14
0
 def getClusterFromMapFormat(clusterMap):
     dummyMessage = Message(1, '', '', datetime.now())
     dummyMessage.vector = Vector({})
     dummyStream = Stream(1, dummyMessage)
     cluster = StreamCluster(dummyStream)
     cluster.clusterId = clusterMap['clusterId']
     cluster.lastStreamAddedTime = getDateTimeObjectFromTweetTimestamp(
         clusterMap['lastStreamAddedTime'])
     cluster.mergedClustersList = clusterMap['mergedClustersList']
     cluster.documentsInCluster = clusterMap['streams']
     for k, v in clusterMap['dimensions'].iteritems():
         cluster[k] = v
     return cluster
コード例 #15
0
ファイル: time_series.py プロジェクト: hbarthwal/infolab
 def read_checkins(self, _, line):
     if line != '':
         data = decode(line)
         #  If the tweet is geolocated with valid coordinates
         #  then we put it in the checkins bucket for the
         #  corresponding user
         if data['c'] != 'N' and data['c'] != [0.0, 0.0]:
             timestamp = data['t']
             date_time_object = getDateTimeObjectFromTweetTimestamp(timestamp)
             timestamp = mktime(date_time_object.timetuple())
             tweet_id = data['tid']
             checkin = {'tid' :  str(tweet_id) , 't' : timestamp}
             yield data['u'], checkin
コード例 #16
0
def parse_stream():    
    stream = tweetstream.FilterStream(USER_NAME, PASSWORD, locations=LOCATIONS) 
    for tweet in stream:
#        try:
            geo = ParseGeoData(tweet)
            if geo: 
                hashtags = ParseHashtags(tweet)
                if hashtags: 
                    checkin_object = GetCheckinObject(tweet)
                    checkin_object['h'] = hashtags
                    checkin_object[geo[0]] = geo[1]
                    FileIO.writeToFileAsJson(
                                                 checkin_object, 
                                                 GetOutputFile(getDateTimeObjectFromTweetTimestamp(tweet['created_at']))
                                             )
コード例 #17
0
 def iterateTweetsFromExperts(expertsDataStartTime=datetime(2011, 3, 19),
                              expertsDataEndTime=datetime(2011, 4, 12)):
     experts = getExperts()
     currentTime = expertsDataStartTime
     while currentTime <= expertsDataEndTime:
         for tweet in TwitterIterators.iterateFromFile(
                 experts_twitter_stream_settings.twitter_users_tweets_folder
                 + '%s.gz' % FileIO.getFileByDay(currentTime)):
             if tweet['user']['id_str'] in experts:
                 if getDateTimeObjectFromTweetTimestamp(
                         tweet['created_at']) <= expertsDataEndTime:
                     yield tweet
                 else:
                     return
         currentTime += timedelta(days=1)
コード例 #18
0
def iterate_hashtag_with_words(line):
    data = cjson.decode(line)
    if data["h"]:
        l = None
        if "geo" in data:
            l = data["geo"]
        else:
            l = data["bb"]
        words = filter(lambda w: w[0] != "#", getWordsFromRawEnglishMessage(data["tx"]))
        words = filter(lambda (w, pos): pos == "NN" or pos == "NP", nltk.pos_tag(words))
        words = map(itemgetter(0), words)

        t = time.mktime(getDateTimeObjectFromTweetTimestamp(data["t"]).timetuple())
        for h in data["h"]:
            yield h.lower(), words, l, t
コード例 #19
0
 def plotGrowthOfPhrasesInTime(self, returnAxisValuesOnly=True):
     '''
     This plot tells us the time when the number of phrases in the stream stablizes. 
     Consider the time after we have seen maximum phrases to determine dimensions.
     But, if these phrases increase linearly with time, it shows that we have infinte
     dimensions and hence this motivates us to have a way to determine number of 
     dimensions.
     
     numberOfTimeUnits=10*24*12
     '''
     x, y = [], []; [(x.append(getDateTimeObjectFromTweetTimestamp(line['time_stamp'])), y.append(line['total_number_of_phrases'])) for line in FileIO.iterateJsonFromFile(self.dimensionsEstimationFile)]
     x = x[:numberOfTimeUnits]; y = y[:numberOfTimeUnits]
     plt.subplot(111).yaxis.set_major_formatter(FuncFormatter(lambda x, i: '%0.1f' % (x / 10. ** 6)))
     plt.text(0.0, 1.01, getLatexForString('10^6'), transform=plt.gca().transAxes)
     plt.ylabel(getLatexForString('\# of dimensions')), plt.xlabel(getLatexForString(xlabelTimeUnits)), plt.title(getLatexForString('Growth in dimensions with increasing time.'))
     plt.plot(y, color=self.stream_settings['plot_color'], label=getLatexForString(self.stream_settings['plot_label']), lw=2)
     plt.legend(loc=4)
     if returnAxisValuesOnly: plt.show()
コード例 #20
0
    def generateStatsForHDLSHClustering(self):
        print 'HD LSH'

        def _getDocumentFromTuple((user, text)):
            vector, words = Vector(), text.split()
            for word in words[1:]:
                if word not in vector: vector[word] = 1
                else: vector[word] += 1
            return Document(user, vector)

        self.stream_settings[
            'convert_data_to_message_method'] = TwitterCrowdsSpecificMethods.convertTweetJSONToMessage
        self.stream_settings[
            'cluster_analysis_method'] = emptyClusterAnalysisMethod
        #        self.stream_settings['cluster_filtering_method'] = emptyClusterFilteringMethod
        self.documents = [
            tw[1] for tw in list(self._tweetWithTimestampIterator())
            if tw[1]['text'].strip() != ''
        ]
        self.documents = [
            tw[0] for tw in sorted([(
                t, getDateTimeObjectFromTweetTimestamp(t['created_at']))
                                    for t in self.documents],
                                   key=itemgetter(0))
        ]
        clustering = HDStreaminClustering(**self.stream_settings)
        ts = time.time()
        #        for tweet in self.documents: clustering.getClusterAndUpdateExistingClusters(_getDocumentFromTuple(tweet))
        #        clustering.cluster([_getDocumentFromTuple(d) for d in self.documents])
        clustering.cluster(self.documents)
        te = time.time()
        documentClusters = [
            cluster.documentsInCluster.keys()
            for k, cluster in clustering.clusters.iteritems()
            if len(cluster.documentsInCluster.keys()) >=
            self.stream_settings['cluster_filter_threshold']
        ]
        return self.getEvaluationMetrics(documentClusters, te - ts)
コード例 #21
0
    def generateStatsForHDLSHClustering(self):
        print 'HD LSH'
        def _getDocumentFromTuple((user, text)):
            vector, words = Vector(), text.split()
            for word in words[1:]:
                if word not in vector: vector[word]=1
                else: vector[word]+=1
            return Document(user, vector)
        self.stream_settings['convert_data_to_message_method'] = TwitterCrowdsSpecificMethods.convertTweetJSONToMessage
        self.stream_settings['cluster_analysis_method'] = emptyClusterAnalysisMethod
#        self.stream_settings['cluster_filtering_method'] = emptyClusterFilteringMethod
        self.documents = [tw[1] for tw in list(self._tweetWithTimestampIterator()) if tw[1]['text'].strip()!='']
        self.documents = [ tw[0] for tw in 
                          sorted([(t, getDateTimeObjectFromTweetTimestamp(t['created_at']))  for t in self.documents], key=itemgetter(0))
                          ]
        clustering=HDStreaminClustering(**self.stream_settings)
        ts = time.time()
#        for tweet in self.documents: clustering.getClusterAndUpdateExistingClusters(_getDocumentFromTuple(tweet))
#        clustering.cluster([_getDocumentFromTuple(d) for d in self.documents])
        clustering.cluster(self.documents)
        te = time.time()
        documentClusters = [cluster.documentsInCluster.keys() for k, cluster in clustering.clusters.iteritems() if len(cluster.documentsInCluster.keys())>=self.stream_settings['cluster_filter_threshold']]
        return self.getEvaluationMetrics(documentClusters, te-ts)
コード例 #22
0
    def analyzeJustifyExponentialDecay(self):
        global evaluation
        experimentsData = {JustifyExponentialDecay.with_decay: {}, JustifyExponentialDecay.without_decay: {}}
        for data in FileIO.iterateJsonFromFile(JustifyExponentialDecay.stats_file): experimentsData[data['iteration_parameters']['type']][getDateTimeObjectFromTweetTimestamp(data['iteration_parameters']['current_time'])]=data['clusters']
        qualityData = []
        for k1, k2 in zip(sorted(experimentsData[JustifyExponentialDecay.with_decay]), sorted(experimentsData[JustifyExponentialDecay.without_decay])):
            qualityData.append((k1, evaluation.getEvaluationMetrics(experimentsData[JustifyExponentialDecay.with_decay][k1], None, None)['purity']-evaluation.getEvaluationMetrics(experimentsData[JustifyExponentialDecay.without_decay][k1], None, None)['purity']))
        keyTime = sorted(qualityData, key=itemgetter(1))[-1][0]
        clusterWithDecay = [i for i in experimentsData[JustifyExponentialDecay.with_decay][keyTime] if len(i)>=3]
        clusterWithOutDecay = [i for i in experimentsData[JustifyExponentialDecay.without_decay][keyTime] if len(i)>=3]
#        for c in clusterWithDecay:
#            print c, [evaluation.expertsToClassMap[i.lower()] for i in c]

        interestedCluster = set(['Zap2it', 'ESPNAndyKatz', 'comingsoonnet', '950KJR', 'ginasmith888', 'UKCoachCalipari', 'SportsFanz', 'David_Henrie'])
        for c in clusterWithOutDecay:
            if len(set(c).intersection(interestedCluster))>0: 
#                print c, [evaluation.expertsToClassMap[i.lower()] for i in c]
                setString = ', '.join(['%s (%s)'%(i, evaluation.expertsToClassMap[i.lower()]) for i in sorted(c)]).replace(' ', '\\ ').replace('_', '\\_')
                print keyTime, '&', setString, '\\\\'
            
        clustersDiscoveredEarlierByDecay = {}
        for kt in sorted(experimentsData[JustifyExponentialDecay.with_decay]):
            for c in experimentsData[JustifyExponentialDecay.with_decay][kt]:
                c=sorted(c)
                if len(set(c).intersection(interestedCluster))>0: 
                    classes = [evaluation.expertsToClassMap[i.lower()] for i in c if i.lower() in evaluation.expertsToClassMap]
                    if sorted([(k, len(list(g))/float(len(classes))) for k,g in groupby(sorted(classes))], key=itemgetter(1))[-1][1]>0.7:
                        if kt>datetime(2011,3,19) and kt<=keyTime: clustersDiscoveredEarlierByDecay[kt]=c
        observedStrings = set()
        for k in sorted(clustersDiscoveredEarlierByDecay): 
            setString = ', '.join(['%s (%s)'%(i, evaluation.expertsToClassMap[i.lower()]) for i in sorted(clustersDiscoveredEarlierByDecay[k])]).replace(' ', '\\ ').replace('_', '\\_')
            if setString not in observedStrings: print k, '&', setString, '\\\\'; observedStrings.add(setString)
コード例 #23
0
    def analyzeJustifyExponentialDecay(self):
        global evaluation
        experimentsData = {JustifyExponentialDecay.with_decay: {}, JustifyExponentialDecay.without_decay: {}}
        for data in FileIO.iterateJsonFromFile(JustifyExponentialDecay.stats_file):
            experimentsData[data["iteration_parameters"]["type"]][
                getDateTimeObjectFromTweetTimestamp(data["iteration_parameters"]["current_time"])
            ] = data["clusters"]
        qualityData = []
        for k1, k2 in zip(
            sorted(experimentsData[JustifyExponentialDecay.with_decay]),
            sorted(experimentsData[JustifyExponentialDecay.without_decay]),
        ):
            qualityData.append(
                (
                    k1,
                    evaluation.getEvaluationMetrics(
                        experimentsData[JustifyExponentialDecay.with_decay][k1], None, None
                    )["purity"]
                    - evaluation.getEvaluationMetrics(
                        experimentsData[JustifyExponentialDecay.without_decay][k1], None, None
                    )["purity"],
                )
            )
        keyTime = sorted(qualityData, key=itemgetter(1))[-1][0]
        clusterWithDecay = [i for i in experimentsData[JustifyExponentialDecay.with_decay][keyTime] if len(i) >= 3]
        clusterWithOutDecay = [
            i for i in experimentsData[JustifyExponentialDecay.without_decay][keyTime] if len(i) >= 3
        ]
        #        for c in clusterWithDecay:
        #            print c, [evaluation.expertsToClassMap[i.lower()] for i in c]

        interestedCluster = set(
            [
                "Zap2it",
                "ESPNAndyKatz",
                "comingsoonnet",
                "950KJR",
                "ginasmith888",
                "UKCoachCalipari",
                "SportsFanz",
                "David_Henrie",
            ]
        )
        for c in clusterWithOutDecay:
            if len(set(c).intersection(interestedCluster)) > 0:
                #                print c, [evaluation.expertsToClassMap[i.lower()] for i in c]
                setString = (
                    ", ".join(["%s (%s)" % (i, evaluation.expertsToClassMap[i.lower()]) for i in sorted(c)])
                    .replace(" ", "\\ ")
                    .replace("_", "\\_")
                )
                print keyTime, "&", setString, "\\\\"

        clustersDiscoveredEarlierByDecay = {}
        for kt in sorted(experimentsData[JustifyExponentialDecay.with_decay]):
            for c in experimentsData[JustifyExponentialDecay.with_decay][kt]:
                c = sorted(c)
                if len(set(c).intersection(interestedCluster)) > 0:
                    classes = [
                        evaluation.expertsToClassMap[i.lower()] for i in c if i.lower() in evaluation.expertsToClassMap
                    ]
                    if (
                        sorted(
                            [(k, len(list(g)) / float(len(classes))) for k, g in groupby(sorted(classes))],
                            key=itemgetter(1),
                        )[-1][1]
                        > 0.7
                    ):
                        if kt > datetime(2011, 3, 19) and kt <= keyTime:
                            clustersDiscoveredEarlierByDecay[kt] = c
        observedStrings = set()
        for k in sorted(clustersDiscoveredEarlierByDecay):
            setString = (
                ", ".join(
                    [
                        "%s (%s)" % (i, evaluation.expertsToClassMap[i.lower()])
                        for i in sorted(clustersDiscoveredEarlierByDecay[k])
                    ]
                )
                .replace(" ", "\\ ")
                .replace("_", "\\_")
            )
            if setString not in observedStrings:
                print k, "&", setString, "\\\\"
                observedStrings.add(setString)
コード例 #24
0
    def analyzeJustifyExponentialDecay(self):
        global evaluation
        experimentsData = {JustifyExponentialDecay.with_decay: {}, JustifyExponentialDecay.without_decay: {}}
        for data in FileIO.iterateJsonFromFile(JustifyExponentialDecay.stats_file): experimentsData[data['iteration_parameters']['type']][getDateTimeObjectFromTweetTimestamp(data['iteration_parameters']['current_time'])]=data['clusters']
        qualityData = []
        for k1, k2 in zip(sorted(experimentsData[JustifyExponentialDecay.with_decay]), sorted(experimentsData[JustifyExponentialDecay.without_decay])):
            qualityData.append((k1, evaluation.getEvaluationMetrics(experimentsData[JustifyExponentialDecay.with_decay][k1], None, None)['purity']-evaluation.getEvaluationMetrics(experimentsData[JustifyExponentialDecay.without_decay][k1], None, None)['purity']))
        keyTime = sorted(qualityData, key=itemgetter(1))[-1][0]
        clusterWithDecay = [i for i in experimentsData[JustifyExponentialDecay.with_decay][keyTime] if len(i)>=3]
        clusterWithOutDecay = [i for i in experimentsData[JustifyExponentialDecay.without_decay][keyTime] if len(i)>=3]
#        for c in clusterWithDecay:
#            print c, [evaluation.expertsToClassMap[i.lower()] for i in c]

        interestedCluster = set(['Zap2it', 'ESPNAndyKatz', 'comingsoonnet', '950KJR', 'ginasmith888', 'UKCoachCalipari', 'SportsFanz', 'David_Henrie'])
        for c in clusterWithOutDecay:
            if len(set(c).intersection(interestedCluster))>0: 
#                print c, [evaluation.expertsToClassMap[i.lower()] for i in c]
                setString = ', '.join(['%s (%s)'%(i, evaluation.expertsToClassMap[i.lower()]) for i in sorted(c)]).replace(' ', '\\ ').replace('_', '\\_')
                print keyTime, '&', setString, '\\\\'
            
        clustersDiscoveredEarlierByDecay = {}
        for kt in sorted(experimentsData[JustifyExponentialDecay.with_decay]):
            for c in experimentsData[JustifyExponentialDecay.with_decay][kt]:
                c=sorted(c)
                if len(set(c).intersection(interestedCluster))>0: 
                    classes = [evaluation.expertsToClassMap[i.lower()] for i in c if i.lower() in evaluation.expertsToClassMap]
                    if sorted([(k, len(list(g))/float(len(classes))) for k,g in groupby(sorted(classes))], key=itemgetter(1))[-1][1]>0.7:
                        if kt>datetime(2011,3,19) and kt<=keyTime: clustersDiscoveredEarlierByDecay[kt]=c
        observedStrings = set()
        for k in sorted(clustersDiscoveredEarlierByDecay): 
            setString = ', '.join(['%s (%s)'%(i, evaluation.expertsToClassMap[i.lower()]) for i in sorted(clustersDiscoveredEarlierByDecay[k])]).replace(' ', '\\ ').replace('_', '\\_')
            if setString not in observedStrings: print k, '&', setString, '\\\\'; observedStrings.add(setString)
コード例 #25
0
ファイル: mr_hotSpots.py プロジェクト: kykamath/users_and_geo
def getCheckinObject(line):
    data = cjson.decode(line)
    data['t'] = time.mktime((getDateTimeObjectFromTweetTimestamp(data['t'])-datetime.timedelta(hours=5)).timetuple())
    data['l'] = data['geo']; del data['geo']
    return data
コード例 #26
0
 def _ParseHashtagObjects(checkin):
     if 'geo' in checkin: point = checkin['geo']
     else: point = checkin['bb']
     # Adding 30 minutes because stream appears to be delayed by 30 minutes
     t = time.mktime(getDateTimeObjectFromTweetTimestamp(checkin['t']).timetuple()) + 1800.
     for h in checkin['h']: yield h.lower(), [point, t]