def test_append(self): self.crowd.append(self.cluster, test_time+timedelta(days=1)) self.assertEqual([GeneralMethods.getEpochFromDateTimeObject(test_time), GeneralMethods.getEpochFromDateTimeObject(test_time+timedelta(days=1))], sorted(self.crowd.clusters.keys())) self.assertEqual(StreamCluster, type(self.crowd.clusters[GeneralMethods.getEpochFromDateTimeObject(test_time)])) self.assertEqual(2, self.crowd.lifespan) self.assertEqual(getStringRepresentationForTweetTimestamp(test_time), getStringRepresentationForTweetTimestamp(self.crowd.startTime)) self.assertEqual(getStringRepresentationForTweetTimestamp(test_time+timedelta(days=1)), getStringRepresentationForTweetTimestamp(self.crowd.endTime))
def __init__(self, cluster, clusterFormationTime): self.crowdId = cluster.clusterId self.clusters = { GeneralMethods.getEpochFromDateTimeObject(clusterFormationTime): cluster } self.ends, self.inComingCrowds, self.outGoingCrowd = False, [], None
def combineLocationGraphs(graphMap, startingGraphId, startingTime, intervalInSeconds, linear=True, **kwargs): if intervalInSeconds%TIME_UNIT_IN_SECONDS==0 and int(intervalInSeconds/TIME_UNIT_IN_SECONDS)!=0: numberOfGraphs = int(intervalInSeconds/TIME_UNIT_IN_SECONDS) else: numberOfGraphs = int(intervalInSeconds/TIME_UNIT_IN_SECONDS)+1 graphId = GeneralMethods.approximateEpoch(GeneralMethods.getEpochFromDateTimeObject(startingTime), TIME_UNIT_IN_SECONDS) currentLogarithmicId = LocationGraphs.getLogarithmicGraphId(startingGraphId, graphId) currentCollectedGraphs = 0 graphIdsToCombine = [] while currentCollectedGraphs!=numberOfGraphs and currentLogarithmicId>0: numberOfGraphsToCollect = 2**int(math.log(numberOfGraphs-currentCollectedGraphs,2)) if not linear and currentLogarithmicId%2==0: indices = [1]+map(lambda j: 2**j, filter(lambda j: currentLogarithmicId%(2**j)==0, range(1, int(math.log(currentLogarithmicId+1,2))+1))) if max(indices)>numberOfGraphsToCollect and numberOfGraphsToCollect in indices: index = numberOfGraphsToCollect else: index = max(indices) else: index=1 logGraphId = '%s_%s'%(LocationGraphs.getGraphId(startingGraphId, currentLogarithmicId), index) if logGraphId in graphMap: graphIdsToCombine.append(logGraphId) currentLogarithmicId-=index currentCollectedGraphs+=index graphIdsToCombine = sorted(graphIdsToCombine, key=lambda id:int(id.split('_')[1]), reverse=True) # print graphIdsToCombine # for i in graphIdsToCombine: # ep, l = i.split('_') # print i, datetime.datetime.fromtimestamp(float(ep)), l, graphMap[i].number_of_nodes() graphsToCombine = [graphMap[id] for id in graphIdsToCombine] return combineGraphList(graphsToCombine, **kwargs)
def getStreamStats(streamTweetsIterator): ''' 30-day Experts stats: # of users: 4804 # of tweets: 1614510 # of tweets per tu (mean, var): 186.497631974 7860.12570191 Houston stats # of users: 107494 # of tweets: 15946768 # of tweets per tu (mean, var): 1730.33506944 4834419.37341 10-day Experts stats # of users: 4674 # of tweets: 608798 # of tweets per tu (mean, var): 190.726190476 8132.75460228 Houston stats # of users: 39618 # of tweets: 2139829 # of tweets per tu (mean, var): 619.163483796 94450.7334004 ''' numberOfTweets, users, distributionPerTU = 0, set(), defaultdict(int) for tweet in streamTweetsIterator: users.add(tweet['user']['screen_name']) distributionPerTU[GeneralMethods.getEpochFromDateTimeObject(getDateTimeObjectFromTweetTimestamp(tweet['created_at']))//300]+=1 numberOfTweets+=1 print '# of users: ', len(users) print '# of tweets: ', numberOfTweets print '# of tweets per tu (mean, var): ', np.mean(distributionPerTU.values()), np.var(distributionPerTU.values())
def test_append(self): self.crowd.append(self.cluster, test_time + timedelta(days=1)) self.assertEqual([ GeneralMethods.getEpochFromDateTimeObject(test_time), GeneralMethods.getEpochFromDateTimeObject(test_time + timedelta(days=1)) ], sorted(self.crowd.clusters.keys())) self.assertEqual( StreamCluster, type(self.crowd.clusters[GeneralMethods.getEpochFromDateTimeObject( test_time)])) self.assertEqual(2, self.crowd.lifespan) self.assertEqual( getStringRepresentationForTweetTimestamp(test_time), getStringRepresentationForTweetTimestamp(self.crowd.startTime)) self.assertEqual( getStringRepresentationForTweetTimestamp(test_time + timedelta(days=1)), getStringRepresentationForTweetTimestamp(self.crowd.endTime))
def iteratePhrases(): for tweet in TweetFiles.iterateTweetsFromGzip('/mnt/chevron/kykamath/data/twitter/tweets_by_trends/2011_2_6.gz'): message = TwitterCrowdsSpecificMethods.convertTweetJSONToMessage(tweet, **settings) if message.vector: for phrase in message.vector: if phrase!='': yield (phrase, GeneralMethods.approximateEpoch(GeneralMethods.getEpochFromDateTimeObject(message.timeStamp), 60))
def append(self, cluster, clusterFormationTime): self.clusters[GeneralMethods.getEpochFromDateTimeObject( clusterFormationTime)] = cluster
def append(self, cluster, clusterFormationTime): self.clusters[GeneralMethods.getEpochFromDateTimeObject(clusterFormationTime)] = cluster
def __init__(self, cluster, clusterFormationTime): self.crowdId = cluster.clusterId self.clusters = {GeneralMethods.getEpochFromDateTimeObject(clusterFormationTime): cluster} self.ends, self.inComingCrowds, self.outGoingCrowd = False, [], None