def iterateExpertClusters(startingDay=datetime(2011,3,19), endingDay=datetime(2011,3, 30)): # def iterateExpertClusters(startingDay=datetime(2011,3,19), endingDay=datetime(2011,4,7)): while startingDay<=endingDay: for line in FileIO.iterateJsonFromFile(experts_twitter_stream_settings.lsh_clusters_folder+FileIO.getFileByDay(startingDay)): currentTime = getDateTimeObjectFromTweetTimestamp(line['time_stamp']) for clusterMap in line['clusters']: yield (currentTime, TwitterCrowdsSpecificMethods.getClusterFromMapFormat(clusterMap)) startingDay+=timedelta(days=1)
def iterateHashtagObjectInstances(line): data = cjson.decode(line) l = None if 'geo' in data: l = data['geo'] else: l = data['bb'] t = time.mktime(getDateTimeObjectFromTweetTimestamp(data['t']).timetuple()) for h in data['h']: yield h.lower(), [getLattice(l, ACCURACY), t]
def getStreamStats(streamTweetsIterator): ''' 30-day Experts stats: # of users: 4804 # of tweets: 1614510 # of tweets per tu (mean, var): 186.497631974 7860.12570191 Houston stats # of users: 107494 # of tweets: 15946768 # of tweets per tu (mean, var): 1730.33506944 4834419.37341 10-day Experts stats # of users: 4674 # of tweets: 608798 # of tweets per tu (mean, var): 190.726190476 8132.75460228 Houston stats # of users: 39618 # of tweets: 2139829 # of tweets per tu (mean, var): 619.163483796 94450.7334004 ''' numberOfTweets, users, distributionPerTU = 0, set(), defaultdict(int) for tweet in streamTweetsIterator: users.add(tweet['user']['screen_name']) distributionPerTU[GeneralMethods.getEpochFromDateTimeObject(getDateTimeObjectFromTweetTimestamp(tweet['created_at']))//300]+=1 numberOfTweets+=1 print '# of users: ', len(users) print '# of tweets: ', numberOfTweets print '# of tweets per tu (mean, var): ', np.mean(distributionPerTU.values()), np.var(distributionPerTU.values())
def plotGrowthOfPhrasesInTime(self, returnAxisValuesOnly=True): ''' This plot tells us the time when the number of phrases in the stream stablizes. Consider the time after we have seen maximum phrases to determine dimensions. But, if these phrases increase linearly with time, it shows that we have infinte dimensions and hence this motivates us to have a way to determine number of dimensions. numberOfTimeUnits=10*24*12 ''' x, y = [], [] [(x.append(getDateTimeObjectFromTweetTimestamp(line['time_stamp'])), y.append(line['total_number_of_phrases'])) for line in FileIO.iterateJsonFromFile(self.dimensionsEstimationFile)] x = x[:numberOfTimeUnits] y = y[:numberOfTimeUnits] plt.subplot(111).yaxis.set_major_formatter( FuncFormatter(lambda x, i: '%0.1f' % (x / 10.**6))) plt.text(0.0, 1.01, getLatexForString('10^6'), transform=plt.gca().transAxes) plt.ylabel(getLatexForString('\# of dimensions')), plt.xlabel( getLatexForString(xlabelTimeUnits)), plt.title( getLatexForString( 'Growth in dimensions with increasing time.')) plt.plot(y, color=self.stream_settings['plot_color'], label=getLatexForString(self.stream_settings['plot_label']), lw=2) plt.legend(loc=4) if returnAxisValuesOnly: plt.show()
def iterate_hashtag_occurrences_with_high_accuracy_lid(line): data = cjson.decode(line) l = None if 'geo' in data: l = data['geo'] else: l = data['bb'] t = time.mktime(getDateTimeObjectFromTweetTimestamp(data['t']).timetuple()) lid = getLatticeLid(l, accuracy=0.0001) for h in data['h']: yield h.lower(), [lid, GeneralMethods.approximateEpoch(t, TIME_UNIT_IN_SECONDS)]
def convertTweetJSONToMessage(tweet, **twitter_stream_settings): tweetTime = getDateTimeObjectFromTweetTimestamp(tweet['created_at']) message = Message(tweet['user']['screen_name'], tweet['id'], tweet['text'], tweetTime) message.vector = Vector() for phrase in getPhrases(getWordsFromRawEnglishMessage(tweet['text']), twitter_stream_settings['min_phrase_length'], twitter_stream_settings['max_phrase_length']): if phrase not in message.vector: message.vector[phrase]=0 message.vector[phrase]+=1 return message
def iterateHashtagObjectInstances(line): data = cjson.decode(line) l = None if 'geo' in data: l = data['geo'] else: l = data['bb'] t = time.mktime(getDateTimeObjectFromTweetTimestamp(data['t']).timetuple()) point = getLattice(l, LATTICE_ACCURACY) # if isWithinBoundingBox(point, BOUNDING_BOX): for h in data['h']: yield h.lower(), [point, t]
def iterateTweetsFromExperts(expertsDataStartTime=datetime(2011,3,19), expertsDataEndTime=datetime(2011,4,12)): experts = getExperts() currentTime = expertsDataStartTime while currentTime <= expertsDataEndTime: for tweet in TwitterIterators.iterateFromFile(experts_twitter_stream_settings.twitter_users_tweets_folder+'%s.gz'%FileIO.getFileByDay(currentTime)): if tweet['user']['id_str'] in experts: if getDateTimeObjectFromTweetTimestamp(tweet['created_at']) <= expertsDataEndTime : yield tweet else: return currentTime+=timedelta(days=1)
def iterateHashtagObjectInstances(line): data = cjson.decode(line) l = None if 'geo' in data: l = data['geo'] else: l = data['bb'] t = GeneralMethods.approximateEpoch(time.mktime(getDateTimeObjectFromTweetTimestamp(data['t']).timetuple()), TIME_UNIT_IN_SECONDS) if isWithinBoundingBox(l, BOUNDARY): point = getLatticeLid(l, LATTICE_ACCURACY) if point!='0.0000_0.0000': for h in data['h']: yield h.lower(), [point, t]
def convertTweetJSONToMessage(tweet, **twitter_stream_settings): tweetTime = getDateTimeObjectFromTweetTimestamp(tweet['created_at']) message = Message(tweet['user']['screen_name'], tweet['id'], tweet['text'], tweetTime) message.vector = Vector() for phrase in getPhrases(getWordsFromRawEnglishMessage(tweet['text']), twitter_stream_settings['min_phrase_length'], twitter_stream_settings['max_phrase_length']): if phrase not in message.vector: message.vector[phrase] = 0 message.vector[phrase] += 1 return message
def getClusterFromMapFormat(clusterMap): dummyMessage = Message(1, '', '', datetime.now()) dummyMessage.vector=Vector({}) dummyStream=Stream(1, dummyMessage) cluster = StreamCluster(dummyStream) cluster.clusterId = clusterMap['clusterId'] cluster.lastStreamAddedTime = getDateTimeObjectFromTweetTimestamp(clusterMap['lastStreamAddedTime']) cluster.mergedClustersList = clusterMap['mergedClustersList'] cluster.documentsInCluster = clusterMap['streams'] for k,v in clusterMap['dimensions'].iteritems(): cluster[k]=v return cluster
def iterate_reduced_tweets(line): data = cjson.decode(line) loc = None if 'geo' in data: loc = data['geo'] else: loc = data['bb'] if data['id'] != None: uid = data['id'] time1 = time.mktime(getDateTimeObjectFromTweetTimestamp(data['t']).timetuple())# make time represent as sec from 9 tuple #this here is selecting place in US. # if loc[0]>24.52 and loc[0]<49.38 and loc[1]<-66.95 and loc[1]>-124.77: # if loc[0]>40.48 and loc[0]<40.90 and loc[1]<-73.69 and loc[1]>-74.25: # if loc[0]>40.7022 and loc[0]<40.807 and loc[1]<-73.927 and loc[1]>-74.0218: for h in data['h']: yield h.lower(), (loc, time1, uid)
def iterateHashtagObjectInstances(line, all_locations = False): data = cjson.decode(line) l = None if 'geo' in data: l = data['geo'] else: l = data['bb'] t = time.mktime(getDateTimeObjectFromTweetTimestamp(data['t']).timetuple()) point = getLattice(l, LOCATION_ACCURACY) if not all_locations: lattice_lid = getLatticeLid(point, LOCATION_ACCURACY) if lattice_lid in VALID_LOCATIONS_LIST: for h in data['h']: yield h.lower(), [point, t] else: for h in data['h']: yield h.lower(), [point, t]
def getClusterFromMapFormat(clusterMap): dummyMessage = Message(1, '', '', datetime.now()) dummyMessage.vector = Vector({}) dummyStream = Stream(1, dummyMessage) cluster = StreamCluster(dummyStream) cluster.clusterId = clusterMap['clusterId'] cluster.lastStreamAddedTime = getDateTimeObjectFromTweetTimestamp( clusterMap['lastStreamAddedTime']) cluster.mergedClustersList = clusterMap['mergedClustersList'] cluster.documentsInCluster = clusterMap['streams'] for k, v in clusterMap['dimensions'].iteritems(): cluster[k] = v return cluster
def read_checkins(self, _, line): if line != '': data = decode(line) # If the tweet is geolocated with valid coordinates # then we put it in the checkins bucket for the # corresponding user if data['c'] != 'N' and data['c'] != [0.0, 0.0]: timestamp = data['t'] date_time_object = getDateTimeObjectFromTweetTimestamp(timestamp) timestamp = mktime(date_time_object.timetuple()) tweet_id = data['tid'] checkin = {'tid' : str(tweet_id) , 't' : timestamp} yield data['u'], checkin
def parse_stream(): stream = tweetstream.FilterStream(USER_NAME, PASSWORD, locations=LOCATIONS) for tweet in stream: # try: geo = ParseGeoData(tweet) if geo: hashtags = ParseHashtags(tweet) if hashtags: checkin_object = GetCheckinObject(tweet) checkin_object['h'] = hashtags checkin_object[geo[0]] = geo[1] FileIO.writeToFileAsJson( checkin_object, GetOutputFile(getDateTimeObjectFromTweetTimestamp(tweet['created_at'])) )
def iterateTweetsFromExperts(expertsDataStartTime=datetime(2011, 3, 19), expertsDataEndTime=datetime(2011, 4, 12)): experts = getExperts() currentTime = expertsDataStartTime while currentTime <= expertsDataEndTime: for tweet in TwitterIterators.iterateFromFile( experts_twitter_stream_settings.twitter_users_tweets_folder + '%s.gz' % FileIO.getFileByDay(currentTime)): if tweet['user']['id_str'] in experts: if getDateTimeObjectFromTweetTimestamp( tweet['created_at']) <= expertsDataEndTime: yield tweet else: return currentTime += timedelta(days=1)
def iterate_hashtag_with_words(line): data = cjson.decode(line) if data["h"]: l = None if "geo" in data: l = data["geo"] else: l = data["bb"] words = filter(lambda w: w[0] != "#", getWordsFromRawEnglishMessage(data["tx"])) words = filter(lambda (w, pos): pos == "NN" or pos == "NP", nltk.pos_tag(words)) words = map(itemgetter(0), words) t = time.mktime(getDateTimeObjectFromTweetTimestamp(data["t"]).timetuple()) for h in data["h"]: yield h.lower(), words, l, t
def plotGrowthOfPhrasesInTime(self, returnAxisValuesOnly=True): ''' This plot tells us the time when the number of phrases in the stream stablizes. Consider the time after we have seen maximum phrases to determine dimensions. But, if these phrases increase linearly with time, it shows that we have infinte dimensions and hence this motivates us to have a way to determine number of dimensions. numberOfTimeUnits=10*24*12 ''' x, y = [], []; [(x.append(getDateTimeObjectFromTweetTimestamp(line['time_stamp'])), y.append(line['total_number_of_phrases'])) for line in FileIO.iterateJsonFromFile(self.dimensionsEstimationFile)] x = x[:numberOfTimeUnits]; y = y[:numberOfTimeUnits] plt.subplot(111).yaxis.set_major_formatter(FuncFormatter(lambda x, i: '%0.1f' % (x / 10. ** 6))) plt.text(0.0, 1.01, getLatexForString('10^6'), transform=plt.gca().transAxes) plt.ylabel(getLatexForString('\# of dimensions')), plt.xlabel(getLatexForString(xlabelTimeUnits)), plt.title(getLatexForString('Growth in dimensions with increasing time.')) plt.plot(y, color=self.stream_settings['plot_color'], label=getLatexForString(self.stream_settings['plot_label']), lw=2) plt.legend(loc=4) if returnAxisValuesOnly: plt.show()
def generateStatsForHDLSHClustering(self): print 'HD LSH' def _getDocumentFromTuple((user, text)): vector, words = Vector(), text.split() for word in words[1:]: if word not in vector: vector[word] = 1 else: vector[word] += 1 return Document(user, vector) self.stream_settings[ 'convert_data_to_message_method'] = TwitterCrowdsSpecificMethods.convertTweetJSONToMessage self.stream_settings[ 'cluster_analysis_method'] = emptyClusterAnalysisMethod # self.stream_settings['cluster_filtering_method'] = emptyClusterFilteringMethod self.documents = [ tw[1] for tw in list(self._tweetWithTimestampIterator()) if tw[1]['text'].strip() != '' ] self.documents = [ tw[0] for tw in sorted([( t, getDateTimeObjectFromTweetTimestamp(t['created_at'])) for t in self.documents], key=itemgetter(0)) ] clustering = HDStreaminClustering(**self.stream_settings) ts = time.time() # for tweet in self.documents: clustering.getClusterAndUpdateExistingClusters(_getDocumentFromTuple(tweet)) # clustering.cluster([_getDocumentFromTuple(d) for d in self.documents]) clustering.cluster(self.documents) te = time.time() documentClusters = [ cluster.documentsInCluster.keys() for k, cluster in clustering.clusters.iteritems() if len(cluster.documentsInCluster.keys()) >= self.stream_settings['cluster_filter_threshold'] ] return self.getEvaluationMetrics(documentClusters, te - ts)
def generateStatsForHDLSHClustering(self): print 'HD LSH' def _getDocumentFromTuple((user, text)): vector, words = Vector(), text.split() for word in words[1:]: if word not in vector: vector[word]=1 else: vector[word]+=1 return Document(user, vector) self.stream_settings['convert_data_to_message_method'] = TwitterCrowdsSpecificMethods.convertTweetJSONToMessage self.stream_settings['cluster_analysis_method'] = emptyClusterAnalysisMethod # self.stream_settings['cluster_filtering_method'] = emptyClusterFilteringMethod self.documents = [tw[1] for tw in list(self._tweetWithTimestampIterator()) if tw[1]['text'].strip()!=''] self.documents = [ tw[0] for tw in sorted([(t, getDateTimeObjectFromTweetTimestamp(t['created_at'])) for t in self.documents], key=itemgetter(0)) ] clustering=HDStreaminClustering(**self.stream_settings) ts = time.time() # for tweet in self.documents: clustering.getClusterAndUpdateExistingClusters(_getDocumentFromTuple(tweet)) # clustering.cluster([_getDocumentFromTuple(d) for d in self.documents]) clustering.cluster(self.documents) te = time.time() documentClusters = [cluster.documentsInCluster.keys() for k, cluster in clustering.clusters.iteritems() if len(cluster.documentsInCluster.keys())>=self.stream_settings['cluster_filter_threshold']] return self.getEvaluationMetrics(documentClusters, te-ts)
def analyzeJustifyExponentialDecay(self): global evaluation experimentsData = {JustifyExponentialDecay.with_decay: {}, JustifyExponentialDecay.without_decay: {}} for data in FileIO.iterateJsonFromFile(JustifyExponentialDecay.stats_file): experimentsData[data['iteration_parameters']['type']][getDateTimeObjectFromTweetTimestamp(data['iteration_parameters']['current_time'])]=data['clusters'] qualityData = [] for k1, k2 in zip(sorted(experimentsData[JustifyExponentialDecay.with_decay]), sorted(experimentsData[JustifyExponentialDecay.without_decay])): qualityData.append((k1, evaluation.getEvaluationMetrics(experimentsData[JustifyExponentialDecay.with_decay][k1], None, None)['purity']-evaluation.getEvaluationMetrics(experimentsData[JustifyExponentialDecay.without_decay][k1], None, None)['purity'])) keyTime = sorted(qualityData, key=itemgetter(1))[-1][0] clusterWithDecay = [i for i in experimentsData[JustifyExponentialDecay.with_decay][keyTime] if len(i)>=3] clusterWithOutDecay = [i for i in experimentsData[JustifyExponentialDecay.without_decay][keyTime] if len(i)>=3] # for c in clusterWithDecay: # print c, [evaluation.expertsToClassMap[i.lower()] for i in c] interestedCluster = set(['Zap2it', 'ESPNAndyKatz', 'comingsoonnet', '950KJR', 'ginasmith888', 'UKCoachCalipari', 'SportsFanz', 'David_Henrie']) for c in clusterWithOutDecay: if len(set(c).intersection(interestedCluster))>0: # print c, [evaluation.expertsToClassMap[i.lower()] for i in c] setString = ', '.join(['%s (%s)'%(i, evaluation.expertsToClassMap[i.lower()]) for i in sorted(c)]).replace(' ', '\\ ').replace('_', '\\_') print keyTime, '&', setString, '\\\\' clustersDiscoveredEarlierByDecay = {} for kt in sorted(experimentsData[JustifyExponentialDecay.with_decay]): for c in experimentsData[JustifyExponentialDecay.with_decay][kt]: c=sorted(c) if len(set(c).intersection(interestedCluster))>0: classes = [evaluation.expertsToClassMap[i.lower()] for i in c if i.lower() in evaluation.expertsToClassMap] if sorted([(k, len(list(g))/float(len(classes))) for k,g in groupby(sorted(classes))], key=itemgetter(1))[-1][1]>0.7: if kt>datetime(2011,3,19) and kt<=keyTime: clustersDiscoveredEarlierByDecay[kt]=c observedStrings = set() for k in sorted(clustersDiscoveredEarlierByDecay): setString = ', '.join(['%s (%s)'%(i, evaluation.expertsToClassMap[i.lower()]) for i in sorted(clustersDiscoveredEarlierByDecay[k])]).replace(' ', '\\ ').replace('_', '\\_') if setString not in observedStrings: print k, '&', setString, '\\\\'; observedStrings.add(setString)
def analyzeJustifyExponentialDecay(self): global evaluation experimentsData = {JustifyExponentialDecay.with_decay: {}, JustifyExponentialDecay.without_decay: {}} for data in FileIO.iterateJsonFromFile(JustifyExponentialDecay.stats_file): experimentsData[data["iteration_parameters"]["type"]][ getDateTimeObjectFromTweetTimestamp(data["iteration_parameters"]["current_time"]) ] = data["clusters"] qualityData = [] for k1, k2 in zip( sorted(experimentsData[JustifyExponentialDecay.with_decay]), sorted(experimentsData[JustifyExponentialDecay.without_decay]), ): qualityData.append( ( k1, evaluation.getEvaluationMetrics( experimentsData[JustifyExponentialDecay.with_decay][k1], None, None )["purity"] - evaluation.getEvaluationMetrics( experimentsData[JustifyExponentialDecay.without_decay][k1], None, None )["purity"], ) ) keyTime = sorted(qualityData, key=itemgetter(1))[-1][0] clusterWithDecay = [i for i in experimentsData[JustifyExponentialDecay.with_decay][keyTime] if len(i) >= 3] clusterWithOutDecay = [ i for i in experimentsData[JustifyExponentialDecay.without_decay][keyTime] if len(i) >= 3 ] # for c in clusterWithDecay: # print c, [evaluation.expertsToClassMap[i.lower()] for i in c] interestedCluster = set( [ "Zap2it", "ESPNAndyKatz", "comingsoonnet", "950KJR", "ginasmith888", "UKCoachCalipari", "SportsFanz", "David_Henrie", ] ) for c in clusterWithOutDecay: if len(set(c).intersection(interestedCluster)) > 0: # print c, [evaluation.expertsToClassMap[i.lower()] for i in c] setString = ( ", ".join(["%s (%s)" % (i, evaluation.expertsToClassMap[i.lower()]) for i in sorted(c)]) .replace(" ", "\\ ") .replace("_", "\\_") ) print keyTime, "&", setString, "\\\\" clustersDiscoveredEarlierByDecay = {} for kt in sorted(experimentsData[JustifyExponentialDecay.with_decay]): for c in experimentsData[JustifyExponentialDecay.with_decay][kt]: c = sorted(c) if len(set(c).intersection(interestedCluster)) > 0: classes = [ evaluation.expertsToClassMap[i.lower()] for i in c if i.lower() in evaluation.expertsToClassMap ] if ( sorted( [(k, len(list(g)) / float(len(classes))) for k, g in groupby(sorted(classes))], key=itemgetter(1), )[-1][1] > 0.7 ): if kt > datetime(2011, 3, 19) and kt <= keyTime: clustersDiscoveredEarlierByDecay[kt] = c observedStrings = set() for k in sorted(clustersDiscoveredEarlierByDecay): setString = ( ", ".join( [ "%s (%s)" % (i, evaluation.expertsToClassMap[i.lower()]) for i in sorted(clustersDiscoveredEarlierByDecay[k]) ] ) .replace(" ", "\\ ") .replace("_", "\\_") ) if setString not in observedStrings: print k, "&", setString, "\\\\" observedStrings.add(setString)
def getCheckinObject(line): data = cjson.decode(line) data['t'] = time.mktime((getDateTimeObjectFromTweetTimestamp(data['t'])-datetime.timedelta(hours=5)).timetuple()) data['l'] = data['geo']; del data['geo'] return data
def _ParseHashtagObjects(checkin): if 'geo' in checkin: point = checkin['geo'] else: point = checkin['bb'] # Adding 30 minutes because stream appears to be delayed by 30 minutes t = time.mktime(getDateTimeObjectFromTweetTimestamp(checkin['t']).timetuple()) + 1800. for h in checkin['h']: yield h.lower(), [point, t]