def visualize_word(db,timeWindow,given_word,**kwargs): print "COLLECTING TWEETS...." TS = TweetSnap(db=db,timeWindow = timeWindow,Placename2Geocode=False) print "COLLECTION OVER...." TIME_START = kwargs.get("TIME_START",time.gmtime(0)) TIME_END = kwargs.get("TIME_END",time.gmtime(time.time())) if isinstance(TIME_START,str): TIME_START = time.gmtime(time.mktime(time.strptime(TIME_START,"%d %b %H:%M %Z %Y"))) if isinstance(TIME_END,str): TIME_END = time.gmtime(time.mktime(time.strptime(TIME_END,"%d %b %H:%M %Z %Y"))) TIME_DIFF = time.mktime(TIME_START) - time.mktime(TS.time_start) if TIME_DIFF>0: TS.move_on(TIME_DIFF-timeWindow) Virality = [] Volume = [] Locality = [] TimeWindow=[] Word = [] while (TS.time_start<TIME_END and not TS.end): #Capture nextSnap and initialize time_start of next snap snap = TS.next() if len(snap['TEXT'])<100: continue #gmm = GMM_clustering() for item in given_word: #1. Virality #Virality.append(PoissonRate(snap,given_word=item)) #2. Locality #gmm.Snap = snap #gmm.build_clusters() #Locality.append(GeographicalEntropy(snap,gmm.labels,given_word=item)) #3. Volume Volume.append(Count(snap,given_word=item)) #4. TimeWindow TimeWindow.append(snap['TimeWindow'][0]) #5. Word Word.append(item) #Prepare Dataframe df = pd.DataFrame({'Virality':Virality,'Locality':Locality,'Volume':Volume,'TimeWindow':TimeWindow,'Word':Word}) return df
def __init__(self, db, timeWindow=60 * 10, **kwargs): print "COLLECTING TWEETS...." self.TS = TweetSnap(db=db, timeWindow=timeWindow, Placename2Geocode=False) print "COLLECTION OVER...." # Variables self.SnapStack = [] self.Candidates = {} self.Volume = [] # Constants self.delta = 1.5 self.enoughSamples = 15.0 self.SnapLim = 6 self.StopNewsWords = ["Boston", "day", "time", "love", "today", "Boston-MA"] # Set TIME_FRAME self.SetStart(kwargs.get("TIME_START", time.gmtime(0))) # Storage variables for analysis self.Storage = [] self.StorageDict = pd.DataFrame( columns=["word", "Poisson", "LocalEntropy", "GlobalEntropy", "start_time", "event"] ) self.ResultDict = pd.DataFrame(columns=["word", "event_time", "location", "discovered_time", "summary"]) # Classifier self.matrix_w, self.scaler, self.clf = cPickle.load(open("SVClassifier.Store")) # Verbosity - 1. Print all messages 2. Print less messages 3. ..... self.VerboseLevel = kwargs.get("VerboseLevel", 3)
def __init__(self,db,timeWindow=60*10,**kwargs): print "COLLECTING TWEETS...." self.TS = TweetSnap(db=db,timeWindow = timeWindow,Placename2Geocode=False) print "COLLECTION OVER...." #Variables self.QueueStack = [] self.Candidates = {} self.Vocabulary = [] #Constants self.delta = 3 #GaussianDistortion self.MinWordSamples = 15.0 #Has to be greater than 8 See SetFeatureTable method for this restriction self.QueueLim = 6 #MaximumQueueLimit self.StopNewsWords = ['Boston', 'day', 'time', 'love', 'today', 'Boston-MA'] #Default StopWordList #Set TIME_FRAME self.SetStart(kwargs.get("TIME_START",time.gmtime(0))) #Storage variables for analysis self.FeatureDict = pd.DataFrame(columns=['word','Poisson','LocalEntropy','GlobalEntropy','start_time','event']) self.ResultDict = pd.DataFrame(columns=['word','event_time','location','discovered_time','summary']) #Classifier self.matrix_w, self.scaler, self.clf = cPickle.load(open('SVClassifier.Store')) #Verbosity - 1. Print all messages 2. Print less messages 3. ..... self.OnlyMessage = kwargs.get('OnlyMessage',0)
def set_SnapIter(db,timeWindow,**kwargs): print "COLLECTING TWEETS...." TS = TweetSnap(db=db,timeWindow = timeWindow,Placename2Geocode=False) print "COLLECTION OVER...." TIME_START = kwargs.get("TIME_START",time.gmtime(0)) TIME_END = kwargs.get("TIME_END",time.gmtime(time.time())) if isinstance(TIME_START,str): TIME_START = time.gmtime(time.mktime(time.strptime(TIME_START,"%d %b %H:%M %Z %Y"))) if isinstance(TIME_END,str): TIME_END = time.gmtime(time.mktime(time.strptime(TIME_END,"%d %b %H:%M %Z %Y"))) TIME_DIFF = time.mktime(TIME_START) - time.mktime(TS.time_start) if TIME_DIFF>0: TS.move_on(TIME_DIFF-timeWindow) return TS
def visualize_timeframe(db,timeWindow,**kwargs): print "COLLECTING TWEETS...." TS = TweetSnap(db=db,timeWindow = timeWindow,Placename2Geocode=False) print "COLLECTION OVER...." TIME_START = kwargs.get("TIME_START",time.gmtime(0)) TIME_END = kwargs.get("TIME_END",time.gmtime(time.time())) VocabSize = kwargs.get("VocabSize",500) if isinstance(TIME_START,str): TIME_START = time.gmtime(time.mktime(time.strptime(TIME_START,"%d %b %H:%M %Z %Y"))) if isinstance(TIME_END,str): TIME_END = time.gmtime(time.mktime(time.strptime(TIME_END,"%d %b %H:%M %Z %Y"))) TIME_DIFF = time.mktime(TIME_START) - time.mktime(TS.time_start) if TIME_DIFF>0: TS.move_on(TIME_DIFF-timeWindow) #Create Dataframe df = pd.DataFrame(columns=['Virality','Locality','Volume','Words','TimeWindow']) while (TS.time_start<TIME_END and not TS.end): #InitializeColumns Virality = {} Volume = {} Locality = {} TimeWindow=[] #Capture nextSnap and initialize time_start of next snap snap = TS.next() if len(snap['TEXT'])<100: continue gmm = GMM_clustering() #1. Virality Virality = te(snap) #2. Locality gmm.Snap = snap gmm.build_clusters() Locality = GeographicalEntropy(snap,gmm.labels) #3. Volume Volume = Count(snap) #HotWords= set(dict(Virality.most_common(HotWordSize)).keys())&set(dict(Broadcast.most_common(HotWordSize)).keys())&set(dict(Locality.most_common(HotWordSize)).keys())&set(dict(Prevalence.most_common(HotWordSize)).keys())&set(dict(Volume.most_common(HotWordSize)).keys()) Words= list(set(dict(Virality.most_common(VocabSize)).keys())&set(dict(Locality.most_common(VocabSize)).keys())&set(dict(Volume.most_common(VocabSize)).keys())) if not len(Words)>0: continue Virality= [Virality[key] if key in Virality.keys() else 0 for key in Words] # Broadcast=[Broadcast[key] if key in Broadcast.keys() else 0 for key in HotWords] Locality= [Locality[key] if key in Locality.keys() else 0 for key in Words] # Prevalence=[Prevalence[key] if key in Prevalence.keys() else 0 for key in HotWords] Volume=[Volume[key] if key in Volume.keys() else 0 for key in Words] #4. TimeWindow TimeWindow= [snap['TimeWindow'][0]]*len(Words) #5. Words Words= Words #Append to Dataframe df = df.append({'Virality':Virality,'Locality':Locality,'Volume':Volume,'Words':Words,'TimeWindow':TimeWindow},ignore_index=True) return df
def newsworthy_words(db,timeWindow,**kwargs): print "COLLECTING TWEETS...." TS = TweetSnap(db=db,timeWindow = timeWindow,Placename2Geocode=False) print "COLLECTION OVER...." TIME_START = kwargs.get("TIME_START",time.gmtime(0)) TIME_END = kwargs.get("TIME_END",time.gmtime(time.time())) HotWordSize = kwargs.get("HotWordSize",25) if isinstance(TIME_START,str): TIME_START = time.gmtime(time.mktime(time.strptime(TIME_START,"%d %b %H:%M %Z %Y"))) if isinstance(TIME_END,str): TIME_END = time.gmtime(time.mktime(time.strptime(TIME_END,"%d %b %H:%M %Z %Y"))) TIME_DIFF = time.mktime(TIME_START) - time.mktime(TS.time_start) if TIME_DIFF>0: TS.move_on(TIME_DIFF-timeWindow) Day = {} while (TS.time_start<TIME_END and not TS.end): #Capture nextSnap and initialize time_start of next snap snap = TS.next() if len(snap['TEXT'])<100: continue gmm = GMM_clustering() #1. Virality Virality = PoissonRate(snap) #2. DeltaVolume #Broadcast = DeltaVolume(snap0,snap) #3. Locality gmm.Snap = snap gmm.build_clusters() Locality = GeographicalEntropy(snap,gmm.labels) #4. Prevalence #Prevalence= Ttest(snap) #5. Count Volume = Count(snap) #Prepare Dataframe #Union #HotWords= list(set(dict(Virality.most_common(HotWordSize)).keys()+dict(Broadcast.most_common(HotWordSize)).keys()+dict(Locality.most_common(HotWordSize)).keys()+dict(Prevalence.most_common(HotWordSize)).keys()+dict(Volume.most_common(HotWordSize)).keys())) #Intersection #print "Simmering words" #print 'Virality',set(dict(Virality.most_common(HotWordSize)).keys()) #print 'Broadcast',set(dict(Broadcast.most_common(HotWordSize)).keys()) #print 'Locality',set(dict(Locality.most_common(HotWordSize)).keys()) #print set(dict(Prevalence.most_common(HotWordSize)).keys()) #print 'Volume',set(dict(Volume.most_common(HotWordSize)).keys()) #print "*"*5 #HotWords= set(dict(Virality.most_common(HotWordSize)).keys())&set(dict(Broadcast.most_common(HotWordSize)).keys())&set(dict(Locality.most_common(HotWordSize)).keys())&set(dict(Prevalence.most_common(HotWordSize)).keys())&set(dict(Volume.most_common(HotWordSize)).keys()) HotWords= list(set(dict(Virality.most_common(HotWordSize)).keys())&set(dict(Locality.most_common(HotWordSize)).keys())&set(dict(Volume.most_common(HotWordSize)).keys())) if not len(HotWords)>0: continue Virality= [Virality[key] if key in Virality.keys() else 0 for key in HotWords] # Broadcast=[Broadcast[key] if key in Broadcast.keys() else 0 for key in HotWords] Locality= [Locality[key] if key in Locality.keys() else 0 for key in HotWords] # Prevalence=[Prevalence[key] if key in Prevalence.keys() else 0 for key in HotWords] Volume=[Volume[key] if key in Volume.keys() else 0 for key in HotWords] #scaler = preprocessing.MinMaxScaler([0,100]).fit_transform #scaledVirality = list(scaler(np.array([Virality]).T).flatten()) # scaledBroadcast = scaler(Broadcast) #scaledLocality = list(scaler(np.array([Locality]).T).flatten()) # scaledPrevalence = scaler(Prevalence) #scaledVolume = list(scaler(np.array([Volume],dtype=np.float16).T).flatten()) Score = [vi+lo+vo for vi,lo,vo in zip(Virality,Locality,Volume)] df = pd.DataFrame({'Words':HotWords,'Virality':Virality,'Locality':Locality,'Volume':Volume,'Score':Score}) #df_scaled = pd.DataFrame({'Words':HotWords,'Virality':scaledVirality,'Locality':scaledLocality,'Volume':scaledVolume,'Score':Score}) Day['to'.join(snap['TimeWindow'])]=df return Day
def print_vocabulary_report(db,scale=60*20,**kwargs): print "COLLECTING TWEETS...." TS = TweetSnap(db=db,timeWindow = scale,Placename2Geocode=False) print "COLLECTION OVER...." TIME_START = kwargs.get("TIME_START",time.gmtime(0)) TIME_END = kwargs.get("TIME_END",time.gmtime(time.time())) HotWordSize = kwargs.get("HotWordSize",8) if isinstance(TIME_START,str): TIME_START = time.gmtime(time.mktime(time.strptime(TIME_START,"%d %b %H:%M %Z %Y"))) if isinstance(TIME_END,str): TIME_END = time.gmtime(time.mktime(time.strptime(TIME_END,"%d %b %H:%M %Z %Y"))) TIME_DIFF = time.mktime(TIME_START) - time.mktime(TS.time_start) if TIME_DIFF>0: TS.move_on(TIME_DIFF-scale) volume = [] HotWordsList = [] ColorGradient = {} TweetCountDict = {} TimeList = [] while (TS.time_start<TIME_END and not TS.end): #Capture nextSnap and initialize time_start of next snap snap = TS.next() timeWindow = gmt_to_local(TS.time_start,make_string=True,format='%a %H:%M') #Volume of tweets volume.append(len(snap['LOC'])) #HotWords List Vocab_dict = dict(get_vocabulary(snap['TEXT']).most_common(HotWordSize)) TimeList.append(timeWindow) ColorGradient[timeWindow] = {} for word in Vocab_dict.keys(): ColorGradient[timeWindow][word] = Vocab_dict[word]/float(sum(Vocab_dict.values())) if word in TweetCountDict.keys(): TweetCountDict[word] += Vocab_dict[word] else: TweetCountDict[word] = Vocab_dict[word] print "LOOPING2" SortedTweetCount = sorted(TweetCountDict.iteritems(),key=operator.itemgetter(1)) WordList = [item[0] for item in SortedTweetCount] TweetCountArray = np.array([item[1] for item in SortedTweetCount],dtype=int) del SortedTweetCount ColorMap = np.empty([len(WordList),len(TimeList)],dtype=float) for rw,word in enumerate(WordList): for cl,timeWindow in enumerate(TimeList): if word in ColorGradient[timeWindow].keys(): ColorMap[rw][cl] = ColorGradient[timeWindow][word] else: ColorMap[rw][cl] = 0 ###PRINT RESULTS gs = gridspec.GridSpec(2,2,width_ratios=[1,2],height_ratios=[1,4]) gs.update(left=0.05,right=0.48,wspace=0.00000000000000000000000000000000000000005,hspace=0.00000000000000000000000000000000000000005) fig1 = plt.figure(figsize=(36,90),dpi=200) ax0 = fig1.add_subplot(gs[0,1]) ax1 = fig1.add_subplot(gs[1,1]) ax2 = fig1.add_subplot(gs[1,0]) ax3 = fig1.add_subplot(gs[0,0]) #TweetVolume ax0.grid(True, 'major', color='w', linestyle='-', linewidth=0.7) ax0.grid(True, 'minor', color='0.92', linestyle='-', linewidth=0.35) ax0.set_axis_bgcolor('0.95') ASCII_WordList = [ word.encode('ascii','ignore') for word in WordList ] ax0.plot(np.arange(len(TimeList)),volume,label='NumberOfTweets',linewidth=0.75) ax0.legend(loc='upper left',ncol=4) ax0.set_xlim(0,len(TimeList)-1) ax0.xaxis.tick_top() ax0.yaxis.tick_right() ax0.set_xticks(np.arange(0,len(TimeList),5)) ax0.set_xticklabels(TimeList,rotation='vertical') #HotWordColorMap ax1.imshow(ColorMap,cmap=plt.cm.binary,vmin=ColorMap.min(),vmax=ColorMap.max(),aspect='auto',origin='lower') ax1.yaxis.tick_right() ax1.set_yticks(np.arange(len(WordList))) ax1.set_yticklabels(WordList) ax1.set_xticks(np.arange(0,len(TimeList),5)) ax1.set_xticklabels(TimeList,rotation='vertical') ax1.grid(True, 'major', color='w', linestyle='-', linewidth=0.7) ax1.grid(True, 'minor', color='0.92', linestyle='-', linewidth=0.35) #TweetVolumeDistributionOverHotWords ax2.grid(True, 'major', color='w', linestyle='-', linewidth=0.7) ax2.grid(True, 'minor', color='0.92', linestyle='-', linewidth=0.35) ax2.set_axis_bgcolor('0.95') ax2.invert_xaxis() ax2.barh(np.arange(len(WordList)),TweetCountArray,align='center') #add the numbers to the side of each bar PreviousValue = None for p, ch in zip(np.arange(len(WordList)), TweetCountArray): if ch!=PreviousValue: ax2.annotate(str(ch), xy=(ch + 2.5, p - 0.25), va='center') PreviousValue = ch else: continue ax2.set_yticks(np.arange(len(WordList))) ax2.set_yticklabels(WordList)#,rotation='horizontal') ax2.set_ylim(0,len(WordList)-1+0.25) #Plot table with assisting information #1. Date : Day, Date Year and TIME_START to TIME_END #2. TIME_START #3. TIME_END #4. TIME_WINDOW #5. No. of HotWords per TimeWindow #6. Total No. of unique HotWords Found #7. Max #of Tweets for HotWord & HotWord #8. Min #of Tweets for HotWord & HotWord #9. Max #of Tweets in a timeWindow & timeWindow #10.Mix #of Tweets in a timeWindow & timeWindow rowLabels = ['1. Date','2. Start time','3. End time','4. Time Window (seconds)','5. No.Of HotWords per TimeWindow','6. No. of unique hotwords','7. Max #of tweets for HotWord','8. Min #of tweets for HotWord','9. Max #of tweets in a time window','10. Min #of tweets in a time window'] DateStart = gmt_to_local(TIME_START,make_string=True,format='%a %d %b %Y') DateEnd = gmt_to_local(TIME_END,make_string=True,format='%a %d %b %Y') Date = DateStart if DateStart==DateEnd else DateStart+' to '+DateEnd start_time= gmt_to_local(TIME_START,make_string=True,format='%d %b %H:%M') end_time = gmt_to_local(TIME_END,make_string=True,format='%d %b %H:%M') cellText = [Date,start_time,end_time,scale,HotWordSize,len(set(WordList)),TweetCountArray.max(),TweetCountArray.min(),str(max(volume)),str(min(volume))] rowLabels.reverse() cellText.reverse() colLabels = ['Value'] for y, label, text in zip(range(len(cellText)),rowLabels,cellText): ax3.text(0.05,(float(y)/20)+0.05,s='%s : %s'%(label,text),size=20) ax3.xaxis.set_visible(False) ax3.yaxis.set_visible(False) fig1.savefig('%s_to_%spng'%(start_time,end_time),dpi=200,bbox_inches="tight") plt.close(fig1)
class NewsWorthyWords: def __init__(self, db, timeWindow=60 * 10, **kwargs): print "COLLECTING TWEETS...." self.TS = TweetSnap(db=db, timeWindow=timeWindow, Placename2Geocode=False) print "COLLECTION OVER...." # Variables self.SnapStack = [] self.Candidates = {} self.Volume = [] # Constants self.delta = 1.5 self.enoughSamples = 15.0 self.SnapLim = 6 self.StopNewsWords = ["Boston", "day", "time", "love", "today", "Boston-MA"] # Set TIME_FRAME self.SetStart(kwargs.get("TIME_START", time.gmtime(0))) # Storage variables for analysis self.Storage = [] self.StorageDict = pd.DataFrame( columns=["word", "Poisson", "LocalEntropy", "GlobalEntropy", "start_time", "event"] ) self.ResultDict = pd.DataFrame(columns=["word", "event_time", "location", "discovered_time", "summary"]) # Classifier self.matrix_w, self.scaler, self.clf = cPickle.load(open("SVClassifier.Store")) # Verbosity - 1. Print all messages 2. Print less messages 3. ..... self.VerboseLevel = kwargs.get("VerboseLevel", 3) def verbose(self, text, level=1): if level < self.VerboseLevel: return else: print text def SetStart(self, TIME_START): if isinstance(TIME_START, str): TIME_START = time.gmtime(time.mktime(time.strptime(TIME_START, "%d %b %H:%M %Z %Y"))) TIME_DIFF = time.mktime(TIME_START) - time.mktime(self.TS.time_start) if TIME_DIFF > 0: self.TS.move_on(TIME_DIFF) def run(self): while not self.TS.end: # Update SnapStack if len(self.SnapStack) == self.SnapLim: self.SnapStack = self.SnapStack[1:] self.Volume = self.Volume[1:] self.SnapStack.append(self.TS.next()) self.Volume.append(Count(self.SnapStack[-1])) # Update Candidates origin snap as timeWindow has shifted right for key, val in self.Candidates.items(): if val == -self.SnapLim: self.Candidates.pop(key) self.verbose("This %s word has been removed because it never received enough samples" % key) else: self.Candidates[key] = val - 1 print ("Latest timeWindow %s" % self.SnapStack[-1]["TimeWindow"], 2) # Algorithm self.verbose("Print looking for new events which happened in this timeWindow", 2) self.FindNewEvent() self.verbose("Print confirming old/new candidate events which have not been published") self.ConfirmEvent() if self.Candidates.keys() != []: self.verbose("EventCandidates: %s" % self.Candidates.keys(), 2) def TotalVolume(self, word, Volume): total = 0.0 k = 0 while k < len(Volume): if word in Volume[k].keys(): total += Volume[k][word] k += 1 return total if total != 0 else 1 def FindNewEvent(self): for word, count in self.Volume[-1].items(): # Is word count gaussian noise or signal ? wordHistory = [float(vol[word]) for vol in self.Volume[:-1] if word in vol.keys()] mean = np.mean(wordHistory) if len(wordHistory) > 0 else 1 var = np.std(wordHistory) if len(wordHistory) >= 5 else 1 std_score = (count - mean) / (2 * var) if std_score >= self.delta and (word not in self.StopNewsWords): self.verbose("This %s is not gaussian noise with standard_score = %f " % (word, std_score)) if word not in self.Candidates.keys() or (self.Volume[self.Candidates[word]][word] < count): self.Candidates[word] = -1 def ConfirmEvent(self): for word, no in self.Candidates.items(): wordHistory = [float(vol.get(word, 0.0)) for vol in self.Volume[no:]] self.verbose( "Confirming candidate Newsword : %s at time = %s with samples=%d and Snapno=%d" % (word, self.SnapStack[no]["TimeWindow"][0], sum(wordHistory), no), 2, ) if sum(wordHistory) >= self.enoughSamples: self.verbose( "This %s word has enough samples from tweets to calculate scores (Poisson,LocalEntropy,StandardDeviation)" % (word), 2, ) # Poisson Poisson = self.FitPoissonDistribution(word, no) # Global and Local Entropy GlobalEntropy, LocalEntropy = self.FitSpatialEntropy(word, no) # Classifier # Define feature vector X = np.array([Poisson, LocalEntropy, GlobalEntropy], dtype=np.float64) # Apply Scaler X_sc = self.scaler.transform(X) # Apply Orthogonality X_tr = X_sc.dot(self.matrix_w) # Classify new transformed feature vector Flag = self.clf.predict(X_tr)[0] if Flag == 1: start_time = self.SnapStack[no]["TimeWindow"][0] confirmed_time = self.SnapStack[-1]["TimeWindow"][0] SampleSet = self.ReportEventQueue(word, no) print "Newsword (%s) at %s confirmed at %s\n" % (word, start_time, confirmed_time) print "Summary : " summary = [] for user, created_at, tweet, loc in SampleSet: print "%s reported at time %s near %s: %s" % (user, created_at, loc, tweet) # summary.append("%s reported at time %s near %s: %s"%(user,created_at,tweet,GetPlaceName(loc[0],loc[1])) summary.append([user, created_at, tweet, loc]) event = { "word": word, "event_time": start_time, "location": GetPlaceName( np.mean([item[3][0] for item in summary]), np.mean([item[3][1] for item in summary]) ), "discovered_time": confirmed_time, "summary": "\n".join( ["%s reported at time %s near %s: %s" % (item[0], item[1], item[3], item[2])] ), } print event self.ResultDict = self.ResultDict.append(event, ignore_index=True) self.Candidates.pop(word) else: continue # Store Data for post-classification self.StorageDict = self.StorageDict.append( { "word": word, "Poisson": Poisson, "LocalEntropy": LocalEntropy, "GlobalEntropy": GlobalEntropy, "start_time": start_time, "event": event, }, ignore_index=True, ) # Manual Classifier # if flag in ['1','y','yes']: # print 'This %s word count resembles poisson distribution with lambda=%f'%(word,Lambda) # self.ReportEventQueue(word,no) # self.Candidates.pop(word) # else: # print 'This %s word count does not resembles poisson distribution with lambda=%s'%(word,Lambda) def FitSpatialEntropy(self, word, no): k = no tokenize = T_Tokenizer().tokenize # Store locations ALLLOC = [] WORDLOC = [] while k < 0: ALLLOC += self.SnapStack[k]["LOC"] for order, text in enumerate(self.SnapStack[k]["TEXT"]): if word in tokenize(text): WORDLOC.append(self.SnapStack[k]["LOC"][order]) k += 1 # Choose Cluster of max ALLLOC, C* MakeCluster = GMM_clustering() MakeCluster.Snap = {"LOC": ALLLOC} MakeCluster.build_clusters() WORDLABELS = Counter([MakeCluster.labels[ALLLOC.index(LOC)] for LOC in WORDLOC]) # Global entropy GLOBAL_COUNTER = Counter(MakeCluster.labels) G_D_pq = 0.0 for cl, number in WORDLABELS.items(): G_D_pq += -1 * (number / float(GLOBAL_COUNTER[cl])) * np.log2(number / float(GLOBAL_COUNTER[cl])) # G_D_pq += -1*((number/sum(WORDLABELS))/float(GLOBAL_COUNTER[cl]/sum(GLOBAL_COUNTER)))*np.log2(number/float(GLOBAL_COUNTER[cl])) C_Star = WORDLABELS.most_common(1)[0][0] C_Star_LOC = [ALLLOC[No] for No, label in filter(lambda (enum, x): x == C_Star, enumerate(MakeCluster.labels))] C_Star_WORD_LOC = [LOC for LOC in filter(lambda x: x in C_Star_LOC, WORDLOC)] # Find D(p||q) of word inside C* del MakeCluster MakeLocalCluster = GMM_clustering(components=range(2, 8)) MakeLocalCluster.Snap = {"LOC": C_Star_LOC} MakeLocalCluster.build_clusters() WORD_LOCAL_COUNTER = Counter([MakeLocalCluster.labels[C_Star_LOC.index(LOC)] for LOC in C_Star_WORD_LOC]) LOCAL_ALL_COUNTER = Counter(MakeLocalCluster.labels) L_D_pq = 0.0 for cl, number in WORD_LOCAL_COUNTER.items(): L_D_pq += -1 * (number / float(LOCAL_ALL_COUNTER[cl])) * np.log2(number / float(LOCAL_ALL_COUNTER[cl])) # L_D_pq += -1*((number/sum(WORD_LOCAL_COUNTER.values()))/float(LOCAL_ALL_COUNTER[cl]/sum(LOCAL_ALL_COUNTER.values())))*np.log2(number/float(LOCAL_ALL_COUNTER[cl])) return [G_D_pq, L_D_pq] def FitStdDev(self, word, no): k = no tokenize = T_Tokenizer().tokenize # Store locations WORDLOC = [] while k < 0: for order, text in enumerate(self.SnapStack[k]["TEXT"]): if word in tokenize(text): WORDLOC.append(self.SnapStack[k]["LOC"][order]) k += 1 return np.std(WORDLOC) def FitPoissonDistribution(self, word, no): tokenize = T_Tokenizer().tokenize k = no Times = [] ApproxTimes = [] wordHistory = [vol.get(word, 0) for vol in self.Volume[no:]] # Store all tweet_times with word in current snap and known history while k < 0: approx = time.mktime(time.strptime(self.SnapStack[k]["TimeWindow"][0] + "2014EDT", "%d%b%HHR%MMN%Y%Z")) count = self.Volume[k].get(word, 0) ApproxTimes += [approx] * count for order, text in enumerate(self.SnapStack[k]["TEXT"]): if word in tokenize(text): Times.append( time.mktime(time.strptime(self.SnapStack[k]["CREATED_AT"][order], "%d %b %H:%M:%S %Y")) ) k += 1 # Calculate time-intervals TimeIntervals = [Time - min(Times) for Time in Times] ApproxTimeIntervals = sorted([approx - min(ApproxTimes) for approx in ApproxTimes]) TimeIntervals.sort() self.verbose("Have a look at TimeIntervals(1) and ApproxTimeIntervals(2) and LogLikelihood(3)") self.verbose("(1) %s" % TimeIntervals) self.verbose("(2) %s" % ApproxTimeIntervals) ApproxTimeIntervals = Counter(ApproxTimeIntervals) # Calculate ML_Lmbda _lmbda = float(len(TimeIntervals)) / sum(TimeIntervals) if sum(ApproxTimeIntervals) != 0: _lmbda = float(len(ApproxTimeIntervals)) / sum(ApproxTimeIntervals) else: _lmbda = float(len(TimeIntervals)) / sum(TimeIntervals) return _lmbda def ReportEventQueue(self, word, no, SampleLim=3): # Find clusters at start point of event gmm = GMM_clustering(components=range(4, 15)) gmm.Snap = self.SnapStack[no] gmm.build_clusters() Labels = [] tokenize = T_Tokenizer().tokenize for k, text in enumerate(gmm.Snap["TEXT"]): if word in tokenize(text): Labels.append(gmm.labels[k]) Labels = Counter(Labels) # Find cluster where word was most common StarLabel = Labels.most_common(1)[0][0] SampleSet = [] # Print a tweet from that cluster for k, text in enumerate(gmm.Snap["TEXT"]): if gmm.labels[k] == StarLabel and word in tokenize(text): SampleSet.append((gmm.Snap["SCREEN_NAME"][k], gmm.Snap["CREATED_AT"][k], text, gmm.Snap["LOC"][k])) if len(SampleSet) >= SampleLim: break return SampleSet
class NewsWorthyWords: def __init__(self,db,timeWindow=60*10,**kwargs): print "COLLECTING TWEETS...." self.TS = TweetSnap(db=db,timeWindow = timeWindow,Placename2Geocode=False) print "COLLECTION OVER...." #Variables self.SnapStack = [] self.Candidates= {} self.Volume = [] #Constants self.delta = 1.5 self.enoughSamples = 15.0 self.SnapLim = 6 self.StopNewsWords = ['Boston', 'day', 'time', 'love', 'today', 'Boston-MA'] #Set TIME_FRAME self.SetStart(kwargs.get("TIME_START",time.gmtime(0))) #Storage variables for analysis self.Storage = [] self.StorageDict = pd.DataFrame(columns=['word','Poisson','LocalEntropy','GlobalEntropy','start_time','event']) self.ResultDict = pd.DataFrame(columns=['word','event_time','location','discovered_time','summary']) #Classifier self.matrix_w, self.scaler, self.clf = cPickle.load(open('SVClassifier.Store')) #Verbosity - 1. Print all messages 2. Print less messages 3. ..... self.VerboseLevel = kwargs.get('VerboseLevel',1) def verbose(self,text,level=1): if level<self.VerboseLevel: return else: print text def SetStart(self,TIME_START): if isinstance(TIME_START,str): TIME_START = time.gmtime(time.mktime(time.strptime(TIME_START,"%d %b %H:%M %Z %Y"))) TIME_DIFF = time.mktime(TIME_START) - time.mktime(self.TS.time_start) if TIME_DIFF>0: self.TS.move_on(TIME_DIFF) def run(self): while not self.TS.end: #Update SnapStack if len(self.SnapStack)==self.SnapLim: self.SnapStack = self.SnapStack[1:] self.Volume = self.Volume[1:] self.SnapStack.append(self.TS.next()) self.Volume.append(Count(self.SnapStack[-1])) #Update Candidates origin snap as timeWindow has shifted right for key,val in self.Candidates.items(): if val==-self.SnapLim: self.Candidates.pop(key) self.verbose('This %s word has been removed because it never received enough samples'%key) else: self.Candidates[key]=val-1 print('Latest timeWindow %s'%self.SnapStack[-1]['TimeWindow'],2) #Algorithm self.verbose('Print looking for new events which happened in this timeWindow',2) self.FindNewEvent() self.verbose('Print confirming old/new candidate events which have not been published') self.ConfirmEvent() if self.Candidates.keys() !=[]: self.verbose('EventCandidates: %s'%self.Candidates.keys(),2); def TotalVolume(self,word,Volume): total = 0.0 k = 0 while k < len(Volume): if word in Volume[k].keys(): total += Volume[k][word] k+=1 return total if total!=0 else 1 def FindNewEvent(self): for word,count in self.Volume[-1].items(): #Is word count gaussian noise or signal ? wordHistory = [float(vol[word]) for vol in self.Volume[:-1] if word in vol.keys() ] mean = np.mean(wordHistory) if len(wordHistory)>0 else 1 var = np.std(wordHistory) if len(wordHistory)>=5 else 1 std_score = (count - mean)/(2*var) if std_score>=self.delta and (word not in self.StopNewsWords): self.verbose('This %s is not gaussian noise with standard_score = %f '%(word,std_score)) if word not in self.Candidates.keys() or (self.Volume[self.Candidates[word]][word]<count): self.Candidates[word] = -1 def ConfirmEvent(self): for word,no in self.Candidates.items(): wordHistory = [float(vol.get(word,0.0)) for vol in self.Volume[no:]] self.verbose('Confirming candidate Newsword : %s at time = %s with samples=%d and Snapno=%d'%(word,self.SnapStack[no]['TimeWindow'][0],sum(wordHistory),no),2) if sum(wordHistory)>=self.enoughSamples: self.verbose('This %s word has enough samples from tweets to calculate scores (Poisson,LocalEntropy,StandardDeviation)'%(word),2) #Poisson Poisson = self.FitPoissonDistribution(word,no) #Global and Local Entropy GlobalEntropy,LocalEntropy = self.FitSpatialEntropy(word,no) #Classifier #Define feature vector X = np.array([Poisson,LocalEntropy,GlobalEntropy],dtype=np.float64) #Apply Scaler X_sc = self.scaler.transform(X) #Apply Orthogonality X_tr = X_sc.dot(self.matrix_w) #Classify new transformed feature vector Flag = self.clf.predict(X_tr)[0] if Flag==1: start_time = self.SnapStack[no]['TimeWindow'][0] confirmed_time = self.SnapStack[-1]['TimeWindow'][0] SampleSet = self.ReportEventQueue(word,no) print "Newsword (%s) at %s confirmed at %s\n"%(word,start_time,confirmed_time) print "Summary : " summary = [] for user,created_at,tweet,loc in SampleSet: print "%s reported at time %s near %s: %s"%(user,created_at,GetPlaceName(loc[0],loc[1]),tweet) #summary.append("%s reported at time %s near %s: %s"%(user,created_at,tweet,GetPlaceName(loc[0],loc[1])) summary.append([user,created_at,tweet,loc]) event = {'word':word,'event_time':start_time,'location':GetPlaceName(np.mean([item[3][0] for item in summary]),np.mean([item[3][1] for item in summary])),'discovered_time':confirmed_time,'summary':'\n'.join([ "%s reported at time %s near %s: %s"%(item[0],item[1],GetPlaceName(item[3][0],item[3][1]),item[2]) for item in summary])} print event self.ResultDict = self.ResultDict.append(event,ignore_index=True) self.Candidates.pop(word) else: continue #Store Data for post-classification self.StorageDict = self.StorageDict.append({'word':word,'Poisson':Poisson,'LocalEntropy':LocalEntropy,'GlobalEntropy':GlobalEntropy,'start_time':start_time,'event':event},ignore_index=True) #Manual Classifier # if flag in ['1','y','yes']: # print 'This %s word count resembles poisson distribution with lambda=%f'%(word,Lambda) # self.ReportEventQueue(word,no) # self.Candidates.pop(word) # else: # print 'This %s word count does not resembles poisson distribution with lambda=%s'%(word,Lambda) def FitSpatialEntropy(self,word,no): k = no tokenize = T_Tokenizer().tokenize #Store locations ALLLOC = [] WORDLOC = [] while k<0: ALLLOC += self.SnapStack[k]['LOC'] for order,text in enumerate(self.SnapStack[k]['TEXT']): if word in tokenize(text): WORDLOC.append(self.SnapStack[k]['LOC'][order]) k+=1 #Choose Cluster of max ALLLOC, C* MakeCluster = GMM_clustering() MakeCluster.Snap = {'LOC':ALLLOC} MakeCluster.build_clusters() WORDLABELS = Counter([MakeCluster.labels[ALLLOC.index(LOC)] for LOC in WORDLOC]) #Global entropy GLOBAL_COUNTER = Counter(MakeCluster.labels) G_D_pq = 0.0 for cl,number in WORDLABELS.items(): G_D_pq += -1*(number/float(GLOBAL_COUNTER[cl]))*np.log2(number/float(GLOBAL_COUNTER[cl])) #G_D_pq += -1*((number/sum(WORDLABELS))/float(GLOBAL_COUNTER[cl]/sum(GLOBAL_COUNTER)))*np.log2(number/float(GLOBAL_COUNTER[cl])) C_Star = WORDLABELS.most_common(1)[0][0] C_Star_LOC = [ ALLLOC[No] for No,label in filter(lambda (enum,x): x==C_Star,enumerate(MakeCluster.labels)) ] C_Star_WORD_LOC = [LOC for LOC in filter(lambda x:x in C_Star_LOC,WORDLOC)] #Find D(p||q) of word inside C* del MakeCluster MakeLocalCluster = GMM_clustering(components=range(2,8)) MakeLocalCluster.Snap = {'LOC':C_Star_LOC} MakeLocalCluster.build_clusters() WORD_LOCAL_COUNTER = Counter([MakeLocalCluster.labels[C_Star_LOC.index(LOC)] for LOC in C_Star_WORD_LOC]) LOCAL_ALL_COUNTER = Counter( MakeLocalCluster.labels ) L_D_pq = 0.0 for cl,number in WORD_LOCAL_COUNTER.items(): L_D_pq += -1*(number/float(LOCAL_ALL_COUNTER[cl]))*np.log2(number/float(LOCAL_ALL_COUNTER[cl])) #L_D_pq += -1*((number/sum(WORD_LOCAL_COUNTER.values()))/float(LOCAL_ALL_COUNTER[cl]/sum(LOCAL_ALL_COUNTER.values())))*np.log2(number/float(LOCAL_ALL_COUNTER[cl])) return [G_D_pq,L_D_pq] def FitStdDev(self,word,no): k = no tokenize = T_Tokenizer().tokenize #Store locations WORDLOC= [] while k<0: for order,text in enumerate(self.SnapStack[k]['TEXT']): if word in tokenize(text): WORDLOC.append(self.SnapStack[k]['LOC'][order]) k+=1 return np.std(WORDLOC) def FitPoissonDistribution(self,word,no): tokenize = T_Tokenizer().tokenize k = no Times = [] ApproxTimes = [] wordHistory = [vol.get(word,0) for vol in self.Volume[no:]] #Store all tweet_times with word in current snap and known history while k<0: approx = time.mktime(time.strptime(self.SnapStack[k]['TimeWindow'][0]+'2014EDT',"%d%b%HHR%MMN%Y%Z")) count = self.Volume[k].get(word,0) ApproxTimes+=[approx]*count for order,text in enumerate(self.SnapStack[k]['TEXT']): if word in tokenize(text): Times.append(\ time.mktime(time.strptime(self.SnapStack[k]['CREATED_AT'][order],"%d %b %H:%M:%S %Y"))) k+=1 #Calculate time-intervals TimeIntervals = [Time-min(Times) for Time in Times] ApproxTimeIntervals = sorted([ approx-min(ApproxTimes) for approx in ApproxTimes]) TimeIntervals.sort() self.verbose('Have a look at TimeIntervals(1) and ApproxTimeIntervals(2) and LogLikelihood(3)') self.verbose('(1) %s'%TimeIntervals) self.verbose('(2) %s'%ApproxTimeIntervals) ApproxTimeIntervals = Counter(ApproxTimeIntervals) #Calculate ML_Lmbda _lmbda = float(len(TimeIntervals))/sum(TimeIntervals) # if sum(ApproxTimeIntervals)!=0: # _lmbda = float(len(ApproxTimeIntervals))/sum(ApproxTimeIntervals) # else: # _lmbda = float(len(TimeIntervals))/sum(TimeIntervals) #Calculate Variance for given samples # _R2 = 1/_lmbda**2 #Likelihood calculation and plotting (optional) # MaxLogLikelihood # _LgLd = -1*sum([np.log(_lmbda*np.exp(-_lmbda*x)) for x in TimeIntervals]) # print '(3)',_LgLd # # #Simulate a expon_RV with fitted _lmbda # _rv = expon(scale=1/_lmbda) # # #Plot pdf of counts from _rv and known # fig = plt.figure() # ax = fig.add_subplot(111) # ax.plot(sorted(ApproxTimeIntervals.keys()),[_rv.cdf(x+600)-_rv.cdf(x) for x in sorted(ApproxTimeIntervals.keys())],'r-',label='fitted') # ax.plot(sorted(ApproxTimeIntervals.keys()),[float(ApproxTimeIntervals[key])/sum(wordHistory) for key in sorted(ApproxTimeIntervals.keys()) ],'b-'\ # ,label='empirical estimate') # # plt.legend() # # #save figure # fig.savefig('%s.png'%word) # # gmm = GMM_clustering(components=range(4,15)) # gmm.Snap = self.SnapStack[no] # gmm.build_clusters() # # #flag = raw_input("Fitted curve for %s stored should flag=1 or not with lambda=%f and locality=%f"%(word,_lmbda,Locality(self.SnapStack[no],gmm.labels,word))) # plt.close(fig) return _lmbda def ReportEventQueue(self,word,no,SampleLim=3): #Find clusters at start point of event gmm = GMM_clustering(components=range(4,15)) gmm.Snap = self.SnapStack[no] gmm.build_clusters() Labels = [] tokenize = T_Tokenizer().tokenize for k,text in enumerate(gmm.Snap['TEXT']): if word in tokenize(text): Labels.append(gmm.labels[k]) Labels = Counter(Labels) #Find cluster where word was most common StarLabel = Labels.most_common(1)[0][0] SampleSet = [] #Print a tweet from that cluster for k,text in enumerate(gmm.Snap['TEXT']): if gmm.labels[k] == StarLabel and word in tokenize(text): SampleSet.append((gmm.Snap['SCREEN_NAME'][k],gmm.Snap['CREATED_AT'][k],text,gmm.Snap['LOC'][k])) if len(SampleSet)>=SampleLim: break return SampleSet
class NewsWorthyWords: def __init__(self,db,timeWindow=60*10,**kwargs): print "COLLECTING TWEETS...." self.TS = TweetSnap(db=db,timeWindow = timeWindow,Placename2Geocode=False) print "COLLECTION OVER...." #Variables self.QueueStack = [] self.Candidates = {} self.Vocabulary = [] #Constants self.delta = 3 #GaussianDistortion self.MinWordSamples = 15.0 #Has to be greater than 8 See SetFeatureTable method for this restriction self.QueueLim = 6 #MaximumQueueLimit self.StopNewsWords = ['Boston', 'day', 'time', 'love', 'today', 'Boston-MA'] #Default StopWordList #Set TIME_FRAME self.SetStart(kwargs.get("TIME_START",time.gmtime(0))) #Storage variables for analysis self.FeatureDict = pd.DataFrame(columns=['word','Poisson','LocalEntropy','GlobalEntropy','start_time','event']) self.ResultDict = pd.DataFrame(columns=['word','event_time','location','discovered_time','summary']) #Classifier self.matrix_w, self.scaler, self.clf = cPickle.load(open('SVClassifier.Store')) #Verbosity - 1. Print all messages 2. Print less messages 3. ..... self.OnlyMessage = kwargs.get('OnlyMessage',0) def message(self,text): if self.OnlyMessage: print text else: pass def SetStart(self,TIME_START): if isinstance(TIME_START,str): TIME_START = time.gmtime(time.mktime(time.strptime(TIME_START,"%d %b %H:%M %Z %Y"))) TIME_DIFF = time.mktime(TIME_START) - time.mktime(self.TS.time_start) if TIME_DIFF>0: self.TS.move_on(TIME_DIFF) def run(self): while not self.TS.end: #Update QueueStack if len(self.QueueStack)==self.QueueLim: self.QueueStack = self.QueueStack[1:] self.Vocabulary = self.Vocabulary[1:] self.QueueStack.append(self.TS.next()) self.Vocabulary.append(Count(self.QueueStack[-1])) #Update Candidates origin snap as timeWindow has shifted right for key,val in self.Candidates.items(): if val==-self.QueueLim: self.Candidates.pop(key) self.message('This %s word has been removed because it never received enough samples'%key) else: self.Candidates[key]=val-1 print('Latest timeWindow %s'%self.QueueStack[-1]['TimeWindow']) #Algorithm #1. Add to candidates list self.FilterWords() #1.1 if self.TableON==1 and len(self.Candidates.keys())!=0: self.SetFeatureTable() #2. Find news-word in candidate list self.ConfirmEvent() #Status of candidate list if self.Candidates.keys() !=[]: self.message('EventCandidates: %s'%self.Candidates.keys()); def FilterWords(self): for word,count in self.Vocabulary[-1].items(): #Is word count gaussian noise or signal ? wordHistory = [float(vol[word]) for vol in self.Vocabulary[:-1] if word in vol.keys() ] mean = np.mean(wordHistory) if len(wordHistory)>0 else 1 variance = np.std(wordHistory) if len(wordHistory)>=5 else 1 Z_score = (count - mean)/variance if Z_score>=self.delta and (word not in self.StopNewsWords): self.message('This %s is not gaussian noise with standard_score = %f '%(word,Z_score)) if word not in self.Candidates.keys() or (self.Vocabulary[self.Candidates[word]][word]<count): self.Candidates[word] = -1 def ConfirmEvent(self): for word,no in self.Candidates.items(): wordHistory = [float(vol.get(word,0.0)) for vol in self.Vocabulary[no:]] self.message('Confirming candidate Newsword : %s at time = %s with samples=%d and Queueno=%d'%(word,self.QueueStack[no]['TimeWindow'][0],sum(wordHistory),no)) if sum(wordHistory)>=self.MinWordSamples: self.message('This %s word has enough samples from tweets to calculate scores (Poisson,LocalEntropy,StandardDeviation)'%(word)) #Poisson Poisson = self.FitPoissonDistribution(word,no) #Global and Local Entropy GlobalEntropy,LocalEntropy = self.FitSpatialEntropy(word,no) #Poisson, GlobalEntropy, LocalEntropy = self.GetFeatures(word,no) #Classifier #Define feature vector X = np.array([Poisson,LocalEntropy,GlobalEntropy],dtype=np.float64) #Apply Scaler X_sc = self.scaler.transform(X) #Apply Orthogonality X_tr = X_sc.dot(self.matrix_w) #Classify new transformed feature vector Flag = self.clf.predict(X_tr)[0] if Flag==1: start_time = self.QueueStack[no]['TimeWindow'][0] confirmed_time = self.QueueStack[-1]['TimeWindow'][0] SampleSet = self.ReportEventQueue(word,no) print "Newsword (%s) at %s confirmed at %s\n"%(word,start_time,confirmed_time) print "Summary : " summary = [] for user,created_at,tweet,loc in SampleSet: print "%s reported at time %s near %s: %s"%(user,created_at,loc,tweet) #summary.append("%s reported at time %s near %s: %s"%(user,created_at,tweet,GetPlaceName(loc[0],loc[1])) summary.append([user,created_at,tweet,loc]) event = {'word':word,'event_time':start_time,'location':GetPlaceName(np.mean([item[3][0] for item in summary]),np.mean([item[3][1] for item in summary])),'discovered_time':confirmed_time,'summary':'\n'.join([ "%s reported at time %s near %s: %s"%(item[0],item[1],item[3],item[2]) for item in summary])} print event self.ResultDict = self.ResultDict.append(event,ignore_index=True) self.Candidates.pop(word) else: continue #Store Data for post-classification self.FeatureDict = self.FeatureDict.append({'word':word,'Poisson':Poisson,'LocalEntropy':LocalEntropy,'GlobalEntropy':GlobalEntropy,'start_time':start_time,'event':event},ignore_index=True) #Manual Classifier # if flag in ['1','y','yes']: # print 'This %s word count resembles poisson distribution with lambda=%f'%(word,Lambda) # self.ReportEventQueue(word,no) # self.Candidates.pop(word) # else: # print 'This %s word count does not resembles poisson distribution with lambda=%s'%(word,Lambda) def SetFeatureTable(self): tokenize = T_Tokenizer().tokenize self.Feature = {} k = -len(self.QueueStack) #Store locations ALL_LOC = [] WORD_LOC = {} C_Star_LOC = {} C_Star_Labels = {} #Get List of locations of all tweets Collected : ALL_LOC #Get List of locations where "word" appears in tweets posted after it was declared as an event # : WORD_LOC[word] while k<0: ALL_LOC += self.QueueStack[k]['LOC'] for order,text in enumerate(self.QueueStack[k]['TEXT']): for word,no in self.Candidates.items(): if word in tokenize(text) and order>=no: WORD_LOC.setdefault(word,[]).append(self.QueueStack[k]['LOC'][order]) k+=1 #Global Clustering MakeCluster = GMM_clustering(components=range(3,8)) MakeCluster.Snap = {'LOC':ALL_LOC} MakeCluster.build_clusters() #Input : ALL_LOC & Output : Global labels for locations of tweets GLOBAL_LABELS = Counter(MakeCluster.labels) #Local Clustering for each cluster in lists for C_Star in GLOBAL_LABELS.keys(): #Input : C_Star_LOC ; All tweet locations withing C_Star cluster C_Star_LOC[C_Star] = [ ALL_LOC[No] for No,label in filter(lambda (enum,x): x==C_Star,enumerate(MakeCluster.labels)) ] if len(C_Star_LOC[C_Star])>=(self.MinWordSamples/3.0): MakeLocalCluster = GMM_clustering(components=range(2,min(8,int(self.MinWordSamples/3)))) MakeLocalCluster.Snap = {'LOC':C_Star_LOC[C_Star]} MakeLocalCluster.build_clusters() #Output : C_Star_Labels ; Labels for All tweet locations withing C_Star cluster C_Star_Labels[C_Star] = MakeLocalCluster.labels #Set GlobalEntropy and LocalEntropy for each Candidate word for word,no in self.Candidates.items(): #Global entropy #1. Initialize to 0 G_D_pq = 0.0 #2. List of all non-zero counts for global clusters where 'word' appears in tweet WORD_LABELS = Counter([MakeCluster.labels[ALL_LOC.index(LOC)] for LOC in WORD_LOC[word]]) #3. Calculate entropy by summing up over all clusters for cl,number in WORD_LABELS.items(): G_D_pq += -1*(number/float(GLOBAL_LABELS[cl]))*np.log2(number/float(GLOBAL_LABELS[cl])) #G_D_pq += -1*((number/sum(WORDLABELS))/float(GLOBAL_COUNTER[cl]/sum(GLOBAL_COUNTER)))*np.log2(number/float(GLOBAL_COUNTER[cl])) #Local entropy #1. Most populated cluster with 'word' C_Star = WORD_LABELS.most_common(1)[0][0] #2. List of all non-zero counts for global clusters where 'word' appears in tweet WORD_LOCAL_LABELS = Counter([C_Star_Labels[C_Star][C_Star_LOC[C_Star].index(LOC)] for LOC in WORD_LOC[word] if LOC in C_Star_LOC[C_Star]]) LOCAL_LABELS = Counter( C_Star_Labels[C_Star] ) #3. Calculate entropy by summing up over all local clusters L_D_pq = 0.0 for cl,number in WORD_LOCAL_LABELS.items(): L_D_pq += -1*(number/float(LOCAL_LABELS[cl]))*np.log2(number/float(LOCAL_LABELS[cl])) #L_D_pq += -1*((number/sum(WORD_LOCAL_COUNTER.values()))/float(LOCAL_ALL_COUNTER[cl]/sum(LOCAL_ALL_COUNTER.values())))*np.log2(number/float(LOCAL_ALL_COUNTER[cl])) self.Feature[word] = [G_D_pq,L_D_pq,self.GetPoissonRate(word,no)] def FitSpatialEntropy(self,word,no): if self.TableON: return [self.Feature[word][0],self.Feature[word][1]] k = no tokenize = T_Tokenizer().tokenize #Store locations ALLLOC = [] WORDLOC = [] while k<0: ALLLOC += self.QueueStack[k]['LOC'] for order,text in enumerate(self.QueueStack[k]['TEXT']): if word in tokenize(text): WORDLOC.append(self.QueueStack[k]['LOC'][order]) k+=1 #Choose Cluster of max ALLLOC, C* MakeCluster = GMM_clustering() MakeCluster.Snap = {'LOC':ALLLOC} MakeCluster.build_clusters() WORDLABELS = Counter([MakeCluster.labels[ALLLOC.index(LOC)] for LOC in WORDLOC]) #Global entropy GLOBAL_COUNTER = Counter(MakeCluster.labels) G_D_pq = 0.0 for cl,number in WORDLABELS.items(): G_D_pq += -1*(number/float(GLOBAL_COUNTER[cl]))*np.log2(number/float(GLOBAL_COUNTER[cl])) #G_D_pq += -1*((number/sum(WORDLABELS))/float(GLOBAL_COUNTER[cl]/sum(GLOBAL_COUNTER)))*np.log2(number/float(GLOBAL_COUNTER[cl])) C_Star = WORDLABELS.most_common(1)[0][0] C_Star_LOC = [ ALLLOC[No] for No,label in filter(lambda (enum,x): x==C_Star,enumerate(MakeCluster.labels)) ] C_Star_WORD_LOC = [LOC for LOC in filter(lambda x:x in C_Star_LOC,WORDLOC)] #Find D(p||q) of word inside C* del MakeCluster MakeLocalCluster = GMM_clustering(components=range(2,8)) MakeLocalCluster.Snap = {'LOC':C_Star_LOC} MakeLocalCluster.build_clusters() WORD_LOCAL_COUNTER = Counter([MakeLocalCluster.labels[C_Star_LOC.index(LOC)] for LOC in C_Star_WORD_LOC]) LOCAL_ALL_COUNTER = Counter( MakeLocalCluster.labels ) L_D_pq = 0.0 for cl,number in WORD_LOCAL_COUNTER.items(): L_D_pq += -1*(number/float(LOCAL_ALL_COUNTER[cl]))*np.log2(number/float(LOCAL_ALL_COUNTER[cl])) #L_D_pq += -1*((number/sum(WORD_LOCAL_COUNTER.values()))/float(LOCAL_ALL_COUNTER[cl]/sum(LOCAL_ALL_COUNTER.values())))*np.log2(number/float(LOCAL_ALL_COUNTER[cl])) return [G_D_pq,L_D_pq] def GetPoissonRate(self,word,no): tokenize = T_Tokenizer().tokenize k = no Times = [] ApproxTimes = [] #Store all tweet_times with word in current snap and known history while k<0: approx = time.mktime(time.strptime(self.QueueStack[k]['TimeWindow'][0]+'2014EDT',"%d%b%HHR%MMN%Y%Z")) count = self.Vocabulary[k].get(word,0) ApproxTimes+=[approx]*count for order,text in enumerate(self.QueueStack[k]['TEXT']): if word in tokenize(text): Times.append(\ time.mktime(time.strptime(self.QueueStack[k]['CREATED_AT'][order],"%d %b %H:%M:%S %Y"))) k+=1 #Calculate time-intervals TimeIntervals = sorted([Time-min(Times) for Time in Times]) ApproxTimeIntervals = sorted([ approx-min(ApproxTimes) for approx in ApproxTimes]) #Calculate ML_Lmbda if sum(ApproxTimeIntervals)!=0: _lmbda = float(len(ApproxTimeIntervals))/sum(ApproxTimeIntervals) else: _lmbda = float(len(TimeIntervals))/sum(TimeIntervals) return _lmbda def ReportEventQueue(self,word,no,SampleLim=3): #Find clusters at start point of event gmm = GMM_clustering(components=range(4,15)) gmm.Snap = self.QueueStack[no] gmm.build_clusters() Labels = [] tokenize = T_Tokenizer().tokenize for k,text in enumerate(gmm.Snap['TEXT']): if word in tokenize(text): Labels.append(gmm.labels[k]) Labels = Counter(Labels) #Find cluster where word was most common StarLabel = Labels.most_common(1)[0][0] SampleSet = [] #Print a tweet from that cluster for k,text in enumerate(gmm.Snap['TEXT']): if gmm.labels[k] == StarLabel and word in tokenize(text): SampleSet.append((gmm.Snap['SCREEN_NAME'][k],gmm.Snap['CREATED_AT'][k],text,gmm.Snap['LOC'][k])) if len(SampleSet)>=SampleLim: break return SampleSet