def FitSpatialEntropy(self,word,no): if self.TableON: return [self.Feature[word][0],self.Feature[word][1]] k = no tokenize = T_Tokenizer().tokenize #Store locations ALLLOC = [] WORDLOC = [] while k<0: ALLLOC += self.QueueStack[k]['LOC'] for order,text in enumerate(self.QueueStack[k]['TEXT']): if word in tokenize(text): WORDLOC.append(self.QueueStack[k]['LOC'][order]) k+=1 #Choose Cluster of max ALLLOC, C* MakeCluster = GMM_clustering() MakeCluster.Snap = {'LOC':ALLLOC} MakeCluster.build_clusters() WORDLABELS = Counter([MakeCluster.labels[ALLLOC.index(LOC)] for LOC in WORDLOC]) #Global entropy GLOBAL_COUNTER = Counter(MakeCluster.labels) G_D_pq = 0.0 for cl,number in WORDLABELS.items(): G_D_pq += -1*(number/float(GLOBAL_COUNTER[cl]))*np.log2(number/float(GLOBAL_COUNTER[cl])) #G_D_pq += -1*((number/sum(WORDLABELS))/float(GLOBAL_COUNTER[cl]/sum(GLOBAL_COUNTER)))*np.log2(number/float(GLOBAL_COUNTER[cl])) C_Star = WORDLABELS.most_common(1)[0][0] C_Star_LOC = [ ALLLOC[No] for No,label in filter(lambda (enum,x): x==C_Star,enumerate(MakeCluster.labels)) ] C_Star_WORD_LOC = [LOC for LOC in filter(lambda x:x in C_Star_LOC,WORDLOC)] #Find D(p||q) of word inside C* del MakeCluster MakeLocalCluster = GMM_clustering(components=range(2,8)) MakeLocalCluster.Snap = {'LOC':C_Star_LOC} MakeLocalCluster.build_clusters() WORD_LOCAL_COUNTER = Counter([MakeLocalCluster.labels[C_Star_LOC.index(LOC)] for LOC in C_Star_WORD_LOC]) LOCAL_ALL_COUNTER = Counter( MakeLocalCluster.labels ) L_D_pq = 0.0 for cl,number in WORD_LOCAL_COUNTER.items(): L_D_pq += -1*(number/float(LOCAL_ALL_COUNTER[cl]))*np.log2(number/float(LOCAL_ALL_COUNTER[cl])) #L_D_pq += -1*((number/sum(WORD_LOCAL_COUNTER.values()))/float(LOCAL_ALL_COUNTER[cl]/sum(LOCAL_ALL_COUNTER.values())))*np.log2(number/float(LOCAL_ALL_COUNTER[cl])) return [G_D_pq,L_D_pq]
def ReportEventQueue(self, word, no, SampleLim=3): # Find clusters at start point of event gmm = GMM_clustering(components=range(4, 15)) gmm.Snap = self.SnapStack[no] gmm.build_clusters() Labels = [] tokenize = T_Tokenizer().tokenize for k, text in enumerate(gmm.Snap["TEXT"]): if word in tokenize(text): Labels.append(gmm.labels[k]) Labels = Counter(Labels) # Find cluster where word was most common StarLabel = Labels.most_common(1)[0][0] SampleSet = [] # Print a tweet from that cluster for k, text in enumerate(gmm.Snap["TEXT"]): if gmm.labels[k] == StarLabel and word in tokenize(text): SampleSet.append((gmm.Snap["SCREEN_NAME"][k], gmm.Snap["CREATED_AT"][k], text, gmm.Snap["LOC"][k])) if len(SampleSet) >= SampleLim: break return SampleSet
def visualize_timeframe(db,timeWindow,**kwargs): print "COLLECTING TWEETS...." TS = TweetSnap(db=db,timeWindow = timeWindow,Placename2Geocode=False) print "COLLECTION OVER...." TIME_START = kwargs.get("TIME_START",time.gmtime(0)) TIME_END = kwargs.get("TIME_END",time.gmtime(time.time())) VocabSize = kwargs.get("VocabSize",500) if isinstance(TIME_START,str): TIME_START = time.gmtime(time.mktime(time.strptime(TIME_START,"%d %b %H:%M %Z %Y"))) if isinstance(TIME_END,str): TIME_END = time.gmtime(time.mktime(time.strptime(TIME_END,"%d %b %H:%M %Z %Y"))) TIME_DIFF = time.mktime(TIME_START) - time.mktime(TS.time_start) if TIME_DIFF>0: TS.move_on(TIME_DIFF-timeWindow) #Create Dataframe df = pd.DataFrame(columns=['Virality','Locality','Volume','Words','TimeWindow']) while (TS.time_start<TIME_END and not TS.end): #InitializeColumns Virality = {} Volume = {} Locality = {} TimeWindow=[] #Capture nextSnap and initialize time_start of next snap snap = TS.next() if len(snap['TEXT'])<100: continue gmm = GMM_clustering() #1. Virality Virality = te(snap) #2. Locality gmm.Snap = snap gmm.build_clusters() Locality = GeographicalEntropy(snap,gmm.labels) #3. Volume Volume = Count(snap) #HotWords= set(dict(Virality.most_common(HotWordSize)).keys())&set(dict(Broadcast.most_common(HotWordSize)).keys())&set(dict(Locality.most_common(HotWordSize)).keys())&set(dict(Prevalence.most_common(HotWordSize)).keys())&set(dict(Volume.most_common(HotWordSize)).keys()) Words= list(set(dict(Virality.most_common(VocabSize)).keys())&set(dict(Locality.most_common(VocabSize)).keys())&set(dict(Volume.most_common(VocabSize)).keys())) if not len(Words)>0: continue Virality= [Virality[key] if key in Virality.keys() else 0 for key in Words] # Broadcast=[Broadcast[key] if key in Broadcast.keys() else 0 for key in HotWords] Locality= [Locality[key] if key in Locality.keys() else 0 for key in Words] # Prevalence=[Prevalence[key] if key in Prevalence.keys() else 0 for key in HotWords] Volume=[Volume[key] if key in Volume.keys() else 0 for key in Words] #4. TimeWindow TimeWindow= [snap['TimeWindow'][0]]*len(Words) #5. Words Words= Words #Append to Dataframe df = df.append({'Virality':Virality,'Locality':Locality,'Volume':Volume,'Words':Words,'TimeWindow':TimeWindow},ignore_index=True) return df
def newsworthy_words(db,timeWindow,**kwargs): print "COLLECTING TWEETS...." TS = TweetSnap(db=db,timeWindow = timeWindow,Placename2Geocode=False) print "COLLECTION OVER...." TIME_START = kwargs.get("TIME_START",time.gmtime(0)) TIME_END = kwargs.get("TIME_END",time.gmtime(time.time())) HotWordSize = kwargs.get("HotWordSize",25) if isinstance(TIME_START,str): TIME_START = time.gmtime(time.mktime(time.strptime(TIME_START,"%d %b %H:%M %Z %Y"))) if isinstance(TIME_END,str): TIME_END = time.gmtime(time.mktime(time.strptime(TIME_END,"%d %b %H:%M %Z %Y"))) TIME_DIFF = time.mktime(TIME_START) - time.mktime(TS.time_start) if TIME_DIFF>0: TS.move_on(TIME_DIFF-timeWindow) Day = {} while (TS.time_start<TIME_END and not TS.end): #Capture nextSnap and initialize time_start of next snap snap = TS.next() if len(snap['TEXT'])<100: continue gmm = GMM_clustering() #1. Virality Virality = PoissonRate(snap) #2. DeltaVolume #Broadcast = DeltaVolume(snap0,snap) #3. Locality gmm.Snap = snap gmm.build_clusters() Locality = GeographicalEntropy(snap,gmm.labels) #4. Prevalence #Prevalence= Ttest(snap) #5. Count Volume = Count(snap) #Prepare Dataframe #Union #HotWords= list(set(dict(Virality.most_common(HotWordSize)).keys()+dict(Broadcast.most_common(HotWordSize)).keys()+dict(Locality.most_common(HotWordSize)).keys()+dict(Prevalence.most_common(HotWordSize)).keys()+dict(Volume.most_common(HotWordSize)).keys())) #Intersection #print "Simmering words" #print 'Virality',set(dict(Virality.most_common(HotWordSize)).keys()) #print 'Broadcast',set(dict(Broadcast.most_common(HotWordSize)).keys()) #print 'Locality',set(dict(Locality.most_common(HotWordSize)).keys()) #print set(dict(Prevalence.most_common(HotWordSize)).keys()) #print 'Volume',set(dict(Volume.most_common(HotWordSize)).keys()) #print "*"*5 #HotWords= set(dict(Virality.most_common(HotWordSize)).keys())&set(dict(Broadcast.most_common(HotWordSize)).keys())&set(dict(Locality.most_common(HotWordSize)).keys())&set(dict(Prevalence.most_common(HotWordSize)).keys())&set(dict(Volume.most_common(HotWordSize)).keys()) HotWords= list(set(dict(Virality.most_common(HotWordSize)).keys())&set(dict(Locality.most_common(HotWordSize)).keys())&set(dict(Volume.most_common(HotWordSize)).keys())) if not len(HotWords)>0: continue Virality= [Virality[key] if key in Virality.keys() else 0 for key in HotWords] # Broadcast=[Broadcast[key] if key in Broadcast.keys() else 0 for key in HotWords] Locality= [Locality[key] if key in Locality.keys() else 0 for key in HotWords] # Prevalence=[Prevalence[key] if key in Prevalence.keys() else 0 for key in HotWords] Volume=[Volume[key] if key in Volume.keys() else 0 for key in HotWords] #scaler = preprocessing.MinMaxScaler([0,100]).fit_transform #scaledVirality = list(scaler(np.array([Virality]).T).flatten()) # scaledBroadcast = scaler(Broadcast) #scaledLocality = list(scaler(np.array([Locality]).T).flatten()) # scaledPrevalence = scaler(Prevalence) #scaledVolume = list(scaler(np.array([Volume],dtype=np.float16).T).flatten()) Score = [vi+lo+vo for vi,lo,vo in zip(Virality,Locality,Volume)] df = pd.DataFrame({'Words':HotWords,'Virality':Virality,'Locality':Locality,'Volume':Volume,'Score':Score}) #df_scaled = pd.DataFrame({'Words':HotWords,'Virality':scaledVirality,'Locality':scaledLocality,'Volume':scaledVolume,'Score':Score}) Day['to'.join(snap['TimeWindow'])]=df return Day
def SetFeatureTable(self): tokenize = T_Tokenizer().tokenize self.Feature = {} k = -len(self.QueueStack) #Store locations ALL_LOC = [] WORD_LOC = {} C_Star_LOC = {} C_Star_Labels = {} #Get List of locations of all tweets Collected : ALL_LOC #Get List of locations where "word" appears in tweets posted after it was declared as an event # : WORD_LOC[word] while k<0: ALL_LOC += self.QueueStack[k]['LOC'] for order,text in enumerate(self.QueueStack[k]['TEXT']): for word,no in self.Candidates.items(): if word in tokenize(text) and order>=no: WORD_LOC.setdefault(word,[]).append(self.QueueStack[k]['LOC'][order]) k+=1 #Global Clustering MakeCluster = GMM_clustering(components=range(3,8)) MakeCluster.Snap = {'LOC':ALL_LOC} MakeCluster.build_clusters() #Input : ALL_LOC & Output : Global labels for locations of tweets GLOBAL_LABELS = Counter(MakeCluster.labels) #Local Clustering for each cluster in lists for C_Star in GLOBAL_LABELS.keys(): #Input : C_Star_LOC ; All tweet locations withing C_Star cluster C_Star_LOC[C_Star] = [ ALL_LOC[No] for No,label in filter(lambda (enum,x): x==C_Star,enumerate(MakeCluster.labels)) ] if len(C_Star_LOC[C_Star])>=(self.MinWordSamples/3.0): MakeLocalCluster = GMM_clustering(components=range(2,min(8,int(self.MinWordSamples/3)))) MakeLocalCluster.Snap = {'LOC':C_Star_LOC[C_Star]} MakeLocalCluster.build_clusters() #Output : C_Star_Labels ; Labels for All tweet locations withing C_Star cluster C_Star_Labels[C_Star] = MakeLocalCluster.labels #Set GlobalEntropy and LocalEntropy for each Candidate word for word,no in self.Candidates.items(): #Global entropy #1. Initialize to 0 G_D_pq = 0.0 #2. List of all non-zero counts for global clusters where 'word' appears in tweet WORD_LABELS = Counter([MakeCluster.labels[ALL_LOC.index(LOC)] for LOC in WORD_LOC[word]]) #3. Calculate entropy by summing up over all clusters for cl,number in WORD_LABELS.items(): G_D_pq += -1*(number/float(GLOBAL_LABELS[cl]))*np.log2(number/float(GLOBAL_LABELS[cl])) #G_D_pq += -1*((number/sum(WORDLABELS))/float(GLOBAL_COUNTER[cl]/sum(GLOBAL_COUNTER)))*np.log2(number/float(GLOBAL_COUNTER[cl])) #Local entropy #1. Most populated cluster with 'word' C_Star = WORD_LABELS.most_common(1)[0][0] #2. List of all non-zero counts for global clusters where 'word' appears in tweet WORD_LOCAL_LABELS = Counter([C_Star_Labels[C_Star][C_Star_LOC[C_Star].index(LOC)] for LOC in WORD_LOC[word] if LOC in C_Star_LOC[C_Star]]) LOCAL_LABELS = Counter( C_Star_Labels[C_Star] ) #3. Calculate entropy by summing up over all local clusters L_D_pq = 0.0 for cl,number in WORD_LOCAL_LABELS.items(): L_D_pq += -1*(number/float(LOCAL_LABELS[cl]))*np.log2(number/float(LOCAL_LABELS[cl])) #L_D_pq += -1*((number/sum(WORD_LOCAL_COUNTER.values()))/float(LOCAL_ALL_COUNTER[cl]/sum(LOCAL_ALL_COUNTER.values())))*np.log2(number/float(LOCAL_ALL_COUNTER[cl])) self.Feature[word] = [G_D_pq,L_D_pq,self.GetPoissonRate(word,no)]