Python GMM_clustering 예제들, Clustering.GMM_clustering Python 예제들

예제 #1

0

파일 보기

파일: NSWW.py 프로젝트: shkr/tweet-event

  def FitSpatialEntropy(self,word,no):

    if self.TableON:
      return [self.Feature[word][0],self.Feature[word][1]]

    k = no
    tokenize  = T_Tokenizer().tokenize
    #Store locations
    ALLLOC = []
    WORDLOC = []

    while k<0:

      ALLLOC += self.QueueStack[k]['LOC']
      for order,text in enumerate(self.QueueStack[k]['TEXT']):
        if word in tokenize(text):
          WORDLOC.append(self.QueueStack[k]['LOC'][order])

      k+=1

    #Choose Cluster of max ALLLOC, C*
    MakeCluster 	 	= GMM_clustering()
    MakeCluster.Snap = {'LOC':ALLLOC}
    MakeCluster.build_clusters()
    WORDLABELS       = Counter([MakeCluster.labels[ALLLOC.index(LOC)] for LOC in WORDLOC])

    #Global entropy
    GLOBAL_COUNTER = Counter(MakeCluster.labels)
    G_D_pq		   = 0.0
    for cl,number in WORDLABELS.items():
        G_D_pq	+= -1*(number/float(GLOBAL_COUNTER[cl]))*np.log2(number/float(GLOBAL_COUNTER[cl]))
        #G_D_pq	+= -1*((number/sum(WORDLABELS))/float(GLOBAL_COUNTER[cl]/sum(GLOBAL_COUNTER)))*np.log2(number/float(GLOBAL_COUNTER[cl]))


    C_Star					 = WORDLABELS.most_common(1)[0][0]
    C_Star_LOC       = [ ALLLOC[No] for No,label in filter(lambda (enum,x): x==C_Star,enumerate(MakeCluster.labels)) ]
    C_Star_WORD_LOC  = [LOC for LOC in filter(lambda x:x in C_Star_LOC,WORDLOC)]

    #Find D(p||q) of word inside C*
    del MakeCluster
    MakeLocalCluster 	 	= GMM_clustering(components=range(2,8))
    MakeLocalCluster.Snap = {'LOC':C_Star_LOC}
    MakeLocalCluster.build_clusters()

    WORD_LOCAL_COUNTER    = Counter([MakeLocalCluster.labels[C_Star_LOC.index(LOC)] for LOC in C_Star_WORD_LOC])
    LOCAL_ALL_COUNTER		 = Counter( MakeLocalCluster.labels )
    L_D_pq		   = 0.0
    for cl,number in WORD_LOCAL_COUNTER.items():
        L_D_pq	+= -1*(number/float(LOCAL_ALL_COUNTER[cl]))*np.log2(number/float(LOCAL_ALL_COUNTER[cl]))
        #L_D_pq	+= -1*((number/sum(WORD_LOCAL_COUNTER.values()))/float(LOCAL_ALL_COUNTER[cl]/sum(LOCAL_ALL_COUNTER.values())))*np.log2(number/float(LOCAL_ALL_COUNTER[cl]))

    return [G_D_pq,L_D_pq]

예제 #2

0

파일 보기

파일: algo.py 프로젝트: shkr/tweet-event

    def ReportEventQueue(self, word, no, SampleLim=3):

        # Find clusters at start point of event
        gmm = GMM_clustering(components=range(4, 15))
        gmm.Snap = self.SnapStack[no]
        gmm.build_clusters()
        Labels = []
        tokenize = T_Tokenizer().tokenize
        for k, text in enumerate(gmm.Snap["TEXT"]):
            if word in tokenize(text):
                Labels.append(gmm.labels[k])
        Labels = Counter(Labels)
        # Find cluster where word was most common
        StarLabel = Labels.most_common(1)[0][0]

        SampleSet = []
        # Print a tweet from that cluster
        for k, text in enumerate(gmm.Snap["TEXT"]):
            if gmm.labels[k] == StarLabel and word in tokenize(text):
                SampleSet.append((gmm.Snap["SCREEN_NAME"][k], gmm.Snap["CREATED_AT"][k], text, gmm.Snap["LOC"][k]))
            if len(SampleSet) >= SampleLim:
                break

        return SampleSet

예제 #3

0

파일 보기

파일: visualization.py 프로젝트: shkr/tweet-event

def visualize_timeframe(db,timeWindow,**kwargs):

  print "COLLECTING TWEETS...."
  TS = TweetSnap(db=db,timeWindow = timeWindow,Placename2Geocode=False)
  print "COLLECTION OVER...."

  TIME_START = kwargs.get("TIME_START",time.gmtime(0))
  TIME_END   = kwargs.get("TIME_END",time.gmtime(time.time()))
  VocabSize   = kwargs.get("VocabSize",500)

  if isinstance(TIME_START,str):
    TIME_START  = time.gmtime(time.mktime(time.strptime(TIME_START,"%d %b %H:%M %Z %Y")))
  if isinstance(TIME_END,str):
    TIME_END    = time.gmtime(time.mktime(time.strptime(TIME_END,"%d %b %H:%M %Z %Y")))

  TIME_DIFF   = time.mktime(TIME_START)  - time.mktime(TS.time_start)

  if TIME_DIFF>0:
    TS.move_on(TIME_DIFF-timeWindow)

  #Create Dataframe
  df             = pd.DataFrame(columns=['Virality','Locality','Volume','Words','TimeWindow'])

  while (TS.time_start<TIME_END and not TS.end):

    #InitializeColumns
    Virality = {}
    Volume   = {}
    Locality = {}
    TimeWindow=[]

    #Capture nextSnap and initialize time_start of next snap
    snap = TS.next()
    if len(snap['TEXT'])<100:
      continue
    gmm  = GMM_clustering()
    #1. Virality
    Virality  = te(snap)
    #2. Locality
    gmm.Snap = snap
    gmm.build_clusters()
    Locality = GeographicalEntropy(snap,gmm.labels)
    #3. Volume
    Volume   = Count(snap)

    #HotWords= set(dict(Virality.most_common(HotWordSize)).keys())&set(dict(Broadcast.most_common(HotWordSize)).keys())&set(dict(Locality.most_common(HotWordSize)).keys())&set(dict(Prevalence.most_common(HotWordSize)).keys())&set(dict(Volume.most_common(HotWordSize)).keys())
    Words= list(set(dict(Virality.most_common(VocabSize)).keys())&set(dict(Locality.most_common(VocabSize)).keys())&set(dict(Volume.most_common(VocabSize)).keys()))
    if not len(Words)>0:
      continue

    Virality= [Virality[key] if key in Virality.keys() else 0 for key in Words]
    # Broadcast=[Broadcast[key] if key in Broadcast.keys() else 0 for key in HotWords]
    Locality= [Locality[key] if key in Locality.keys() else 0 for key in Words]
    # Prevalence=[Prevalence[key] if key in Prevalence.keys() else 0 for key in HotWords]
    Volume=[Volume[key] if key in Volume.keys() else 0 for key in Words]

    #4. TimeWindow
    TimeWindow= [snap['TimeWindow'][0]]*len(Words)
    #5. Words
    Words= Words

    #Append to Dataframe
    df = df.append({'Virality':Virality,'Locality':Locality,'Volume':Volume,'Words':Words,'TimeWindow':TimeWindow},ignore_index=True)

  return df

예제 #4

0

파일 보기

파일: visualization.py 프로젝트: shkr/tweet-event

def newsworthy_words(db,timeWindow,**kwargs):

  print "COLLECTING TWEETS...."
  TS = TweetSnap(db=db,timeWindow = timeWindow,Placename2Geocode=False)
  print "COLLECTION OVER...."

  TIME_START = kwargs.get("TIME_START",time.gmtime(0))
  TIME_END   = kwargs.get("TIME_END",time.gmtime(time.time()))
  HotWordSize = kwargs.get("HotWordSize",25)


  if isinstance(TIME_START,str):
    TIME_START  = time.gmtime(time.mktime(time.strptime(TIME_START,"%d %b %H:%M %Z %Y")))
  if isinstance(TIME_END,str):
    TIME_END    = time.gmtime(time.mktime(time.strptime(TIME_END,"%d %b %H:%M %Z %Y")))

  TIME_DIFF   = time.mktime(TIME_START)  - time.mktime(TS.time_start)

  if TIME_DIFF>0:
    TS.move_on(TIME_DIFF-timeWindow)


  Day = {}

  while (TS.time_start<TIME_END and not TS.end):

    #Capture nextSnap and initialize time_start of next snap
    snap = TS.next()
    if len(snap['TEXT'])<100:
      continue
    gmm  = GMM_clustering()
    #1. Virality
    Virality  = PoissonRate(snap)
    #2. DeltaVolume
    #Broadcast = DeltaVolume(snap0,snap)
    #3. Locality
    gmm.Snap = snap
    gmm.build_clusters()
    Locality  = GeographicalEntropy(snap,gmm.labels)
    #4. Prevalence
    #Prevalence= Ttest(snap)
    #5. Count
    Volume     = Count(snap)

    #Prepare Dataframe
    #Union
    #HotWords= list(set(dict(Virality.most_common(HotWordSize)).keys()+dict(Broadcast.most_common(HotWordSize)).keys()+dict(Locality.most_common(HotWordSize)).keys()+dict(Prevalence.most_common(HotWordSize)).keys()+dict(Volume.most_common(HotWordSize)).keys()))
    #Intersection
    #print "Simmering words"
    #print 'Virality',set(dict(Virality.most_common(HotWordSize)).keys())
    #print 'Broadcast',set(dict(Broadcast.most_common(HotWordSize)).keys())
    #print 'Locality',set(dict(Locality.most_common(HotWordSize)).keys())
    #print set(dict(Prevalence.most_common(HotWordSize)).keys())
    #print 'Volume',set(dict(Volume.most_common(HotWordSize)).keys())
    #print "*"*5

    #HotWords= set(dict(Virality.most_common(HotWordSize)).keys())&set(dict(Broadcast.most_common(HotWordSize)).keys())&set(dict(Locality.most_common(HotWordSize)).keys())&set(dict(Prevalence.most_common(HotWordSize)).keys())&set(dict(Volume.most_common(HotWordSize)).keys())
    HotWords= list(set(dict(Virality.most_common(HotWordSize)).keys())&set(dict(Locality.most_common(HotWordSize)).keys())&set(dict(Volume.most_common(HotWordSize)).keys()))
    if not len(HotWords)>0:
      continue

    Virality= [Virality[key] if key in Virality.keys() else 0 for key in HotWords]
    # Broadcast=[Broadcast[key] if key in Broadcast.keys() else 0 for key in HotWords]
    Locality= [Locality[key] if key in Locality.keys() else 0 for key in HotWords]
    # Prevalence=[Prevalence[key] if key in Prevalence.keys() else 0 for key in HotWords]
    Volume=[Volume[key] if key in Volume.keys() else 0 for key in HotWords]

    #scaler           = preprocessing.MinMaxScaler([0,100]).fit_transform
    #scaledVirality   = list(scaler(np.array([Virality]).T).flatten())
    # scaledBroadcast  = scaler(Broadcast)
    #scaledLocality   = list(scaler(np.array([Locality]).T).flatten())
    # scaledPrevalence = scaler(Prevalence)
    #scaledVolume     = list(scaler(np.array([Volume],dtype=np.float16).T).flatten())
    Score            = [vi+lo+vo for vi,lo,vo in zip(Virality,Locality,Volume)]

    df             = pd.DataFrame({'Words':HotWords,'Virality':Virality,'Locality':Locality,'Volume':Volume,'Score':Score})
    #df_scaled      = pd.DataFrame({'Words':HotWords,'Virality':scaledVirality,'Locality':scaledLocality,'Volume':scaledVolume,'Score':Score})

    Day['to'.join(snap['TimeWindow'])]=df

  return Day

예제 #5

0

파일 보기

파일: NSWW.py 프로젝트: shkr/tweet-event

  def SetFeatureTable(self):

    tokenize  = T_Tokenizer().tokenize
    self.Feature = {}
    k = -len(self.QueueStack)

    #Store locations
    ALL_LOC  = []
    WORD_LOC = {}
    C_Star_LOC = {}
    C_Star_Labels = {}

    #Get List of locations of all tweets Collected : ALL_LOC
    #Get List of locations where "word" appears in tweets posted after it was declared as an event
    #    : WORD_LOC[word]
    while k<0:
       ALL_LOC += self.QueueStack[k]['LOC']
       for order,text in enumerate(self.QueueStack[k]['TEXT']):
         for word,no in self.Candidates.items():
           if word in tokenize(text) and order>=no:
             WORD_LOC.setdefault(word,[]).append(self.QueueStack[k]['LOC'][order])

       k+=1

    #Global Clustering
    MakeCluster 	 	= GMM_clustering(components=range(3,8))
    MakeCluster.Snap = {'LOC':ALL_LOC}
    MakeCluster.build_clusters()
    #Input : ALL_LOC & Output : Global labels for locations of tweets
    GLOBAL_LABELS    = Counter(MakeCluster.labels)

    #Local Clustering for each cluster in lists
    for C_Star in GLOBAL_LABELS.keys():

      #Input : C_Star_LOC ; All tweet locations withing C_Star cluster
      C_Star_LOC[C_Star]    = [ ALL_LOC[No] for No,label in filter(lambda (enum,x): x==C_Star,enumerate(MakeCluster.labels)) ]
      if len(C_Star_LOC[C_Star])>=(self.MinWordSamples/3.0):
        MakeLocalCluster 	 	= GMM_clustering(components=range(2,min(8,int(self.MinWordSamples/3))))
        MakeLocalCluster.Snap = {'LOC':C_Star_LOC[C_Star]}
        MakeLocalCluster.build_clusters()

        #Output : C_Star_Labels ; Labels for All tweet locations withing C_Star cluster
        C_Star_Labels[C_Star] = MakeLocalCluster.labels

    #Set GlobalEntropy and LocalEntropy for each Candidate word
    for word,no in self.Candidates.items():

      #Global entropy
      #1. Initialize to 0
      G_D_pq 		   = 0.0
      #2. List of all non-zero counts for global clusters where 'word' appears in tweet
      WORD_LABELS   = Counter([MakeCluster.labels[ALL_LOC.index(LOC)] for LOC in WORD_LOC[word]])
      #3. Calculate entropy by summing up over all clusters
      for cl,number in WORD_LABELS.items():
          G_D_pq	+= -1*(number/float(GLOBAL_LABELS[cl]))*np.log2(number/float(GLOBAL_LABELS[cl]))
          #G_D_pq	+= -1*((number/sum(WORDLABELS))/float(GLOBAL_COUNTER[cl]/sum(GLOBAL_COUNTER)))*np.log2(number/float(GLOBAL_COUNTER[cl]))

      #Local entropy
      #1. Most populated cluster with 'word'
      C_Star					 = WORD_LABELS.most_common(1)[0][0]
      #2. List of all non-zero counts for global clusters where 'word' appears in tweet
      WORD_LOCAL_LABELS     = Counter([C_Star_Labels[C_Star][C_Star_LOC[C_Star].index(LOC)] for LOC in WORD_LOC[word] if LOC in C_Star_LOC[C_Star]])
      LOCAL_LABELS 		     = Counter( C_Star_Labels[C_Star] )
      #3. Calculate entropy by summing up over all local clusters
      L_D_pq		   = 0.0
      for cl,number in WORD_LOCAL_LABELS.items():
          L_D_pq	+= -1*(number/float(LOCAL_LABELS[cl]))*np.log2(number/float(LOCAL_LABELS[cl]))
          #L_D_pq	+= -1*((number/sum(WORD_LOCAL_COUNTER.values()))/float(LOCAL_ALL_COUNTER[cl]/sum(LOCAL_ALL_COUNTER.values())))*np.log2(number/float(LOCAL_ALL_COUNTER[cl]))

      self.Feature[word] = [G_D_pq,L_D_pq,self.GetPoissonRate(word,no)]