Beispiel #1
0
    def buildEventModel(self, keywordsTh, seedURLs):

        corpus = Collection(seedURLs)

        #NoTFDF
        sortedToksTFDF = corpus.getIndicativeWords()
        self.toksTFDFDic = dict(sortedToksTFDF)
        #print sortedToksTFDF

        #sortedImptSents = corpus.getIndicativeSentences(self.topK,self.intersectionTh)
        sortedImptSents = corpus.getIndicativeSentences(
            keywordsTh, self.intersectionTh)
        # Get Event Model
        eventModelInstances = eventUtils.getEventModelInsts(sortedImptSents)

        self.entities['LOCATION'] = []
        self.entities['DATE'] = []
        self.entities['Disaster'] = []
        for e in eventModelInstances:
            if 'LOCATION' in e:
                self.entities['LOCATION'].extend(e['LOCATION'])
            elif 'DATE' in e:
                self.entities['DATE'].extend(e['DATE'])
            self.entities['Disaster'].extend(e['Disaster'])

        entitiesFreq = {}
        entitiesFreq['LOCATION'] = self.getEntitiesFreq(
            self.entities['LOCATION'])
        entitiesFreq['DATE'] = self.getEntitiesFreq(self.entities['DATE'])
        entitiesFreq['Disaster'] = self.getEntitiesFreq(
            self.entities['Disaster'])
        filteredDates = []
        months = [
            'jan', 'feb', 'mar', 'apr', 'aug', 'sept', 'oct', 'nov', 'dec',
            'january', 'february', 'march', 'april', 'may', 'june', 'july',
            'august', 'september', 'october', 'november', 'december'
        ]
        for d, v in entitiesFreq['DATE']:
            if d.isdigit() and len(d) == 4:
                filteredDates.append((d, v))
            elif d.lower() in months:
                filteredDates.append((d, v))
        entitiesFreq['DATE'] = filteredDates

        llen = 5
        dlen = 5
        #l = [k for k,_ in entitiesFreq['LOCATION']]
        s = len(entitiesFreq['LOCATION'])

        if llen < s:
            s = llen
        t = entitiesFreq['LOCATION'][:s]
        print t
        self.entities['LOCATION'] = dict(t)

        #d = [k for k,_ in entitiesFreq['DATE']]
        s = len(entitiesFreq['DATE'])
        if dlen < s:
            s = dlen
        self.entities['DATE'] = dict(entitiesFreq['DATE'][:s])
        print entitiesFreq['DATE'][:s]

        locDate = [k for k, _ in entitiesFreq['LOCATION']
                   ] + [m for m, _ in entitiesFreq['DATE']]

        locDate = eventUtils.getTokens(' '.join(locDate))
        '''
        ntopToks = []
        topToks = [k for k,_ in sortedToksTFDF]
        for tok in topToks:
            if tok not in locDate:
                ntopToks.append(tok)
        topToks = ntopToks
        if self.topK < len(topToks):
            topToks =  topToks[:self.topK]
        '''

        ntopToks = []
        topToks = [k for k, _ in entitiesFreq['Disaster']]
        for tok in topToks:
            if tok not in locDate:
                ntopToks.append(tok)
        topToks = ntopToks
        if self.topK < len(topToks):
            topToks = topToks[:self.topK]
        #print "Disaster: ", topToks

        topToksDic = {}
        for t in topToks:
            topToksDic[t] = self.toksTFDFDic[t]
        #self.entities['Disaster'] = set(topToks)
        self.entities['Disaster'] = topToksDic
        #print self.entities
        print topToks

        #self.vecs = {}
        self.scalars = {}
        for k in self.entities:
            ekv = self.entities[k]
            '''
            if k == 'Disaster':
                ev = [1+math.log(e*v) for e,v in ekv.values()]
            else:
                ev = [1+math.log(e) for e in ekv.values()]
            '''
            #NoTFDF
            ev = [1 + math.log(e) for e in ekv.values()]
            #self.vecs[k] = ev
            self.scalars[k] = self.getScalar(ev)
Beispiel #2
0
    def buildEventModel_old(self, seedURLs):

        corpus = Collection(seedURLs)
        #sortedTokensFreqs = corpus.getWordsFrequencies()
        sortedToksTFDF = corpus.getIndicativeWords()
        print sortedToksTFDF
        sortedImptSents = corpus.getIndicativeSentences(
            self.topK, self.intersectionTh)
        # Get Event Model
        eventModelInstances = eventUtils.getEventModelInsts(sortedImptSents)
        #topToks = [k for k,_ in sortedToksTFDF]
        #if self.topK < len(topToks):
        #    topToks =  topToks[:self.topK]
        #self.entities['Disaster'] = set(topToks)

        self.entities['LOCATION'] = []
        self.entities['DATE'] = []
        for e in eventModelInstances:
            if 'LOCATION' in e:
                self.entities['LOCATION'].extend(e['LOCATION'])
            elif 'DATE' in e:
                self.entities['DATE'].extend(e['DATE'])

        entitiesFreq = {}
        entitiesFreq['LOCATION'] = eventUtils.getFreq(
            self.entities['LOCATION'])
        entitiesFreq['LOCATION'] = eventUtils.getSorted(
            entitiesFreq['LOCATION'].items(), 1)
        entitiesFreq['DATE'] = eventUtils.getFreq(self.entities['DATE'])
        entitiesFreq['DATE'] = eventUtils.getSorted(
            entitiesFreq['DATE'].items(), 1)

        l = [k for k, _ in entitiesFreq['LOCATION']]
        if self.topK < len(l):
            #l = l[:self.topK]
            l = l[:3]
        self.entities['LOCATION'] = set(l)

        d = [k for k, _ in entitiesFreq['DATE']]
        if self.topK < len(d):
            #d = d[:self.topK]
            d = d[:3]
        self.entities['DATE'] = set(d)
        '''
        locList = self.entities['LOCATION']
        locSet = set(locList)
        self.entities['LOCATION'] = [l for l in locSet]
        '''
        self.entities['LOCATION'] = self.getUniqueEntities(
            self.entities['LOCATION'])
        '''
        dateList = self.entities['DATE']
        dateSet = set(dateList)
        self.entities['DATE'] = [d for d in dateSet]
        '''
        self.entities['DATE'] = self.getUniqueEntities(self.entities['DATE'])

        locDate = list(self.entities['LOCATION']) + list(self.entities['DATE'])
        locDate = eventUtils.getTokens(' '.join(locDate))

        ntopToks = []
        topToks = [k for k, _ in sortedToksTFDF]
        for tok in topToks:
            if tok not in locDate:
                ntopToks.append(tok)
        topToks = ntopToks
        if self.topK < len(topToks):
            topToks = topToks[:self.topK]
        self.entities['Disaster'] = set(topToks)

        self.allEntities = []
        for k in self.entities:
            self.allEntities.extend(self.entities[k])

        print self.allEntities
Beispiel #3
0
			ntokEntsList.extend(ps)
		else:
			ntokEntsList.append(s)
	print ntokEntsList
	print '--------------'
	print toks
	for k in toksTFDF:
		if k not in ntokEntsList:
			filteredToksTFDF.append((k,toksTFDF[k]))
	'''
	
	# Get Indicative Sentences	
	sortedImptSents = utils.getIndicativeSents(sortedToksTFDF,topK,intersectionTh)
	
	# Get Event Model
	eventModelInstances = utils.getEventModelInsts(sortedImptSents)
	
	rs = "<tr>"
	re = "</tr>"
	outputs = "<td>"
	outpute = "</td>"
	wordsOutput = "<tr><td>Frequent Words (term Frequency)</td><td>Important Words (term Freq * Doc Freq)</td></tr>"
	for i in range(topK):
		wordsOutput += rs + outputs + str(sortedTokensFreqs[i]) + outpute + outputs + str(sortedToksTFDF[i]) + outpute + re
	
	sents_ents = "<tr><td>Important Sentences</td><td>Named Entities</td></tr>"
	for i in range(len(sortedImptSents)):
		sents_ents += rs + outputs + str(sortedImptSents[i]) + outpute + outputs + str(eventModelInstances[i]) + outpute + re

	print wordsOutput
	print "<br>============<br>"
Beispiel #4
0
 def buildEventModel_old(self,seedURLs):
     
     corpus = Collection(seedURLs)
     #sortedTokensFreqs = corpus.getWordsFrequencies()
     sortedToksTFDF = corpus.getIndicativeWords()
     print sortedToksTFDF
     sortedImptSents = corpus.getIndicativeSentences(self.topK,self.intersectionTh)
     # Get Event Model
     eventModelInstances = eventUtils.getEventModelInsts(sortedImptSents)
     #topToks = [k for k,_ in sortedToksTFDF]
     #if self.topK < len(topToks):
     #    topToks =  topToks[:self.topK]
     #self.entities['Disaster'] = set(topToks)
     
     self.entities['LOCATION']= []
     self.entities['DATE'] = []
     for e in eventModelInstances:
         if 'LOCATION' in e:
             self.entities['LOCATION'].extend( e['LOCATION'])
         elif 'DATE' in e:
             self.entities['DATE'].extend( e['DATE'])
     
     entitiesFreq = {}
     entitiesFreq['LOCATION'] = eventUtils.getFreq(self.entities['LOCATION'])
     entitiesFreq['LOCATION'] = eventUtils.getSorted(entitiesFreq['LOCATION'].items(), 1)
     entitiesFreq['DATE'] = eventUtils.getFreq(self.entities['DATE'])
     entitiesFreq['DATE'] = eventUtils.getSorted(entitiesFreq['DATE'].items(), 1)
     
     l = [k for k,_ in entitiesFreq['LOCATION']]
     if self.topK < len(l):
         #l = l[:self.topK]
         l = l[:3]
     self.entities['LOCATION'] = set(l)
     
     d = [k for k,_ in entitiesFreq['DATE']]
     if self.topK < len(d):
         #d = d[:self.topK]
         d = d[:3]
     self.entities['DATE'] = set(d)
     
     self.entities['LOCATION'] = self.getUniqueEntities(self.entities['LOCATION'])
     
     
     self.entities['DATE'] = self.getUniqueEntities(self.entities['DATE']) 
     
     locDate = list(self.entities['LOCATION']) + list(self.entities['DATE'])
     locDate = eventUtils.getTokens(' '.join(locDate))
     
     ntopToks = []
     topToks = [k for k,_ in sortedToksTFDF]
     for tok in topToks:
         if tok not in locDate:
             ntopToks.append(tok)
     topToks = ntopToks
     if self.topK < len(topToks):
         topToks =  topToks[:self.topK]
     self.entities['Disaster'] = set(topToks)
     
     
     self.allEntities = []
     for k in self.entities:
         self.allEntities.extend(self.entities[k]) 
         
     print self.allEntities
Beispiel #5
0
 def buildEventModel(self,keywordsTh, seedURLs):
     
     corpus = Collection(seedURLs)
     
     #NoTFDF
     sortedToksTFDF = corpus.getIndicativeWords()
     self.toksTFDFDic = dict(sortedToksTFDF)
     print sortedToksTFDF
     
     #sortedImptSents = corpus.getIndicativeSentences(self.topK,self.intersectionTh)
     sortedImptSents = corpus.getIndicativeSentences(keywordsTh,self.intersectionTh)
     # Get Event Model
     eventModelInstances = eventUtils.getEventModelInsts(sortedImptSents)
     
     
     self.entities['LOCATION']= []
     self.entities['DATE'] = []
     self.entities['Disaster']=[]
     for e in eventModelInstances:
         if 'LOCATION' in e:
             self.entities['LOCATION'].extend( e['LOCATION'])
         elif 'DATE' in e:
             self.entities['DATE'].extend( e['DATE'])
         self.entities['Disaster'].extend(e['Disaster'])
     
     entitiesFreq = {}
     entitiesFreq['LOCATION'] = self.getEntitiesFreq(self.entities['LOCATION'])
     entitiesFreq['DATE'] = self.getEntitiesFreq(self.entities['DATE'])
     entitiesFreq['Disaster'] = self.getEntitiesFreq(self.entities['Disaster'])
     filteredDates = []
     months = ['january','february','march','april','may','june','july','august','september','october','november','december']
     for d,v in entitiesFreq['DATE']:
         if d.isdigit() and len(d) == 4:
             filteredDates.append((d,v))
         elif d.lower() in months:
             filteredDates.append((d,v))
     entitiesFreq['DATE']=filteredDates
     
     llen = 5
     dlen = 5
     #l = [k for k,_ in entitiesFreq['LOCATION']]
     s = len(entitiesFreq['LOCATION'])
     
     if llen < s:
         s = llen
     t = entitiesFreq['LOCATION'][:s]
     print t
     self.entities['LOCATION'] = dict(t)
            
     #d = [k for k,_ in entitiesFreq['DATE']]
     s = len(entitiesFreq['DATE'])
     if dlen < s:
         s = dlen
     self.entities['DATE'] = dict(entitiesFreq['DATE'][:s])
     print entitiesFreq['DATE'][:s]
     
     
     locDate = [k for k,_ in entitiesFreq['LOCATION'][:2]] + [m for m,_ in entitiesFreq['DATE']]
     
     locDate = eventUtils.getTokens(' '.join(locDate))
     
     ntopToks = []
     topToks = [k for k,_ in entitiesFreq['Disaster']]
     for tok in topToks:
         if tok not in locDate:
             ntopToks.append(tok)
     topToks = ntopToks
     if self.topK < len(topToks):
         topToks =  topToks[:self.topK]
     #print "Disaster: ", topToks
     
     
     topToksDic = {}
     for t in topToks:
         topToksDic[t] = self.toksTFDFDic[t]
     #self.entities['Disaster'] = set(topToks)
     self.entities['Disaster'] = topToksDic
     #print self.entities
     print topToks
     
     #self.vecs = {}
     self.scalars = {}
     for k in self.entities:
         ekv = self.entities[k]
         
         #NoTFDF
         ev = [1+math.log(e) for e in ekv.values()]
         #self.vecs[k] = ev
         self.scalars[k] = self.getScalar(ev)
 def buildEventModel(self, seedURLs):
         
     corpus = Collection(seedURLs)
     
     #NoTFDF
     corpus.getIndicativeWords('TF')
     self.toksDic= dict(corpus.indicativeWords)
     #self.toksTFIDFDic = dict(sortedToksTFIDF)
     #print sortedToksTFDF
     
     sortedImptSents = corpus.getIndicativeSentences(3 * self.topK,self.intersectionTh)
     #sortedImptSents = corpus.getIndicativeSentences(keywordsTh,self.intersectionTh)
     for s in sortedImptSents[:self.topK]: 
         print s 
     # Get Event Model
     eventModelInstances = eventUtils.getEventModelInsts(sortedImptSents)
     #print eventModelInstances[:self.topK]
     
     self.entities['LOCATION']= []
     self.entities['DATE'] = []
     self.entities['Topic']=[]
     
     for e in eventModelInstances:
         if 'LOCATION' in e:
             self.entities['LOCATION'].extend( e['LOCATION'])
         if 'DATE' in e:
             self.entities['DATE'].extend( e['DATE'])
         self.entities['Topic'].extend(e['Topic'])
     
     entitiesFreq = {}
     entitiesFreq['LOCATION'] = self.getEntitiesFreq(self.entities['LOCATION'])
     entitiesFreq['DATE'] = self.getEntitiesFreq(self.entities['DATE'])
     entitiesFreq['Topic'] = self.getEntitiesFreq(self.entities['Topic'])
     #entitiesFreq['Topic'] = [(t,self.toksTFIDFDic[t]) for t,f in tf ]
     '''
     if self.topK < len(entitiesFreq['Topic']):
         entitiesFreq['Topic'] = entitiesFreq['Topic'][:self.topK]
     self.entities['Topic'] = dict(entitiesFreq['Topic'])
     print entitiesFreq['Topic']
     '''
     filteredDates = []
     months = ['jan','feb','mar','apr','aug','sept','oct','nov','dec','january','february','march','april','may','june','july','august','september','october','november','december']
     for d,v in entitiesFreq['DATE']:
         if d.isdigit() and len(d) == 4:
             filteredDates.append((d,v))
         elif d.lower() in months:
             filteredDates.append((d,v))
     entitiesFreq['DATE']=filteredDates
     
     llen = self.topK
     dlen = self.topK
     #l = [k for k,_ in entitiesFreq['LOCATION']]
     s = len(entitiesFreq['LOCATION'])
     
     if llen < s:
         s = llen
     t = entitiesFreq['LOCATION'][:s]
     print t
     self.entities['LOCATION'] = dict(t)
            
     #locDate = [k for k,_ in entitiesFreq['LOCATION']] + [m for m,_ in entitiesFreq['DATE']]
     locDate = self.entities['LOCATION'].keys() + [m for m,_ in entitiesFreq['DATE']]#self.entities['DATE'].keys()
     
     locDate = eventUtils.getTokens(' '.join(locDate))
     
     #d = [k for k,_ in entitiesFreq['DATE']]
     s = len(entitiesFreq['DATE'])
     if dlen < s:
         s = dlen
     self.entities['DATE'] = dict(entitiesFreq['DATE'][:s])
     print entitiesFreq['DATE'][:s]
     
     
     ntopToks = []
     topToks = [k for k,_ in entitiesFreq['Topic']]
     for tok in topToks:
         if tok not in locDate:
             ntopToks.append(tok)
     topToks = ntopToks
     
     if self.topK < len(topToks):
         topToks =  topToks[:self.topK]
     #print "Disaster: ", topToks
     
     
     topToksDic = {}
     for t in topToks:
         topToksDic[t] = self.toksDic[t]
     #self.entities['Disaster'] = set(topToks)
     self.entities['Topic'] = topToksDic
     
     #print self.entities
     print topToksDic
     
     #Calculate weights
     self.calculateWeights()
     
     newents = {}
     for k in self.entities:
         ed = self.entities[k].iteritems()
         ned = [(k,1) for k,_ in ed]
         newents[k] = dict(ned)
     
     for k in newents:
         self.entities[k] = newents[k]
             
         
     
     #self.vecs = {}
     self.scalars = {}
     for k in self.entities:
         ekv = self.entities[k]
         '''
         if k == 'Disaster':
             ev = [1+math.log(e*v) for e,v in ekv.values()]
         else:
             ev = [1+math.log(e) for e in ekv.values()]
         '''
         #NoTFDF
         ev = [1+math.log(e) for e in ekv.values()]
         #self.vecs[k] = ev
         self.scalars[k] = self.getScalar(ev)
    def buildEventModel(self, keywordsTh, seedURLs):

        corpus = Collection(seedURLs)

        # NoTFDF
        sortedToksTFDF = corpus.getIndicativeWords()
        self.toksTFDFDic = dict(sortedToksTFDF)
        # print sortedToksTFDF

        # sortedImptSents = corpus.getIndicativeSentences(self.topK,self.intersectionTh)
        sortedImptSents = corpus.getIndicativeSentences(keywordsTh, self.intersectionTh)
        # Get Event Model
        eventModelInstances = eventUtils.getEventModelInsts(sortedImptSents)

        self.entities["LOCATION"] = []
        self.entities["DATE"] = []
        self.entities["Disaster"] = []
        for e in eventModelInstances:
            if "LOCATION" in e:
                self.entities["LOCATION"].extend(e["LOCATION"])
            elif "DATE" in e:
                self.entities["DATE"].extend(e["DATE"])
            self.entities["Disaster"].extend(e["Disaster"])

        entitiesFreq = {}
        entitiesFreq["LOCATION"] = self.getEntitiesFreq(self.entities["LOCATION"])
        entitiesFreq["DATE"] = self.getEntitiesFreq(self.entities["DATE"])
        entitiesFreq["Disaster"] = self.getEntitiesFreq(self.entities["Disaster"])
        filteredDates = []
        months = [
            "jan",
            "feb",
            "mar",
            "apr",
            "aug",
            "sept",
            "oct",
            "nov",
            "dec",
            "january",
            "february",
            "march",
            "april",
            "may",
            "june",
            "july",
            "august",
            "september",
            "october",
            "november",
            "december",
        ]
        for d, v in entitiesFreq["DATE"]:
            if d.isdigit() and len(d) == 4:
                filteredDates.append((d, v))
            elif d.lower() in months:
                filteredDates.append((d, v))
        entitiesFreq["DATE"] = filteredDates

        llen = 5
        dlen = 5
        # l = [k for k,_ in entitiesFreq['LOCATION']]
        s = len(entitiesFreq["LOCATION"])

        if llen < s:
            s = llen
        t = entitiesFreq["LOCATION"][:s]
        print t
        self.entities["LOCATION"] = dict(t)

        # d = [k for k,_ in entitiesFreq['DATE']]
        s = len(entitiesFreq["DATE"])
        if dlen < s:
            s = dlen
        self.entities["DATE"] = dict(entitiesFreq["DATE"][:s])
        print entitiesFreq["DATE"][:s]

        locDate = [k for k, _ in entitiesFreq["LOCATION"]] + [m for m, _ in entitiesFreq["DATE"]]

        locDate = eventUtils.getTokens(" ".join(locDate))
        """
        ntopToks = []
        topToks = [k for k,_ in sortedToksTFDF]
        for tok in topToks:
            if tok not in locDate:
                ntopToks.append(tok)
        topToks = ntopToks
        if self.topK < len(topToks):
            topToks =  topToks[:self.topK]
        """

        ntopToks = []
        topToks = [k for k, _ in entitiesFreq["Disaster"]]
        for tok in topToks:
            if tok not in locDate:
                ntopToks.append(tok)
        topToks = ntopToks
        if self.topK < len(topToks):
            topToks = topToks[: self.topK]
        # print "Disaster: ", topToks

        topToksDic = {}
        for t in topToks:
            topToksDic[t] = self.toksTFDFDic[t]
        # self.entities['Disaster'] = set(topToks)
        self.entities["Disaster"] = topToksDic
        # print self.entities
        print topToks

        # self.vecs = {}
        self.scalars = {}
        for k in self.entities:
            ekv = self.entities[k]
            """
            if k == 'Disaster':
                ev = [1+math.log(e*v) for e,v in ekv.values()]
            else:
                ev = [1+math.log(e) for e in ekv.values()]
            """
            # NoTFDF
            ev = [1 + math.log(e) for e in ekv.values()]
            # self.vecs[k] = ev
            self.scalars[k] = self.getScalar(ev)