Ejemplo n.º 1
0
 def calculate_similarity_equalWeights_duplicate(self,doc):
     eDisDic = self.entities['Topic']
     
     locToks = self.entities['LOCATION'].keys()
     locToks = eventUtils.getStemmedWords(locToks)
     locDic = dict(zip(locToks,self.entities['LOCATION'].values()))
     
     dToks = self.entities['DATE'].keys()
     dToks = eventUtils.getStemmedWords(dToks)
     dDic = dict(zip(dToks,self.entities['DATE'].values()))
     
     tokens = eventUtils.getTokens(doc)
     tokensDic = eventUtils.getFreq(tokens)
     wv = [1+math.log(e) for e in tokensDic.values()]
     wvScalar = self.getScalar(wv)
     scores = []
     
     ksd = 0    
     #interst = 0
     for i in tokensDic:
         if i in eDisDic:
             ksd += (1+math.log(eDisDic[i]))* (1+math.log(tokensDic[i]))
             #interst +=1
     if ksd > 0:
         ksd = float(ksd)/(self.scalars['Topic'] * wvScalar)
     else:
         ksd = 0
     if ksd == 0:
         return 0
     #if interst < 2:
         #return 0
     scores.append(ksd)
     ksl = 0    
     for i in tokensDic:
         if i in locDic:
             ksl += (1+math.log(locDic[i]))* (1+math.log(tokensDic[i]))
     if ksl > 0:
         ksl = float(ksl)/(self.scalars['LOCATION'] * wvScalar)
         
     else:
         ksl = 0
     scores.append(ksl)
     
     ks = 0    
     for i in tokensDic:
         if i in dDic:
             ks += (1+math.log(dDic[i]))* (1+math.log(tokensDic[i]))
     if ks > 0:
         ks = float(ks)/(self.scalars['DATE'] * wvScalar)
         
     else:
         ks = 0
     scores.append(ks)
     
     score = sum(scores) / 3.0
     return score
Ejemplo n.º 2
0
    def calculate_similarity(self, doc):
        eDisDic = self.entities["Disaster"]

        locToks = self.entities["LOCATION"].keys()
        locToks = eventUtils.getStemmedWords(locToks)
        locDic = dict(zip(locToks, self.entities["LOCATION"].values()))

        dToks = self.entities["DATE"].keys()
        dToks = eventUtils.getStemmedWords(dToks)
        dDic = dict(zip(dToks, self.entities["DATE"].values()))

        tokens = eventUtils.getTokens(doc)
        tokensDic = eventUtils.getFreq(tokens)
        wv = [1 + math.log(e) for e in tokensDic.values()]
        wvScalar = self.getScalar(wv)
        scores = []

        ksd = 0
        for i in tokensDic:
            if i in eDisDic:
                ksd += (1 + math.log(eDisDic[i])) * (1 + math.log(tokensDic[i]))
        if ksd > 0:
            ksd = float(ksd) / (self.scalars["Disaster"] * wvScalar)
        else:
            ksd = 0
        if ksd == 0:
            return 0
        scores.append(ksd)
        ksl = 0
        for i in tokensDic:
            if i in locDic:
                ksl += (1 + math.log(locDic[i])) * (1 + math.log(tokensDic[i]))
        if ksl > 0:
            ksl = float(ksl) / (self.scalars["LOCATION"] * wvScalar)

        else:
            ksl = 0
        scores.append(ksl)

        ks = 0
        for i in tokensDic:
            if i in dDic:
                ks += (1 + math.log(dDic[i])) * (1 + math.log(tokensDic[i]))
        if ks > 0:
            ks = float(ks) / (self.scalars["DATE"] * wvScalar)

        else:
            ks = 0
        scores.append(ks)

        score = sum(scores) / 3.0
        return score
Ejemplo n.º 3
0
 def buildProbEventModel(self,urlsList,topK):
     
     docsList = eventUtils.getWebpageText(urlsList) #self.getCollectionDocs(urlsList)
     t = ''
     #docsTotalFreqs=[]
     docsEntities=[]
     docsEntitiesFreq = []
     entitiesProb = {}
     
     # Convert each doc to tokens, locations, dates lists and their corresponding frequency distributions
     # Also produces the total frequency for each document of each list (tokens, locations, and dates)
     for doc in docsList:
         
         if doc.has_key('text'):
             t = doc['text']
             if doc.has_key('title'):
                 t =doc['title']+ " "+t
         if t:
             #print 'Reading ' + t[:100]
             ents = eventUtils.getEntities(t)[0]
             docEnt = {}
             docEnt['LOCATION']={}
             if 'LOCATION' in ents:
                 docEnt['LOCATION'] =  ents['LOCATION']
             docEnt['DATE']={}
             if 'DATE' in ents:
                 docEnt['DATE'] = ents['DATE']
             toks = eventUtils.getTokens(t)
             docEnt['Topic'] = toks
             docsEntities.append(docEnt)
             
             docEntFreq = {}
             #docTotals = {}
             for k in docEnt:
                 docEntFreq[k] = eventUtils.getFreq(docEnt[k])
                 #totalFreq = sum([v for _,v in docEntFreq[k].items()])
                 
                 #docTotals[k] = totalFreq
             docsEntitiesFreq.append(docEntFreq)
             #docsTotalFreqs.append(docTotals)
     
     # Collection-level frequency for each entity(tokens, locations, dates)
     
     #Calculating prob for each item in each entity lists (tokens, locations, and dates) as 
     # freq of item in all docs / total freq of all terms in that list
     entitiesProb['LOCATION']={}
     entitiesProb['DATE']={}
     entitiesProb['Topic']={}
     
     for docEntFreq in docsEntitiesFreq:
         for entity in docEntFreq:
             for val in docEntFreq[entity]:
                 if val in entitiesProb[entity]:
                     entitiesProb[entity][val] += docEntFreq[entity][val]
                 else:
                     entitiesProb[entity][val] = docEntFreq[entity][val]
     
     for ent in entitiesProb:
         allvalsFreq = sum([v for _,v in entitiesProb[ent].items()])
         for k in entitiesProb[ent]:
             #entitiesProb[ent][k] = (1.0 + (entitiesProb[ent][k] *1.0)) / (docsTotalFreqs[ent] + allDocsTotal[ent])
             
             entitiesProb[ent][k] = (1.0 + (entitiesProb[ent][k] *1.0)) / (len(entitiesProb[ent]) + allvalsFreq)
             
         
     #self.probEvtModel = entitiesProb
     
     mle =  self.getMLEEventEntities(entitiesProb,10)
     for k in mle:
         print k, mle[k]
         
     
     self.probEvtModel = {}
     for k in mle:
         self.probEvtModel[k] = dict(mle[k])#entitiesProb[k][:topK]
     
     self.eDisDic = self.probEvtModel['Topic']
     
     
     locToks = self.probEvtModel['LOCATION'].keys()
     locToks = eventUtils.getStemmedWords(locToks)
     self.locDic = dict(zip(locToks,self.probEvtModel['LOCATION'].values()))
     
 
     dToks = self.probEvtModel['DATE'].keys()
     dToks = eventUtils.getStemmedWords(dToks)
     self.dDic = dict(zip(dToks,self.probEvtModel['DATE'].values()))
     
     
     
     return docsEntities, entitiesProb
Ejemplo n.º 4
0
 def calculate_similarity(self,doc):
     #weigths ={'Topic':0.0,'LOCATION':0.0, 'DATE':0.0}
     '''
     entFreq = {}
     for k in self.entities:
         entFreq[k]= sum(self.entities[k].values())
     totFreq = sum(entFreq.values())
     
     for k in weigths:
         weigths[k] = entFreq[k]*1.0 / totFreq
     '''
     topicDic = self.entities['Topic']
     
     locToks = self.entities['LOCATION'].keys()
     locToks = eventUtils.getStemmedWords(locToks)
     locDic = dict(zip(locToks,self.entities['LOCATION'].values()))
     
     dToks = self.entities['DATE'].keys()
     dToks = eventUtils.getStemmedWords(dToks)
     dDic = dict(zip(dToks,self.entities['DATE'].values()))
     
     tokens = eventUtils.getTokens(doc)
     tokensDic = eventUtils.getFreq(tokens)
     wv = [1+math.log(e) for e in tokensDic.values()]
     wvScalar = self.getScalar(wv)
     scores = []
     
     ksd = 0    
     #interst = 0
     for i in tokensDic:
         if i in topicDic:
             ksd += (1+math.log(topicDic[i]))* (1+math.log(tokensDic[i]))
             #interst +=1
     if ksd != 0:
         ksd = float(ksd)/(self.scalars['Topic'] * wvScalar)
     #else:
     #    ksd = 0
     #if ksd == 0:
     #    return 0
     #if interst < 2:
         #return 0
     scores.append(ksd*self.weights['Topic'])
     ksl = 0    
     for i in tokensDic:
         if i in locDic:
             ksl += (1+math.log(locDic[i]))* (1+math.log(tokensDic[i]))
     if ksl != 0:
         ksl = float(ksl)/(self.scalars['LOCATION'] * wvScalar) 
     #else:
     #    ksl = 0
     scores.append(ksl*self.weights['LOCATION'])
     
     ks = 0    
     for i in tokensDic:
         if i in dDic:
             ks += (1+math.log(dDic[i]))* (1+math.log(tokensDic[i]))
     if ks != 0:
         ks = float(ks)/(self.scalars['DATE'] * wvScalar)    
     #else:
     #    ks = 0
     scores.append(ks*self.weights['DATE'])
     
     #score = sum(scores) / 3.0
     score = sum(scores)
     return score
Ejemplo n.º 5
0
    def buildProbEventModel(self, urlsList, topK):

        docsList = eventUtils.getWebpageText_NoURLs(urlsList)  #getWebpageText
        docsList = [d for d in docsList if 'text' in d]
        t = ''
        #docsTotalFreqs=[]
        docsEntities = []
        docsEntitiesFreq = []
        entitiesFreq = {}

        # Convert each doc to tokens, locations, dates lists and their corresponding frequency distributions
        # Also produces the total frequency for each document of each list (tokens, locations, and dates)
        for doc in docsList:
            #t = ""
            #if doc.has_key('text'):
            t = doc['text']
            #if doc.has_key('title'):
            #    t =doc['title']+ " "+t
            #if t:
            #print 'Reading ' + t[:100]
            ents = eventUtils.getEntities(t)[0]
            docEnt = {}
            docEnt['LOCATION'] = {}
            if 'LOCATION' in ents:
                docEnt['LOCATION'] = ents['LOCATION']
            docEnt['DATE'] = {}
            if 'DATE' in ents:
                docEnt['DATE'] = ents['DATE']
            toks = eventUtils.getTokens(t)
            docEnt['Topic'] = toks
            docsEntities.append(docEnt)

            docEntFreq = {}
            #docTotals = {}
            for k in docEnt:
                docEntFreq[k] = eventUtils.getFreq(docEnt[k])
                #totalFreq = sum([v for _,v in docEntFreq[k].items()])

                #docTotals[k] = totalFreq
            docsEntitiesFreq.append(docEntFreq)
            #docsTotalFreqs.append(docTotals)

        # Collection-level frequency for each entity(tokens, locations, dates)

        #Calculating prob for each item in each entity lists (tokens, locations, and dates) as
        # freq of item in all docs / total freq of all terms in that list
        entitiesFreq['LOCATION'] = defaultdict(float)  #{}
        entitiesFreq['DATE'] = defaultdict(float)  #{}
        entitiesFreq['Topic'] = defaultdict(float)  #{}

        for docEntFreq in docsEntitiesFreq:
            for entity in docEntFreq:
                for val in docEntFreq[entity]:
                    #if val in entitiesProb[entity]:
                    entitiesFreq[entity][val] += docEntFreq[entity][val]
                    #else:
                    #    entitiesProb[entity][val] = docEntFreq[entity][val]
        self.defaultProb = {}
        entitiesProb = {}
        for ent in entitiesFreq:
            allvalsFreq = sum([v for _, v in entitiesFreq[ent].items()])
            l = len(entitiesFreq[ent])
            denom = l + allvalsFreq
            self.defaultProb[ent] = 1.0 / denom
            entitiesProb[ent] = defaultdict(lambda: 1.0 / denom)
            for k in entitiesFreq[ent]:
                #entitiesProb[ent][k] = (1.0 + (entitiesProb[ent][k] *1.0)) / (docsTotalFreqs[ent] + allDocsTotal[ent])

                entitiesProb[ent][k] = (
                    1.0 + entitiesProb[ent][k]) / denom  #(l + allvalsFreq)

        #self.probEvtModel = entitiesProb

        mle = self.getMLEEventEntities(entitiesProb, 10)
        for k in mle:
            print k, mle[k]

        self.probEvtModel = {}
        for k in mle:
            #self.probEvtModel[k] = dict(mle[k])#entitiesProb[k][:topK]
            self.probEvtModel[k] = defaultdict(lambda: self.defaultProb[k])
            for e, v in mle[k]:
                self.probEvtModel[k][e] = v

        #self.eDisDic = self.probEvtModel['Topic']

        locToks = self.probEvtModel['LOCATION'].keys()
        locToks = eventUtils.getStemmedWords(locToks)
        #self.locDic = dict(zip(locToks,self.probEvtModel['LOCATION'].values()))
        locDic = defaultdict(lambda: self.defaultProb['LOCATION'])
        for k, v in zip(locToks, self.probEvtModel['LOCATION'].values()):
            locDic[k] = v
        self.probEvtModel['LOCATION'] = locDic

        dToks = self.probEvtModel['DATE'].keys()
        dToks = eventUtils.getStemmedWords(dToks)
        #self.dDic = dict(zip(dToks,self.probEvtModel['DATE'].values()))
        dDic = defaultdict(lambda: self.defaultProb['DATE'])
        for k, v in zip(locToks, self.probEvtModel['DATE'].values()):
            dDic[k] = v
        self.probEvtModel['DATE'] = dDic

        return docsEntities, entitiesProb