Example #1
0
 def getWordsFrequencies(self):
     for d in self.documents:
         w = d.getWords()
         self.words.extend(w)
     f = utils.getFreq(self.words)
     tokensFreqs = f.items()
     self.wordsFrequencies = utils.getSorted(tokensFreqs,1)
     return self.wordsFrequencies
Example #2
0
 def getWordsFrequencies(self):
     for d in self.documents:
         w = d.getWords()
         self.words.extend(w)
     f = utils.getFreq(self.words)
     tokensFreqs = f.items()
     self.wordsFrequencies = utils.getSorted(tokensFreqs, 1)
     return self.wordsFrequencies
Example #3
0
 def calculate_similarity_equalWeights_duplicate(self,doc):
     eDisDic = self.entities['Topic']
     
     locToks = self.entities['LOCATION'].keys()
     locToks = eventUtils.getStemmedWords(locToks)
     locDic = dict(zip(locToks,self.entities['LOCATION'].values()))
     
     dToks = self.entities['DATE'].keys()
     dToks = eventUtils.getStemmedWords(dToks)
     dDic = dict(zip(dToks,self.entities['DATE'].values()))
     
     tokens = eventUtils.getTokens(doc)
     tokensDic = eventUtils.getFreq(tokens)
     wv = [1+math.log(e) for e in tokensDic.values()]
     wvScalar = self.getScalar(wv)
     scores = []
     
     ksd = 0    
     #interst = 0
     for i in tokensDic:
         if i in eDisDic:
             ksd += (1+math.log(eDisDic[i]))* (1+math.log(tokensDic[i]))
             #interst +=1
     if ksd > 0:
         ksd = float(ksd)/(self.scalars['Topic'] * wvScalar)
     else:
         ksd = 0
     if ksd == 0:
         return 0
     #if interst < 2:
         #return 0
     scores.append(ksd)
     ksl = 0    
     for i in tokensDic:
         if i in locDic:
             ksl += (1+math.log(locDic[i]))* (1+math.log(tokensDic[i]))
     if ksl > 0:
         ksl = float(ksl)/(self.scalars['LOCATION'] * wvScalar)
         
     else:
         ksl = 0
     scores.append(ksl)
     
     ks = 0    
     for i in tokensDic:
         if i in dDic:
             ks += (1+math.log(dDic[i]))* (1+math.log(tokensDic[i]))
     if ks > 0:
         ks = float(ks)/(self.scalars['DATE'] * wvScalar)
         
     else:
         ks = 0
     scores.append(ks)
     
     score = sum(scores) / 3.0
     return score
Example #4
0
 def calculate_score(self, doc):
     #sims=[]
     docWords = getTokens(doc)
     docTF = getFreq(docWords)
     sim = self.cosSim( docTF)
     
     if sim >= self.relevanceth:
         return [1,sim]
     else:
         return [0,sim]
Example #5
0
 def buildIndex(self):
     self.docs_bow = [eventUtils.getFreq(d.words) for d in self.coll.documents if d.text]
     for doc_bow in self.docs_bow:
         for w in doc_bow:
             if w in self.index:
                 self.index[w]['docs'].append(doc_bow[w])
             else:
                 self.index[w] = {}
                 self.index[w]['docs'] = [doc_bow[w]]
     for w in self.index:
         self.index[w]['docFreq'] = len(self.index[w]['docs'])
         self.index[w]['collFreq'] = sum(self.index[w]['docs'])
 def buildVSMClassifier(self,posFile,vsmClassifierFileName,th,topK):
     
     try:
         classifierFile = open(vsmClassifierFileName,"rb")
         self.classifier = pickle.load(classifierFile)
         classifierFile.close()
     except:
         docs = []
         f = open(posFile,'r')
         for url in f:
             url = url.strip()
             d = Document(url)
             if d and d.text:
                 docs.append(d)
         f.close()
         '''
         docsTF = []
         for d in docs:
             wordsFreq = getFreq(d.getWords())
             docsTF.append(wordsFreq)
         self.classifier = VSMClassifier(docsTF,th)
         '''
         docsTF = []
         vocabTFDic = {}
         for d in docs:
             wordsFreq = getFreq(d.getWords())
             #docsTF.append(wordsFreq)
             for w in wordsFreq:
                 if w in vocabTFDic:
                     vocabTFDic[w] += wordsFreq[w]
                 else:
                     vocabTFDic[w] = wordsFreq[w]
         
         vocabSorted = getSorted(vocabTFDic.items(), 1)
         topVocabDic = dict(vocabSorted[:topK])
         #topVocabDic = vocabTFDic
         
         ndocsTF = []
         '''
         for d in docsTF:
             ndocTF = {}
             for k in topVocabDic:
                 if k in d:
                     ndocTF[k] = d[k]
                 else: 
                     ndocTF[k] = 1/math.e
             ndocsTF.append(ndocTF)
          '''   
         
         self.classifier = VSMClassifier(topVocabDic,ndocsTF,th)
         classifierFile = open(vsmClassifierFileName,"wb")
         pickle.dump(self.classifier,classifierFile)
         classifierFile.close()
Example #7
0
def extractDatesLocs(urls):
    webpagesTxt = eventUtils.getWebpageText_NoURLs(urls)
    txts = [
        webpageTxt['text'] for webpageTxt in webpagesTxt
        if 'text' in webpageTxt
    ]
    webpageEnts = eventUtils.getEntities(txts)
    #webpageEnts = eventUtils.getEntities(webpageTxt[0]['text'])
    #print webpageEnts[0]['LOCATION']
    #print webpageEnts[0]['DATE']

    locs = []
    dates = []

    for wbE in webpageEnts:
        #print wbE['LOCATION']
        #print wbE['DATE']
        #print '-----------------------'
        if 'LOCATION' in wbE:
            locs.extend(wbE['LOCATION'])
        if 'DATE' in wbE:
            dates.extend(wbE['DATE'])

    freqLocs = eventUtils.getFreq(locs)
    freqDates = eventUtils.getFreq(dates)
    '''
    freqDates_norm = normalizeDates(freqDates)
    sortedDates = eventUtils.getSorted(freqDates_norm.iteritems(),1)
    print sortedDates
    print "Most Frequent Date (i.e. most probably event's date) is: ", sortedDates[0]
    print '________________________________'
    #print freqDates_norm
    '''
    freqLocs_norm = normalizeLocs(freqLocs)
    sortedLocs = eventUtils.getSorted(freqLocs_norm.iteritems(), 1)
    print sortedLocs
    print "Most Frequent Location (i.e. most probably event's location) is: ", sortedLocs[
        0]
    #print freqLocs_norm
    return
    def calculate_similarity(self, doc):
        eDisDic = self.entities["Disaster"]

        locToks = self.entities["LOCATION"].keys()
        locToks = eventUtils.getStemmedWords(locToks)
        locDic = dict(zip(locToks, self.entities["LOCATION"].values()))

        dToks = self.entities["DATE"].keys()
        dToks = eventUtils.getStemmedWords(dToks)
        dDic = dict(zip(dToks, self.entities["DATE"].values()))

        tokens = eventUtils.getTokens(doc)
        tokensDic = eventUtils.getFreq(tokens)
        wv = [1 + math.log(e) for e in tokensDic.values()]
        wvScalar = self.getScalar(wv)
        scores = []

        ksd = 0
        for i in tokensDic:
            if i in eDisDic:
                ksd += (1 + math.log(eDisDic[i])) * (1 + math.log(tokensDic[i]))
        if ksd > 0:
            ksd = float(ksd) / (self.scalars["Disaster"] * wvScalar)
        else:
            ksd = 0
        if ksd == 0:
            return 0
        scores.append(ksd)
        ksl = 0
        for i in tokensDic:
            if i in locDic:
                ksl += (1 + math.log(locDic[i])) * (1 + math.log(tokensDic[i]))
        if ksl > 0:
            ksl = float(ksl) / (self.scalars["LOCATION"] * wvScalar)

        else:
            ksl = 0
        scores.append(ksl)

        ks = 0
        for i in tokensDic:
            if i in dDic:
                ks += (1 + math.log(dDic[i])) * (1 + math.log(tokensDic[i]))
        if ks > 0:
            ks = float(ks) / (self.scalars["DATE"] * wvScalar)

        else:
            ks = 0
        scores.append(ks)

        score = sum(scores) / 3.0
        return score
Example #9
0
 def getEntitiesFreq(self,entityList):
     el = [e.lower() for e in entityList]
     
     entitiesWords = []
     for w in el:
         p = w.split()
         if len(p)>1:
             entitiesWords.extend(p)
         else:
             entitiesWords.append(w)
     s = eventUtils.getFreq(entitiesWords)
     s = eventUtils.getSorted(s.items(), 1)
     return s
Example #10
0
    def getEntitiesFreq(self, entityList):
        el = [e.lower() for e in entityList]

        entitiesWords = []
        for w in el:
            p = w.split()
            if len(p) > 1:
                entitiesWords.extend(p)
            else:
                entitiesWords.append(w)
        s = eventUtils.getFreq(entitiesWords)
        s = eventUtils.getSorted(s.items(), 1)
        return s
Example #11
0
def extractDatesLocs(urls):
    webpagesTxt = eventUtils.getWebpageText_NoURLs(urls)
    txts = [webpageTxt['text'] for webpageTxt in webpagesTxt if 'text' in webpageTxt]
    webpageEnts = eventUtils.getEntities(txts)
    #webpageEnts = eventUtils.getEntities(webpageTxt[0]['text'])
    #print webpageEnts[0]['LOCATION']
    #print webpageEnts[0]['DATE']
    
    locs = []
    dates = []
    
    for wbE in webpageEnts:
        #print wbE['LOCATION']
        #print wbE['DATE']
        #print '-----------------------'
        if 'LOCATION' in wbE:
            locs.extend(wbE['LOCATION'])
        if 'DATE' in wbE:
            dates.extend(wbE['DATE'])
    
    freqLocs = eventUtils.getFreq(locs)
    freqDates = eventUtils.getFreq(dates)
   
    '''
    freqDates_norm = normalizeDates(freqDates)
    sortedDates = eventUtils.getSorted(freqDates_norm.iteritems(),1)
    print sortedDates
    print "Most Frequent Date (i.e. most probably event's date) is: ", sortedDates[0]
    print '________________________________'
    #print freqDates_norm
    '''
    freqLocs_norm = normalizeLocs(freqLocs)
    sortedLocs = eventUtils.getSorted(freqLocs_norm.iteritems(),1)
    print sortedLocs
    print "Most Frequent Location (i.e. most probably event's location) is: ", sortedLocs[0]
    #print freqLocs_norm
    return
Example #12
0
 def buildVSMClassifier_OneTargetTopicVector(self,posFile,vsmClassifierFileName,th,topK):
     
     try:
         classifierFile = open(vsmClassifierFileName,"rb")
         self.classifier = pickle.load(classifierFile)
         classifierFile.close()
     except:
         docs = []
         f = open(posFile,'r')
         for url in f:
             url = url.strip()
             d = Document(url)
             if d and d.text:
                 docs.append(d)
         f.close()
         '''
         docsTF = []
         for d in docs:
             wordsFreq = getFreq(d.getWords())
             docsTF.append(wordsFreq)
         self.classifier = VSMClassifier(docsTF,th)
         '''
         docsTF = []
         vocabTFDic = {}
         n = len(docs)
         for d in docs:
             wordsFreq = getFreq(d.getWords())
             #docsTF.append(wordsFreq)
             for w in wordsFreq:
                 if w in vocabTFDic:
                     #vocabTFDic[w] += wordsFreq[w]
                     vocabTFDic[w].append( wordsFreq[w])
                 else:
                     vocabTFDic[w] = [wordsFreq[w]]
         #vocTF_IDF = [(w,sum(vocabTFDic[w])*math.log(n*1.0/len(vocabTFDic[w]))) for w in vocabTFDic]
         idf = 1.0
         vocTF_IDF = [(w,sum([1+math.log(vtf) for vtf in vocabTFDic[w]])*idf) for w in vocabTFDic]
          
         #vocabSorted = getSorted(vocabTFDic.items(), 1)
         vocabSorted = getSorted(vocTF_IDF, 1)
         print vocabSorted[:topK]
         topVocabDic = dict(vocabSorted[:topK])
         #topVocabDic = vocabTFDic
          
         
         self.classifier = VSMClassifier(topVocabDic,th)
         classifierFile = open(vsmClassifierFileName,"wb")
         pickle.dump(self.classifier,classifierFile)
         classifierFile.close()
 def calculate_score_AllDocs(self, doc):
     sims=[]
     docWords = getTokens(doc)
     docTF = getFreq(docWords)
     ndocTF = dict.fromkeys(self.topVocabDic)
     for k in ndocTF:
         if k in docTF:
             ndocTF[k] = docTF[k]
         else:
             ndocTF[k] = 1/math.e
     for dTF in self.docsTF:
         s = self.cosSim(ndocTF, dTF)
         sims.append(s)
     sim = max(sims)
     if sim >= self.relevanceth:
         return [1,sim]
     else:
         return [0,sim]
Example #14
0
 def buildVSMClassifier(self,posFile,vsmClassifierFileName,th,leastK):
     
     try:
         classifierFile = open(vsmClassifierFileName,"rb")
         self.classifier = pickle.load(classifierFile)
         classifierFile.close()
     except:
         docs = []
         
         f = open(posFile,'r')
         for url in f:
             url = url.strip()
             d = Document(url)
             if d and d.text:
                 docs.append(d)
         f.close()
        
         #docsBOW = []
         vocabTFDic = defaultdict([])
         #n = len(docs)
         for d in docs:
             wordsFreq = getFreq(d.getWords())
             #docsBOW.append(wordsFreq)
             for w in wordsFreq:
                 vocabTFDic[w].append( wordsFreq[w])
         
         #idf = 1.0
         #vocTF_IDF = [(w,sum([1+math.log(vtf) for vtf in vocabTFDic[w]])*idf) for w in vocabTFDic]
         voc_CollFreq = [(w,sum(vocabTFDic[w])) for w in vocabTFDic]
         vocab_filtered = [(w,f) for w in voc_CollFreq if f>= leastK] 
         vocab_filtered_dict = dict(vocab_filtered)
         #vocabSorted = getSorted(voc_CollFreq, 1)
         '''
         print vocabSorted[:topK]
         topVocabDic = dict(vocabSorted[:topK])
         '''
         
         self.classifier = VSMClassifier(vocab_filtered_dict,th)
         classifierFile = open(vsmClassifierFileName,"wb")
         pickle.dump(self.classifier,classifierFile)
         classifierFile.close()
Example #15
0
def getShortURLsFreqDic(shortURLs):
    shortURLsFreqDic = eventUtils.getFreq(shortURLs)
    return shortURLsFreqDic
Example #16
0
def buildProbEventModel(docsList):
    t = ''
    docsTotalFreqs=[]
    docsEntities=[]
    docsEntitiesFreq = []
    entitiesProb = {}
    
    # Convert each doc to tokens, locations, dates lists and their corresponding frequency distributions
    # Also produces the total frequency for each document of each list (tokens, locations, and dates)
    for doc in docsList:
        
        if doc.has_key('text'):
            t = doc['text']
            if doc.has_key('title'):
                t =doc['title']+ " "+t
        if t:
            print 'Reading ' + t[:100]
            ents = eventUtils.getEntities(t)[0]
            docEnt = {}
            docEnt['LOCATION']={}
            if 'LOCATION' in ents:
                docEnt['LOCATION'] =  ents['LOCATION']
            docEnt['DATE']={}
            if 'DATE' in ents:
                docEnt['DATE'] = ents['DATE']
            toks = eventUtils.getTokens(t)
            docEnt['Topic'] = toks
            docsEntities.append(docEnt)
            
            docEntFreq = {}
            #docTotals = {}
            for k in docEnt:
                docEntFreq[k] = eventUtils.getFreq(docEnt[k])
                #totalFreq = sum([v for _,v in docEntFreq[k].items()])
                
                #docTotals[k] = totalFreq
            docsEntitiesFreq.append(docEntFreq)
            #docsTotalFreqs.append(docTotals)
    
    # Collection-level frequency for each entity(tokens, locations, dates)
    
    # Total Frequency of keywords, locations, and dates in all documents
    '''
    allDocsTotal = {}
    allDocsTotal['LOCATION'] = 0
    allDocsTotal['DATE']=0
    allDocsTotal['Topic'] = 0
    
    for docTotFreq in docsTotalFreqs:
        for k in docTotFreq:
            allDocsTotal[k]+= docTotFreq[k]
    '''
    
    #Calculating prob for each item in each entity lists (tokens, locations, and dates) as 
    # freq of item in all docs / total freq of all terms in that list
    entitiesProb['LOCATION']={}
    entitiesProb['DATE']={}
    entitiesProb['Topic']={}
    
    for docEntFreq in docsEntitiesFreq:
        for entity in docEntFreq:
            for val in docEntFreq[entity]:
                if val in entitiesProb[entity]:
                    entitiesProb[entity][val] += docEntFreq[entity][val]
                else:
                    entitiesProb[entity][val] = docEntFreq[entity][val]
    
    for ent in entitiesProb:
        allvalsFreq = sum([v for _,v in entitiesProb[ent].items()])
        for k in entitiesProb[ent]:
            #entitiesProb[ent][k] = (1.0 + (entitiesProb[ent][k] *1.0)) / (docsTotalFreqs[ent] + allDocsTotal[ent])
            
            entitiesProb[ent][k] = (1.0 + (entitiesProb[ent][k] *1.0)) / (len(entitiesProb[ent]) + allvalsFreq)
            
        
            
    return docsEntities, entitiesProb
Example #17
0
def readGraphFile(graphFile):
    with open(graphFile) as f:
        lines = f.readlines()
    lines = [l.strip() for l in lines]
    #graph = [(int(l.split(",")[0])+1,int(l.split(',')[1])+1) for l in lines ]
    graph = [(l.split(",")[0],l.split(',')[1]) for l in lines ]
    return graph

# draw example
urlsFile = 'base-Output-URLs.txt'
urls = eu.readFileLines(urlsFile)

doms = [eu.getDomain(url) for url in urls]

uniqueDomsFreqDic = eu.getFreq(doms)
uDoms = uniqueDomsFreqDic.keys()
numDoms = len(uDoms)
uc=[random.random() for i in range(numDoms)]
uniqDomsColorsDic = dict(zip(uDoms,uc))
#c = [uniqDomsColorsDic[d] for d in doms]
#c = c[5:]

domsTuples = enumerate(doms)
domsDic = dict(domsTuples)
#domsDic = defaultdict(list)
#for i,d in domsTuples:
#    domsDic[d].append(i)
#print domsDic
graphFile = 'Output-CharlestonShooting/base-webpages/base-logData.txt'
graph = readGraphFile(graphFile)
Example #18
0
    def buildEventModel_old(self, seedURLs):

        corpus = Collection(seedURLs)
        #sortedTokensFreqs = corpus.getWordsFrequencies()
        sortedToksTFDF = corpus.getIndicativeWords()
        print sortedToksTFDF
        sortedImptSents = corpus.getIndicativeSentences(
            self.topK, self.intersectionTh)
        # Get Event Model
        eventModelInstances = eventUtils.getEventModelInsts(sortedImptSents)
        #topToks = [k for k,_ in sortedToksTFDF]
        #if self.topK < len(topToks):
        #    topToks =  topToks[:self.topK]
        #self.entities['Disaster'] = set(topToks)

        self.entities['LOCATION'] = []
        self.entities['DATE'] = []
        for e in eventModelInstances:
            if 'LOCATION' in e:
                self.entities['LOCATION'].extend(e['LOCATION'])
            elif 'DATE' in e:
                self.entities['DATE'].extend(e['DATE'])

        entitiesFreq = {}
        entitiesFreq['LOCATION'] = eventUtils.getFreq(
            self.entities['LOCATION'])
        entitiesFreq['LOCATION'] = eventUtils.getSorted(
            entitiesFreq['LOCATION'].items(), 1)
        entitiesFreq['DATE'] = eventUtils.getFreq(self.entities['DATE'])
        entitiesFreq['DATE'] = eventUtils.getSorted(
            entitiesFreq['DATE'].items(), 1)

        l = [k for k, _ in entitiesFreq['LOCATION']]
        if self.topK < len(l):
            #l = l[:self.topK]
            l = l[:3]
        self.entities['LOCATION'] = set(l)

        d = [k for k, _ in entitiesFreq['DATE']]
        if self.topK < len(d):
            #d = d[:self.topK]
            d = d[:3]
        self.entities['DATE'] = set(d)
        '''
        locList = self.entities['LOCATION']
        locSet = set(locList)
        self.entities['LOCATION'] = [l for l in locSet]
        '''
        self.entities['LOCATION'] = self.getUniqueEntities(
            self.entities['LOCATION'])
        '''
        dateList = self.entities['DATE']
        dateSet = set(dateList)
        self.entities['DATE'] = [d for d in dateSet]
        '''
        self.entities['DATE'] = self.getUniqueEntities(self.entities['DATE'])

        locDate = list(self.entities['LOCATION']) + list(self.entities['DATE'])
        locDate = eventUtils.getTokens(' '.join(locDate))

        ntopToks = []
        topToks = [k for k, _ in sortedToksTFDF]
        for tok in topToks:
            if tok not in locDate:
                ntopToks.append(tok)
        topToks = ntopToks
        if self.topK < len(topToks):
            topToks = topToks[:self.topK]
        self.entities['Disaster'] = set(topToks)

        self.allEntities = []
        for k in self.entities:
            self.allEntities.extend(self.entities[k])

        print self.allEntities
Example #19
0
def readGraphFile(graphFile):
    with open(graphFile) as f:
        lines = f.readlines()
    lines = [l.strip() for l in lines]
    #graph = [(int(l.split(",")[0])+1,int(l.split(',')[1])+1) for l in lines ]
    graph = [(l.split(",")[0], l.split(',')[1]) for l in lines]
    return graph


# draw example
urlsFile = 'base-Output-URLs.txt'
urls = eu.readFileLines(urlsFile)

doms = [eu.getDomain(url) for url in urls]

uniqueDomsFreqDic = eu.getFreq(doms)
uDoms = uniqueDomsFreqDic.keys()
numDoms = len(uDoms)
uc = [random.random() for i in range(numDoms)]
uniqDomsColorsDic = dict(zip(uDoms, uc))
#c = [uniqDomsColorsDic[d] for d in doms]
#c = c[5:]

domsTuples = enumerate(doms)
domsDic = dict(domsTuples)
#domsDic = defaultdict(list)
#for i,d in domsTuples:
#    domsDic[d].append(i)
#print domsDic
graphFile = 'Output-CharlestonShooting/base-webpages/base-logData.txt'
graph = readGraphFile(graphFile)
Example #20
0
 def calculate_similarity(self,doc):
     #weigths ={'Topic':0.0,'LOCATION':0.0, 'DATE':0.0}
     '''
     entFreq = {}
     for k in self.entities:
         entFreq[k]= sum(self.entities[k].values())
     totFreq = sum(entFreq.values())
     
     for k in weigths:
         weigths[k] = entFreq[k]*1.0 / totFreq
     '''
     topicDic = self.entities['Topic']
     
     locToks = self.entities['LOCATION'].keys()
     locToks = eventUtils.getStemmedWords(locToks)
     locDic = dict(zip(locToks,self.entities['LOCATION'].values()))
     
     dToks = self.entities['DATE'].keys()
     dToks = eventUtils.getStemmedWords(dToks)
     dDic = dict(zip(dToks,self.entities['DATE'].values()))
     
     tokens = eventUtils.getTokens(doc)
     tokensDic = eventUtils.getFreq(tokens)
     wv = [1+math.log(e) for e in tokensDic.values()]
     wvScalar = self.getScalar(wv)
     scores = []
     
     ksd = 0    
     #interst = 0
     for i in tokensDic:
         if i in topicDic:
             ksd += (1+math.log(topicDic[i]))* (1+math.log(tokensDic[i]))
             #interst +=1
     if ksd != 0:
         ksd = float(ksd)/(self.scalars['Topic'] * wvScalar)
     #else:
     #    ksd = 0
     #if ksd == 0:
     #    return 0
     #if interst < 2:
         #return 0
     scores.append(ksd*self.weights['Topic'])
     ksl = 0    
     for i in tokensDic:
         if i in locDic:
             ksl += (1+math.log(locDic[i]))* (1+math.log(tokensDic[i]))
     if ksl != 0:
         ksl = float(ksl)/(self.scalars['LOCATION'] * wvScalar) 
     #else:
     #    ksl = 0
     scores.append(ksl*self.weights['LOCATION'])
     
     ks = 0    
     for i in tokensDic:
         if i in dDic:
             ks += (1+math.log(dDic[i]))* (1+math.log(tokensDic[i]))
     if ks != 0:
         ks = float(ks)/(self.scalars['DATE'] * wvScalar)    
     #else:
     #    ks = 0
     scores.append(ks*self.weights['DATE'])
     
     #score = sum(scores) / 3.0
     score = sum(scores)
     return score
def evaluate(collFolder,k):
    evalres = []
    for j in range(k):
        
        fn = collFolder+str(j)+'.txt'
        f = codecs.open(fn, encoding='utf-8')
        ftext = f.read()
        text =  ftext.split()#getTokens(ftext)
        text = [t.lower() for t in text]
        te = []
        for t in text:
            if t.endswith('.'):
                t = t[:-1]
            te.append(t)
        text = te
        textFreq = getFreq(text)
        '''
        if 'shoot' in text or 'shooter' in text or 'shooting' in text:
            if 'fsu' in text:
                evalres.append(1)
            elif 'florida' in text and 'state' in text :#and 'university' in text:
                evalres.append(1)
            else:
                evalres.append(0)
        else:
            evalres.append(0)
        '''
        '''
        if 'hagupit' in text or 'ruby' in text:
            if 'typhoon' in text:
                evalres.append(1)
            elif 'philippin' in text or 'philippines' in text:
                evalres.append(1)
            else:
                evalres.append(0)
            #evalres.append(1)
        else:
            evalres.append(0)
        '''
        '''
        if 'fire' in text:
            if 'la' in text:
                #if 'downtown' in text:
                evalres.append(1)
                #else:
                #    evalres.append(0)
            elif 'los' in text and 'angeles' in text:
                #if 'downtown' in text:
                evalres.append(1)
                #else:
                #    evalres.append(0)
            else:
                evalres.append(0)
        else:
            evalres.append(0)
        '''
        '''
        if 'charlie' in text and 'hebdo' in text or 'paris' in text:
            if 'shooting' in text or 'shoot' in text:
                evalres.append(1)
            elif 'attack' in text:
                evalres.append(1)
            else:
                evalres.append(0)
        else:
            evalres.append(0)
        '''
        '''
        if 'airasia' in text or 'qz8501' in text:
            if 'flight' in text and 'missing' in text:
                evalres.append(1)
            elif 'plane' in text and 'missing' in text:
                evalres.append(1)
            else:
                evalres.append(0)
            #evalres.append(1)
        else:
            evalres.append(0)
        '''
        th = 2
        if textFreq.get('qz8501',0)>th:
            evalres.append(1)
        elif textFreq.get('airasia',0)>th:
            if textFreq.get('flight',0) or textFreq.get('plane',0):
                if textFreq.get('missing',0) or textFreq.get('crash',0):
                    evalres.append(1)
                elif textFreq.get('8501',0) or textFreq.get('qz8501',0):
                    evalres.append(1)
                else:
                    evalres.append(0)
            else:
                evalres.append(0)
            #evalres.append(1)
        else:
            evalres.append(0)
        f.close()
    return evalres
Example #22
0
def buildProbEventModel(docsList):
    t = ''
    docsTotalFreqs = []
    docsEntities = []
    docsEntitiesFreq = []
    entitiesProb = {}

    # Convert each doc to tokens, locations, dates lists and their corresponding frequency distributions
    # Also produces the total frequency for each document of each list (tokens, locations, and dates)
    for doc in docsList:

        if doc.has_key('text'):
            t = doc['text']
            if doc.has_key('title'):
                t = doc['title'] + " " + t
        if t:
            print 'Reading ' + t[:100]
            ents = eventUtils.getEntities(t)[0]
            docEnt = {}
            docEnt['LOCATION'] = {}
            if 'LOCATION' in ents:
                docEnt['LOCATION'] = ents['LOCATION']
            docEnt['DATE'] = {}
            if 'DATE' in ents:
                docEnt['DATE'] = ents['DATE']
            toks = eventUtils.getTokens(t)
            docEnt['Topic'] = toks
            docsEntities.append(docEnt)

            docEntFreq = {}
            #docTotals = {}
            for k in docEnt:
                docEntFreq[k] = eventUtils.getFreq(docEnt[k])
                #totalFreq = sum([v for _,v in docEntFreq[k].items()])

                #docTotals[k] = totalFreq
            docsEntitiesFreq.append(docEntFreq)
            #docsTotalFreqs.append(docTotals)

    # Collection-level frequency for each entity(tokens, locations, dates)

    # Total Frequency of keywords, locations, and dates in all documents
    '''
    allDocsTotal = {}
    allDocsTotal['LOCATION'] = 0
    allDocsTotal['DATE']=0
    allDocsTotal['Topic'] = 0
    
    for docTotFreq in docsTotalFreqs:
        for k in docTotFreq:
            allDocsTotal[k]+= docTotFreq[k]
    '''

    #Calculating prob for each item in each entity lists (tokens, locations, and dates) as
    # freq of item in all docs / total freq of all terms in that list
    entitiesProb['LOCATION'] = {}
    entitiesProb['DATE'] = {}
    entitiesProb['Topic'] = {}

    for docEntFreq in docsEntitiesFreq:
        for entity in docEntFreq:
            for val in docEntFreq[entity]:
                if val in entitiesProb[entity]:
                    entitiesProb[entity][val] += docEntFreq[entity][val]
                else:
                    entitiesProb[entity][val] = docEntFreq[entity][val]

    for ent in entitiesProb:
        allvalsFreq = sum([v for _, v in entitiesProb[ent].items()])
        for k in entitiesProb[ent]:
            #entitiesProb[ent][k] = (1.0 + (entitiesProb[ent][k] *1.0)) / (docsTotalFreqs[ent] + allDocsTotal[ent])

            entitiesProb[ent][k] = (1.0 + (entitiesProb[ent][k] * 1.0)) / (
                len(entitiesProb[ent]) + allvalsFreq)

    return docsEntities, entitiesProb
Example #23
0
 def buildEventModel_old(self,seedURLs):
     
     corpus = Collection(seedURLs)
     #sortedTokensFreqs = corpus.getWordsFrequencies()
     sortedToksTFDF = corpus.getIndicativeWords()
     print sortedToksTFDF
     sortedImptSents = corpus.getIndicativeSentences(self.topK,self.intersectionTh)
     # Get Event Model
     eventModelInstances = eventUtils.getEventModelInsts(sortedImptSents)
     #topToks = [k for k,_ in sortedToksTFDF]
     #if self.topK < len(topToks):
     #    topToks =  topToks[:self.topK]
     #self.entities['Disaster'] = set(topToks)
     
     self.entities['LOCATION']= []
     self.entities['DATE'] = []
     for e in eventModelInstances:
         if 'LOCATION' in e:
             self.entities['LOCATION'].extend( e['LOCATION'])
         elif 'DATE' in e:
             self.entities['DATE'].extend( e['DATE'])
     
     entitiesFreq = {}
     entitiesFreq['LOCATION'] = eventUtils.getFreq(self.entities['LOCATION'])
     entitiesFreq['LOCATION'] = eventUtils.getSorted(entitiesFreq['LOCATION'].items(), 1)
     entitiesFreq['DATE'] = eventUtils.getFreq(self.entities['DATE'])
     entitiesFreq['DATE'] = eventUtils.getSorted(entitiesFreq['DATE'].items(), 1)
     
     l = [k for k,_ in entitiesFreq['LOCATION']]
     if self.topK < len(l):
         #l = l[:self.topK]
         l = l[:3]
     self.entities['LOCATION'] = set(l)
     
     d = [k for k,_ in entitiesFreq['DATE']]
     if self.topK < len(d):
         #d = d[:self.topK]
         d = d[:3]
     self.entities['DATE'] = set(d)
     
     self.entities['LOCATION'] = self.getUniqueEntities(self.entities['LOCATION'])
     
     
     self.entities['DATE'] = self.getUniqueEntities(self.entities['DATE']) 
     
     locDate = list(self.entities['LOCATION']) + list(self.entities['DATE'])
     locDate = eventUtils.getTokens(' '.join(locDate))
     
     ntopToks = []
     topToks = [k for k,_ in sortedToksTFDF]
     for tok in topToks:
         if tok not in locDate:
             ntopToks.append(tok)
     topToks = ntopToks
     if self.topK < len(topToks):
         topToks =  topToks[:self.topK]
     self.entities['Disaster'] = set(topToks)
     
     
     self.allEntities = []
     for k in self.entities:
         self.allEntities.extend(self.entities[k]) 
         
     print self.allEntities
Example #24
0
def evaluate(collFolder,k):
    evalres = []
    for j in range(k):
        
        fn = collFolder+str(j)+'.txt'
        f = codecs.open(fn, encoding='utf-8')
        ftext = f.read()
        text =  ftext.split()#getTokens(ftext)
        text = [t.lower() for t in text]
        te = []
        for t in text:
            if t.endswith('.'):
                t = t[:-1]
            te.append(t)
        text = te
        textFreq = getFreq(text)
        '''
        if 'shoot' in text or 'shooter' in text or 'shooting' in text:
            if 'fsu' in text:
                evalres.append(1)
            elif 'florida' in text and 'state' in text :#and 'university' in text:
                evalres.append(1)
            else:
                evalres.append(0)
        else:
            evalres.append(0)
        '''
        
        '''
        if 'typhoon' in text:
            if 'hagupit' in text or 'ruby' in text:
                evalres.append(1)
            #elif 'philippin' in text or 'philippines' in text:
            #    evalres.append(1)
            else:
                evalres.append(0)
            #evalres.append(1)
        elif 'hagupit' in text:
            evalres.append(1)
        else:
            evalres.append(0)
        '''
        '''
        if 'fire' in text:
            if 'la' in text:
                #if 'downtown' in text:
                evalres.append(1)
                #else:
                #    evalres.append(0)
            elif 'los' in text and 'angeles' in text:
                #if 'downtown' in text:
                evalres.append(1)
                #else:
                #    evalres.append(0)
            else:
                evalres.append(0)
        else:
            evalres.append(0)
        '''
        '''
        if 'charlie' in text and 'hebdo' in text or 'paris' in text:
            if 'shooting' in text or 'shoot' in text:
                evalres.append(1)
            elif 'attack' in text:
                evalres.append(1)
            else:
                evalres.append(0)
        else:
            evalres.append(0)
        '''
        if 'qz8501' in text:
            evalres.append(1)
        elif 'airasia' in text:
            if 'flight' in text and 'missing' in text:
                evalres.append(1)
            elif 'plane' in text:
                if 'crash' in text or 'missing' in text:
                    evalres.append(1)
                else:
                    evalres.append(0)
            else:
                evalres.append(0)
            #evalres.append(1)
        else:
            evalres.append(0)
        
        '''
        th = 2
        if textFreq.get('qz8501',0)>th:
            evalres.append(1)
        elif textFreq.get('airasia',0)>th:
            if textFreq.get('flight',0) or textFreq.get('plane',0):
                if textFreq.get('missing',0) or textFreq.get('crash',0):
                    evalres.append(1)
                elif textFreq.get('8501',0) or textFreq.get('qz8501',0):
                    evalres.append(1)
                else:
                    evalres.append(0)
            else:
                evalres.append(0)
            #evalres.append(1)
        else:
            evalres.append(0)
        f.close()
        '''
    return evalres
 def buildProbEventModel(self,urlsList,topK):
     
     docsList = eventUtils.getWebpageText(urlsList) #self.getCollectionDocs(urlsList)
     t = ''
     #docsTotalFreqs=[]
     docsEntities=[]
     docsEntitiesFreq = []
     entitiesProb = {}
     
     # Convert each doc to tokens, locations, dates lists and their corresponding frequency distributions
     # Also produces the total frequency for each document of each list (tokens, locations, and dates)
     for doc in docsList:
         
         if doc.has_key('text'):
             t = doc['text']
             if doc.has_key('title'):
                 t =doc['title']+ " "+t
         if t:
             #print 'Reading ' + t[:100]
             ents = eventUtils.getEntities(t)[0]
             docEnt = {}
             docEnt['LOCATION']={}
             if 'LOCATION' in ents:
                 docEnt['LOCATION'] =  ents['LOCATION']
             docEnt['DATE']={}
             if 'DATE' in ents:
                 docEnt['DATE'] = ents['DATE']
             toks = eventUtils.getTokens(t)
             docEnt['Topic'] = toks
             docsEntities.append(docEnt)
             
             docEntFreq = {}
             #docTotals = {}
             for k in docEnt:
                 docEntFreq[k] = eventUtils.getFreq(docEnt[k])
                 #totalFreq = sum([v for _,v in docEntFreq[k].items()])
                 
                 #docTotals[k] = totalFreq
             docsEntitiesFreq.append(docEntFreq)
             #docsTotalFreqs.append(docTotals)
     
     # Collection-level frequency for each entity(tokens, locations, dates)
     
     #Calculating prob for each item in each entity lists (tokens, locations, and dates) as 
     # freq of item in all docs / total freq of all terms in that list
     entitiesProb['LOCATION']={}
     entitiesProb['DATE']={}
     entitiesProb['Topic']={}
     
     for docEntFreq in docsEntitiesFreq:
         for entity in docEntFreq:
             for val in docEntFreq[entity]:
                 if val in entitiesProb[entity]:
                     entitiesProb[entity][val] += docEntFreq[entity][val]
                 else:
                     entitiesProb[entity][val] = docEntFreq[entity][val]
     
     for ent in entitiesProb:
         allvalsFreq = sum([v for _,v in entitiesProb[ent].items()])
         for k in entitiesProb[ent]:
             #entitiesProb[ent][k] = (1.0 + (entitiesProb[ent][k] *1.0)) / (docsTotalFreqs[ent] + allDocsTotal[ent])
             
             entitiesProb[ent][k] = (1.0 + (entitiesProb[ent][k] *1.0)) / (len(entitiesProb[ent]) + allvalsFreq)
             
         
     #self.probEvtModel = entitiesProb
     
     mle =  self.getMLEEventEntities(entitiesProb,10)
     for k in mle:
         print k, mle[k]
         
     
     self.probEvtModel = {}
     for k in mle:
         self.probEvtModel[k] = dict(mle[k])#entitiesProb[k][:topK]
     
     self.eDisDic = self.probEvtModel['Topic']
     
     
     locToks = self.probEvtModel['LOCATION'].keys()
     locToks = eventUtils.getStemmedWords(locToks)
     self.locDic = dict(zip(locToks,self.probEvtModel['LOCATION'].values()))
     
 
     dToks = self.probEvtModel['DATE'].keys()
     dToks = eventUtils.getStemmedWords(dToks)
     self.dDic = dict(zip(dToks,self.probEvtModel['DATE'].values()))
     
     
     
     return docsEntities, entitiesProb
    def buildProbEventModel(self, urlsList, topK):

        docsList = eventUtils.getWebpageText_NoURLs(urlsList)  #getWebpageText
        docsList = [d for d in docsList if 'text' in d]
        t = ''
        #docsTotalFreqs=[]
        docsEntities = []
        docsEntitiesFreq = []
        entitiesFreq = {}

        # Convert each doc to tokens, locations, dates lists and their corresponding frequency distributions
        # Also produces the total frequency for each document of each list (tokens, locations, and dates)
        for doc in docsList:
            #t = ""
            #if doc.has_key('text'):
            t = doc['text']
            #if doc.has_key('title'):
            #    t =doc['title']+ " "+t
            #if t:
            #print 'Reading ' + t[:100]
            ents = eventUtils.getEntities(t)[0]
            docEnt = {}
            docEnt['LOCATION'] = {}
            if 'LOCATION' in ents:
                docEnt['LOCATION'] = ents['LOCATION']
            docEnt['DATE'] = {}
            if 'DATE' in ents:
                docEnt['DATE'] = ents['DATE']
            toks = eventUtils.getTokens(t)
            docEnt['Topic'] = toks
            docsEntities.append(docEnt)

            docEntFreq = {}
            #docTotals = {}
            for k in docEnt:
                docEntFreq[k] = eventUtils.getFreq(docEnt[k])
                #totalFreq = sum([v for _,v in docEntFreq[k].items()])

                #docTotals[k] = totalFreq
            docsEntitiesFreq.append(docEntFreq)
            #docsTotalFreqs.append(docTotals)

        # Collection-level frequency for each entity(tokens, locations, dates)

        #Calculating prob for each item in each entity lists (tokens, locations, and dates) as
        # freq of item in all docs / total freq of all terms in that list
        entitiesFreq['LOCATION'] = defaultdict(float)  #{}
        entitiesFreq['DATE'] = defaultdict(float)  #{}
        entitiesFreq['Topic'] = defaultdict(float)  #{}

        for docEntFreq in docsEntitiesFreq:
            for entity in docEntFreq:
                for val in docEntFreq[entity]:
                    #if val in entitiesProb[entity]:
                    entitiesFreq[entity][val] += docEntFreq[entity][val]
                    #else:
                    #    entitiesProb[entity][val] = docEntFreq[entity][val]
        self.defaultProb = {}
        entitiesProb = {}
        for ent in entitiesFreq:
            allvalsFreq = sum([v for _, v in entitiesFreq[ent].items()])
            l = len(entitiesFreq[ent])
            denom = l + allvalsFreq
            self.defaultProb[ent] = 1.0 / denom
            entitiesProb[ent] = defaultdict(lambda: 1.0 / denom)
            for k in entitiesFreq[ent]:
                #entitiesProb[ent][k] = (1.0 + (entitiesProb[ent][k] *1.0)) / (docsTotalFreqs[ent] + allDocsTotal[ent])

                entitiesProb[ent][k] = (
                    1.0 + entitiesProb[ent][k]) / denom  #(l + allvalsFreq)

        #self.probEvtModel = entitiesProb

        mle = self.getMLEEventEntities(entitiesProb, 10)
        for k in mle:
            print k, mle[k]

        self.probEvtModel = {}
        for k in mle:
            #self.probEvtModel[k] = dict(mle[k])#entitiesProb[k][:topK]
            self.probEvtModel[k] = defaultdict(lambda: self.defaultProb[k])
            for e, v in mle[k]:
                self.probEvtModel[k][e] = v

        #self.eDisDic = self.probEvtModel['Topic']

        locToks = self.probEvtModel['LOCATION'].keys()
        locToks = eventUtils.getStemmedWords(locToks)
        #self.locDic = dict(zip(locToks,self.probEvtModel['LOCATION'].values()))
        locDic = defaultdict(lambda: self.defaultProb['LOCATION'])
        for k, v in zip(locToks, self.probEvtModel['LOCATION'].values()):
            locDic[k] = v
        self.probEvtModel['LOCATION'] = locDic

        dToks = self.probEvtModel['DATE'].keys()
        dToks = eventUtils.getStemmedWords(dToks)
        #self.dDic = dict(zip(dToks,self.probEvtModel['DATE'].values()))
        dDic = defaultdict(lambda: self.defaultProb['DATE'])
        for k, v in zip(locToks, self.probEvtModel['DATE'].values()):
            dDic[k] = v
        self.probEvtModel['DATE'] = dDic

        return docsEntities, entitiesProb