def getWordsFrequencies(self): for d in self.documents: w = d.getWords() self.words.extend(w) f = utils.getFreq(self.words) tokensFreqs = f.items() self.wordsFrequencies = utils.getSorted(tokensFreqs,1) return self.wordsFrequencies
def getWordsFrequencies(self): for d in self.documents: w = d.getWords() self.words.extend(w) f = utils.getFreq(self.words) tokensFreqs = f.items() self.wordsFrequencies = utils.getSorted(tokensFreqs, 1) return self.wordsFrequencies
def calculate_similarity_equalWeights_duplicate(self,doc): eDisDic = self.entities['Topic'] locToks = self.entities['LOCATION'].keys() locToks = eventUtils.getStemmedWords(locToks) locDic = dict(zip(locToks,self.entities['LOCATION'].values())) dToks = self.entities['DATE'].keys() dToks = eventUtils.getStemmedWords(dToks) dDic = dict(zip(dToks,self.entities['DATE'].values())) tokens = eventUtils.getTokens(doc) tokensDic = eventUtils.getFreq(tokens) wv = [1+math.log(e) for e in tokensDic.values()] wvScalar = self.getScalar(wv) scores = [] ksd = 0 #interst = 0 for i in tokensDic: if i in eDisDic: ksd += (1+math.log(eDisDic[i]))* (1+math.log(tokensDic[i])) #interst +=1 if ksd > 0: ksd = float(ksd)/(self.scalars['Topic'] * wvScalar) else: ksd = 0 if ksd == 0: return 0 #if interst < 2: #return 0 scores.append(ksd) ksl = 0 for i in tokensDic: if i in locDic: ksl += (1+math.log(locDic[i]))* (1+math.log(tokensDic[i])) if ksl > 0: ksl = float(ksl)/(self.scalars['LOCATION'] * wvScalar) else: ksl = 0 scores.append(ksl) ks = 0 for i in tokensDic: if i in dDic: ks += (1+math.log(dDic[i]))* (1+math.log(tokensDic[i])) if ks > 0: ks = float(ks)/(self.scalars['DATE'] * wvScalar) else: ks = 0 scores.append(ks) score = sum(scores) / 3.0 return score
def calculate_score(self, doc): #sims=[] docWords = getTokens(doc) docTF = getFreq(docWords) sim = self.cosSim( docTF) if sim >= self.relevanceth: return [1,sim] else: return [0,sim]
def buildIndex(self): self.docs_bow = [eventUtils.getFreq(d.words) for d in self.coll.documents if d.text] for doc_bow in self.docs_bow: for w in doc_bow: if w in self.index: self.index[w]['docs'].append(doc_bow[w]) else: self.index[w] = {} self.index[w]['docs'] = [doc_bow[w]] for w in self.index: self.index[w]['docFreq'] = len(self.index[w]['docs']) self.index[w]['collFreq'] = sum(self.index[w]['docs'])
def buildVSMClassifier(self,posFile,vsmClassifierFileName,th,topK): try: classifierFile = open(vsmClassifierFileName,"rb") self.classifier = pickle.load(classifierFile) classifierFile.close() except: docs = [] f = open(posFile,'r') for url in f: url = url.strip() d = Document(url) if d and d.text: docs.append(d) f.close() ''' docsTF = [] for d in docs: wordsFreq = getFreq(d.getWords()) docsTF.append(wordsFreq) self.classifier = VSMClassifier(docsTF,th) ''' docsTF = [] vocabTFDic = {} for d in docs: wordsFreq = getFreq(d.getWords()) #docsTF.append(wordsFreq) for w in wordsFreq: if w in vocabTFDic: vocabTFDic[w] += wordsFreq[w] else: vocabTFDic[w] = wordsFreq[w] vocabSorted = getSorted(vocabTFDic.items(), 1) topVocabDic = dict(vocabSorted[:topK]) #topVocabDic = vocabTFDic ndocsTF = [] ''' for d in docsTF: ndocTF = {} for k in topVocabDic: if k in d: ndocTF[k] = d[k] else: ndocTF[k] = 1/math.e ndocsTF.append(ndocTF) ''' self.classifier = VSMClassifier(topVocabDic,ndocsTF,th) classifierFile = open(vsmClassifierFileName,"wb") pickle.dump(self.classifier,classifierFile) classifierFile.close()
def extractDatesLocs(urls): webpagesTxt = eventUtils.getWebpageText_NoURLs(urls) txts = [ webpageTxt['text'] for webpageTxt in webpagesTxt if 'text' in webpageTxt ] webpageEnts = eventUtils.getEntities(txts) #webpageEnts = eventUtils.getEntities(webpageTxt[0]['text']) #print webpageEnts[0]['LOCATION'] #print webpageEnts[0]['DATE'] locs = [] dates = [] for wbE in webpageEnts: #print wbE['LOCATION'] #print wbE['DATE'] #print '-----------------------' if 'LOCATION' in wbE: locs.extend(wbE['LOCATION']) if 'DATE' in wbE: dates.extend(wbE['DATE']) freqLocs = eventUtils.getFreq(locs) freqDates = eventUtils.getFreq(dates) ''' freqDates_norm = normalizeDates(freqDates) sortedDates = eventUtils.getSorted(freqDates_norm.iteritems(),1) print sortedDates print "Most Frequent Date (i.e. most probably event's date) is: ", sortedDates[0] print '________________________________' #print freqDates_norm ''' freqLocs_norm = normalizeLocs(freqLocs) sortedLocs = eventUtils.getSorted(freqLocs_norm.iteritems(), 1) print sortedLocs print "Most Frequent Location (i.e. most probably event's location) is: ", sortedLocs[ 0] #print freqLocs_norm return
def calculate_similarity(self, doc): eDisDic = self.entities["Disaster"] locToks = self.entities["LOCATION"].keys() locToks = eventUtils.getStemmedWords(locToks) locDic = dict(zip(locToks, self.entities["LOCATION"].values())) dToks = self.entities["DATE"].keys() dToks = eventUtils.getStemmedWords(dToks) dDic = dict(zip(dToks, self.entities["DATE"].values())) tokens = eventUtils.getTokens(doc) tokensDic = eventUtils.getFreq(tokens) wv = [1 + math.log(e) for e in tokensDic.values()] wvScalar = self.getScalar(wv) scores = [] ksd = 0 for i in tokensDic: if i in eDisDic: ksd += (1 + math.log(eDisDic[i])) * (1 + math.log(tokensDic[i])) if ksd > 0: ksd = float(ksd) / (self.scalars["Disaster"] * wvScalar) else: ksd = 0 if ksd == 0: return 0 scores.append(ksd) ksl = 0 for i in tokensDic: if i in locDic: ksl += (1 + math.log(locDic[i])) * (1 + math.log(tokensDic[i])) if ksl > 0: ksl = float(ksl) / (self.scalars["LOCATION"] * wvScalar) else: ksl = 0 scores.append(ksl) ks = 0 for i in tokensDic: if i in dDic: ks += (1 + math.log(dDic[i])) * (1 + math.log(tokensDic[i])) if ks > 0: ks = float(ks) / (self.scalars["DATE"] * wvScalar) else: ks = 0 scores.append(ks) score = sum(scores) / 3.0 return score
def getEntitiesFreq(self,entityList): el = [e.lower() for e in entityList] entitiesWords = [] for w in el: p = w.split() if len(p)>1: entitiesWords.extend(p) else: entitiesWords.append(w) s = eventUtils.getFreq(entitiesWords) s = eventUtils.getSorted(s.items(), 1) return s
def getEntitiesFreq(self, entityList): el = [e.lower() for e in entityList] entitiesWords = [] for w in el: p = w.split() if len(p) > 1: entitiesWords.extend(p) else: entitiesWords.append(w) s = eventUtils.getFreq(entitiesWords) s = eventUtils.getSorted(s.items(), 1) return s
def extractDatesLocs(urls): webpagesTxt = eventUtils.getWebpageText_NoURLs(urls) txts = [webpageTxt['text'] for webpageTxt in webpagesTxt if 'text' in webpageTxt] webpageEnts = eventUtils.getEntities(txts) #webpageEnts = eventUtils.getEntities(webpageTxt[0]['text']) #print webpageEnts[0]['LOCATION'] #print webpageEnts[0]['DATE'] locs = [] dates = [] for wbE in webpageEnts: #print wbE['LOCATION'] #print wbE['DATE'] #print '-----------------------' if 'LOCATION' in wbE: locs.extend(wbE['LOCATION']) if 'DATE' in wbE: dates.extend(wbE['DATE']) freqLocs = eventUtils.getFreq(locs) freqDates = eventUtils.getFreq(dates) ''' freqDates_norm = normalizeDates(freqDates) sortedDates = eventUtils.getSorted(freqDates_norm.iteritems(),1) print sortedDates print "Most Frequent Date (i.e. most probably event's date) is: ", sortedDates[0] print '________________________________' #print freqDates_norm ''' freqLocs_norm = normalizeLocs(freqLocs) sortedLocs = eventUtils.getSorted(freqLocs_norm.iteritems(),1) print sortedLocs print "Most Frequent Location (i.e. most probably event's location) is: ", sortedLocs[0] #print freqLocs_norm return
def buildVSMClassifier_OneTargetTopicVector(self,posFile,vsmClassifierFileName,th,topK): try: classifierFile = open(vsmClassifierFileName,"rb") self.classifier = pickle.load(classifierFile) classifierFile.close() except: docs = [] f = open(posFile,'r') for url in f: url = url.strip() d = Document(url) if d and d.text: docs.append(d) f.close() ''' docsTF = [] for d in docs: wordsFreq = getFreq(d.getWords()) docsTF.append(wordsFreq) self.classifier = VSMClassifier(docsTF,th) ''' docsTF = [] vocabTFDic = {} n = len(docs) for d in docs: wordsFreq = getFreq(d.getWords()) #docsTF.append(wordsFreq) for w in wordsFreq: if w in vocabTFDic: #vocabTFDic[w] += wordsFreq[w] vocabTFDic[w].append( wordsFreq[w]) else: vocabTFDic[w] = [wordsFreq[w]] #vocTF_IDF = [(w,sum(vocabTFDic[w])*math.log(n*1.0/len(vocabTFDic[w]))) for w in vocabTFDic] idf = 1.0 vocTF_IDF = [(w,sum([1+math.log(vtf) for vtf in vocabTFDic[w]])*idf) for w in vocabTFDic] #vocabSorted = getSorted(vocabTFDic.items(), 1) vocabSorted = getSorted(vocTF_IDF, 1) print vocabSorted[:topK] topVocabDic = dict(vocabSorted[:topK]) #topVocabDic = vocabTFDic self.classifier = VSMClassifier(topVocabDic,th) classifierFile = open(vsmClassifierFileName,"wb") pickle.dump(self.classifier,classifierFile) classifierFile.close()
def calculate_score_AllDocs(self, doc): sims=[] docWords = getTokens(doc) docTF = getFreq(docWords) ndocTF = dict.fromkeys(self.topVocabDic) for k in ndocTF: if k in docTF: ndocTF[k] = docTF[k] else: ndocTF[k] = 1/math.e for dTF in self.docsTF: s = self.cosSim(ndocTF, dTF) sims.append(s) sim = max(sims) if sim >= self.relevanceth: return [1,sim] else: return [0,sim]
def buildVSMClassifier(self,posFile,vsmClassifierFileName,th,leastK): try: classifierFile = open(vsmClassifierFileName,"rb") self.classifier = pickle.load(classifierFile) classifierFile.close() except: docs = [] f = open(posFile,'r') for url in f: url = url.strip() d = Document(url) if d and d.text: docs.append(d) f.close() #docsBOW = [] vocabTFDic = defaultdict([]) #n = len(docs) for d in docs: wordsFreq = getFreq(d.getWords()) #docsBOW.append(wordsFreq) for w in wordsFreq: vocabTFDic[w].append( wordsFreq[w]) #idf = 1.0 #vocTF_IDF = [(w,sum([1+math.log(vtf) for vtf in vocabTFDic[w]])*idf) for w in vocabTFDic] voc_CollFreq = [(w,sum(vocabTFDic[w])) for w in vocabTFDic] vocab_filtered = [(w,f) for w in voc_CollFreq if f>= leastK] vocab_filtered_dict = dict(vocab_filtered) #vocabSorted = getSorted(voc_CollFreq, 1) ''' print vocabSorted[:topK] topVocabDic = dict(vocabSorted[:topK]) ''' self.classifier = VSMClassifier(vocab_filtered_dict,th) classifierFile = open(vsmClassifierFileName,"wb") pickle.dump(self.classifier,classifierFile) classifierFile.close()
def getShortURLsFreqDic(shortURLs): shortURLsFreqDic = eventUtils.getFreq(shortURLs) return shortURLsFreqDic
def buildProbEventModel(docsList): t = '' docsTotalFreqs=[] docsEntities=[] docsEntitiesFreq = [] entitiesProb = {} # Convert each doc to tokens, locations, dates lists and their corresponding frequency distributions # Also produces the total frequency for each document of each list (tokens, locations, and dates) for doc in docsList: if doc.has_key('text'): t = doc['text'] if doc.has_key('title'): t =doc['title']+ " "+t if t: print 'Reading ' + t[:100] ents = eventUtils.getEntities(t)[0] docEnt = {} docEnt['LOCATION']={} if 'LOCATION' in ents: docEnt['LOCATION'] = ents['LOCATION'] docEnt['DATE']={} if 'DATE' in ents: docEnt['DATE'] = ents['DATE'] toks = eventUtils.getTokens(t) docEnt['Topic'] = toks docsEntities.append(docEnt) docEntFreq = {} #docTotals = {} for k in docEnt: docEntFreq[k] = eventUtils.getFreq(docEnt[k]) #totalFreq = sum([v for _,v in docEntFreq[k].items()]) #docTotals[k] = totalFreq docsEntitiesFreq.append(docEntFreq) #docsTotalFreqs.append(docTotals) # Collection-level frequency for each entity(tokens, locations, dates) # Total Frequency of keywords, locations, and dates in all documents ''' allDocsTotal = {} allDocsTotal['LOCATION'] = 0 allDocsTotal['DATE']=0 allDocsTotal['Topic'] = 0 for docTotFreq in docsTotalFreqs: for k in docTotFreq: allDocsTotal[k]+= docTotFreq[k] ''' #Calculating prob for each item in each entity lists (tokens, locations, and dates) as # freq of item in all docs / total freq of all terms in that list entitiesProb['LOCATION']={} entitiesProb['DATE']={} entitiesProb['Topic']={} for docEntFreq in docsEntitiesFreq: for entity in docEntFreq: for val in docEntFreq[entity]: if val in entitiesProb[entity]: entitiesProb[entity][val] += docEntFreq[entity][val] else: entitiesProb[entity][val] = docEntFreq[entity][val] for ent in entitiesProb: allvalsFreq = sum([v for _,v in entitiesProb[ent].items()]) for k in entitiesProb[ent]: #entitiesProb[ent][k] = (1.0 + (entitiesProb[ent][k] *1.0)) / (docsTotalFreqs[ent] + allDocsTotal[ent]) entitiesProb[ent][k] = (1.0 + (entitiesProb[ent][k] *1.0)) / (len(entitiesProb[ent]) + allvalsFreq) return docsEntities, entitiesProb
def readGraphFile(graphFile): with open(graphFile) as f: lines = f.readlines() lines = [l.strip() for l in lines] #graph = [(int(l.split(",")[0])+1,int(l.split(',')[1])+1) for l in lines ] graph = [(l.split(",")[0],l.split(',')[1]) for l in lines ] return graph # draw example urlsFile = 'base-Output-URLs.txt' urls = eu.readFileLines(urlsFile) doms = [eu.getDomain(url) for url in urls] uniqueDomsFreqDic = eu.getFreq(doms) uDoms = uniqueDomsFreqDic.keys() numDoms = len(uDoms) uc=[random.random() for i in range(numDoms)] uniqDomsColorsDic = dict(zip(uDoms,uc)) #c = [uniqDomsColorsDic[d] for d in doms] #c = c[5:] domsTuples = enumerate(doms) domsDic = dict(domsTuples) #domsDic = defaultdict(list) #for i,d in domsTuples: # domsDic[d].append(i) #print domsDic graphFile = 'Output-CharlestonShooting/base-webpages/base-logData.txt' graph = readGraphFile(graphFile)
def buildEventModel_old(self, seedURLs): corpus = Collection(seedURLs) #sortedTokensFreqs = corpus.getWordsFrequencies() sortedToksTFDF = corpus.getIndicativeWords() print sortedToksTFDF sortedImptSents = corpus.getIndicativeSentences( self.topK, self.intersectionTh) # Get Event Model eventModelInstances = eventUtils.getEventModelInsts(sortedImptSents) #topToks = [k for k,_ in sortedToksTFDF] #if self.topK < len(topToks): # topToks = topToks[:self.topK] #self.entities['Disaster'] = set(topToks) self.entities['LOCATION'] = [] self.entities['DATE'] = [] for e in eventModelInstances: if 'LOCATION' in e: self.entities['LOCATION'].extend(e['LOCATION']) elif 'DATE' in e: self.entities['DATE'].extend(e['DATE']) entitiesFreq = {} entitiesFreq['LOCATION'] = eventUtils.getFreq( self.entities['LOCATION']) entitiesFreq['LOCATION'] = eventUtils.getSorted( entitiesFreq['LOCATION'].items(), 1) entitiesFreq['DATE'] = eventUtils.getFreq(self.entities['DATE']) entitiesFreq['DATE'] = eventUtils.getSorted( entitiesFreq['DATE'].items(), 1) l = [k for k, _ in entitiesFreq['LOCATION']] if self.topK < len(l): #l = l[:self.topK] l = l[:3] self.entities['LOCATION'] = set(l) d = [k for k, _ in entitiesFreq['DATE']] if self.topK < len(d): #d = d[:self.topK] d = d[:3] self.entities['DATE'] = set(d) ''' locList = self.entities['LOCATION'] locSet = set(locList) self.entities['LOCATION'] = [l for l in locSet] ''' self.entities['LOCATION'] = self.getUniqueEntities( self.entities['LOCATION']) ''' dateList = self.entities['DATE'] dateSet = set(dateList) self.entities['DATE'] = [d for d in dateSet] ''' self.entities['DATE'] = self.getUniqueEntities(self.entities['DATE']) locDate = list(self.entities['LOCATION']) + list(self.entities['DATE']) locDate = eventUtils.getTokens(' '.join(locDate)) ntopToks = [] topToks = [k for k, _ in sortedToksTFDF] for tok in topToks: if tok not in locDate: ntopToks.append(tok) topToks = ntopToks if self.topK < len(topToks): topToks = topToks[:self.topK] self.entities['Disaster'] = set(topToks) self.allEntities = [] for k in self.entities: self.allEntities.extend(self.entities[k]) print self.allEntities
def readGraphFile(graphFile): with open(graphFile) as f: lines = f.readlines() lines = [l.strip() for l in lines] #graph = [(int(l.split(",")[0])+1,int(l.split(',')[1])+1) for l in lines ] graph = [(l.split(",")[0], l.split(',')[1]) for l in lines] return graph # draw example urlsFile = 'base-Output-URLs.txt' urls = eu.readFileLines(urlsFile) doms = [eu.getDomain(url) for url in urls] uniqueDomsFreqDic = eu.getFreq(doms) uDoms = uniqueDomsFreqDic.keys() numDoms = len(uDoms) uc = [random.random() for i in range(numDoms)] uniqDomsColorsDic = dict(zip(uDoms, uc)) #c = [uniqDomsColorsDic[d] for d in doms] #c = c[5:] domsTuples = enumerate(doms) domsDic = dict(domsTuples) #domsDic = defaultdict(list) #for i,d in domsTuples: # domsDic[d].append(i) #print domsDic graphFile = 'Output-CharlestonShooting/base-webpages/base-logData.txt' graph = readGraphFile(graphFile)
def calculate_similarity(self,doc): #weigths ={'Topic':0.0,'LOCATION':0.0, 'DATE':0.0} ''' entFreq = {} for k in self.entities: entFreq[k]= sum(self.entities[k].values()) totFreq = sum(entFreq.values()) for k in weigths: weigths[k] = entFreq[k]*1.0 / totFreq ''' topicDic = self.entities['Topic'] locToks = self.entities['LOCATION'].keys() locToks = eventUtils.getStemmedWords(locToks) locDic = dict(zip(locToks,self.entities['LOCATION'].values())) dToks = self.entities['DATE'].keys() dToks = eventUtils.getStemmedWords(dToks) dDic = dict(zip(dToks,self.entities['DATE'].values())) tokens = eventUtils.getTokens(doc) tokensDic = eventUtils.getFreq(tokens) wv = [1+math.log(e) for e in tokensDic.values()] wvScalar = self.getScalar(wv) scores = [] ksd = 0 #interst = 0 for i in tokensDic: if i in topicDic: ksd += (1+math.log(topicDic[i]))* (1+math.log(tokensDic[i])) #interst +=1 if ksd != 0: ksd = float(ksd)/(self.scalars['Topic'] * wvScalar) #else: # ksd = 0 #if ksd == 0: # return 0 #if interst < 2: #return 0 scores.append(ksd*self.weights['Topic']) ksl = 0 for i in tokensDic: if i in locDic: ksl += (1+math.log(locDic[i]))* (1+math.log(tokensDic[i])) if ksl != 0: ksl = float(ksl)/(self.scalars['LOCATION'] * wvScalar) #else: # ksl = 0 scores.append(ksl*self.weights['LOCATION']) ks = 0 for i in tokensDic: if i in dDic: ks += (1+math.log(dDic[i]))* (1+math.log(tokensDic[i])) if ks != 0: ks = float(ks)/(self.scalars['DATE'] * wvScalar) #else: # ks = 0 scores.append(ks*self.weights['DATE']) #score = sum(scores) / 3.0 score = sum(scores) return score
def evaluate(collFolder,k): evalres = [] for j in range(k): fn = collFolder+str(j)+'.txt' f = codecs.open(fn, encoding='utf-8') ftext = f.read() text = ftext.split()#getTokens(ftext) text = [t.lower() for t in text] te = [] for t in text: if t.endswith('.'): t = t[:-1] te.append(t) text = te textFreq = getFreq(text) ''' if 'shoot' in text or 'shooter' in text or 'shooting' in text: if 'fsu' in text: evalres.append(1) elif 'florida' in text and 'state' in text :#and 'university' in text: evalres.append(1) else: evalres.append(0) else: evalres.append(0) ''' ''' if 'hagupit' in text or 'ruby' in text: if 'typhoon' in text: evalres.append(1) elif 'philippin' in text or 'philippines' in text: evalres.append(1) else: evalres.append(0) #evalres.append(1) else: evalres.append(0) ''' ''' if 'fire' in text: if 'la' in text: #if 'downtown' in text: evalres.append(1) #else: # evalres.append(0) elif 'los' in text and 'angeles' in text: #if 'downtown' in text: evalres.append(1) #else: # evalres.append(0) else: evalres.append(0) else: evalres.append(0) ''' ''' if 'charlie' in text and 'hebdo' in text or 'paris' in text: if 'shooting' in text or 'shoot' in text: evalres.append(1) elif 'attack' in text: evalres.append(1) else: evalres.append(0) else: evalres.append(0) ''' ''' if 'airasia' in text or 'qz8501' in text: if 'flight' in text and 'missing' in text: evalres.append(1) elif 'plane' in text and 'missing' in text: evalres.append(1) else: evalres.append(0) #evalres.append(1) else: evalres.append(0) ''' th = 2 if textFreq.get('qz8501',0)>th: evalres.append(1) elif textFreq.get('airasia',0)>th: if textFreq.get('flight',0) or textFreq.get('plane',0): if textFreq.get('missing',0) or textFreq.get('crash',0): evalres.append(1) elif textFreq.get('8501',0) or textFreq.get('qz8501',0): evalres.append(1) else: evalres.append(0) else: evalres.append(0) #evalres.append(1) else: evalres.append(0) f.close() return evalres
def buildProbEventModel(docsList): t = '' docsTotalFreqs = [] docsEntities = [] docsEntitiesFreq = [] entitiesProb = {} # Convert each doc to tokens, locations, dates lists and their corresponding frequency distributions # Also produces the total frequency for each document of each list (tokens, locations, and dates) for doc in docsList: if doc.has_key('text'): t = doc['text'] if doc.has_key('title'): t = doc['title'] + " " + t if t: print 'Reading ' + t[:100] ents = eventUtils.getEntities(t)[0] docEnt = {} docEnt['LOCATION'] = {} if 'LOCATION' in ents: docEnt['LOCATION'] = ents['LOCATION'] docEnt['DATE'] = {} if 'DATE' in ents: docEnt['DATE'] = ents['DATE'] toks = eventUtils.getTokens(t) docEnt['Topic'] = toks docsEntities.append(docEnt) docEntFreq = {} #docTotals = {} for k in docEnt: docEntFreq[k] = eventUtils.getFreq(docEnt[k]) #totalFreq = sum([v for _,v in docEntFreq[k].items()]) #docTotals[k] = totalFreq docsEntitiesFreq.append(docEntFreq) #docsTotalFreqs.append(docTotals) # Collection-level frequency for each entity(tokens, locations, dates) # Total Frequency of keywords, locations, and dates in all documents ''' allDocsTotal = {} allDocsTotal['LOCATION'] = 0 allDocsTotal['DATE']=0 allDocsTotal['Topic'] = 0 for docTotFreq in docsTotalFreqs: for k in docTotFreq: allDocsTotal[k]+= docTotFreq[k] ''' #Calculating prob for each item in each entity lists (tokens, locations, and dates) as # freq of item in all docs / total freq of all terms in that list entitiesProb['LOCATION'] = {} entitiesProb['DATE'] = {} entitiesProb['Topic'] = {} for docEntFreq in docsEntitiesFreq: for entity in docEntFreq: for val in docEntFreq[entity]: if val in entitiesProb[entity]: entitiesProb[entity][val] += docEntFreq[entity][val] else: entitiesProb[entity][val] = docEntFreq[entity][val] for ent in entitiesProb: allvalsFreq = sum([v for _, v in entitiesProb[ent].items()]) for k in entitiesProb[ent]: #entitiesProb[ent][k] = (1.0 + (entitiesProb[ent][k] *1.0)) / (docsTotalFreqs[ent] + allDocsTotal[ent]) entitiesProb[ent][k] = (1.0 + (entitiesProb[ent][k] * 1.0)) / ( len(entitiesProb[ent]) + allvalsFreq) return docsEntities, entitiesProb
def buildEventModel_old(self,seedURLs): corpus = Collection(seedURLs) #sortedTokensFreqs = corpus.getWordsFrequencies() sortedToksTFDF = corpus.getIndicativeWords() print sortedToksTFDF sortedImptSents = corpus.getIndicativeSentences(self.topK,self.intersectionTh) # Get Event Model eventModelInstances = eventUtils.getEventModelInsts(sortedImptSents) #topToks = [k for k,_ in sortedToksTFDF] #if self.topK < len(topToks): # topToks = topToks[:self.topK] #self.entities['Disaster'] = set(topToks) self.entities['LOCATION']= [] self.entities['DATE'] = [] for e in eventModelInstances: if 'LOCATION' in e: self.entities['LOCATION'].extend( e['LOCATION']) elif 'DATE' in e: self.entities['DATE'].extend( e['DATE']) entitiesFreq = {} entitiesFreq['LOCATION'] = eventUtils.getFreq(self.entities['LOCATION']) entitiesFreq['LOCATION'] = eventUtils.getSorted(entitiesFreq['LOCATION'].items(), 1) entitiesFreq['DATE'] = eventUtils.getFreq(self.entities['DATE']) entitiesFreq['DATE'] = eventUtils.getSorted(entitiesFreq['DATE'].items(), 1) l = [k for k,_ in entitiesFreq['LOCATION']] if self.topK < len(l): #l = l[:self.topK] l = l[:3] self.entities['LOCATION'] = set(l) d = [k for k,_ in entitiesFreq['DATE']] if self.topK < len(d): #d = d[:self.topK] d = d[:3] self.entities['DATE'] = set(d) self.entities['LOCATION'] = self.getUniqueEntities(self.entities['LOCATION']) self.entities['DATE'] = self.getUniqueEntities(self.entities['DATE']) locDate = list(self.entities['LOCATION']) + list(self.entities['DATE']) locDate = eventUtils.getTokens(' '.join(locDate)) ntopToks = [] topToks = [k for k,_ in sortedToksTFDF] for tok in topToks: if tok not in locDate: ntopToks.append(tok) topToks = ntopToks if self.topK < len(topToks): topToks = topToks[:self.topK] self.entities['Disaster'] = set(topToks) self.allEntities = [] for k in self.entities: self.allEntities.extend(self.entities[k]) print self.allEntities
def evaluate(collFolder,k): evalres = [] for j in range(k): fn = collFolder+str(j)+'.txt' f = codecs.open(fn, encoding='utf-8') ftext = f.read() text = ftext.split()#getTokens(ftext) text = [t.lower() for t in text] te = [] for t in text: if t.endswith('.'): t = t[:-1] te.append(t) text = te textFreq = getFreq(text) ''' if 'shoot' in text or 'shooter' in text or 'shooting' in text: if 'fsu' in text: evalres.append(1) elif 'florida' in text and 'state' in text :#and 'university' in text: evalres.append(1) else: evalres.append(0) else: evalres.append(0) ''' ''' if 'typhoon' in text: if 'hagupit' in text or 'ruby' in text: evalres.append(1) #elif 'philippin' in text or 'philippines' in text: # evalres.append(1) else: evalres.append(0) #evalres.append(1) elif 'hagupit' in text: evalres.append(1) else: evalres.append(0) ''' ''' if 'fire' in text: if 'la' in text: #if 'downtown' in text: evalres.append(1) #else: # evalres.append(0) elif 'los' in text and 'angeles' in text: #if 'downtown' in text: evalres.append(1) #else: # evalres.append(0) else: evalres.append(0) else: evalres.append(0) ''' ''' if 'charlie' in text and 'hebdo' in text or 'paris' in text: if 'shooting' in text or 'shoot' in text: evalres.append(1) elif 'attack' in text: evalres.append(1) else: evalres.append(0) else: evalres.append(0) ''' if 'qz8501' in text: evalres.append(1) elif 'airasia' in text: if 'flight' in text and 'missing' in text: evalres.append(1) elif 'plane' in text: if 'crash' in text or 'missing' in text: evalres.append(1) else: evalres.append(0) else: evalres.append(0) #evalres.append(1) else: evalres.append(0) ''' th = 2 if textFreq.get('qz8501',0)>th: evalres.append(1) elif textFreq.get('airasia',0)>th: if textFreq.get('flight',0) or textFreq.get('plane',0): if textFreq.get('missing',0) or textFreq.get('crash',0): evalres.append(1) elif textFreq.get('8501',0) or textFreq.get('qz8501',0): evalres.append(1) else: evalres.append(0) else: evalres.append(0) #evalres.append(1) else: evalres.append(0) f.close() ''' return evalres
def buildProbEventModel(self,urlsList,topK): docsList = eventUtils.getWebpageText(urlsList) #self.getCollectionDocs(urlsList) t = '' #docsTotalFreqs=[] docsEntities=[] docsEntitiesFreq = [] entitiesProb = {} # Convert each doc to tokens, locations, dates lists and their corresponding frequency distributions # Also produces the total frequency for each document of each list (tokens, locations, and dates) for doc in docsList: if doc.has_key('text'): t = doc['text'] if doc.has_key('title'): t =doc['title']+ " "+t if t: #print 'Reading ' + t[:100] ents = eventUtils.getEntities(t)[0] docEnt = {} docEnt['LOCATION']={} if 'LOCATION' in ents: docEnt['LOCATION'] = ents['LOCATION'] docEnt['DATE']={} if 'DATE' in ents: docEnt['DATE'] = ents['DATE'] toks = eventUtils.getTokens(t) docEnt['Topic'] = toks docsEntities.append(docEnt) docEntFreq = {} #docTotals = {} for k in docEnt: docEntFreq[k] = eventUtils.getFreq(docEnt[k]) #totalFreq = sum([v for _,v in docEntFreq[k].items()]) #docTotals[k] = totalFreq docsEntitiesFreq.append(docEntFreq) #docsTotalFreqs.append(docTotals) # Collection-level frequency for each entity(tokens, locations, dates) #Calculating prob for each item in each entity lists (tokens, locations, and dates) as # freq of item in all docs / total freq of all terms in that list entitiesProb['LOCATION']={} entitiesProb['DATE']={} entitiesProb['Topic']={} for docEntFreq in docsEntitiesFreq: for entity in docEntFreq: for val in docEntFreq[entity]: if val in entitiesProb[entity]: entitiesProb[entity][val] += docEntFreq[entity][val] else: entitiesProb[entity][val] = docEntFreq[entity][val] for ent in entitiesProb: allvalsFreq = sum([v for _,v in entitiesProb[ent].items()]) for k in entitiesProb[ent]: #entitiesProb[ent][k] = (1.0 + (entitiesProb[ent][k] *1.0)) / (docsTotalFreqs[ent] + allDocsTotal[ent]) entitiesProb[ent][k] = (1.0 + (entitiesProb[ent][k] *1.0)) / (len(entitiesProb[ent]) + allvalsFreq) #self.probEvtModel = entitiesProb mle = self.getMLEEventEntities(entitiesProb,10) for k in mle: print k, mle[k] self.probEvtModel = {} for k in mle: self.probEvtModel[k] = dict(mle[k])#entitiesProb[k][:topK] self.eDisDic = self.probEvtModel['Topic'] locToks = self.probEvtModel['LOCATION'].keys() locToks = eventUtils.getStemmedWords(locToks) self.locDic = dict(zip(locToks,self.probEvtModel['LOCATION'].values())) dToks = self.probEvtModel['DATE'].keys() dToks = eventUtils.getStemmedWords(dToks) self.dDic = dict(zip(dToks,self.probEvtModel['DATE'].values())) return docsEntities, entitiesProb
def buildProbEventModel(self, urlsList, topK): docsList = eventUtils.getWebpageText_NoURLs(urlsList) #getWebpageText docsList = [d for d in docsList if 'text' in d] t = '' #docsTotalFreqs=[] docsEntities = [] docsEntitiesFreq = [] entitiesFreq = {} # Convert each doc to tokens, locations, dates lists and their corresponding frequency distributions # Also produces the total frequency for each document of each list (tokens, locations, and dates) for doc in docsList: #t = "" #if doc.has_key('text'): t = doc['text'] #if doc.has_key('title'): # t =doc['title']+ " "+t #if t: #print 'Reading ' + t[:100] ents = eventUtils.getEntities(t)[0] docEnt = {} docEnt['LOCATION'] = {} if 'LOCATION' in ents: docEnt['LOCATION'] = ents['LOCATION'] docEnt['DATE'] = {} if 'DATE' in ents: docEnt['DATE'] = ents['DATE'] toks = eventUtils.getTokens(t) docEnt['Topic'] = toks docsEntities.append(docEnt) docEntFreq = {} #docTotals = {} for k in docEnt: docEntFreq[k] = eventUtils.getFreq(docEnt[k]) #totalFreq = sum([v for _,v in docEntFreq[k].items()]) #docTotals[k] = totalFreq docsEntitiesFreq.append(docEntFreq) #docsTotalFreqs.append(docTotals) # Collection-level frequency for each entity(tokens, locations, dates) #Calculating prob for each item in each entity lists (tokens, locations, and dates) as # freq of item in all docs / total freq of all terms in that list entitiesFreq['LOCATION'] = defaultdict(float) #{} entitiesFreq['DATE'] = defaultdict(float) #{} entitiesFreq['Topic'] = defaultdict(float) #{} for docEntFreq in docsEntitiesFreq: for entity in docEntFreq: for val in docEntFreq[entity]: #if val in entitiesProb[entity]: entitiesFreq[entity][val] += docEntFreq[entity][val] #else: # entitiesProb[entity][val] = docEntFreq[entity][val] self.defaultProb = {} entitiesProb = {} for ent in entitiesFreq: allvalsFreq = sum([v for _, v in entitiesFreq[ent].items()]) l = len(entitiesFreq[ent]) denom = l + allvalsFreq self.defaultProb[ent] = 1.0 / denom entitiesProb[ent] = defaultdict(lambda: 1.0 / denom) for k in entitiesFreq[ent]: #entitiesProb[ent][k] = (1.0 + (entitiesProb[ent][k] *1.0)) / (docsTotalFreqs[ent] + allDocsTotal[ent]) entitiesProb[ent][k] = ( 1.0 + entitiesProb[ent][k]) / denom #(l + allvalsFreq) #self.probEvtModel = entitiesProb mle = self.getMLEEventEntities(entitiesProb, 10) for k in mle: print k, mle[k] self.probEvtModel = {} for k in mle: #self.probEvtModel[k] = dict(mle[k])#entitiesProb[k][:topK] self.probEvtModel[k] = defaultdict(lambda: self.defaultProb[k]) for e, v in mle[k]: self.probEvtModel[k][e] = v #self.eDisDic = self.probEvtModel['Topic'] locToks = self.probEvtModel['LOCATION'].keys() locToks = eventUtils.getStemmedWords(locToks) #self.locDic = dict(zip(locToks,self.probEvtModel['LOCATION'].values())) locDic = defaultdict(lambda: self.defaultProb['LOCATION']) for k, v in zip(locToks, self.probEvtModel['LOCATION'].values()): locDic[k] = v self.probEvtModel['LOCATION'] = locDic dToks = self.probEvtModel['DATE'].keys() dToks = eventUtils.getStemmedWords(dToks) #self.dDic = dict(zip(dToks,self.probEvtModel['DATE'].values())) dDic = defaultdict(lambda: self.defaultProb['DATE']) for k, v in zip(locToks, self.probEvtModel['DATE'].values()): dDic[k] = v self.probEvtModel['DATE'] = dDic return docsEntities, entitiesProb