def compareDomains(files): domains = [] for bf in files: doms = {} f = open(bf) #ls = f.readlines() #ls = [l.strip() for l in ls] for l in f: l = l.strip() p = l.split(" ") if int(p[1]) > 1: doms[p[0]] = int(p[1]) ''' if p[0] in doms: doms[p[0]]+= int(p[1]) else: doms[p[0]] = int(p[1]) ''' domains.append(doms) f.close() comps = [] sets = [] for d in domains: s = set(d.keys()) sets.append(s) for i in range(len(sets)): for j in range(i+1,len(sets)): comps.append(len(getIntersection(sets[i],sets[j]))) print comps ks = [len(s) for s in sets] print ks return comps
def getIndicativeSentences(self, topK, intersectionTh): if len(self.indicativeSentences) > 0: return self.indicativeSentences else: topToksTuples = self.indicativeWords[:topK] #topToksTuples = self.indicativeWords topToks = [k for k, _ in topToksTuples] for d in self.documents: sents = d.getSentences() if sents and len(sents) > 0: self.sentences.extend(sents) impSents = {} for sent in self.sentences: if sent not in impSents: sentToks = utils.getTokens(sent) if len(sentToks) > 100: continue intersect = utils.getIntersection(topToks, sentToks) if len(intersect) >= intersectionTh: #impSents[sent] = len(intersect) impSents[sent] = intersect #print intersect #if sent not in impSentsF: # impSentsF[sent] = len(intersect) #allImptSents.append(impSents) if impSents: #self.indicativeSentences = utils.getSorted(impSents.items(),1) self.indicativeSentences = sorted(impSents.items(), key=lambda x: len(x[1]), reverse=True) #sortedToksTFDF = sorted(toksTFDF.items(), key=lambda x: x[1][0], reverse=True) return self.indicativeSentences
def getIndicativeSentences(self, topK, intersectionTh): if len(self.indicativeSentences) > 0: return self.indicativeSentences else: topToksTuples = self.indicativeWords[:topK] topToks = [k for k, _ in topToksTuples] for d in self.documents: sents = d.getSentences() self.sentences.extend(sents) impSents = {} for sent in self.sentences: if sent not in impSents: sentToks = utils.getTokens(sent) if len(sentToks) > 100: continue intersect = utils.getIntersection(topToks, sentToks) if len(intersect) > intersectionTh: impSents[sent] = len(intersect) #if sent not in impSentsF: # impSentsF[sent] = len(intersect) #allImptSents.append(impSents) self.indicativeSentences = utils.getSorted(impSents.items(), 1) return self.indicativeSentences
def getIndicativeSentences(self,topK,intersectionTh): if len(self.indicativeSentences) > 0: return self.indicativeSentences else: topToksTuples = self.indicativeWords[:topK] #topToksTuples = self.indicativeWords topToks = [k for k,_ in topToksTuples] for d in self.documents: sents = d.getSentences() if sents and len(sents)>0: self.sentences.extend(sents) impSents ={} for sent in self.sentences: if sent not in impSents: sentToks = utils.getTokens(sent) if len(sentToks) > 100: continue intersect = utils.getIntersection(topToks, sentToks) if len(intersect) > intersectionTh: #impSents[sent] = len(intersect) impSents[sent] = intersect #print intersect #if sent not in impSentsF: # impSentsF[sent] = len(intersect) #allImptSents.append(impSents) if impSents: #self.indicativeSentences = utils.getSorted(impSents.items(),1) self.indicativeSentences = sorted(impSents.items(),key=lambda x: len(x[1]), reverse=True) #sortedToksTFDF = sorted(toksTFDF.items(), key=lambda x: x[1][0], reverse=True) return self.indicativeSentences
def getIndicativeSentences(self,topK,intersectionTh): if len(self.indicativeSentences) > 0: return self.indicativeSentences else: topToksTuples = self.indicativeWords[:topK] topToks = [k for k,_ in topToksTuples] for d in self.documents: sents = d.getSentences() self.sentences.extend(sents) impSents ={} for sent in self.sentences: if sent not in impSents: sentToks = utils.getTokens(sent) if len(sentToks) > 100: continue intersect = utils.getIntersection(topToks, sentToks) if len(intersect) > intersectionTh: impSents[sent] = len(intersect) #if sent not in impSentsF: # impSentsF[sent] = len(intersect) #allImptSents.append(impSents) self.indicativeSentences = utils.getSorted(impSents.items(),1) return self.indicativeSentences
def getDocsVecs(self): n= len(self.docs_bow) indexWordsSet = self.index.keys() self.docsVecs = [] for doc_bow in self.docs_bow: docVec = {} docWordsSet = set(doc_bow.keys()) commWords = eventUtils.getIntersection(indexWordsSet, docWordsSet) for w in commWords: idf = math.log(n/self.index[w]['docFreq']) tf_idf = doc_bow[w] * idf docVec[w]=tf_idf self.docsVecs.append(docVec)
def webpageEntities(self, docText=""): disasters = set(self.entities["Disaster"].keys()) sentences = eventUtils.getSentences(docText) webpageEnts = [] for sent in sentences: sentToks = eventUtils.getTokens(sent) if len(sentToks) > 100: continue intersect = eventUtils.getIntersection(disasters, sentToks) if len(intersect) > self.intersectionTh: #print intersect sentEnts = eventUtils.getEntities(sent)[0] if sentEnts.has_key('LOCATION') or sentEnts.has_key('DATE'): sentEnts['Disaster'] = intersect webpageEnts.append((sent, sentEnts)) return webpageEnts
def webpageEntities(self,docText=""): disasters=set(self.entities["Disaster"].keys()) sentences = eventUtils.getSentences(docText) webpageEnts =[] for sent in sentences: sentToks = eventUtils.getTokens(sent) if len(sentToks) > 100: continue intersect = eventUtils.getIntersection(disasters, sentToks) if len(intersect) > self.intersectionTh: #print intersect sentEnts = eventUtils.getEntities(sent)[0] if sentEnts.has_key('LOCATION') or sentEnts.has_key('DATE'): sentEnts['Disaster'] = intersect webpageEnts.append((sent,sentEnts)) return webpageEnts
def getEM_Sents(wps): docsEntities=[] docsEntitiesFreq = [] entitiesProb = {} collSents = [] #for i,wp in enumerate(wps): for wp in wps: if 'text' not in wp: continue wpContent = wp['text']+wp['title'] wpSplit = wpContent.split('\n') wpFiltered = filter(None,wpSplit) wpContentf = '\n'.join(wpFiltered) sents = eventUtils.getSentences(wpContentf) collSents.append(sents) allSents = [] for sents in collSents: allSents.extend(sents) fw = eventUtils.getFreqTokens(allSents) fw = [w[0] for w in fw] #collFilteredSents = [] collEventModelInsts=[] for sents in collSents: filtEvtModelInsts = [] for s in sents: sentToks = eventUtils.getTokens(s) cw = eventUtils.getIntersection(fw, sentToks) if len(cw) >= 2: emi = {} emi['TOPIC'] = list(cw) ents = eventUtils.getEntities(s)[0] if ents.has_key('LOCATION'): emi['LOCATION'] = ents['LOCATION'] #filtEvtModelInsts.append(emi) if ents.has_key('DATE'): #emi['TOPIC'] = cw emi['DATE']=ents['DATE'] filtEvtModelInsts.append(emi) collEventModelInsts.append(filtEvtModelInsts) '''
def webpageEntities_old(self, docText=""): disasters = self.entities["Disaster"] sentences = eventUtils.getSentences(docText) #impSentences = getIndicativeSents(sentences, disasters, len(disasters), 0) #impSentences = [] webpageEnts = [] for sent in sentences: sentToks = eventUtils.getTokens(sent) if len(sentToks) > 100: continue intersect = eventUtils.getIntersection(disasters, sentToks) if len(intersect) > self.intersectionTh: #impSentences.append(sent) sentEnts = eventUtils.getEntities(sent)[0] if sentEnts.has_key('LOCATION') or sentEnts.has_key('DATE'): sentEnts['Disaster'] = intersect webpageEnts.append((sent, sentEnts)) #entities = getEntities(impSentences) #webpageEnts = zip(impSentences,entities) return webpageEnts
def webpageEntities_old(self,docText=""): disasters=self.entities["Disaster"] sentences = eventUtils.getSentences(docText) #impSentences = getIndicativeSents(sentences, disasters, len(disasters), 0) #impSentences = [] webpageEnts =[] for sent in sentences: sentToks = eventUtils.getTokens(sent) if len(sentToks) > 100: continue intersect = eventUtils.getIntersection(disasters, sentToks) if len(intersect) > self.intersectionTh: #impSentences.append(sent) sentEnts = eventUtils.getEntities(sent)[0] if sentEnts.has_key('LOCATION') or sentEnts.has_key('DATE'): sentEnts['Disaster'] = intersect webpageEnts.append((sent,sentEnts)) #entities = getEntities(impSentences) #webpageEnts = zip(impSentences,entities) return webpageEnts