Example #1
0
def compareDomains(files):
    domains = []
    for bf in files:
        doms = {}
        f = open(bf)
        #ls = f.readlines()
        #ls = [l.strip() for l in ls]
        for l in f:
            l = l.strip()
            p = l.split(" ")
            if int(p[1]) > 1:
                doms[p[0]] = int(p[1])
            '''
            if p[0] in doms:
                doms[p[0]]+= int(p[1])
            else:
                doms[p[0]] = int(p[1])
            '''
        domains.append(doms)
        f.close()
    comps = []
    sets = []
    for d in domains:
        s = set(d.keys())
        sets.append(s)
    for i in range(len(sets)):
        for j in range(i+1,len(sets)):
            comps.append(len(getIntersection(sets[i],sets[j])))
    print comps
    ks = [len(s) for s in sets]
    print ks 
    return comps       
Example #2
0
    def getIndicativeSentences(self, topK, intersectionTh):
        if len(self.indicativeSentences) > 0:
            return self.indicativeSentences
        else:
            topToksTuples = self.indicativeWords[:topK]
            #topToksTuples = self.indicativeWords
            topToks = [k for k, _ in topToksTuples]

            for d in self.documents:
                sents = d.getSentences()
                if sents and len(sents) > 0:
                    self.sentences.extend(sents)

            impSents = {}
            for sent in self.sentences:
                if sent not in impSents:
                    sentToks = utils.getTokens(sent)
                    if len(sentToks) > 100:
                        continue
                    intersect = utils.getIntersection(topToks, sentToks)
                    if len(intersect) >= intersectionTh:
                        #impSents[sent] = len(intersect)
                        impSents[sent] = intersect
                        #print intersect
                        #if sent not in impSentsF:
                        #    impSentsF[sent] = len(intersect)
                    #allImptSents.append(impSents)
            if impSents:
                #self.indicativeSentences = utils.getSorted(impSents.items(),1)
                self.indicativeSentences = sorted(impSents.items(),
                                                  key=lambda x: len(x[1]),
                                                  reverse=True)
                #sortedToksTFDF = sorted(toksTFDF.items(), key=lambda x: x[1][0], reverse=True)
            return self.indicativeSentences
Example #3
0
    def getIndicativeSentences(self, topK, intersectionTh):
        if len(self.indicativeSentences) > 0:
            return self.indicativeSentences
        else:
            topToksTuples = self.indicativeWords[:topK]
            topToks = [k for k, _ in topToksTuples]

            for d in self.documents:
                sents = d.getSentences()
                self.sentences.extend(sents)

            impSents = {}
            for sent in self.sentences:
                if sent not in impSents:
                    sentToks = utils.getTokens(sent)
                    if len(sentToks) > 100:
                        continue
                    intersect = utils.getIntersection(topToks, sentToks)
                    if len(intersect) > intersectionTh:
                        impSents[sent] = len(intersect)
                        #if sent not in impSentsF:
                        #    impSentsF[sent] = len(intersect)
                    #allImptSents.append(impSents)

            self.indicativeSentences = utils.getSorted(impSents.items(), 1)
            return self.indicativeSentences
 def getIndicativeSentences(self,topK,intersectionTh):
     if len(self.indicativeSentences) > 0:
         return self.indicativeSentences
     else:
         topToksTuples = self.indicativeWords[:topK]
         #topToksTuples = self.indicativeWords
         topToks = [k for k,_ in topToksTuples]
         
         for d in self.documents:
             sents = d.getSentences()
             if sents and len(sents)>0:
                 self.sentences.extend(sents)
         
         impSents ={}
         for sent in self.sentences:
             if sent not in impSents:
                 sentToks = utils.getTokens(sent)
                 if len(sentToks) > 100:
                     continue
                 intersect = utils.getIntersection(topToks, sentToks)
                 if len(intersect) > intersectionTh:
                     #impSents[sent] = len(intersect)
                     impSents[sent] = intersect
                     #print intersect
                     #if sent not in impSentsF:
                     #    impSentsF[sent] = len(intersect)
                 #allImptSents.append(impSents)
         if impSents:
             #self.indicativeSentences = utils.getSorted(impSents.items(),1)
             self.indicativeSentences = sorted(impSents.items(),key=lambda x: len(x[1]), reverse=True)
             #sortedToksTFDF = sorted(toksTFDF.items(), key=lambda x: x[1][0], reverse=True)
         return self.indicativeSentences
Example #5
0
 def getIndicativeSentences(self,topK,intersectionTh):
     if len(self.indicativeSentences) > 0:
         return self.indicativeSentences
     else:
         topToksTuples = self.indicativeWords[:topK]
         topToks = [k for k,_ in topToksTuples]
         
         for d in self.documents:
             sents = d.getSentences()
             self.sentences.extend(sents)
         
         impSents ={}
         for sent in self.sentences:
             if sent not in impSents:
                 sentToks = utils.getTokens(sent)
                 if len(sentToks) > 100:
                     continue
                 intersect = utils.getIntersection(topToks, sentToks)
                 if len(intersect) > intersectionTh:
                     impSents[sent] = len(intersect)
                     #if sent not in impSentsF:
                     #    impSentsF[sent] = len(intersect)
                 #allImptSents.append(impSents)
         
         self.indicativeSentences = utils.getSorted(impSents.items(),1)
         return self.indicativeSentences
Example #6
0
 def getDocsVecs(self):
     n= len(self.docs_bow)
     indexWordsSet = self.index.keys()
     self.docsVecs = []
     for doc_bow in self.docs_bow:
         docVec = {}
         docWordsSet = set(doc_bow.keys())
         commWords = eventUtils.getIntersection(indexWordsSet, docWordsSet)
         for w in commWords:
             idf = math.log(n/self.index[w]['docFreq'])
             tf_idf = doc_bow[w] * idf
             docVec[w]=tf_idf
         self.docsVecs.append(docVec)
Example #7
0
    def webpageEntities(self, docText=""):
        disasters = set(self.entities["Disaster"].keys())

        sentences = eventUtils.getSentences(docText)
        webpageEnts = []
        for sent in sentences:
            sentToks = eventUtils.getTokens(sent)
            if len(sentToks) > 100:
                continue
            intersect = eventUtils.getIntersection(disasters, sentToks)
            if len(intersect) > self.intersectionTh:
                #print intersect
                sentEnts = eventUtils.getEntities(sent)[0]
                if sentEnts.has_key('LOCATION') or sentEnts.has_key('DATE'):
                    sentEnts['Disaster'] = intersect
                    webpageEnts.append((sent, sentEnts))

        return webpageEnts
Example #8
0
 def webpageEntities(self,docText=""):
     disasters=set(self.entities["Disaster"].keys())
     
     sentences = eventUtils.getSentences(docText)
     webpageEnts =[]
     for sent in sentences:
         sentToks = eventUtils.getTokens(sent)
         if len(sentToks) > 100:
             continue
         intersect = eventUtils.getIntersection(disasters, sentToks)
         if len(intersect) > self.intersectionTh:
             #print intersect
             sentEnts = eventUtils.getEntities(sent)[0]
             if sentEnts.has_key('LOCATION') or sentEnts.has_key('DATE'):
                 sentEnts['Disaster'] = intersect
                 webpageEnts.append((sent,sentEnts))
     
     return webpageEnts
Example #9
0
def getEM_Sents(wps):
    docsEntities=[]
    docsEntitiesFreq = []
    entitiesProb = {}
    
    
    collSents = []
    #for i,wp in enumerate(wps):
    for wp in wps:
        if 'text' not in wp:
            continue
        wpContent = wp['text']+wp['title']
        wpSplit = wpContent.split('\n')
        wpFiltered = filter(None,wpSplit)
        wpContentf = '\n'.join(wpFiltered)
        sents = eventUtils.getSentences(wpContentf)
        collSents.append(sents)
    allSents = []
    for sents in collSents:
        allSents.extend(sents)
    fw = eventUtils.getFreqTokens(allSents)
    fw = [w[0] for w in fw]
    
    #collFilteredSents = []
    collEventModelInsts=[]
    for sents in collSents:
        filtEvtModelInsts = []
        for s in sents:
            sentToks = eventUtils.getTokens(s)
            cw = eventUtils.getIntersection(fw, sentToks)
            if len(cw) >= 2:
                emi = {}
                emi['TOPIC'] = list(cw)
                ents = eventUtils.getEntities(s)[0]
                if ents.has_key('LOCATION'):
                    emi['LOCATION'] = ents['LOCATION']
                    #filtEvtModelInsts.append(emi)
                if ents.has_key('DATE'):
                        #emi['TOPIC'] = cw
                    emi['DATE']=ents['DATE']
                filtEvtModelInsts.append(emi)
        collEventModelInsts.append(filtEvtModelInsts)
    '''
Example #10
0
    def webpageEntities_old(self, docText=""):
        disasters = self.entities["Disaster"]

        sentences = eventUtils.getSentences(docText)
        #impSentences = getIndicativeSents(sentences, disasters, len(disasters), 0)
        #impSentences = []
        webpageEnts = []
        for sent in sentences:
            sentToks = eventUtils.getTokens(sent)
            if len(sentToks) > 100:
                continue
            intersect = eventUtils.getIntersection(disasters, sentToks)
            if len(intersect) > self.intersectionTh:
                #impSentences.append(sent)
                sentEnts = eventUtils.getEntities(sent)[0]
                if sentEnts.has_key('LOCATION') or sentEnts.has_key('DATE'):
                    sentEnts['Disaster'] = intersect
                    webpageEnts.append((sent, sentEnts))
        #entities = getEntities(impSentences)
        #webpageEnts = zip(impSentences,entities)

        return webpageEnts
Example #11
0
 def webpageEntities_old(self,docText=""):
     disasters=self.entities["Disaster"]
     
     sentences = eventUtils.getSentences(docText)
     #impSentences = getIndicativeSents(sentences, disasters, len(disasters), 0)
     #impSentences = []
     webpageEnts =[]
     for sent in sentences:
         sentToks = eventUtils.getTokens(sent)
         if len(sentToks) > 100:
             continue
         intersect = eventUtils.getIntersection(disasters, sentToks)
         if len(intersect) > self.intersectionTh:
             #impSentences.append(sent)
             sentEnts = eventUtils.getEntities(sent)[0]
             if sentEnts.has_key('LOCATION') or sentEnts.has_key('DATE'):
                 sentEnts['Disaster'] = intersect
                 webpageEnts.append((sent,sentEnts))
     #entities = getEntities(impSentences)
     #webpageEnts = zip(impSentences,entities)
     
     return webpageEnts