Example #1
0
def getCollectionDocs(filename):
    f = open(filename)
    ls = f.readlines()
    f.close()
    ls = [l.strip() for l in ls]
    docsL = eventUtils.getWebpageText(ls)
    return docsL
Example #2
0
def getCollectionDocs(filename):
    f = open(filename)
    ls = f.readlines()
    f.close()
    ls = [l.strip() for l in ls]
    docsL = eventUtils.getWebpageText(ls)
    return docsL
 def getCollectionDocs(self,inputURLs):
     
     #f = open(inputURLs)
     #ls = f.readlines()
     #f.close()
     #ls = [l.strip() for l in ls]
     ls = inputURLs
     docsL = eventUtils.getWebpageText(ls)
     return docsL
Example #4
0
 def getText(self):
     if self.text != '':
         return self.text
     else:
         r = utils.getWebpageText(self.URL)[0]
         if r:
             self.text = r['text']
             self.title = r['title']
         return self.text
Example #5
0
 def getText(self):
     if self.text != '':
         return self.text
     else:
         r = utils.getWebpageText(self.URL)[0]
         if r:
             self.text = r['text']
             self.title = r['title']
         return self.text
    def getCollectionDocs(self, inputURLs):

        #f = open(inputURLs)
        #ls = f.readlines()
        #f.close()
        #ls = [l.strip() for l in ls]
        ls = inputURLs
        docsL = eventUtils.getWebpageText(ls)
        return docsL
Example #7
0
 def buildModel(self,seedURLs,num):
     #docs = downloadRawDocs(seedURLs)
     td = getWebpageText(seedURLs)
     docs = [t['text'] + " "+ t['title'] for t in td if t.has_key('text')]
     
     docs = getTokenizedDocs(docs)
     self.n = len(docs)
     docs_bow = [self.doc2bow(doc) for doc in docs]
     #vocab = self.buildVocab(docs_bow)
     self.buildVocabIndex(docs_bow)
     selected = self.selectImportantWords_tf(num)
     print selected
     wordsList = self.index.keys()
     selected_words = [wordsList[k[1]] for k in selected]
     print selected_words
     self.model = (selected,selected_words)
 def __init__(self,url,pageId):
     self.pageUrl = url
     self.pageId = pageId
     self.text = ""
     self.title = "" 
     self.outgoingUrls=[]
     self.soup = None
     res = utils.getWebpageText(url[1])[0]
     
     if res:
         self.text = res['text']
         self.title = res['title']
         self.soup = BeautifulSoup(res['html'])
     
     '''
     try:
         
         headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 5.1; rv:10.0.1) Gecko/20100101 Firefox/10.0.1'}
         req = urllib2.Request(url[1], None, headers)
         page = urllib2.urlopen(req).read()        
     except urllib2.HTTPError:
         print sys.exc_info()[0]
         self.text = "Error"
         return
     except urllib2.URLError:
         print sys.exc_info()[0]           
         self.text = "Error"
         return
     except ValueError:
         print sys.exc_info()[0]
         self.text = "Error"
         return
     except :
         print sys.exc_info()[0]
         self.text = "Error"
         return
     '''
     '''   
 def buildProbEventModel(self,urlsList,topK):
     
     docsList = eventUtils.getWebpageText(urlsList) #self.getCollectionDocs(urlsList)
     t = ''
     #docsTotalFreqs=[]
     docsEntities=[]
     docsEntitiesFreq = []
     entitiesProb = {}
     
     # Convert each doc to tokens, locations, dates lists and their corresponding frequency distributions
     # Also produces the total frequency for each document of each list (tokens, locations, and dates)
     for doc in docsList:
         
         if doc.has_key('text'):
             t = doc['text']
             if doc.has_key('title'):
                 t =doc['title']+ " "+t
         if t:
             #print 'Reading ' + t[:100]
             ents = eventUtils.getEntities(t)[0]
             docEnt = {}
             docEnt['LOCATION']={}
             if 'LOCATION' in ents:
                 docEnt['LOCATION'] =  ents['LOCATION']
             docEnt['DATE']={}
             if 'DATE' in ents:
                 docEnt['DATE'] = ents['DATE']
             toks = eventUtils.getTokens(t)
             docEnt['Topic'] = toks
             docsEntities.append(docEnt)
             
             docEntFreq = {}
             #docTotals = {}
             for k in docEnt:
                 docEntFreq[k] = eventUtils.getFreq(docEnt[k])
                 #totalFreq = sum([v for _,v in docEntFreq[k].items()])
                 
                 #docTotals[k] = totalFreq
             docsEntitiesFreq.append(docEntFreq)
             #docsTotalFreqs.append(docTotals)
     
     # Collection-level frequency for each entity(tokens, locations, dates)
     
     #Calculating prob for each item in each entity lists (tokens, locations, and dates) as 
     # freq of item in all docs / total freq of all terms in that list
     entitiesProb['LOCATION']={}
     entitiesProb['DATE']={}
     entitiesProb['Topic']={}
     
     for docEntFreq in docsEntitiesFreq:
         for entity in docEntFreq:
             for val in docEntFreq[entity]:
                 if val in entitiesProb[entity]:
                     entitiesProb[entity][val] += docEntFreq[entity][val]
                 else:
                     entitiesProb[entity][val] = docEntFreq[entity][val]
     
     for ent in entitiesProb:
         allvalsFreq = sum([v for _,v in entitiesProb[ent].items()])
         for k in entitiesProb[ent]:
             #entitiesProb[ent][k] = (1.0 + (entitiesProb[ent][k] *1.0)) / (docsTotalFreqs[ent] + allDocsTotal[ent])
             
             entitiesProb[ent][k] = (1.0 + (entitiesProb[ent][k] *1.0)) / (len(entitiesProb[ent]) + allvalsFreq)
             
         
     #self.probEvtModel = entitiesProb
     
     mle =  self.getMLEEventEntities(entitiesProb,10)
     for k in mle:
         print k, mle[k]
         
     
     self.probEvtModel = {}
     for k in mle:
         self.probEvtModel[k] = dict(mle[k])#entitiesProb[k][:topK]
     
     self.eDisDic = self.probEvtModel['Topic']
     
     
     locToks = self.probEvtModel['LOCATION'].keys()
     locToks = eventUtils.getStemmedWords(locToks)
     self.locDic = dict(zip(locToks,self.probEvtModel['LOCATION'].values()))
     
 
     dToks = self.probEvtModel['DATE'].keys()
     dToks = eventUtils.getStemmedWords(dToks)
     self.dDic = dict(zip(dToks,self.probEvtModel['DATE'].values()))
     
     
     
     return docsEntities, entitiesProb
Example #10
0
		f.close()
		print "file saved"
		#warcFile = warc.file.name
		#print warcFile
		#texts = utils.expandWarcFile(warcFile)
		texts = utils.expandWarcFile(warcFile)
	else:
		texts = utils.expandWarcFile(warcFile)			

	urls = form.getvalue('urls')
	#if not urls: 
	#	urls = 'http://www.nbcnews.com/storyline/ebola-virus-outbreak/why-its-not-enough-just-eradicate-ebola-n243891\nhttp://www.npr.org/blogs/thetwo-way/2014/11/09/362770821/maine-nurse-to-move-out-of-state-following-ebola-quarantine-row'
	
	if urls:
		webpagesURLs = urls.split('\n')
		webpagesText = utils.getWebpageText(webpagesURLs)
		texts = [t['text'] for t in webpagesText if t.has_key('text') and len(t['text'])>0]
	
	print texts
	
	#Get LDA Topics
	ldaTopics = utils.getLDATopics(texts)
	
	#Get Frequent Tokens
	sortedTokensFreqs = utils.getFreqTokens(texts)

	#Get Indicative tokens
	sortedToksTFDF = utils.getIndicativeWords(texts)
	'''
	filteredToksTFDF = []
	toks = " ".join([])
Example #11
0
from eventUtils import getWebpageText
f = open('seedsURLs_z_543-noTweets.txt','r')
urls = f.readlines()
f.close()
urls = [u.strip() for u in urls]
texts = getWebpageText(urls)
i = 0
for p in texts:
    
    if p.has_key('text'):
        ftext = open("webpages/"+str(i) + ".txt", "w")
        ftext.write(p['text'].encode("utf-8"))
        ftext.close()
    i+=1