def getCollectionDocs(filename): f = open(filename) ls = f.readlines() f.close() ls = [l.strip() for l in ls] docsL = eventUtils.getWebpageText(ls) return docsL
def getCollectionDocs(self,inputURLs): #f = open(inputURLs) #ls = f.readlines() #f.close() #ls = [l.strip() for l in ls] ls = inputURLs docsL = eventUtils.getWebpageText(ls) return docsL
def getText(self): if self.text != '': return self.text else: r = utils.getWebpageText(self.URL)[0] if r: self.text = r['text'] self.title = r['title'] return self.text
def getCollectionDocs(self, inputURLs): #f = open(inputURLs) #ls = f.readlines() #f.close() #ls = [l.strip() for l in ls] ls = inputURLs docsL = eventUtils.getWebpageText(ls) return docsL
def buildModel(self,seedURLs,num): #docs = downloadRawDocs(seedURLs) td = getWebpageText(seedURLs) docs = [t['text'] + " "+ t['title'] for t in td if t.has_key('text')] docs = getTokenizedDocs(docs) self.n = len(docs) docs_bow = [self.doc2bow(doc) for doc in docs] #vocab = self.buildVocab(docs_bow) self.buildVocabIndex(docs_bow) selected = self.selectImportantWords_tf(num) print selected wordsList = self.index.keys() selected_words = [wordsList[k[1]] for k in selected] print selected_words self.model = (selected,selected_words)
def __init__(self,url,pageId): self.pageUrl = url self.pageId = pageId self.text = "" self.title = "" self.outgoingUrls=[] self.soup = None res = utils.getWebpageText(url[1])[0] if res: self.text = res['text'] self.title = res['title'] self.soup = BeautifulSoup(res['html']) ''' try: headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 5.1; rv:10.0.1) Gecko/20100101 Firefox/10.0.1'} req = urllib2.Request(url[1], None, headers) page = urllib2.urlopen(req).read() except urllib2.HTTPError: print sys.exc_info()[0] self.text = "Error" return except urllib2.URLError: print sys.exc_info()[0] self.text = "Error" return except ValueError: print sys.exc_info()[0] self.text = "Error" return except : print sys.exc_info()[0] self.text = "Error" return ''' '''
def buildProbEventModel(self,urlsList,topK): docsList = eventUtils.getWebpageText(urlsList) #self.getCollectionDocs(urlsList) t = '' #docsTotalFreqs=[] docsEntities=[] docsEntitiesFreq = [] entitiesProb = {} # Convert each doc to tokens, locations, dates lists and their corresponding frequency distributions # Also produces the total frequency for each document of each list (tokens, locations, and dates) for doc in docsList: if doc.has_key('text'): t = doc['text'] if doc.has_key('title'): t =doc['title']+ " "+t if t: #print 'Reading ' + t[:100] ents = eventUtils.getEntities(t)[0] docEnt = {} docEnt['LOCATION']={} if 'LOCATION' in ents: docEnt['LOCATION'] = ents['LOCATION'] docEnt['DATE']={} if 'DATE' in ents: docEnt['DATE'] = ents['DATE'] toks = eventUtils.getTokens(t) docEnt['Topic'] = toks docsEntities.append(docEnt) docEntFreq = {} #docTotals = {} for k in docEnt: docEntFreq[k] = eventUtils.getFreq(docEnt[k]) #totalFreq = sum([v for _,v in docEntFreq[k].items()]) #docTotals[k] = totalFreq docsEntitiesFreq.append(docEntFreq) #docsTotalFreqs.append(docTotals) # Collection-level frequency for each entity(tokens, locations, dates) #Calculating prob for each item in each entity lists (tokens, locations, and dates) as # freq of item in all docs / total freq of all terms in that list entitiesProb['LOCATION']={} entitiesProb['DATE']={} entitiesProb['Topic']={} for docEntFreq in docsEntitiesFreq: for entity in docEntFreq: for val in docEntFreq[entity]: if val in entitiesProb[entity]: entitiesProb[entity][val] += docEntFreq[entity][val] else: entitiesProb[entity][val] = docEntFreq[entity][val] for ent in entitiesProb: allvalsFreq = sum([v for _,v in entitiesProb[ent].items()]) for k in entitiesProb[ent]: #entitiesProb[ent][k] = (1.0 + (entitiesProb[ent][k] *1.0)) / (docsTotalFreqs[ent] + allDocsTotal[ent]) entitiesProb[ent][k] = (1.0 + (entitiesProb[ent][k] *1.0)) / (len(entitiesProb[ent]) + allvalsFreq) #self.probEvtModel = entitiesProb mle = self.getMLEEventEntities(entitiesProb,10) for k in mle: print k, mle[k] self.probEvtModel = {} for k in mle: self.probEvtModel[k] = dict(mle[k])#entitiesProb[k][:topK] self.eDisDic = self.probEvtModel['Topic'] locToks = self.probEvtModel['LOCATION'].keys() locToks = eventUtils.getStemmedWords(locToks) self.locDic = dict(zip(locToks,self.probEvtModel['LOCATION'].values())) dToks = self.probEvtModel['DATE'].keys() dToks = eventUtils.getStemmedWords(dToks) self.dDic = dict(zip(dToks,self.probEvtModel['DATE'].values())) return docsEntities, entitiesProb
f.close() print "file saved" #warcFile = warc.file.name #print warcFile #texts = utils.expandWarcFile(warcFile) texts = utils.expandWarcFile(warcFile) else: texts = utils.expandWarcFile(warcFile) urls = form.getvalue('urls') #if not urls: # urls = 'http://www.nbcnews.com/storyline/ebola-virus-outbreak/why-its-not-enough-just-eradicate-ebola-n243891\nhttp://www.npr.org/blogs/thetwo-way/2014/11/09/362770821/maine-nurse-to-move-out-of-state-following-ebola-quarantine-row' if urls: webpagesURLs = urls.split('\n') webpagesText = utils.getWebpageText(webpagesURLs) texts = [t['text'] for t in webpagesText if t.has_key('text') and len(t['text'])>0] print texts #Get LDA Topics ldaTopics = utils.getLDATopics(texts) #Get Frequent Tokens sortedTokensFreqs = utils.getFreqTokens(texts) #Get Indicative tokens sortedToksTFDF = utils.getIndicativeWords(texts) ''' filteredToksTFDF = [] toks = " ".join([])
from eventUtils import getWebpageText f = open('seedsURLs_z_543-noTweets.txt','r') urls = f.readlines() f.close() urls = [u.strip() for u in urls] texts = getWebpageText(urls) i = 0 for p in texts: if p.has_key('text'): ftext = open("webpages/"+str(i) + ".txt", "w") ftext.write(p['text'].encode("utf-8")) ftext.close() i+=1