def crawl(self): #start crawling #myopener = MyOpener() self.harvestRatioData = [] self.relevantPages = [] while self.pagesCount < self.pagesLimit and not self.priorityQueue.isempty( ): work_url = self.priorityQueue.pop() self.visited.append(work_url[1]) #print ("%s, %s") % (-1 * work_url[0], work_url[1]) #page = urllib2.urlopen(work_url) '''page = myopener.open(work_url) self.pagesCount += 1 soup = BeautifulSoup(page) links = soup.find_all('a')''' page = Webpage(work_url, self.pagesCount) if len(page.text) > 0: page_score = self.scorer.calculate_score(page.text) else: page_score = 0 self.pagesCount += 1 if (page_score > self.pageScoreThreshold): page.getUrls() self.relevantPagesCount += 1 self.relevantPages.append(page) self.harvestRatioData.append( (self.relevantPagesCount, self.pagesCount)) print("%s," + str(page_score) + ", %s") % (-1 * work_url[0], work_url[1]) for link in page.outgoingUrls: url = link.address if url != None and url != '': if url.find('?') != -1: url = url.split('?')[0] if url.find('#') != -1: url = url.split('#')[0] # if url.startswith('http') == False: # parts = page.pageUrl[1].split("://") # baseUrl = parts[1].split("/")[0] # baseUrl = parts[0] +"://" + baseUrl # url = baseUrl + url #if not self.existsInVisited(url,self.visited): if url not in self.visited: #if url.startswith('http:') and url.find('#') == -1 and not self.exists(url,self.priorityQueue.queue): if url.startswith('http') and not self.exists( url, self.priorityQueue.queue): url_score = self.scorer.calculate_score( link.getAllText()) self.totalPagesCount += 1 #tot_score = (page_score + url_score)/2.0 #tot_score = page_score + url_score tot_score = url_score if tot_score > self.urlScoreThreshold: #self.priorityQueue.push(((-1 * url_score),url)) self.priorityQueue.push( ((-1 * tot_score), url, page.pageId))
def crawl(self): self.harvestRatioData = [] self.relevantPages = [] while self.pagesCount < self.pagesLimit and not self.priorityQueue.isempty(): work_url = self.priorityQueue.pop() self.visited.append(work_url) page = Webpage(work_url,self.pagesCount) if page.text =='' : continue page_score = 0.0 if self.combineScore: if len(page.text) > 0: page_score = self.scorer.calculate_score(page.text,'W') else: continue page.estimatedScore = page_score if self.restricted: if page_score < self.pageScoreThreshold: continue #print -1 * work_url[0],",", str(page_score),",",work_url[1],",", work_url[3] print -1 * work_url[0],",",work_url[1],",", work_url[3] self.pagesCount += 1 page.getUrls() self.relevantPages.append(page) for link in page.outgoingUrls: url = link.address if url != None and url != '': url = url.strip() if url.find('?')!= -1: url = url.split('?')[0] if url.find('#') != -1: url = url.split('#')[0] if url.endswith(("comment","comment/","feed","comments","feed/","comments/",".rss","video","video/","link","gif","jpeg","mp4","wav","jpg","mp3","png","share.php","sharer.php","login.php","print","print/","button/","share","email","submit","post",".pdf") ): continue if not self.exists(url,1): #tot_score = 0.0 if url.startswith('http') and not self.exists(url,2): if self.mode == 1: url_score = self.scorer.calculate_score(link.getAllText(),'U') if self.combineScore: tot_score= 0.5 *page_score + 0.5 *url_score else: tot_score = url_score #if tot_score >= self.urlScoreThreshold: self.priorityQueue.push(((-1 * tot_score),url,page.pageId,link.getAllText())) #else: # self.priorityQueue.push(((-1 * page_score),url,page.pageId,link.getAllText())) #else: # self.pages.append((page,0)) print self.priorityQueue.isempty()
def crawl(self): #start crawling #myopener = MyOpener() self.harvestRatioData = [] self.relevantPages = [] while self.pagesCount < self.pagesLimit and not self.priorityQueue.isempty(): work_url = self.priorityQueue.pop() self.visited.append(work_url[1]) #print ("%s, %s") % (-1 * work_url[0], work_url[1]) #page = urllib2.urlopen(work_url) '''page = myopener.open(work_url) self.pagesCount += 1 soup = BeautifulSoup(page) links = soup.find_all('a')''' page = Webpage(work_url,self.pagesCount) if len(page.text) > 0: page_score = self.scorer.calculate_score(page.text) else: page_score = 0 self.pagesCount += 1 if (page_score > self.pageScoreThreshold): page.getUrls() self.relevantPagesCount += 1 self.relevantPages.append(page) self.harvestRatioData.append((self.relevantPagesCount,self.pagesCount)) print ("%s,"+ str(page_score)+", %s") % (-1 * work_url[0], work_url[1]) for link in page.outgoingUrls: url = link.address if url != None and url != '': if url.find('?')!= -1: url = url.split('?')[0] if url.find('#') != -1: url = url.split('#')[0] # if url.startswith('http') == False: # parts = page.pageUrl[1].split("://") # baseUrl = parts[1].split("/")[0] # baseUrl = parts[0] +"://" + baseUrl # url = baseUrl + url #if not self.existsInVisited(url,self.visited): if url not in self.visited: #if url.startswith('http:') and url.find('#') == -1 and not self.exists(url,self.priorityQueue.queue): if url.startswith('http') and not self.exists(url,self.priorityQueue.queue): url_score = self.scorer.calculate_score(link.getAllText()) self.totalPagesCount +=1 #tot_score = (page_score + url_score)/2.0 #tot_score = page_score + url_score tot_score = url_score if tot_score > self.urlScoreThreshold: #self.priorityQueue.push(((-1 * url_score),url)) self.priorityQueue.push(((-1 * tot_score),url,page.pageId))
def crawl(self): #start crawling #myopener = MyOpener() self.harvestRatioData = [] self.relevantPages = [] while self.pagesCount < self.pagesLimit and not self.priorityQueue.isempty( ): work_url = self.priorityQueue.pop() self.visited.append(work_url) page = Webpage(work_url, self.pagesCount) if len(page.text) > 0: page_score = self.scorer.calculate_score(page.text) else: page_score = 0 self.pagesCount += 1 if (page_score > self.pageScoreThreshold): page.getUrls() self.relevantPagesCount += 1 self.relevantPages.append(page) self.harvestRatioData.append( (self.relevantPagesCount, self.pagesCount)) print("%s, %s") % (-1 * work_url[0], work_url[1]) for link in page.outgoingUrls: url = link.address if url != None and url != '': #if url.find('?')!= -1: # url = url.split('?')[0] if url.startswith("/"): base = page.pageUrl[1][7:].split("/")[0] url = "http://" + base + url if not self.exists(url, self.visited): if url.startswith('http') and url.find( '#' ) == -1 and not self.priorityQueue.exists( url ): #self.exists(url,self.priorityQueue.queue): url_score = self.scorer.calculate_score( link.getAllText()) self.totalPagesCount += 1 #tot_score = (page_score + url_score)/2.0 #tot_score = page_score + url_score tot_score = url_score if tot_score > self.urlScoreThreshold: #self.priorityQueue.push(((-1 * url_score),url)) self.priorityQueue.push( ((-1 * tot_score), url, page.pageId)) #self.relevantPagesCount += 1 self.priorityQueue.next()
def crawl(self): #start crawling #myopener = MyOpener() self.harvestRatioData = [] self.relevantPages = [] while self.pagesCount < self.pagesLimit and not self.priorityQueue.isempty(): work_url = self.priorityQueue.pop() self.visited.append(work_url) page = Webpage(work_url,self.pagesCount) if len(page.text) > 0: page_score = self.scorer.calculate_score(page.text) else: page_score = 0 self.pagesCount += 1 if (page_score > self.pageScoreThreshold): page.getUrls() self.relevantPagesCount += 1 self.relevantPages.append(page) self.harvestRatioData.append((self.relevantPagesCount,self.pagesCount)) print ("%s, %s") % (-1 * work_url[0], work_url[1]) for link in page.outgoingUrls: url = link.address if url != None and url != '': #if url.find('?')!= -1: # url = url.split('?')[0] if url.startswith("/"): base = page.pageUrl[1][7:].split("/")[0] url = "http://" + base + url if not self.exists(url,self.visited): if url.startswith('http') and url.find('#') == -1 and not self.priorityQueue.exists(url):#self.exists(url,self.priorityQueue.queue): url_score = self.scorer.calculate_score(link.getAllText()) self.totalPagesCount +=1 #tot_score = (page_score + url_score)/2.0 #tot_score = page_score + url_score tot_score = url_score if tot_score > self.urlScoreThreshold: #self.priorityQueue.push(((-1 * url_score),url)) self.priorityQueue.push(((-1 * tot_score),url,page.pageId)) #self.relevantPagesCount += 1 self.priorityQueue.next()
def enhanced_crawl(self): #start crawling #myopener = MyOpener() self.harvestRatioData = [] self.relevantPages = [] while self.pagesCount < self.pagesLimit and not self.priorityQueue.isempty(): work_url = self.priorityQueue.pop() self.visited.append(work_url[1]) #print ("%s, %s") % (-1 * work_url[0], work_url[1]) #page = urllib2.urlopen(work_url) '''page = myopener.open(work_url) self.pagesCount += 1 soup = BeautifulSoup(page) links = soup.find_all('a')''' #print work_url[1] try: req = urllib2.Request(work_url[1]) # create a request object handle = urllib2.urlopen(req) # and open it to return a handle on the url except urllib2.URLError, e: # ignore error, URL timed out pass else: html = handle.read() soup = BeautifulSoup(html) paras = soup.findAll('p') #print paras text = "" for para in paras: text = text + " " + para.text page = Webpage(work_url,self.pagesCount) if len(page.text) > 0: page_score = self.scorer.calculate_smart_score(text, work_url[1]) else: page_score = 0 self.pagesCount += 1 if (page_score > self.pageScoreThreshold): page.getUrls() self.relevantPagesCount += 1 self.relevantPages.append(page) self.harvestRatioData.append((self.relevantPagesCount,self.pagesCount)) print ("%s|"+ str(page_score)+"|%s") % (-1.0 * work_url[0], work_url[1]) for link in page.outgoingUrls: url = link.address if url != None and url != '': if url.find('?')!= -1: url = url.split('?')[0] if url.find('#') != -1: url = url.split('#')[0] # if url.startswith('http') == False: # parts = page.pageUrl[1].split("://") # baseUrl = parts[1].split("/")[0] # baseUrl = parts[0] +"://" + baseUrl # url = baseUrl + url #if not self.existsInVisited(url,self.visited): if url not in self.visited: #if url.startswith('http:') and url.find('#') == -1 and not self.exists(url,self.priorityQueue.queue): if url.startswith('http') and not self.exists(url,self.priorityQueue.queue): url_score = self.url_scorer.calculate_score(link.getAllText()) self.totalPagesCount +=1 #tot_score = (page_score + url_score)/2.0 #tot_score = page_score + url_score tot_score = url_score if tot_score > self.urlScoreThreshold: #self.priorityQueue.push(((-1 * url_score),url)) self.priorityQueue.push(((-1 * tot_score),url,page.pageId))
def crawl(self): self.harvestRatioData = [] self.relevantPages = [] webpages = [] count = 0 ftext = open(self.pagesDir + "webpagesTxt.txt", "w") webpageLabel = 0 # 0 for Non-relevant and 1 for Relevant while self.pagesCount < self.pagesLimit and not self.priorityQueue.isempty( ): work_url = self.priorityQueue.pop() url = work_url[1] #if self.exists(url,1): # continue if url in self.visited: continue #self.visited.append(url)#work_url[1]) self.visited[url] = 1 page = Webpage(work_url, self.pagesCount) if page.text == '': continue page.estimatedScore = 0 if self.combineScore: page_score = 0 if len(page.text) > 0: #page_score = self.scorer.calculate_score(page.text,'W')[1] page_score = self.scorer.calculate_score(page, 'W')[1] if page_score == -1: continue else: print 'page text is empty' continue page.estimatedScore = page_score if self.restricted: if page_score < self.pageScoreThreshold: #self.pagesCount += 1 continue pageDom = getDomain(url) if page_score >= self.pageScoreThreshold: self.sourcesImp[pageDom][0] += 1 webpageLabel = 1 else: self.sourcesImp[pageDom][1] += 1 #self.sourcesImp[pageDom][0] = self.sourcesImp[pageDom][1] webpageLabel = 0 if self.combineScore: print page.pageId, ": ", str( page_score), ",", -1 * work_url[0], ",", work_url[ 1] #,",", work_url[3] else: print -1 * work_url[0], ",", work_url[1] #,",", work_url[3] self.pagesCount += 1 #self.relevantPages.append((page.pageId,page.pageUrl,page.estimatedScore)) self.relevantPages.append( (page.pageId, (page.pageUrl[1], page.pageUrl[2]), page.estimatedScore)) wbsStr = page.text.replace('\n', '. ').replace('\t', ' ') webpages.append(wbsStr) count += 1 #save webpage's text to disk instead of adding to list # this will lead to change in evaluation if count % self.bufferLen == 0: strToWrite = '\n'.join(webpages).encode("utf-8") ftext.write(strToWrite) webpages = [] #ftext = open(self.pagesDir+str(page.pageId) + ".txt", "w") #ftext.write(page.text.encode("utf-8")) #ftext.close() #------- if page_score < 0.1: continue page.getUrls() for link in page.outgoingUrls: url = link.address #if url != None and url != '': if url: url = url.strip() if url.find('report-a-typo') != -1: continue if url.find('m.tempo.co/') != -1: continue if url.find('?') != -1: furl = url.split('?')[1] if furl.startswith('id=') == False or furl.startswith( 'v=') == False or furl.startswith( 'tid=') == False: url = url.split('?')[0] if url.find('#') != -1: url = url.split('#')[0] if url.endswith('/'): url = url[:-1] #if url.endswith(("comment","comment/","feed","comments","feed/","comments/",".rss","video","video/","link","gif","jpeg","mp4","wav","jpg","mp3","png","share.php","sharer.php","login.php","print","print/","button/","share","email","submit","post",".pdf") ): if url.endswith( ("comment", "feed", "comments", ".rss", "video", "link", "gif", "jpeg", "mp4", "wav", "jpg", "mp3", "png", "share.php", "sharer.php", "login.php", "print", "button", "share", "email", "submit", "post", ".pdf")): continue #if not self.exists(url,1): if url in self.visited: continue #tot_score = 0.0 if url.startswith('http'): #and not self.exists(url,2): linkText = link.getAllText() #if self.mode == 1: #url_score = self.scorer.calculate_score(linkText,'U') url_score = self.scorer.calculate_score(link, 'U') tot_score = url_score if self.combineScore: #tot_score= 0.4 *page_score + 0.6 *url_score tot_score = page_score * url_score if tot_score < self.urlScoreThreshold: continue urlDom = getDomain(url) si_score = self.sourcesImp[urlDom][ 0] / self.sourcesImp[urlDom][1] if self.siScoreCombineMethod == 1: if webpageLabel: tot_score = tot_score * si_score elif self.siScoreCombineMethod == 2: tot_score = self.topicWeight * tot_score + self.siWeight * si_score #tot_score = tot_score * si_score #else: # tot_score = url_score #if tot_score >= self.urlScoreThreshold: #print tot_score, '-', url, linkText if self.restricted: if tot_score < self.urlScoreThreshold: continue if tot_score >= self.urlScoreThreshold: self.priorityQueue.push( ((-1 * tot_score), url, page.pageId)) #,linkText)) #else: # self.priorityQueue.push(((-1 * page_score),url,page.pageId,link.getAllText())) #else: # self.pages.append((page,0)) print self.priorityQueue.isempty() if webpages: strToWrite = '\n'.join(webpages).encode("utf-8") ftext.write(strToWrite) ftext.close() return self.priorityQueue.queue