Ejemplo n.º 1
0
    def crawl(self):
        #start crawling
        #myopener = MyOpener()
        self.harvestRatioData = []
        self.relevantPages = []
        while self.pagesCount < self.pagesLimit and not self.priorityQueue.isempty(
        ):
            work_url = self.priorityQueue.pop()
            self.visited.append(work_url[1])
            #print ("%s, %s") % (-1 * work_url[0], work_url[1])
            #page = urllib2.urlopen(work_url)
            '''page = myopener.open(work_url)
            self.pagesCount += 1
            soup = BeautifulSoup(page)
            links = soup.find_all('a')'''
            page = Webpage(work_url, self.pagesCount)
            if len(page.text) > 0:
                page_score = self.scorer.calculate_score(page.text)
            else:
                page_score = 0

            self.pagesCount += 1
            if (page_score > self.pageScoreThreshold):
                page.getUrls()
                self.relevantPagesCount += 1
                self.relevantPages.append(page)
                self.harvestRatioData.append(
                    (self.relevantPagesCount, self.pagesCount))
                print("%s," + str(page_score) + ", %s") % (-1 * work_url[0],
                                                           work_url[1])
                for link in page.outgoingUrls:
                    url = link.address
                    if url != None and url != '':
                        if url.find('?') != -1:
                            url = url.split('?')[0]
                        if url.find('#') != -1:
                            url = url.split('#')[0]

#                         if url.startswith('http') == False:
#                             parts = page.pageUrl[1].split("://")
#                             baseUrl = parts[1].split("/")[0]
#                             baseUrl = parts[0] +"://" + baseUrl
#                             url = baseUrl + url

#if not self.existsInVisited(url,self.visited):
                        if url not in self.visited:
                            #if url.startswith('http:') and url.find('#') == -1 and not self.exists(url,self.priorityQueue.queue):
                            if url.startswith('http') and not self.exists(
                                    url, self.priorityQueue.queue):
                                url_score = self.scorer.calculate_score(
                                    link.getAllText())
                                self.totalPagesCount += 1
                                #tot_score = (page_score + url_score)/2.0
                                #tot_score = page_score + url_score
                                tot_score = url_score
                                if tot_score > self.urlScoreThreshold:
                                    #self.priorityQueue.push(((-1 * url_score),url))
                                    self.priorityQueue.push(
                                        ((-1 * tot_score), url, page.pageId))
Ejemplo n.º 2
0
 def crawl(self):
     self.harvestRatioData = []
     self.relevantPages = []
     
     while self.pagesCount <  self.pagesLimit and not self.priorityQueue.isempty():
     
         work_url = self.priorityQueue.pop()
         self.visited.append(work_url)
         page = Webpage(work_url,self.pagesCount)
         if page.text =='' :
             continue
         page_score = 0.0
         if self.combineScore:
             if len(page.text) > 0:
                 page_score = self.scorer.calculate_score(page.text,'W')
             else:
                 continue
             page.estimatedScore = page_score
             if self.restricted:
                 if page_score < self.pageScoreThreshold:
                     continue
             
         #print -1 * work_url[0],",", str(page_score),",",work_url[1],",", work_url[3]
         print -1 * work_url[0],",",work_url[1],",", work_url[3]
         self.pagesCount += 1
         
         page.getUrls()
         self.relevantPages.append(page)
         for link in page.outgoingUrls:
             url = link.address
             if url != None and url != '':
                 url = url.strip()
                 if url.find('?')!= -1:                            
                     url = url.split('?')[0]
                 if url.find('#') != -1:
                     url = url.split('#')[0]
                 
                 if url.endswith(("comment","comment/","feed","comments","feed/","comments/",".rss","video","video/","link","gif","jpeg","mp4","wav","jpg","mp3","png","share.php","sharer.php","login.php","print","print/","button/","share","email","submit","post",".pdf") ):    
                     continue
                 if not self.exists(url,1):
                     #tot_score = 0.0
                     if url.startswith('http') and not self.exists(url,2):                            
                         if self.mode == 1:
                             url_score = self.scorer.calculate_score(link.getAllText(),'U')
                             if self.combineScore:
                                 tot_score= 0.5 *page_score + 0.5 *url_score
                             else:
                                 tot_score = url_score
                             #if tot_score >= self.urlScoreThreshold:
                             self.priorityQueue.push(((-1 * tot_score),url,page.pageId,link.getAllText()))
                         #else:
                         #    self.priorityQueue.push(((-1 * page_score),url,page.pageId,link.getAllText()))
         #else:
         #    self.pages.append((page,0))
                                 
     print self.priorityQueue.isempty()
Ejemplo n.º 3
0
    def crawl(self):
        #start crawling
        #myopener = MyOpener()
        self.harvestRatioData = []
        self.relevantPages = []
        while self.pagesCount <  self.pagesLimit and not self.priorityQueue.isempty():
            work_url = self.priorityQueue.pop()
            self.visited.append(work_url[1])
            #print ("%s, %s") % (-1 * work_url[0], work_url[1])
            #page = urllib2.urlopen(work_url)
            '''page = myopener.open(work_url)
            self.pagesCount += 1
            soup = BeautifulSoup(page)
            links = soup.find_all('a')'''
            page = Webpage(work_url,self.pagesCount)
            if len(page.text) > 0:
                page_score = self.scorer.calculate_score(page.text)
            else:
                page_score = 0
            
            self.pagesCount += 1
            if (page_score > self.pageScoreThreshold):
                page.getUrls()
                self.relevantPagesCount += 1
                self.relevantPages.append(page)
                self.harvestRatioData.append((self.relevantPagesCount,self.pagesCount))
                print ("%s,"+ str(page_score)+", %s") % (-1 * work_url[0], work_url[1])
                for link in page.outgoingUrls:
                    url = link.address
                    if url != None and url != '':
                        if url.find('?')!= -1:
                            url = url.split('?')[0]
                        if url.find('#') != -1:
                            url = url.split('#')[0]
                        
#                         if url.startswith('http') == False:
#                             parts = page.pageUrl[1].split("://")
#                             baseUrl = parts[1].split("/")[0]
#                             baseUrl = parts[0] +"://" + baseUrl
#                             url = baseUrl + url
                        
                        #if not self.existsInVisited(url,self.visited): 
                        if url not in self.visited:
                            #if url.startswith('http:') and url.find('#') == -1 and not self.exists(url,self.priorityQueue.queue):                            
                            if url.startswith('http') and not self.exists(url,self.priorityQueue.queue):
                                url_score = self.scorer.calculate_score(link.getAllText())
                                self.totalPagesCount +=1
                                #tot_score = (page_score + url_score)/2.0
                                #tot_score = page_score + url_score
                                tot_score = url_score
                                if tot_score > self.urlScoreThreshold:
                                    #self.priorityQueue.push(((-1 * url_score),url))
                                    self.priorityQueue.push(((-1 * tot_score),url,page.pageId))
Ejemplo n.º 4
0
    def crawl(self):
        #start crawling
        #myopener = MyOpener()
        self.harvestRatioData = []
        self.relevantPages = []
        while self.pagesCount < self.pagesLimit and not self.priorityQueue.isempty(
        ):
            work_url = self.priorityQueue.pop()
            self.visited.append(work_url)

            page = Webpage(work_url, self.pagesCount)
            if len(page.text) > 0:
                page_score = self.scorer.calculate_score(page.text)
            else:
                page_score = 0

            self.pagesCount += 1
            if (page_score > self.pageScoreThreshold):
                page.getUrls()
                self.relevantPagesCount += 1
                self.relevantPages.append(page)
                self.harvestRatioData.append(
                    (self.relevantPagesCount, self.pagesCount))
                print("%s, %s") % (-1 * work_url[0], work_url[1])
                for link in page.outgoingUrls:
                    url = link.address
                    if url != None and url != '':
                        #if url.find('?')!= -1:
                        #    url = url.split('?')[0]
                        if url.startswith("/"):
                            base = page.pageUrl[1][7:].split("/")[0]
                            url = "http://" + base + url
                        if not self.exists(url, self.visited):
                            if url.startswith('http') and url.find(
                                    '#'
                            ) == -1 and not self.priorityQueue.exists(
                                    url
                            ):  #self.exists(url,self.priorityQueue.queue):
                                url_score = self.scorer.calculate_score(
                                    link.getAllText())
                                self.totalPagesCount += 1
                                #tot_score = (page_score + url_score)/2.0
                                #tot_score = page_score + url_score
                                tot_score = url_score
                                if tot_score > self.urlScoreThreshold:
                                    #self.priorityQueue.push(((-1 * url_score),url))
                                    self.priorityQueue.push(
                                        ((-1 * tot_score), url, page.pageId))
                                    #self.relevantPagesCount += 1

                self.priorityQueue.next()
 def crawl(self):
     #start crawling
     #myopener = MyOpener()
     self.harvestRatioData = []
     self.relevantPages = []
     while self.pagesCount <  self.pagesLimit and not self.priorityQueue.isempty():            
         work_url = self.priorityQueue.pop()
         self.visited.append(work_url)
         
         page = Webpage(work_url,self.pagesCount)
         if len(page.text) > 0:
             page_score = self.scorer.calculate_score(page.text)
         else:
             page_score = 0
             
         self.pagesCount += 1
         if (page_score > self.pageScoreThreshold):
             page.getUrls()
             self.relevantPagesCount += 1
             self.relevantPages.append(page)
             self.harvestRatioData.append((self.relevantPagesCount,self.pagesCount))
             print ("%s, %s") % (-1 * work_url[0], work_url[1])
             for link in page.outgoingUrls:
                 url = link.address                    
                 if url != None and url != '':
                     #if url.find('?')!= -1:
                     #    url = url.split('?')[0]
                     if url.startswith("/"):                            
                         base = page.pageUrl[1][7:].split("/")[0]
                         url = "http://" + base + url
                     if not self.exists(url,self.visited):
                         if url.startswith('http') and url.find('#') == -1 and not self.priorityQueue.exists(url):#self.exists(url,self.priorityQueue.queue):                            
                             url_score = self.scorer.calculate_score(link.getAllText())
                             self.totalPagesCount +=1
                             #tot_score = (page_score + url_score)/2.0
                             #tot_score = page_score + url_score
                             tot_score = url_score
                             if tot_score > self.urlScoreThreshold:
                                 #self.priorityQueue.push(((-1 * url_score),url))
                                 self.priorityQueue.push(((-1 * tot_score),url,page.pageId))
                                 #self.relevantPagesCount += 1                            
                             
             self.priorityQueue.next()
Ejemplo n.º 6
0
    def enhanced_crawl(self):
            #start crawling
            #myopener = MyOpener()
            self.harvestRatioData = []
            self.relevantPages = []
            while self.pagesCount <  self.pagesLimit and not self.priorityQueue.isempty():
                work_url = self.priorityQueue.pop()
                self.visited.append(work_url[1])
                #print ("%s, %s") % (-1 * work_url[0], work_url[1])
                #page = urllib2.urlopen(work_url)
                '''page = myopener.open(work_url)
                self.pagesCount += 1
                soup = BeautifulSoup(page)
                links = soup.find_all('a')'''
                #print work_url[1]
                try:
                    req = urllib2.Request(work_url[1])
                    # create a request object

                    handle = urllib2.urlopen(req)
                    # and open it to return a handle on the url
                except urllib2.URLError, e:
                    # ignore error, URL timed out
                    pass

                else:
                    html = handle.read()
                    soup = BeautifulSoup(html)
                    paras = soup.findAll('p')
                    #print paras
                    text = ""
                    for para in paras:
                            text = text + " " + para.text
                                    
                    page = Webpage(work_url,self.pagesCount)
                    if len(page.text) > 0:
                        page_score = self.scorer.calculate_smart_score(text, work_url[1])
                    else:
                        page_score = 0
                        
                    self.pagesCount += 1
                    if (page_score > self.pageScoreThreshold):
                        page.getUrls()
                        self.relevantPagesCount += 1
                        self.relevantPages.append(page)
                        self.harvestRatioData.append((self.relevantPagesCount,self.pagesCount))
                        print ("%s|"+ str(page_score)+"|%s") % (-1.0 * work_url[0], work_url[1])
                        for link in page.outgoingUrls:
                            url = link.address
                            if url != None and url != '':
                                if url.find('?')!= -1:
                                    url = url.split('?')[0]
                                if url.find('#') != -1:
                                    url = url.split('#')[0]
                                    
            #                         if url.startswith('http') == False:
            #                             parts = page.pageUrl[1].split("://")
            #                             baseUrl = parts[1].split("/")[0]
            #                             baseUrl = parts[0] +"://" + baseUrl
            #                             url = baseUrl + url
                                    
                                    #if not self.existsInVisited(url,self.visited): 
                                if url not in self.visited:
                                        #if url.startswith('http:') and url.find('#') == -1 and not self.exists(url,self.priorityQueue.queue):                            
                                    if url.startswith('http') and not self.exists(url,self.priorityQueue.queue):
                                        url_score = self.url_scorer.calculate_score(link.getAllText())
                                        self.totalPagesCount +=1
                                        #tot_score = (page_score + url_score)/2.0
                                        #tot_score = page_score + url_score
                                        tot_score = url_score
                                        if tot_score > self.urlScoreThreshold:
                                            #self.priorityQueue.push(((-1 * url_score),url))
                                            self.priorityQueue.push(((-1 * tot_score),url,page.pageId))
Ejemplo n.º 7
0
    def crawl(self):
        self.harvestRatioData = []
        self.relevantPages = []
        webpages = []
        count = 0
        ftext = open(self.pagesDir + "webpagesTxt.txt", "w")
        webpageLabel = 0  # 0 for Non-relevant and 1 for Relevant
        while self.pagesCount < self.pagesLimit and not self.priorityQueue.isempty(
        ):

            work_url = self.priorityQueue.pop()
            url = work_url[1]
            #if self.exists(url,1):
            #    continue
            if url in self.visited:
                continue
            #self.visited.append(url)#work_url[1])
            self.visited[url] = 1
            page = Webpage(work_url, self.pagesCount)
            if page.text == '':
                continue

            page.estimatedScore = 0
            if self.combineScore:
                page_score = 0
                if len(page.text) > 0:
                    #page_score = self.scorer.calculate_score(page.text,'W')[1]
                    page_score = self.scorer.calculate_score(page, 'W')[1]
                    if page_score == -1:
                        continue
                else:
                    print 'page text is empty'
                    continue

                page.estimatedScore = page_score

                if self.restricted:
                    if page_score < self.pageScoreThreshold:
                        #self.pagesCount += 1
                        continue

                pageDom = getDomain(url)
                if page_score >= self.pageScoreThreshold:
                    self.sourcesImp[pageDom][0] += 1
                    webpageLabel = 1
                else:
                    self.sourcesImp[pageDom][1] += 1
                    #self.sourcesImp[pageDom][0] = self.sourcesImp[pageDom][1]
                    webpageLabel = 0
            if self.combineScore:
                print page.pageId, ": ", str(
                    page_score), ",", -1 * work_url[0], ",", work_url[
                        1]  #,",", work_url[3]
            else:
                print -1 * work_url[0], ",", work_url[1]  #,",", work_url[3]
            self.pagesCount += 1
            #self.relevantPages.append((page.pageId,page.pageUrl,page.estimatedScore))
            self.relevantPages.append(
                (page.pageId, (page.pageUrl[1], page.pageUrl[2]),
                 page.estimatedScore))

            wbsStr = page.text.replace('\n', '. ').replace('\t', ' ')

            webpages.append(wbsStr)
            count += 1
            #save webpage's text to disk instead of adding to list
            # this will lead to change in evaluation
            if count % self.bufferLen == 0:
                strToWrite = '\n'.join(webpages).encode("utf-8")
                ftext.write(strToWrite)
                webpages = []
            #ftext = open(self.pagesDir+str(page.pageId) + ".txt", "w")
            #ftext.write(page.text.encode("utf-8"))
            #ftext.close()
            #-------

            if page_score < 0.1:
                continue
            page.getUrls()

            for link in page.outgoingUrls:
                url = link.address

                #if url != None and url != '':
                if url:
                    url = url.strip()
                    if url.find('report-a-typo') != -1:
                        continue
                    if url.find('m.tempo.co/') != -1:
                        continue
                    if url.find('?') != -1:
                        furl = url.split('?')[1]
                        if furl.startswith('id=') == False or furl.startswith(
                                'v=') == False or furl.startswith(
                                    'tid=') == False:
                            url = url.split('?')[0]
                    if url.find('#') != -1:
                        url = url.split('#')[0]

                    if url.endswith('/'):
                        url = url[:-1]
                    #if url.endswith(("comment","comment/","feed","comments","feed/","comments/",".rss","video","video/","link","gif","jpeg","mp4","wav","jpg","mp3","png","share.php","sharer.php","login.php","print","print/","button/","share","email","submit","post",".pdf") ):
                    if url.endswith(
                        ("comment", "feed", "comments", ".rss", "video",
                         "link", "gif", "jpeg", "mp4", "wav", "jpg", "mp3",
                         "png", "share.php", "sharer.php", "login.php",
                         "print", "button", "share", "email", "submit", "post",
                         ".pdf")):
                        continue

                    #if not self.exists(url,1):
                    if url in self.visited:
                        continue
                    #tot_score = 0.0
                    if url.startswith('http'):  #and not self.exists(url,2):
                        linkText = link.getAllText()
                        #if self.mode == 1:
                        #url_score = self.scorer.calculate_score(linkText,'U')
                        url_score = self.scorer.calculate_score(link, 'U')
                        tot_score = url_score
                        if self.combineScore:
                            #tot_score= 0.4 *page_score + 0.6 *url_score

                            tot_score = page_score * url_score
                        if tot_score < self.urlScoreThreshold:
                            continue
                        urlDom = getDomain(url)

                        si_score = self.sourcesImp[urlDom][
                            0] / self.sourcesImp[urlDom][1]
                        if self.siScoreCombineMethod == 1:
                            if webpageLabel:
                                tot_score = tot_score * si_score
                        elif self.siScoreCombineMethod == 2:
                            tot_score = self.topicWeight * tot_score + self.siWeight * si_score
                        #tot_score = tot_score * si_score
                        #else:
                        #    tot_score = url_score
                        #if tot_score >= self.urlScoreThreshold:
                        #print tot_score, '-', url, linkText
                        if self.restricted:
                            if tot_score < self.urlScoreThreshold:
                                continue
                        if tot_score >= self.urlScoreThreshold:
                            self.priorityQueue.push(
                                ((-1 * tot_score), url,
                                 page.pageId))  #,linkText))
                        #else:
                        #    self.priorityQueue.push(((-1 * page_score),url,page.pageId,link.getAllText()))
            #else:
            #    self.pages.append((page,0))

        print self.priorityQueue.isempty()

        if webpages:
            strToWrite = '\n'.join(webpages).encode("utf-8")
            ftext.write(strToWrite)
        ftext.close()

        return self.priorityQueue.queue