コード例 #1
0
ファイル: crawler.py プロジェクト: nyimbi/ProjFocusedCrawler
 def crawl(self):
     self.harvestRatioData = []
     self.relevantPages = []
     
     while self.pagesCount <  self.pagesLimit and not self.priorityQueue.isempty():
     
         work_url = self.priorityQueue.pop()
         self.visited.append(work_url)
         page = Webpage(work_url,self.pagesCount)
         if page.text =='' :
             continue
         page_score = 0.0
         if self.combineScore:
             if len(page.text) > 0:
                 page_score = self.scorer.calculate_score(page.text,'W')
             else:
                 continue
             page.estimatedScore = page_score
             if self.restricted:
                 if page_score < self.pageScoreThreshold:
                     continue
             
         #print -1 * work_url[0],",", str(page_score),",",work_url[1],",", work_url[3]
         print -1 * work_url[0],",",work_url[1],",", work_url[3]
         self.pagesCount += 1
         
         page.getUrls()
         self.relevantPages.append(page)
         for link in page.outgoingUrls:
             url = link.address
             if url != None and url != '':
                 url = url.strip()
                 if url.find('?')!= -1:                            
                     url = url.split('?')[0]
                 if url.find('#') != -1:
                     url = url.split('#')[0]
                 
                 if url.endswith(("comment","comment/","feed","comments","feed/","comments/",".rss","video","video/","link","gif","jpeg","mp4","wav","jpg","mp3","png","share.php","sharer.php","login.php","print","print/","button/","share","email","submit","post",".pdf") ):    
                     continue
                 if not self.exists(url,1):
                     #tot_score = 0.0
                     if url.startswith('http') and not self.exists(url,2):                            
                         if self.mode == 1:
                             url_score = self.scorer.calculate_score(link.getAllText(),'U')
                             if self.combineScore:
                                 tot_score= 0.5 *page_score + 0.5 *url_score
                             else:
                                 tot_score = url_score
                             #if tot_score >= self.urlScoreThreshold:
                             self.priorityQueue.push(((-1 * tot_score),url,page.pageId,link.getAllText()))
                         #else:
                         #    self.priorityQueue.push(((-1 * page_score),url,page.pageId,link.getAllText()))
         #else:
         #    self.pages.append((page,0))
                                 
     print self.priorityQueue.isempty()
コード例 #2
0
    def crawl(self):
        self.harvestRatioData = []
        self.relevantPages = []
        webpages = []
        count = 0
        ftext = open(self.pagesDir + "webpagesTxt.txt", "w")
        webpageLabel = 0  # 0 for Non-relevant and 1 for Relevant
        while self.pagesCount < self.pagesLimit and not self.priorityQueue.isempty(
        ):

            work_url = self.priorityQueue.pop()
            url = work_url[1]
            #if self.exists(url,1):
            #    continue
            if url in self.visited:
                continue
            #self.visited.append(url)#work_url[1])
            self.visited[url] = 1
            page = Webpage(work_url, self.pagesCount)
            if page.text == '':
                continue

            page.estimatedScore = 0
            if self.combineScore:
                page_score = 0
                if len(page.text) > 0:
                    #page_score = self.scorer.calculate_score(page.text,'W')[1]
                    page_score = self.scorer.calculate_score(page, 'W')[1]
                    if page_score == -1:
                        continue
                else:
                    print 'page text is empty'
                    continue

                page.estimatedScore = page_score

                if self.restricted:
                    if page_score < self.pageScoreThreshold:
                        #self.pagesCount += 1
                        continue

                pageDom = getDomain(url)
                if page_score >= self.pageScoreThreshold:
                    self.sourcesImp[pageDom][0] += 1
                    webpageLabel = 1
                else:
                    self.sourcesImp[pageDom][1] += 1
                    #self.sourcesImp[pageDom][0] = self.sourcesImp[pageDom][1]
                    webpageLabel = 0
            if self.combineScore:
                print page.pageId, ": ", str(
                    page_score), ",", -1 * work_url[0], ",", work_url[
                        1]  #,",", work_url[3]
            else:
                print -1 * work_url[0], ",", work_url[1]  #,",", work_url[3]
            self.pagesCount += 1
            #self.relevantPages.append((page.pageId,page.pageUrl,page.estimatedScore))
            self.relevantPages.append(
                (page.pageId, (page.pageUrl[1], page.pageUrl[2]),
                 page.estimatedScore))

            wbsStr = page.text.replace('\n', '. ').replace('\t', ' ')

            webpages.append(wbsStr)
            count += 1
            #save webpage's text to disk instead of adding to list
            # this will lead to change in evaluation
            if count % self.bufferLen == 0:
                strToWrite = '\n'.join(webpages).encode("utf-8")
                ftext.write(strToWrite)
                webpages = []
            #ftext = open(self.pagesDir+str(page.pageId) + ".txt", "w")
            #ftext.write(page.text.encode("utf-8"))
            #ftext.close()
            #-------

            if page_score < 0.1:
                continue
            page.getUrls()

            for link in page.outgoingUrls:
                url = link.address

                #if url != None and url != '':
                if url:
                    url = url.strip()
                    if url.find('report-a-typo') != -1:
                        continue
                    if url.find('m.tempo.co/') != -1:
                        continue
                    if url.find('?') != -1:
                        furl = url.split('?')[1]
                        if furl.startswith('id=') == False or furl.startswith(
                                'v=') == False or furl.startswith(
                                    'tid=') == False:
                            url = url.split('?')[0]
                    if url.find('#') != -1:
                        url = url.split('#')[0]

                    if url.endswith('/'):
                        url = url[:-1]
                    #if url.endswith(("comment","comment/","feed","comments","feed/","comments/",".rss","video","video/","link","gif","jpeg","mp4","wav","jpg","mp3","png","share.php","sharer.php","login.php","print","print/","button/","share","email","submit","post",".pdf") ):
                    if url.endswith(
                        ("comment", "feed", "comments", ".rss", "video",
                         "link", "gif", "jpeg", "mp4", "wav", "jpg", "mp3",
                         "png", "share.php", "sharer.php", "login.php",
                         "print", "button", "share", "email", "submit", "post",
                         ".pdf")):
                        continue

                    #if not self.exists(url,1):
                    if url in self.visited:
                        continue
                    #tot_score = 0.0
                    if url.startswith('http'):  #and not self.exists(url,2):
                        linkText = link.getAllText()
                        #if self.mode == 1:
                        #url_score = self.scorer.calculate_score(linkText,'U')
                        url_score = self.scorer.calculate_score(link, 'U')
                        tot_score = url_score
                        if self.combineScore:
                            #tot_score= 0.4 *page_score + 0.6 *url_score

                            tot_score = page_score * url_score
                        if tot_score < self.urlScoreThreshold:
                            continue
                        urlDom = getDomain(url)

                        si_score = self.sourcesImp[urlDom][
                            0] / self.sourcesImp[urlDom][1]
                        if self.siScoreCombineMethod == 1:
                            if webpageLabel:
                                tot_score = tot_score * si_score
                        elif self.siScoreCombineMethod == 2:
                            tot_score = self.topicWeight * tot_score + self.siWeight * si_score
                        #tot_score = tot_score * si_score
                        #else:
                        #    tot_score = url_score
                        #if tot_score >= self.urlScoreThreshold:
                        #print tot_score, '-', url, linkText
                        if self.restricted:
                            if tot_score < self.urlScoreThreshold:
                                continue
                        if tot_score >= self.urlScoreThreshold:
                            self.priorityQueue.push(
                                ((-1 * tot_score), url,
                                 page.pageId))  #,linkText))
                        #else:
                        #    self.priorityQueue.push(((-1 * page_score),url,page.pageId,link.getAllText()))
            #else:
            #    self.pages.append((page,0))

        print self.priorityQueue.isempty()

        if webpages:
            strToWrite = '\n'.join(webpages).encode("utf-8")
            ftext.write(strToWrite)
        ftext.close()

        return self.priorityQueue.queue