Ejemplo n.º 1
0
    def parseDomains( self ):
        if self.urls.__len__() == 0:
            self.parseUrls()
            
        domains = []
        noHttpUrls = [ url[7:] for url in self.urls]

        for url in noHttpUrls:
            index = url.find('/')
            domains.append( url[:index != -1 and index or None] )
        
        selfDomain = self.pageUrl[7:]
        index = selfDomain.find('/')
        if index != -1:
            selfDomain = self.pageUrl[:index]

        domains = uniqify_nop(domains)

        if self.removeSelfLinks & 2:
            if domains.count( selfDomain ) != 0:
                domains.remove( selfDomain )

        self.domains = domains
        self.parseComplete = True
        return
Ejemplo n.º 2
0
    def parseUrls( self ):
       
        soup = BeautifulSoup( self.htmlCode )
        a_tags = soup.findAll('a')
        
        links = []
        for tag in a_tags:
            for name, value in tag.attrs:
                if name == 'href':
                    links.append(value)
                    
        urls = filter( (lambda str: str.find('http://') == 0), links )
                
        relLinks = filter( (lambda str: str.find('http://') == -1), links )
        relLinks = filter( (lambda str: str != '/'), relLinks )
        relLinks = [ self.domainName + relLink for relLink in relLinks ]

        urls += relLinks
        
        urls = uniqify_nop(urls)

        if self.removeSelfLinks & 1:
            if urls.count( self.pageUrl ) != 0:
                urls.remove( self.pageUrl )        

        self.urls = urls
        return