コード例 #1
0
    def canVisit(self, url):
        topDomain = utils.getTopDomain(url)
        retrRules = self.__rulesDict__.get(topDomain, None)
        if retrRules is None:  # Cache miss
            robotsUrl = utils.robotsTxt(url)
            roboFileBuf = utils.dlAndDecode(robotsUrl)
            if not self.parseRobotFile(topDomain, roboFileBuf):
                return False
            retr = self.__rulesDict__[topDomain]

        sp = tuple(filter(lambda a: a, url.split(topDomain)))
        if sp:
            firstCh = firstLetterCompile.search(sp[0])
            if firstCh:
                # Time to probe
                fCh = firstCh.groups(1)[0]
                retr = self.__rulesDict__[topDomain]['disallow']
                compList = retr.get(fCh, None)
                if compList:
                    for comp in compList:
                        if comp.search(sp[0]):
                            return False

                    return True
        return True
コード例 #2
0
ファイル: RobotParser.py プロジェクト: odeke-em/crawlers
    def canVisit(self, url):
        topDomain = utils.getTopDomain(url)
        retrRules = self.__rulesDict__.get(topDomain, None)
        if retrRules is None: # Cache miss
            robotsUrl = utils.robotsTxt(url)
            roboFileBuf = utils.dlAndDecode(robotsUrl)
            if not self.parseRobotFile(topDomain, roboFileBuf):
                return False
            retr = self.__rulesDict__[topDomain]

        sp = tuple(filter(lambda a: a, url.split(topDomain)))
        if sp:
            firstCh = firstLetterCompile.search(sp[0])
            if firstCh:
                # Time to probe
                fCh = firstCh.groups(1)[0]
                retr = self.__rulesDict__[topDomain]['disallow']
                compList = retr.get(fCh, None)
                if compList:
                    for comp in compList:
                        if comp.search(sp[0]):
                            return False 

                    return True
        return True
コード例 #3
0
 def addRobotRule(self, url):
     topDomain = utils.getTopDomain(url)
     if topDomain:
         robotPath = utils.robotsTxt(topDomain)
コード例 #4
0
ファイル: RobotParser.py プロジェクト: odeke-em/crawlers
 def addRobotRule(self, url): 
     topDomain = utils.getTopDomain(url)
     if topDomain:
         robotPath = utils.robotsTxt(topDomain)