Esempi in Python per RobotFileParser

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: six.moves.urllib_robotparser

Classe/tipologia: RobotFileParser

Esempi su hotexamples.com: 8

RobotFileParser in Python: 8 esempi trovati. Questi sono i migliori esempi reali in Python per six.moves.urllib_robotparser.RobotFileParser, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Metodi utilizzati di frequente

Mostra Nascondi

RobotFileParser(3)

can_fetch(3)

parse(3)

__init__(1)

allow_all(1)

Esempio n. 1

Mostra file

File: robotstxt.py Progetto: rbetv/contextual

class PythonRobotParser(RobotParser):
    def __init__(self, robotstxt_body, spider):
        from six.moves.urllib_robotparser import RobotFileParser
        self.spider = spider
        try:
            robotstxt_body = to_native_str(robotstxt_body)
        except UnicodeDecodeError:
            # If we found garbage or robots.txt in an encoding other than UTF-8, disregard it.
            # Switch to 'allow all' state.
            logger.warning("Failure while parsing robots.txt using %(parser)s."
                           " File either contains garbage or is in an encoding other than UTF-8, treating it as an empty file.",
                           {'parser': "RobotFileParser"},
                           exc_info=sys.exc_info(),
                           extra={'spider': self.spider})
            robotstxt_body = ''
        self.rp = RobotFileParser()
        self.rp.parse(robotstxt_body.splitlines())

    @classmethod
    def from_crawler(cls, crawler, robotstxt_body):
        spider = None if not crawler else crawler.spider
        o = cls(robotstxt_body, spider)
        return o

    def allowed(self, url, user_agent):
        user_agent = to_native_str(user_agent)
        url = to_native_str(url)
        return self.rp.can_fetch(user_agent, url)

Esempio n. 2

Mostra file

 def __init__(self, robotstxt_body, spider):
     from six.moves.urllib_robotparser import RobotFileParser
     self.spider = spider
     robotstxt_body = decode_robotstxt(
         robotstxt_body, spider, to_native_str_type=True)
     self.rp = RobotFileParser()
     self.rp.parse(robotstxt_body.splitlines())

Esempio n. 3

Mostra file

File: util.py Progetto: KevinAnthony/dosage

def get_robotstxt_parser(url, session=None):
    """Get a RobotFileParser for the given robots.txt URL."""
    rp = RobotFileParser()
    try:
        req = urlopen(url, session, max_content_bytes=MaxContentBytes,
                      raise_for_status=False)
    except Exception:
        # connect or timeout errors are treated as an absent robots.txt
        rp.allow_all = True
    else:
        if req.status_code >= 400:
            rp.allow_all = True
        elif req.status_code == 200:
            rp.parse(req.text.splitlines())
    return rp

Esempio n. 4

Mostra file

File: robotstxt.py Progetto: rbetv/contextual

 def __init__(self, robotstxt_body, spider):
     from six.moves.urllib_robotparser import RobotFileParser
     self.spider = spider
     try:
         robotstxt_body = to_native_str(robotstxt_body)
     except UnicodeDecodeError:
         # If we found garbage or robots.txt in an encoding other than UTF-8, disregard it.
         # Switch to 'allow all' state.
         logger.warning("Failure while parsing robots.txt using %(parser)s."
                        " File either contains garbage or is in an encoding other than UTF-8, treating it as an empty file.",
                        {'parser': "RobotFileParser"},
                        exc_info=sys.exc_info(),
                        extra={'spider': self.spider})
         robotstxt_body = ''
     self.rp = RobotFileParser()
     self.rp.parse(robotstxt_body.splitlines())

Esempio n. 5

Mostra file

class PythonRobotParser(RobotParser):
    def __init__(self, robotstxt_body, spider):
        from six.moves.urllib_robotparser import RobotFileParser
        self.spider = spider
        robotstxt_body = decode_robotstxt(
            robotstxt_body, spider, to_native_str_type=True)
        self.rp = RobotFileParser()
        self.rp.parse(robotstxt_body.splitlines())

    @classmethod
    def from_crawler(cls, crawler, robotstxt_body):
        spider = None if not crawler else crawler.spider
        o = cls(robotstxt_body, spider)
        return o

    def allowed(self, url, user_agent):
        user_agent = to_native_str(user_agent)
        url = to_native_str(url)
        return self.rp.can_fetch(user_agent, url)

Esempio n. 6

Mostra file

File: util.py Progetto: littauer/dosagetest

def get_robotstxt_parser(url, session=None):
    """Get a RobotFileParser for the given robots.txt URL."""
    rp = RobotFileParser()
    try:
        req = urlopen(url, session, max_content_bytes=MaxContentBytes,
                      allow_errors=range(600))
    except Exception:
        # connect or timeout errors are treated as an absent robots.txt
        rp.allow_all = True
    else:
        if req.status_code >= 400:
            rp.allow_all = True
        elif req.status_code == 200:
            rp.parse(req.text.splitlines())
    return rp

Esempio n. 7

Mostra file

File: structures.py Progetto: maxpearl/pywebcopy

 def __init__(self, user_agent, url):
     self.url = url
     self.user_agent = user_agent
     RobotFileParser.__init__(self, self.url)

Esempio n. 8

Mostra file

File: structures.py Progetto: maxpearl/pywebcopy

 def can_fetch(self, url, useragent=None):
     RobotFileParser.can_fetch(self,
                               useragent=useragent or self.user_agent,
                               url=url)