コード例 #1
0
def get_robotstxt_parser(url, session=None):
    """Get a RobotFileParser for the given robots.txt URL."""
    rp = RobotFileParser()
    try:
        req = urlopen(url, session, max_content_bytes=MaxContentBytes,
                      allow_errors=range(600))
    except Exception:
        # connect or timeout errors are treated as an absent robots.txt
        rp.allow_all = True
    else:
        if req.status_code >= 400:
            rp.allow_all = True
        elif req.status_code == 200:
            rp.parse(req.text.splitlines())
    return rp
コード例 #2
0
ファイル: db_classes.py プロジェクト: jgombac/crawler
 def get_robots(self):
     rp = RobotFileParser()
     if self.robots_content:
         rp.parse(self.robots_content)
     else:
         rp.allow_all = True
     return rp
コード例 #3
0
async def parse_robots(session, base):
    """Fetches and parses the robots.txt file from a given base URL. Returns an instance of
    RobotFileParser."""

    url = urljoin(base, "robots.txt")
    async with session.get(url) as response:
        status = response.status
        text = await response.text()
    robot_parser = RobotFileParser()
    if status == 200:
        robot_parser.parse(text.splitlines())
    else:
        robot_parser.allow_all = True
    return robot_parser
コード例 #4
0
ファイル: store.py プロジェクト: snowwym/proj_news_viz
 def get_robots_parser(self, url: str):
     rp = RobotFileParser()
     if self.store.exists(url, 'txt'):
         body = self.store.load_url(url, 'txt')
     else:
         page, status_code = download_page(url, 'Robot')
         body = page.body
         if status_code in [401, 403]:
             body = self.DISALLOW_ALL
         elif 400 <= status_code < 500:  # including status_code 404
             body = self.ALLOW_ALL
         self.store.save_url(url, body, 'txt')
     if body.strip() == self.ALLOW_ALL:
         rp.allow_all = True
     elif body.strip() == self.DISALLOW_ALL:
         rp.disallow_all = True
     else:
         rp.parse(body.decode('utf-8').splitlines())
     return rp