Python RobotsParser Examples

Programming Language: Python

Namespace/Package Name: parsers

Class/Type: RobotsParser

Examples at hotexamples.com: 4

Python RobotsParser - 4 examples found. These are the top rated real world Python examples of parsers.RobotsParser extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

RobotsParser(1)

block_url(1)

Example #1

Show file

class GenericSiteCrawler(BaseSiteCrawler):
    def __init__(self, hostname, dbconn, siteid, https=False):
        super(GenericSiteCrawler, self).__init__(hostname,
                                                 dbconn,
                                                 siteid,
                                                 https=https)

    def init_crawl(self):
        # Load robots.txt
        self.robots = RobotsParser("http://%s/robots.txt" % self.hostname)

        # We need to seed the crawler with every URL we've already seen, since
        # we don't recrawl the contents if they haven't changed.
        allpages = self.scantimes.keys()

        # Figure out if there are any excludes to deal with (beyond the
        # robots.txt ones)
        curs = self.dbconn.cursor()
        curs.execute("SELECT suburlre FROM site_excludes WHERE site=%(site)s",
                     {
                         'site': self.siteid,
                     })
        self.extra_excludes = [re.compile(x) for x, in curs.fetchall()]

        # We *always* crawl the root page, of course
        self.queue.put(("/", 0.5, False))

        # Now do all the other pages
        for x in allpages:
            self.queue.put((x, 0.5, False))

    def exclude_url(self, url):
        if self.robots and self.robots.block_url(url):
            return True
        for r in self.extra_excludes:
            if r.search(url):
                return True
        return False

    def queue_url(self, url):
        self.queue.put((url.strip(), 0.5, False))

    def post_process_page(self, url):
        for l in self.resolve_links(self.page.links, url):
            if self.pages_crawled.has_key(l) or self.pages_crawled.has_key(
                    l + "/"):
                continue
            if self.exclude_url(l):
                continue
            self.queue_url(l)

Example #2

Show file

    def init_crawl(self):
        # Load robots.txt
        self.robots = RobotsParser("http://%s/robots.txt" % self.hostname)

        # We need to seed the crawler with every URL we've already seen, since
        # we don't recrawl the contents if they haven't changed.
        allpages = self.scantimes.keys()

        # Figure out if there are any excludes to deal with (beyond the
        # robots.txt ones)
        curs = self.dbconn.cursor()
        curs.execute("SELECT suburlre FROM site_excludes WHERE site=%(site)s",
                     {
                         'site': self.siteid,
                     })
        self.extra_excludes = [re.compile(x) for x, in curs.fetchall()]

        # We *always* crawl the root page, of course
        self.queue.put(("/", 0.5, False))

        # Now do all the other pages
        for x in allpages:
            self.queue.put((x, 0.5, False))

Example #3

Show file

File: genericsite.py Project: ChristophBerg/pgweb

class GenericSiteCrawler(BaseSiteCrawler):
	def __init__(self, hostname, dbconn, siteid):
		super(GenericSiteCrawler, self).__init__(hostname, dbconn, siteid)

	def init_crawl(self):
		# Load robots.txt
		self.robots = RobotsParser("http://%s/robots.txt" % self.hostname)

		# We need to seed the crawler with every URL we've already seen, since
		# we don't recrawl the contents if they haven't changed.
		allpages = self.scantimes.keys()

		# Figure out if there are any excludes to deal with (beyond the
		# robots.txt ones)
		curs = self.dbconn.cursor()
		curs.execute("SELECT suburlre FROM site_excludes WHERE site=%(site)s", {
				'site': self.siteid,
				})
		self.extra_excludes = [re.compile(x) for x, in curs.fetchall()]

		# We *always* crawl the root page, of course
		self.queue.put(("/", 0.5))

		# Now do all the other pages
		for x in allpages:
			self.queue.put((x, 0.5))

	def exclude_url(self, url):
		if self.robots and self.robots.block_url(url):
			return True
		for r in self.extra_excludes:
			if r.search(url):
				return True
		return False

	def queue_url(self, url):
		self.queue.put((url.strip(), 0.5))

	def post_process_page(self, url):
		for l in self.resolve_links(self.page.links, url):
			if self.pages_crawled.has_key(l) or self.pages_crawled.has_key(l+"/"):
				continue
			if self.exclude_url(l):
				continue
			self.queue_url(l)

Example #4

Show file

File: genericsite.py Project: ChristophBerg/pgweb

	def init_crawl(self):
		# Load robots.txt
		self.robots = RobotsParser("http://%s/robots.txt" % self.hostname)

		# We need to seed the crawler with every URL we've already seen, since
		# we don't recrawl the contents if they haven't changed.
		allpages = self.scantimes.keys()

		# Figure out if there are any excludes to deal with (beyond the
		# robots.txt ones)
		curs = self.dbconn.cursor()
		curs.execute("SELECT suburlre FROM site_excludes WHERE site=%(site)s", {
				'site': self.siteid,
				})
		self.extra_excludes = [re.compile(x) for x, in curs.fetchall()]

		# We *always* crawl the root page, of course
		self.queue.put(("/", 0.5))

		# Now do all the other pages
		for x in allpages:
			self.queue.put((x, 0.5))