Exemple #1
0
    def __init__(self, seeds, store_loc, num_threads=1, threshold=0, verbose=False):
        self.verbose = verbose
        self.threshold = int(threshold) # Max. Number of Pages to Crawl
        self.robotstorage = RobotStorage(__user_agent__)
        self.pagestorage = PageStorage({'store_location': store_loc})
        self.urllist = [] # List of URLs crawled or in queue
        self.frontier = Queue() # Crawler's Request Queue
        self.watcher = Watcher() # Watch for Keyboard Interrupt
        self._pagesstored = 0 # Number of pages stored
        self._lock = threading.Lock()
        for n in range(num_threads): # Pool of Threads
            worker = threading.Thread(target=self.crawl, args=(n, ))
            worker.setDaemon(True)
            worker.start()

        for link in seeds:
            self.queue_url(link)
        self.frontier.join()
Exemple #2
0
class Crawler:
    """
    A simple Web Crawler
    """
    def __init__(self, seeds, store_loc, num_threads=1, threshold=0, verbose=False):
        self.verbose = verbose
        self.threshold = int(threshold) # Max. Number of Pages to Crawl
        self.robotstorage = RobotStorage(__user_agent__)
        self.pagestorage = PageStorage({'store_location': store_loc})
        self.urllist = [] # List of URLs crawled or in queue
        self.frontier = Queue() # Crawler's Request Queue
        self.watcher = Watcher() # Watch for Keyboard Interrupt
        self._pagesstored = 0 # Number of pages stored
        self._lock = threading.Lock()
        for n in range(num_threads): # Pool of Threads
            worker = threading.Thread(target=self.crawl, args=(n, ))
            worker.setDaemon(True)
            worker.start()

        for link in seeds:
            self.queue_url(link)
        self.frontier.join()

    def crawl(self, tid):
        """Crawl given URL"""
        while True:
            try:
                if self.threshold > 0 and self._pagesstored + 1 > self.threshold:
                    print "### Number of URLs crawled:", self._pagesstored
                    self.watcher.kill()
                    break

                url = self.frontier.get()
                if self.verbose:
                    print "  Crawler #%d: %s" % (tid, url)

                robot = self.robotstorage.get_robot(url)
                if robot.is_allowed(url):
                    # Delay processing
                    d_time = robot.delay_remaining(time.time())
                    if d_time > 0:
                        if self.verbose > 1:
                            print "\n###*** Delaying for %f seconds" % d_time
                        time.sleep(d_time)

                    # Update the last request time
                    robot.update_last_request(time.time())

                    # Download the Page
                    page = Fetcher(url, verbose=self.verbose)
                    doc = page.get_content()
                    if doc:
                        self.pagestorage.store(url, doc)
                        self._lock.acquire()
                        try:
                            self._pagesstored = self._pagesstored + 1
                        finally:
                            self._lock.release()

                        # Parse Page for Links
                        p = Parser(url, doc)
                        links = p.get_links()
                        if self.verbose > 1:
                            print "    # links on %s: %d" % (url, len(links))
                        for link in links:
                            self.queue_url(link)
                else:
                    if self.verbose > 1:
                        print "*** URL not allowed:", url
            finally:
                self.frontier.task_done()

    def queue_url(self, url):
        """Add url to the queue for crawling"""
        if url not in self.urllist and self.validate_url(url):
            self.urllist.append(url)
            self.frontier.put(url)

    def validate_url(self, url):
        """Validate given url - skip image/video/zip..."""
        u = urlparse.urlparse(url)
        if u.scheme in ['http', 'https'] and u.netloc and not u.fragment:
            return not re.search(r'(jpg|jpeg|gif|png|exe|msi|dmg|gz|zip|tar|mov|mpg|mp3|mp4)$', u.path, re.I)
        return False