Python RobotStorage Exemples

Langage de programmation: Python

Espace de nommage/Pack: RobotStorage

Class/Type: RobotStorage

Exemples au hotexamples.com: 2

Python RobotStorage - 2 exemples trouvés. Ce sont les exemples réels les mieux notés de RobotStorage.RobotStorage extraits de projets open source. Vous pouvez noter les exemples pour nous aider à en améliorer la qualité.

Méthodes fréquemment utilisées

Afficher Cacher

get_robot(1)

Méthodes fréquemment utilisées

get_robot (1)

Associées

kwargs_to_list

writerclass

importComponentViewFile

text_only

get_connection_string

CIDict

eq

BigcommerceApi

out

detach

Related in langs

tep_products_in_category_count (PHP)

getUnlinkableHostIds (PHP)

Squid (C#)

ReviewItemDiscrepancy (C#)

slip_send_char (C++)

reset_wave (C++)

Eq (Go)

New (Go)

MNamespace (Java)

Entity (Java)

Exemple #1

0

Afficher le fichier

Fichier : Crawler.py Projet : tvs/creepy

def __init__(self, seeds, store_loc, num_threads=1, threshold=0, verbose=False): self.verbose = verbose self.threshold = int(threshold) # Max. Number of Pages to Crawl self.robotstorage = RobotStorage(__user_agent__) self.pagestorage = PageStorage({'store_location': store_loc}) self.urllist = [] # List of URLs crawled or in queue self.frontier = Queue() # Crawler's Request Queue self.watcher = Watcher() # Watch for Keyboard Interrupt self._pagesstored = 0 # Number of pages stored self._lock = threading.Lock() for n in range(num_threads): # Pool of Threads worker = threading.Thread(target=self.crawl, args=(n, )) worker.setDaemon(True) worker.start() for link in seeds: self.queue_url(link) self.frontier.join()

Exemple #2

0

Afficher le fichier

Fichier : Crawler.py Projet : tvs/creepy

class Crawler: """ A simple Web Crawler """ def __init__(self, seeds, store_loc, num_threads=1, threshold=0, verbose=False): self.verbose = verbose self.threshold = int(threshold) # Max. Number of Pages to Crawl self.robotstorage = RobotStorage(__user_agent__) self.pagestorage = PageStorage({'store_location': store_loc}) self.urllist = [] # List of URLs crawled or in queue self.frontier = Queue() # Crawler's Request Queue self.watcher = Watcher() # Watch for Keyboard Interrupt self._pagesstored = 0 # Number of pages stored self._lock = threading.Lock() for n in range(num_threads): # Pool of Threads worker = threading.Thread(target=self.crawl, args=(n, )) worker.setDaemon(True) worker.start() for link in seeds: self.queue_url(link) self.frontier.join() def crawl(self, tid): """Crawl given URL""" while True: try: if self.threshold > 0 and self._pagesstored + 1 > self.threshold: print "### Number of URLs crawled:", self._pagesstored self.watcher.kill() break url = self.frontier.get() if self.verbose: print " Crawler #%d: %s" % (tid, url) robot = self.robotstorage.get_robot(url) if robot.is_allowed(url): # Delay processing d_time = robot.delay_remaining(time.time()) if d_time > 0: if self.verbose > 1: print "\n###*** Delaying for %f seconds" % d_time time.sleep(d_time) # Update the last request time robot.update_last_request(time.time()) # Download the Page page = Fetcher(url, verbose=self.verbose) doc = page.get_content() if doc: self.pagestorage.store(url, doc) self._lock.acquire() try: self._pagesstored = self._pagesstored + 1 finally: self._lock.release() # Parse Page for Links p = Parser(url, doc) links = p.get_links() if self.verbose > 1: print " # links on %s: %d" % (url, len(links)) for link in links: self.queue_url(link) else: if self.verbose > 1: print "*** URL not allowed:", url finally: self.frontier.task_done() def queue_url(self, url): """Add url to the queue for crawling""" if url not in self.urllist and self.validate_url(url): self.urllist.append(url) self.frontier.put(url) def validate_url(self, url): """Validate given url - skip image/video/zip...""" u = urlparse.urlparse(url) if u.scheme in ['http', 'https'] and u.netloc and not u.fragment: return not re.search(r'(jpg|jpeg|gif|png|exe|msi|dmg|gz|zip|tar|mov|mpg|mp3|mp4)$', u.path, re.I) return False