def __init__ (self, RESTART_LIMIT=100, VISIT_MEMORY=100): """ RESTART_LIMIT: how many pages to visit before doing a random restart. VISIT_MEMORY: max pages to keep in a tabu list, i.e. a list of pages not to visit again. Prevents cycles. """ self.current_url = None self.current_html = None self.pages_crawled = 0 self.next_urls = None self.visited = SetQueue(VISIT_MEMORY) self.RESTART_LIMIT = RESTART_LIMIT
class Crawler (object): __metaclass__ = abc.ABCMeta def __init__ (self, RESTART_LIMIT=100, VISIT_MEMORY=100): """ RESTART_LIMIT: how many pages to visit before doing a random restart. VISIT_MEMORY: max pages to keep in a tabu list, i.e. a list of pages not to visit again. Prevents cycles. """ self.current_url = None self.current_html = None self.pages_crawled = 0 self.next_urls = None self.visited = SetQueue(VISIT_MEMORY) self.RESTART_LIMIT = RESTART_LIMIT def next_page (self): if self.current_url is not None and self.next_urls is None: self._fetch_next_urls_from_current_page() found_page = False while not found_page: if (self.current_url is None or self._restart_required() or self._no_new_pages_available_from_current()): new_page = self._random_restart() else: new_page = self.next_urls.pop() if new_page not in self.visited: found_page = True self.current_url = new_page self.current_html = None self.next_urls = None self.visited.add(new_page) self.pages_crawled += 1 @abc.abstractmethod def _fetch_next_urls_from_current_page (self): """Scrape current page for possible URLs to crawl next. Should form a list of possible URLs to be crawled next, using the HTML tree of the current page. Then assign self.next_urls to hold this list. Shuffle it as well, if randomness is desired. Returns: None """ return @staticmethod def _fetch_page_content (f): """Decorator - ensures HTML tree retrieval before function body.""" @wraps(f) def wrapped (inst, *args, **kwargs): if inst.current_html is None: page = req.get(inst.current_url) inst.current_html = html.fromstring(page.content) return f(inst, *args, **kwargs) return wrapped @abc.abstractmethod def _random_restart (self): """Return link to a 'random' page. Returns: Link to a new page to be crawled. """ return @abc.abstractmethod def scrape_page (self): """Scrape data from the current page. Note: maybe shouldn't require this implementation? Returns: Data from the current page, in whatever format. """ return def _restart_required (self): return self.pages_crawled % self.RESTART_LIMIT == 0 def _no_new_pages_available_from_current (self): return self.next_urls == []