def enqueue(self, job_id): """ Enqueue URLs for the spider to crawl. Arguments: job_id: intefer job id. Returns: None """ urls = data.redis.smembers('job' + str(job_id)) self._queue.extend(urls) if not self._active and not data.job_is_aborted(job_id): self._deploy(job_id)
def _fetch_and_parse(self, job_id, url, depth): """ Fetch a webpage and parse it for links and images. Arguments: job_id: intefer job id. url: string URL. depth: integer current depth. Returns: None. """ html_parser = MyHtmlParser(url) request_headers = {'User-Agent': self.user_agent} request = urllib_Request(url, headers=request_headers) try: webpage = urlopen(request).read().decode() except Exception as error: data.redis.set(url, 'failed') return try: html_parser.feed(webpage) except (HTMLParseError) as error: data.redis.set(url, 'failed') return data.add_webpages(url, html_parser.hyperlinks, depth) data.redis.set(url, 'complete') data.complete_crawl(url) if 0 < depth and self._active and not data.job_is_aborted(job_id): if html_parser.hyperlinks: data.redis.sadd('job' + str(job_id), *html_parser.hyperlinks) data.redis.publish('deploy', pickle.dumps(job_id))
def _deploy(self, job_id): """ Deploy a spider to crawl the web. Use the DeploymentManager's enqueue method to specify which URLs to crawl. Depth should be assigned to each submitted URL prior to deployment. Arguments: job_id: intefer job id. Returns: None """ if data.job_is_aborted(job_id): self._active = False self._queue = [] return self._active = True queue_copy = self._queue[:] for index, url in enumerate(queue_copy): if data.job_is_aborted(job_id): break self._queue.remove(url) validated_url = validate_url(url) url = validated_url['url'] webpage_info = data.get_webpage_info(url) if not claim(url): continue if not validated_url['valid']: continue # Ignore webpages crawled less than 15 min ago. if self._less_than_15_min_ago(webpage_info['completion_datetime']): continue # Database latency means depth is occasionally still unavailable. if not webpage_info['depth']: # Child URLs with no job_id and no depth have been deleted. if bool(data.redis.llen('reg:' + url)): data.redis.set(url, 'ready') self._queue.append(url) continue depth = webpage_info['depth'] - 1 self._set_job_status(job_id, depth, index, len(queue_copy)) self._fetch_and_parse(job_id, url, depth) time.sleep(self.delay) if data.job_is_aborted(job_id): self._active = False self._queue = [] else: if len(self._queue): time.sleep(self.delay) self._deploy(job_id) else: self._set_job_status(job_id, -1, -1, 0, 'Complete') self._active = False