def get(self, url, user_agent=None, cookies={}, respect_robots=False, cache=False): logger.log_info('GET %s' % url) #lazy init if cache and not self.cache_db: self.cache_connect() if config.respect_robots and respect_robots and not robots.can_fetch(url): return False if cache: cache_result = self.cache_db.get(str(url)) if cache_result: return pickle.loads(cache_result) else: logger.log_info('URL[%s] not found in cache' % url) headers = dict() if config.user_agent: headers['User-Agent'] = config.user_agent if user_agent: headers['User-Agent'] = user_agent res = requests.get(url, headers=headers, cookies=cookies) if res: #Don't reuse res.connection.close() logger.log_info('GET[%s] HTTP(%s)' % (url,res.status_code)) lite_res = LiteResponse() lite_res.content = res.content lite_res.cookies = requests.utils.dict_from_cookiejar(res.cookies) lite_res.headers = res.headers lite_res.status_code = res.status_code pickled = pickle.dumps(lite_res) if self.cache_db: self.cache_db.put(str(url), pickled) logger.log_info('Put content at URL %s into cache' % url) return lite_res else: logger.log_info('Request for URL %s failed' % (url)) return None
def run(self): with self.get_job(start_offset=self.crawl_delay) as job: #This is just so that we can retrieve url when printing an exception url = None try: if not job: return url = job.get_url() #TODO: make respect robots configurable resp_args = { 'cache' : self.cache_requests, 'respect_robots' : config.respect_robots if config.respect_robots else True, } cookies = job.get_cookies() if cookies: resp_args['cookies'] = cookies resp = job.get(url,**resp_args) if not resp: logger.log_info('Failed to get url: ' + url) return StrategyResult.PASSED if resp.status_code != 200: logger.log_info('[WARNING] Job with URL %s failed with code(%s)' % \ (url, resp.status_code)) return StrategyResult.PASSED if self.can_scrape(url, job): #maybe this can be put together into the same step items = self.get_scraped_items(resp, job) for item in items: self.on_item_scraped(item, job) content_type = resp.headers.get('content-type') sitemap_links = None if content_type == 'text/xml' or \ content_type == 'application/xml' or \ url.endswith('.xml') and \ sitemap.is_sitemap(resp.content): sitemap_links = sitemap.get_links(resp.content) follow_filter = job.get_follow_filter() links = [link.loc for link in sitemap_links] if sitemap_links else \ (linkparser.parse_links( resp.content, url, restrict_to_host=True, selector_filter=follow_filter ) if follow_filter else \ linkparser.parse_links( resp.content, url, restrict_to_host=True )) if not follow_filter: links = filter(lambda url: linkparser.can_follow(url, job.get_allowed_links()),links) for link in links: job.add_url(link) if resp.cookies and len(resp.cookies): cookies = resp.cookies if isinstance(resp.cookies,RequestsCookieJar): cookies = requests.utils.dict_from_cookiejar(resp.cookies) job.set_cookies(cookies) except Exception as e: logger.log_error('[%s] Exception caught %s' % (url, e)) return StrategyResult.PASSED return StrategyResult.SUCCESS