class HcfMiddleware(object): def __init__(self, crawler): self.crawler = crawler hs_endpoint = self._get_config(crawler, "HS_ENDPOINT") hs_auth = self._get_config(crawler, "HS_AUTH") self.hs_projectid = self._get_config(crawler, "HS_PROJECTID") self.hs_project_frontier = self._get_config(crawler, "HS_FRONTIER") self.hs_project_slot = self._get_config(crawler, "HS_SLOT") # Max number of batches to read from the HCF within a single run. try: self.hs_max_baches = int(crawler.settings.get("HS_MAX_BATCHES", DEFAULT_MAX_BATCHES)) except ValueError: self.hs_max_baches = DEFAULT_MAX_BATCHES self.hs_start_job_on_reason = crawler.settings.get("HS_START_JOB_ON_REASON", []) self.hsclient = HubstorageClient(auth=hs_auth, endpoint=hs_endpoint) self.project = self.hsclient.get_project(self.hs_projectid) self.fclient = self.project.frontier self.new_links = defaultdict(list) self.batch_ids = [] crawler.signals.connect(self.idle_spider, signals.spider_idle) crawler.signals.connect(self.close_spider, signals.spider_closed) def _get_config(self, crawler, key): value = crawler.settings.get(key) if not value: raise NotConfigured('%s not found' % key) return value def _msg(self, msg): log.msg('(HCF) %s' % msg) @classmethod def from_crawler(cls, crawler): return cls(crawler) def process_start_requests(self, start_requests, spider): self.hs_frontier = getattr(spider, 'frontier', self.hs_project_frontier) self._msg('Using HS_FRONTIER=%s' % self.hs_frontier) self.hs_slot = getattr(spider, 'slot', self.hs_project_slot) self._msg('Using HS_SLOT=%s' % self.hs_slot) has_new_requests = False for req in self._get_new_requests(): has_new_requests = True yield req # if there are no links in the hcf, use the start_requests if not has_new_requests: self._msg('Using start_requests') for r in start_requests: yield r def process_spider_output(self, response, result, spider): slot_callback = getattr(spider, 'slot_callback', self._get_slot) for item in result: if isinstance(item, Request): request = item if (request.method == 'GET' and # XXX: Only GET support for now. request.meta.get('use_hcf', False)): slot = slot_callback(request) hcf_params = request.meta.get('hcf_params') fp = {'fp': request.url} if hcf_params: fp.update(hcf_params) self.new_links[slot].append(fp) else: yield item else: yield item def idle_spider(self, spider): self._save_new_links() self.fclient.flush() self._delete_processed_ids() has_new_requests = False for request in self._get_new_requests(): self.crawler.engine.schedule(request, spider) has_new_requests = True if has_new_requests: raise DontCloseSpider def close_spider(self, spider, reason): # Only store the results if the spider finished normally, if it # didn't finished properly there is not way to know whether all the url batches # were processed and it is better not to delete them from the frontier # (so they will be picked by anothe process). if reason == 'finished': self._save_new_links() self._delete_processed_ids() # If the reason is defined in the hs_start_job_on_reason list then start # a new job right after this spider is finished. The idea is to limit # every spider runtime (either via itemcount, pagecount or timeout) and # then have the old spider start a new one to take its place in the slot. if reason in self.hs_start_job_on_reason: self._msg("Starting new job" + spider.name) job = self.hsclient.start_job(projectid=self.hs_projectid, spider=spider.name) self._msg("New job started: %s" % job) self.fclient.close() self.hsclient.close() def _get_new_requests(self): """ Get a new batch of links from the HCF.""" num_batches = 0 num_links = 0 for num_batches, batch in enumerate(self.fclient.read(self.hs_frontier, self.hs_slot), 1): for r in batch['requests']: num_links += 1 yield Request(r[0]) self.batch_ids.append(batch['id']) if num_batches >= self.hs_max_baches: break self._msg('Read %d new batches from slot(%s)' % (num_batches, self.hs_slot)) self._msg('Read %d new links from slot(%s)' % (num_links, self.hs_slot)) def _save_new_links(self): """ Save the new extracted links into the HCF.""" for slot, fps in self.new_links.items(): self.fclient.add(self.hs_frontier, slot, fps) self._msg('Stored %d new links in slot(%s)' % (len(fps), slot)) self.new_links = defaultdict(list) def _delete_processed_ids(self): """ Delete in the HCF the ids of the processed batches.""" self.fclient.delete(self.hs_frontier, self.hs_slot, self.batch_ids) self._msg('Deleted %d processed batches in slot(%s)' % (len(self.batch_ids), self.hs_slot)) self.batch_ids = [] def _get_slot(self, request): """ Determine to which slot should be saved the request.""" return '0'
class HcfMiddleware(object): def __init__(self, crawler): self.crawler = crawler hs_endpoint = self._get_config(crawler, "HS_ENDPOINT") hs_auth = self._get_config(crawler, "HS_AUTH") self.hs_projectid = self._get_config(crawler, "HS_PROJECTID") self.hs_project_frontier = self._get_config(crawler, "HS_FRONTIER") self.hs_project_slot = self._get_config(crawler, "HS_SLOT") # Max number of batches to read from the HCF within a single run. try: self.hs_max_baches = int( crawler.settings.get("HS_MAX_BATCHES", DEFAULT_MAX_BATCHES)) except ValueError: self.hs_max_baches = DEFAULT_MAX_BATCHES self.hs_start_job_on_reason = crawler.settings.get( "HS_START_JOB_ON_REASON", []) self.hsclient = HubstorageClient(auth=hs_auth, endpoint=hs_endpoint) self.project = self.hsclient.get_project(self.hs_projectid) self.fclient = self.project.frontier self.new_links = defaultdict(list) self.batch_ids = [] crawler.signals.connect(self.idle_spider, signals.spider_idle) crawler.signals.connect(self.close_spider, signals.spider_closed) def _get_config(self, crawler, key): value = crawler.settings.get(key) if not value: raise NotConfigured('%s not found' % key) return value def _msg(self, msg): log.msg('(HCF) %s' % msg) @classmethod def from_crawler(cls, crawler): return cls(crawler) def process_start_requests(self, start_requests, spider): self.hs_frontier = getattr(spider, 'frontier', self.hs_project_frontier) self._msg('Using HS_FRONTIER=%s' % self.hs_frontier) self.hs_slot = getattr(spider, 'slot', self.hs_project_slot) self._msg('Using HS_SLOT=%s' % self.hs_slot) has_new_requests = False for req in self._get_new_requests(): has_new_requests = True yield req # if there are no links in the hcf, use the start_requests if not has_new_requests: self._msg('Using start_requests') for r in start_requests: yield r def process_spider_output(self, response, result, spider): slot_callback = getattr(spider, 'slot_callback', self._get_slot) for item in result: if isinstance(item, Request): request = item if (request.method == 'GET' and # XXX: Only GET support for now. request.meta.get('use_hcf', False)): slot = slot_callback(request) hcf_params = request.meta.get('hcf_params') fp = {'fp': request.url} if hcf_params: fp.update(hcf_params) self.new_links[slot].append(fp) else: yield item else: yield item def idle_spider(self, spider): self._save_new_links() self.fclient.flush() self._delete_processed_ids() has_new_requests = False for request in self._get_new_requests(): self.crawler.engine.schedule(request, spider) has_new_requests = True if has_new_requests: raise DontCloseSpider def close_spider(self, spider, reason): # Only store the results if the spider finished normally, if it # didn't finished properly there is not way to know whether all the url batches # were processed and it is better not to delete them from the frontier # (so they will be picked by anothe process). if reason == 'finished': self._save_new_links() self._delete_processed_ids() # If the reason is defined in the hs_start_job_on_reason list then start # a new job right after this spider is finished. The idea is to limit # every spider runtime (either via itemcount, pagecount or timeout) and # then have the old spider start a new one to take its place in the slot. if reason in self.hs_start_job_on_reason: self._msg("Starting new job" + spider.name) job = self.hsclient.start_job(projectid=self.hs_projectid, spider=spider.name) self._msg("New job started: %s" % job) self.fclient.close() self.hsclient.close() def _get_new_requests(self): """ Get a new batch of links from the HCF.""" num_batches = 0 num_links = 0 for num_batches, batch in enumerate( self.fclient.read(self.hs_frontier, self.hs_slot), 1): for r in batch['requests']: num_links += 1 yield Request(r[0]) self.batch_ids.append(batch['id']) if num_batches >= self.hs_max_baches: break self._msg('Read %d new batches from slot(%s)' % (num_batches, self.hs_slot)) self._msg('Read %d new links from slot(%s)' % (num_links, self.hs_slot)) def _save_new_links(self): """ Save the new extracted links into the HCF.""" for slot, fps in self.new_links.items(): self.fclient.add(self.hs_frontier, slot, fps) self._msg('Stored %d new links in slot(%s)' % (len(fps), slot)) self.new_links = defaultdict(list) def _delete_processed_ids(self): """ Delete in the HCF the ids of the processed batches.""" self.fclient.delete(self.hs_frontier, self.hs_slot, self.batch_ids) self._msg('Deleted %d processed batches in slot(%s)' % (len(self.batch_ids), self.hs_slot)) self.batch_ids = [] def _get_slot(self, request): """ Determine to which slot should be saved the request.""" return '0'
class HcfMiddleware(object): def __init__(self, crawler): self.crawler = crawler self.hs_endpoint = crawler.settings.get("HS_ENDPOINT") self.hs_auth = self._get_config(crawler, "HS_AUTH") self.hs_projectid = self._get_config(crawler, "HS_PROJECTID") self.hs_frontier = self._get_config(crawler, "HS_FRONTIER") self.hs_consume_from_slot = self._get_config(crawler, "HS_CONSUME_FROM_SLOT") try: self.hs_number_of_slots = int( crawler.settings.get("HS_NUMBER_OF_SLOTS", DEFAULT_HS_NUMBER_OF_SLOTS)) except ValueError: self.hs_number_of_slots = DEFAULT_HS_NUMBER_OF_SLOTS try: self.hs_max_links = int( crawler.settings.get("HS_MAX_LINKS", DEFAULT_MAX_LINKS)) except ValueError: self.hs_max_links = DEFAULT_MAX_LINKS self.hs_start_job_enabled = crawler.settings.get( "HS_START_JOB_ENABLED", False) self.hs_start_job_on_reason = crawler.settings.get( "HS_START_JOB_ON_REASON", ['finished']) self.hs_start_job_new_panel = crawler.settings.get( "HS_START_JOB_NEW_PANEL", False) if not self.hs_start_job_new_panel: conn = Connection(self.hs_auth) self.oldpanel_project = conn[self.hs_projectid] self.hsclient = HubstorageClient(auth=self.hs_auth, endpoint=self.hs_endpoint) self.project = self.hsclient.get_project(self.hs_projectid) self.fclient = self.project.frontier self.new_links_count = defaultdict(int) self.batch_ids = [] crawler.signals.connect(self.close_spider, signals.spider_closed) # Make sure the logger for hubstorage.batchuploader is configured logging.basicConfig() def _get_config(self, crawler, key): value = crawler.settings.get(key) if not value: raise NotConfigured('%s not found' % key) return value def _msg(self, msg, level=log.INFO): log.msg('(HCF) %s' % msg, level) def _start_job(self, spider): self._msg("Starting new job for: %s" % spider.name) if self.hs_start_job_new_panel: jobid = self.hsclient.start_job(projectid=self.hs_projectid, spider=spider.name) else: jobid = self.oldpanel_project.schedule( spider.name, slot=self.hs_consume_from_slot, dummy=datetime.now()) self._msg("New job started: %s" % jobid) @classmethod def from_crawler(cls, crawler): return cls(crawler) def process_start_requests(self, start_requests, spider): self.hs_frontier = getattr(spider, 'hs_frontier', self.hs_frontier) self._msg('Using HS_FRONTIER=%s' % self.hs_frontier) self.hs_consume_from_slot = getattr(spider, 'hs_consume_from_slot', self.hs_consume_from_slot) self._msg('Using HS_CONSUME_FROM_SLOT=%s' % self.hs_consume_from_slot) self.has_new_requests = False for req in self._get_new_requests(): self.has_new_requests = True yield req # if there are no links in the hcf, use the start_requests # unless this is not the first job. if not self.has_new_requests and not getattr(spider, 'dummy', None): self._msg('Using start_requests') for r in start_requests: yield r def process_spider_output(self, response, result, spider): slot_callback = getattr(spider, 'slot_callback', self._get_slot) for item in result: if isinstance(item, Request): request = item if request.meta.get('use_hcf', False): if request.method == 'GET': # XXX: Only GET support for now. slot = slot_callback(request) hcf_params = request.meta.get('hcf_params') fp = {'fp': request.url} if hcf_params: fp.update(hcf_params) # Save the new links as soon as possible using # the batch uploader self.fclient.add(self.hs_frontier, slot, [fp]) self.new_links_count[slot] += 1 else: self._msg( "'use_hcf' meta key is not supported for non GET requests (%s)" % request.url, log.ERROR) yield request else: yield request else: yield item def close_spider(self, spider, reason): # Only store the results if the spider finished normally, if it # didn't finished properly there is not way to know whether all the url batches # were processed and it is better not to delete them from the frontier # (so they will be picked by another process). if reason == 'finished': self._save_new_links_count() self._delete_processed_ids() # Close the frontier client in order to make sure that all the new links # are stored. self.fclient.close() self.hsclient.close() # If the reason is defined in the hs_start_job_on_reason list then start # a new job right after this spider is finished. if self.hs_start_job_enabled and reason in self.hs_start_job_on_reason: # Start the new job if this job had requests from the HCF or it # was the first job. if self.has_new_requests or not getattr(spider, 'dummy', None): self._start_job(spider) def _get_new_requests(self): """ Get a new batch of links from the HCF.""" num_batches = 0 num_links = 0 for num_batches, batch in enumerate( self.fclient.read(self.hs_frontier, self.hs_consume_from_slot), 1): for fingerprint, data in batch['requests']: num_links += 1 yield Request(url=fingerprint, meta={'hcf_params': { 'qdata': data }}) self.batch_ids.append(batch['id']) if num_links >= self.hs_max_links: break self._msg('Read %d new batches from slot(%s)' % (num_batches, self.hs_consume_from_slot)) self._msg('Read %d new links from slot(%s)' % (num_links, self.hs_consume_from_slot)) def _save_new_links_count(self): """ Save the new extracted links into the HCF.""" for slot, link_count in self.new_links_count.items(): self._msg('Stored %d new links in slot(%s)' % (link_count, slot)) self.new_links_count = defaultdict(list) def _delete_processed_ids(self): """ Delete in the HCF the ids of the processed batches.""" self.fclient.delete(self.hs_frontier, self.hs_consume_from_slot, self.batch_ids) self._msg('Deleted %d processed batches in slot(%s)' % (len(self.batch_ids), self.hs_consume_from_slot)) self.batch_ids = [] def _get_slot(self, request): """ Determine to which slot should be saved the request.""" md5 = hashlib.md5() md5.update(request.url) digest = md5.hexdigest() return str(int(digest, 16) % self.hs_number_of_slots)
class SystemTest(HSTestCase): MAGICN = 1211 def setUp(self): super(HSTestCase, self).setUp() endpoint = self.hsclient.endpoint # Panel - no client auth, only project auth using user auth token self.panelclient = HubstorageClient(endpoint=endpoint) self.panelproject = self.panelclient.get_project(self.projectid, auth=self.auth) # Runner - client uses global auth to poll jobq self.runnerclient = HubstorageClient(endpoint=endpoint, auth=self.auth) # Scraper - uses job level auth, no global or project auth available self.scraperclient = HubstorageClient(endpoint=endpoint) def test_succeed_with_close_reason(self): p = self.panelproject pushed = p.jobq.push(self.spidername) # check pending state job = p.get_jobs(self.spiderid).next() self.assertEqual(job.metadata.get('state'), 'pending') # consume msg from runner self._run_runner(pushed, close_reason='all-good') # query again from panel job = p.get_jobs(self.spiderid).next() self.assertEqual(job.metadata.get('state'), 'finished') self.assertEqual(job.metadata.get('close_reason'), 'all-good') self.assertEqual(job.items.stats()['totals']['input_values'], self.MAGICN) self.assertEqual(job.logs.stats()['totals']['input_values'], self.MAGICN * 4) self.assertEqual(job.requests.stats()['totals']['input_values'], self.MAGICN) def test_succeed_without_close_reason(self): p = self.panelproject pushed = p.jobq.push(self.spidername) # check pending state job = p.get_jobs(self.spiderid).next() self.assertEqual(job.metadata.get('state'), 'pending') # consume msg from runner self._run_runner(pushed, close_reason=None) # query again from panel job = p.get_jobs(self.spiderid).next() self.assertEqual(job.metadata.get('state'), 'finished') self.assertEqual(job.metadata.get('close_reason'), 'no_reason') self.assertEqual(job.items.stats()['totals']['input_values'], self.MAGICN) self.assertEqual(job.logs.stats()['totals']['input_values'], self.MAGICN * 4) self.assertEqual(job.requests.stats()['totals']['input_values'], self.MAGICN) def test_scraper_failure(self): p = self.panelproject pushed = p.jobq.push(self.spidername) # check pending state job = p.get_jobs(self.spiderid).next() self.assertEqual(job.metadata.get('state'), 'pending') # consume msg from runner self._run_runner(pushed, close_reason=IOError('no more resources, ha!')) # query again from panel job = p.get_jobs(self.spiderid).next() self.assertEqual(job.metadata.get('state'), 'finished') self.assertEqual(job.metadata.get('close_reason'), 'failed') # MAGICN per log level messages plus one of last failure stats = job.logs.stats() self.assertTrue(stats) self.assertEqual(stats['totals']['input_values'], self.MAGICN * 4 + 1) def _run_runner(self, pushed, close_reason): job = self.runnerclient.start_job(self.projectid) self.assertFalse(job.metadata.get('stop_requested')) job.metadata.update(host='localhost', slot=1) self.assertEqual(job.metadata.get('state'), 'running') # run scraper try: self._run_scraper(job.key, job.jobauth, close_reason=close_reason) except Exception as exc: job.failed(message=str(exc)) # logging from runner must append and never remove messages logged # by scraper self.assertTrue(job.logs.batch_append) else: job.finished() self.runnerclient.close() def _run_scraper(self, jobkey, jobauth, close_reason=None): httpmethods = 'GET PUT POST DELETE HEAD OPTIONS TRACE CONNECT'.split() job = self.scraperclient.get_job(jobkey, auth=jobauth) for idx in xrange(self.MAGICN): iid = job.items.write({'uuid': idx}) job.logs.debug('log debug %s' % idx, idx=idx) job.logs.info('log info %s' % idx, idx=idx) job.logs.warn('log warn %s' % idx, idx=idx) job.logs.error('log error %s' % idx, idx=idx) sid = job.samples.write([idx, idx, idx]) rid = job.requests.add( url='http://test.com/%d' % idx, status=random.randint(100, 1000), method=random.choice(httpmethods), rs=random.randint(0, 100000), duration=random.randint(0, 1000), parent=random.randrange(0, idx + 1) if idx > 10 else None, ts=millitime() + random.randint(100, 100000), ) self.assertEqual(iid, idx) self.assertEqual(sid, idx) self.assertEqual(rid, idx) if isinstance(close_reason, Exception): self.scraperclient.close() raise close_reason if close_reason: job.metadata['close_reason'] = close_reason job.metadata.save() self.scraperclient.close() del self.scraperclient
class HcfMiddleware(object): def __init__(self, crawler): settings = crawler.settings self.hs_endpoint = settings.get("HS_ENDPOINT") self.hs_auth = self._get_config(settings, "HS_AUTH") self.hs_projectid = self._get_config(settings, "HS_PROJECTID") self.hs_frontier = self._get_config(settings, "HS_FRONTIER") self.hs_consume_from_slot = self._get_config(settings, "HS_CONSUME_FROM_SLOT") self.hs_number_of_slots = settings.getint("HS_NUMBER_OF_SLOTS", DEFAULT_HS_NUMBER_OF_SLOTS) self.hs_max_links = settings.getint("HS_MAX_LINKS", DEFAULT_MAX_LINKS) self.hs_start_job_enabled = settings.getbool("HS_START_JOB_ENABLED", False) self.hs_start_job_on_reason = settings.getlist("HS_START_JOB_ON_REASON", ['finished']) self.hs_start_job_new_panel = settings.getbool("HS_START_JOB_NEW_PANEL", False) if not self.hs_start_job_new_panel: conn = Connection(self.hs_auth) self.oldpanel_project = conn[self.hs_projectid] self.hsclient = HubstorageClient(auth=self.hs_auth, endpoint=self.hs_endpoint) self.project = self.hsclient.get_project(self.hs_projectid) self.fclient = self.project.frontier self.new_links = defaultdict(set) self.batch_ids = [] crawler.signals.connect(self.close_spider, signals.spider_closed) # Make sure the logger for hubstorage.batchuploader is configured logging.basicConfig() def _get_config(self, settings, key): value = settings.get(key) if not value: raise NotConfigured('%s not found' % key) return value def _msg(self, msg, level=log.INFO): log.msg('(HCF) %s' % msg, level) def _start_job(self, spider): self._msg("Starting new job for: %s" % spider.name) if self.hs_start_job_new_panel: jobid = self.hsclient.start_job(projectid=self.hs_projectid, spider=spider.name) else: jobid = self.oldpanel_project.schedule(spider.name, slot=self.hs_consume_from_slot, dummy=datetime.now()) self._msg("New job started: %s" % jobid) @classmethod def from_crawler(cls, crawler): return cls(crawler) def process_start_requests(self, start_requests, spider): self.hs_frontier = getattr(spider, 'hs_frontier', self.hs_frontier) self._msg('Using HS_FRONTIER=%s' % self.hs_frontier) self.hs_consume_from_slot = getattr(spider, 'hs_consume_from_slot', self.hs_consume_from_slot) self._msg('Using HS_CONSUME_FROM_SLOT=%s' % self.hs_consume_from_slot) self.has_new_requests = False for req in self._get_new_requests(): self.has_new_requests = True yield req # if there are no links in the hcf, use the start_requests # unless this is not the first job. if not self.has_new_requests and not getattr(spider, 'dummy', None): self._msg('Using start_requests') for r in start_requests: yield r def process_spider_output(self, response, result, spider): slot_callback = getattr(spider, 'slot_callback', self._get_slot) for item in result: if isinstance(item, Request): request = item if request.meta.get('use_hcf', False): if request.method == 'GET': # XXX: Only GET support for now. slot = slot_callback(request) if not request.url in self.new_links: hcf_params = request.meta.get('hcf_params') fp = {'fp': request.url} if hcf_params: fp.update(hcf_params) # Save the new links as soon as possible using # the batch uploader self.fclient.add(self.hs_frontier, slot, [fp]) self.new_links[slot].add(request.url) else: self._msg("'use_hcf' meta key is not supported for non GET requests (%s)" % request.url, log.ERROR) yield request else: yield request else: yield item def close_spider(self, spider, reason): # Only store the results if the spider finished normally, if it # didn't finished properly there is not way to know whether all the url batches # were processed and it is better not to delete them from the frontier # (so they will be picked by another process). if reason == 'finished': self._save_new_links_count() self._delete_processed_ids() # Close the frontier client in order to make sure that all the new links # are stored. self.fclient.close() self.hsclient.close() # If the reason is defined in the hs_start_job_on_reason list then start # a new job right after this spider is finished. if self.hs_start_job_enabled and reason in self.hs_start_job_on_reason: # Start the new job if this job had requests from the HCF or it # was the first job. if self.has_new_requests or not getattr(spider, 'dummy', None): self._start_job(spider) def _get_new_requests(self): """ Get a new batch of links from the HCF.""" num_batches = 0 num_links = 0 for num_batches, batch in enumerate(self.fclient.read(self.hs_frontier, self.hs_consume_from_slot), 1): for fingerprint, data in batch['requests']: num_links += 1 yield Request(url=fingerprint, meta={'hcf_params': {'qdata': data}}) self.batch_ids.append(batch['id']) if num_links >= self.hs_max_links: break self._msg('Read %d new batches from slot(%s)' % (num_batches, self.hs_consume_from_slot)) self._msg('Read %d new links from slot(%s)' % (num_links, self.hs_consume_from_slot)) def _save_new_links_count(self): """ Save the new extracted links into the HCF.""" for slot, new_links in self.new_links.items(): self._msg('Stored %d new links in slot(%s)' % (len(new_links), slot)) self.new_links = defaultdict(set) def _delete_processed_ids(self): """ Delete in the HCF the ids of the processed batches.""" self.fclient.delete(self.hs_frontier, self.hs_consume_from_slot, self.batch_ids) self._msg('Deleted %d processed batches in slot(%s)' % (len(self.batch_ids), self.hs_consume_from_slot)) self.batch_ids = [] def _get_slot(self, request): """ Determine to which slot should be saved the request.""" md5 = hashlib.md5() md5.update(request.url) digest = md5.hexdigest() return str(int(digest, 16) % self.hs_number_of_slots)
class SystemTest(HSTestCase): MAGICN = 1211 def setUp(self): super(HSTestCase, self).setUp() endpoint = self.hsclient.endpoint # Panel - no client auth, only project auth using user auth token self.panelclient = HubstorageClient(endpoint=endpoint) self.panelproject = self.panelclient.get_project(self.projectid, auth=self.auth) # Runner - client uses global auth to poll jobq self.runnerclient = HubstorageClient(endpoint=endpoint, auth=self.auth) # Scraper - uses job level auth, no global or project auth available self.scraperclient = HubstorageClient(endpoint=endpoint) def test_succeed_with_close_reason(self): p = self.panelproject pushed = p.jobq.push(self.spidername) # check pending state job = p.get_jobs(self.spiderid).next() self.assertEqual(job.metadata.get('state'), 'pending') # consume msg from runner self._run_runner(pushed, close_reason='all-good') # query again from panel job = p.get_jobs(self.spiderid).next() self.assertEqual(job.metadata.get('state'), 'finished') self.assertEqual(job.metadata.get('close_reason'), 'all-good') self.assertEqual(job.items.stats()['totals']['input_values'], self.MAGICN) self.assertEqual(job.logs.stats()['totals']['input_values'], self.MAGICN * 4) self.assertEqual(job.requests.stats()['totals']['input_values'], self.MAGICN) def test_succeed_without_close_reason(self): p = self.panelproject pushed = p.jobq.push(self.spidername) # check pending state job = p.get_jobs(self.spiderid).next() self.assertEqual(job.metadata.get('state'), 'pending') # consume msg from runner self._run_runner(pushed, close_reason=None) # query again from panel job = p.get_jobs(self.spiderid).next() self.assertEqual(job.metadata.get('state'), 'finished') self.assertEqual(job.metadata.get('close_reason'), 'no_reason') self.assertEqual(job.items.stats()['totals']['input_values'], self.MAGICN) self.assertEqual(job.logs.stats()['totals']['input_values'], self.MAGICN * 4) self.assertEqual(job.requests.stats()['totals']['input_values'], self.MAGICN) def test_scraper_failure(self): p = self.panelproject pushed = p.jobq.push(self.spidername) # check pending state job = p.get_jobs(self.spiderid).next() self.assertEqual(job.metadata.get('state'), 'pending') # consume msg from runner self._run_runner(pushed, close_reason=IOError('no more resources, ha!')) # query again from panel job = p.get_jobs(self.spiderid).next() self.assertEqual(job.metadata.get('state'), 'finished') self.assertEqual(job.metadata.get('close_reason'), 'failed') # MAGICN per log level messages plus one of last failure stats = job.logs.stats() self.assertTrue(stats) self.assertEqual(stats['totals']['input_values'], self.MAGICN * 4 + 1) def _run_runner(self, pushed, close_reason): job = self.runnerclient.start_job(self.projectid) self.assertFalse(job.metadata.get('stop_requested')) job.metadata.update(host='localhost', slot=1) self.assertEqual(job.metadata.get('state'), 'running') # run scraper try: self._run_scraper(job.key, job.jobauth, close_reason=close_reason) except Exception as exc: job.failed(message=str(exc)) # logging from runner must append and never remove messages logged # by scraper self.assertTrue(job.logs.batch_append) else: job.finished() self.runnerclient.close() def _run_scraper(self, jobkey, jobauth, close_reason=None): httpmethods = 'GET PUT POST DELETE HEAD OPTIONS TRACE CONNECT'.split() job = self.scraperclient.get_job(jobkey, auth=jobauth) for idx in xrange(self.MAGICN): iid = job.items.write({'uuid': idx}) job.logs.debug('log debug %s' % idx, idx=idx) job.logs.info('log info %s' % idx, idx=idx) job.logs.warn('log warn %s' % idx, idx=idx) job.logs.error('log error %s' % idx, idx=idx) sid = job.samples.write([idx, idx, idx]) rid = job.requests.add( url='http://test.com/%d' % idx, status=random.randint(100, 1000), method=random.choice(httpmethods), rs=random.randint(0, 100000), duration=random.randint(0, 1000), parent=random.randrange(0, idx + 1) if idx > 10 else None, ts=millitime() + random.randint(100, 100000), ) self.assertEqual(iid, idx) self.assertEqual(sid, idx) self.assertEqual(rid, idx) if isinstance(close_reason, Exception): self.scraperclient.close() raise close_reason if close_reason: job.metadata['close_reason'] = close_reason job.metadata.save() self.scraperclient.close() del self.scraperclient