class HcfMiddleware(object): def __init__(self, crawler): settings = crawler.settings self.hs_endpoint = settings.get("HS_ENDPOINT") self.hs_auth = self._get_config(settings, "HS_AUTH") self.hs_projectid = self._get_config(settings, "HS_PROJECTID", os.environ.get('SCRAPY_PROJECT_ID')) self.hs_frontier = self._get_config(settings, "HS_FRONTIER") self.hs_consume_from_slot = self._get_config(settings, "HS_CONSUME_FROM_SLOT") self.hs_number_of_slots = settings.getint("HS_NUMBER_OF_SLOTS", DEFAULT_HS_NUMBER_OF_SLOTS) self.hs_max_links = settings.getint("HS_MAX_LINKS", DEFAULT_MAX_LINKS) self.hs_start_job_enabled = settings.getbool("HS_START_JOB_ENABLED", False) self.hs_start_job_on_reason = settings.getlist("HS_START_JOB_ON_REASON", ['finished']) conn = Connection(self.hs_auth) self.panel_project = conn[self.hs_projectid] self.hsclient = HubstorageClient(auth=self.hs_auth, endpoint=self.hs_endpoint) self.project = self.hsclient.get_project(self.hs_projectid) self.fclient = self.project.frontier self.new_links = defaultdict(set) self.batch_ids = [] crawler.signals.connect(self.close_spider, signals.spider_closed) # Make sure the logger for hubstorage.batchuploader is configured logging.basicConfig() def _get_config(self, settings, key, default=None): value = settings.get(key, default) if not value: raise NotConfigured('%s not found' % key) return value def _msg(self, msg, level=log.INFO): log.msg('(HCF) %s' % msg, level) def start_job(self, spider): self._msg("Starting new job for: %s" % spider.name) jobid = self.panel_project.schedule( spider.name, hs_consume_from_slot=self.hs_consume_from_slot, dummy=datetime.now() ) self._msg("New job started: %s" % jobid) return jobid @classmethod def from_crawler(cls, crawler): return cls(crawler) def process_start_requests(self, start_requests, spider): self.hs_frontier = getattr(spider, 'hs_frontier', self.hs_frontier) self._msg('Using HS_FRONTIER=%s' % self.hs_frontier) self.hs_consume_from_slot = getattr(spider, 'hs_consume_from_slot', self.hs_consume_from_slot) self._msg('Using HS_CONSUME_FROM_SLOT=%s' % self.hs_consume_from_slot) self.has_new_requests = False for req in self._get_new_requests(): self.has_new_requests = True yield req # if there are no links in the hcf, use the start_requests # unless this is not the first job. if not self.has_new_requests and not getattr(spider, 'dummy', None): self._msg('Using start_requests') for r in start_requests: yield r def process_spider_output(self, response, result, spider): slot_callback = getattr(spider, 'slot_callback', self._get_slot) for item in result: if isinstance(item, Request): request = item if request.meta.get('use_hcf', False): if request.method == 'GET': # XXX: Only GET support for now. slot = slot_callback(request) if not request.url in self.new_links[slot]: hcf_params = request.meta.get('hcf_params') fp = {'fp': request.url} if hcf_params: fp.update(hcf_params) # Save the new links as soon as possible using # the batch uploader self.fclient.add(self.hs_frontier, slot, [fp]) self.new_links[slot].add(request.url) else: self._msg("'use_hcf' meta key is not supported for non GET requests (%s)" % request.url, log.ERROR) yield request else: yield request else: yield item def close_spider(self, spider, reason): # Only store the results if the spider finished normally, if it # didn't finished properly there is not way to know whether all the url batches # were processed and it is better not to delete them from the frontier # (so they will be picked by another process). if reason == 'finished': self._save_new_links_count() self._delete_processed_ids() # Close the frontier client in order to make sure that all the new links # are stored. self.fclient.close() self.hsclient.close() # If the reason is defined in the hs_start_job_on_reason list then start # a new job right after this spider is finished. if self.hs_start_job_enabled and reason in self.hs_start_job_on_reason: # Start the new job if this job had requests from the HCF or it # was the first job. if self.has_new_requests or not getattr(spider, 'dummy', None): self.start_job(spider) def _get_new_requests(self): """ Get a new batch of links from the HCF.""" num_batches = 0 num_links = 0 for num_batches, batch in enumerate(self.fclient.read(self.hs_frontier, self.hs_consume_from_slot), 1): for fingerprint, data in batch['requests']: num_links += 1 yield Request(url=fingerprint, meta={'hcf_params': {'qdata': data}}) self.batch_ids.append(batch['id']) if num_links >= self.hs_max_links: break self._msg('Read %d new batches from slot(%s)' % (num_batches, self.hs_consume_from_slot)) self._msg('Read %d new links from slot(%s)' % (num_links, self.hs_consume_from_slot)) def _save_new_links_count(self): """ Save the new extracted links into the HCF.""" for slot, new_links in self.new_links.items(): self._msg('Stored %d new links in slot(%s)' % (len(new_links), slot)) self.new_links = defaultdict(set) def _delete_processed_ids(self): """ Delete in the HCF the ids of the processed batches.""" self.fclient.delete(self.hs_frontier, self.hs_consume_from_slot, self.batch_ids) self._msg('Deleted %d processed batches in slot(%s)' % (len(self.batch_ids), self.hs_consume_from_slot)) self.batch_ids = [] def _get_slot(self, request): """ Determine to which slot should be saved the request.""" md5 = hashlib.md5() md5.update(request.url) digest = md5.hexdigest() return str(int(digest, 16) % self.hs_number_of_slots)
class SystemTest(HSTestCase): MAGICN = 1211 def setUp(self): super(HSTestCase, self).setUp() self.endpoint = self.hsclient.endpoint # Panel - no client auth, only project auth using user auth token self.panelclient = HubstorageClient(endpoint=self.endpoint) self.panelproject = self.panelclient.get_project(self.projectid, auth=self.auth) def tearDown(self): super(HSTestCase, self).tearDown() self.panelclient.close() def test_succeed_with_close_reason(self): self._do_test_success("all-good", "all-good") def test_succeed_without_close_reason(self): self._do_test_success(None, "no_reason") def test_scraper_failure(self): job = self._do_test_job(IOError("no more resources, ha!"), "failed") # MAGICN per log level messages plus one of last failure stats = job.logs.stats() self.assertTrue(stats) self.assertEqual(stats["totals"]["input_values"], self.MAGICN * 4 + 1) def _do_test_success(self, job_close_reason, expected_close_reason): job = self._do_test_job(job_close_reason, expected_close_reason) self.assertEqual(job.items.stats()["totals"]["input_values"], self.MAGICN) self.assertEqual(job.logs.stats()["totals"]["input_values"], self.MAGICN * 4) self.assertEqual(job.requests.stats()["totals"]["input_values"], self.MAGICN) def _do_test_job(self, job_close_reason, expected_close_reason): p = self.panelproject pushed = p.jobq.push(self.spidername) # check pending state job = p.get_job(pushed["key"]) self.assertEqual(job.metadata.get("state"), "pending") # consume msg from runner self._run_runner(pushed, close_reason=job_close_reason) # query again from panel job = p.get_job(pushed["key"]) self.assertEqual(job.metadata.get("state"), "finished") self.assertEqual(job.metadata.get("close_reason"), expected_close_reason) return job def _run_runner(self, pushed, close_reason): client = HubstorageClient(endpoint=self.endpoint, auth=self.auth) with closing(client) as runnerclient: job = self.start_job() self.assertFalse(job.metadata.get("stop_requested")) job.metadata.update(host="localhost", slot=1) self.assertEqual(job.metadata.get("state"), "running") # run scraper try: self._run_scraper(job.key, job.jobauth, close_reason=close_reason) except Exception as exc: job.logs.error(message=str(exc), appendmode=True) job.close_writers() job.jobq.finish(job, close_reason="failed") # logging from runner must append and never remove messages logged # by scraper self.assertTrue(job.logs.batch_append) else: job.jobq.finish(job, close_reason=close_reason or "no_reason") def _run_scraper(self, jobkey, jobauth, close_reason=None): httpmethods = "GET PUT POST DELETE HEAD OPTIONS TRACE CONNECT".split() # Scraper - uses job level auth, no global or project auth available client = HubstorageClient(endpoint=self.endpoint) with closing(client) as scraperclient: job = scraperclient.get_job(jobkey, auth=jobauth) for idx in xrange(self.MAGICN): iid = job.items.write({"uuid": idx}) job.logs.debug("log debug %s" % idx, idx=idx) job.logs.info("log info %s" % idx, idx=idx) job.logs.warn("log warn %s" % idx, idx=idx) job.logs.error("log error %s" % idx, idx=idx) sid = job.samples.write([idx, idx, idx]) rid = job.requests.add( url="http://test.com/%d" % idx, status=random.randint(100, 1000), method=random.choice(httpmethods), rs=random.randint(0, 100000), duration=random.randint(0, 1000), parent=random.randrange(0, idx + 1) if idx > 10 else None, ts=millitime() + random.randint(100, 100000), ) self.assertEqual(iid, idx) self.assertEqual(sid, idx) self.assertEqual(rid, idx) if isinstance(close_reason, Exception): raise close_reason if close_reason: job.metadata["close_reason"] = close_reason job.metadata.save()
class HcfMiddleware(object): def __init__(self, crawler): self.crawler = crawler hs_endpoint = self._get_config(crawler, "HS_ENDPOINT") hs_auth = self._get_config(crawler, "HS_AUTH") self.hs_projectid = self._get_config(crawler, "HS_PROJECTID") self.hs_project_frontier = self._get_config(crawler, "HS_FRONTIER") self.hs_project_slot = self._get_config(crawler, "HS_SLOT") # Max number of batches to read from the HCF within a single run. try: self.hs_max_baches = int(crawler.settings.get("HS_MAX_BATCHES", DEFAULT_MAX_BATCHES)) except ValueError: self.hs_max_baches = DEFAULT_MAX_BATCHES self.hs_start_job_on_reason = crawler.settings.get("HS_START_JOB_ON_REASON", []) self.hsclient = HubstorageClient(auth=hs_auth, endpoint=hs_endpoint) self.project = self.hsclient.get_project(self.hs_projectid) self.fclient = self.project.frontier self.new_links = defaultdict(list) self.batch_ids = [] crawler.signals.connect(self.idle_spider, signals.spider_idle) crawler.signals.connect(self.close_spider, signals.spider_closed) def _get_config(self, crawler, key): value = crawler.settings.get(key) if not value: raise NotConfigured('%s not found' % key) return value def _msg(self, msg): log.msg('(HCF) %s' % msg) @classmethod def from_crawler(cls, crawler): return cls(crawler) def process_start_requests(self, start_requests, spider): self.hs_frontier = getattr(spider, 'frontier', self.hs_project_frontier) self._msg('Using HS_FRONTIER=%s' % self.hs_frontier) self.hs_slot = getattr(spider, 'slot', self.hs_project_slot) self._msg('Using HS_SLOT=%s' % self.hs_slot) has_new_requests = False for req in self._get_new_requests(): has_new_requests = True yield req # if there are no links in the hcf, use the start_requests if not has_new_requests: self._msg('Using start_requests') for r in start_requests: yield r def process_spider_output(self, response, result, spider): slot_callback = getattr(spider, 'slot_callback', self._get_slot) for item in result: if isinstance(item, Request): request = item if (request.method == 'GET' and # XXX: Only GET support for now. request.meta.get('use_hcf', False)): slot = slot_callback(request) hcf_params = request.meta.get('hcf_params') fp = {'fp': request.url} if hcf_params: fp.update(hcf_params) self.new_links[slot].append(fp) else: yield item else: yield item def idle_spider(self, spider): self._save_new_links() self.fclient.flush() self._delete_processed_ids() has_new_requests = False for request in self._get_new_requests(): self.crawler.engine.schedule(request, spider) has_new_requests = True if has_new_requests: raise DontCloseSpider def close_spider(self, spider, reason): # Only store the results if the spider finished normally, if it # didn't finished properly there is not way to know whether all the url batches # were processed and it is better not to delete them from the frontier # (so they will be picked by anothe process). if reason == 'finished': self._save_new_links() self._delete_processed_ids() # If the reason is defined in the hs_start_job_on_reason list then start # a new job right after this spider is finished. The idea is to limit # every spider runtime (either via itemcount, pagecount or timeout) and # then have the old spider start a new one to take its place in the slot. if reason in self.hs_start_job_on_reason: self._msg("Starting new job" + spider.name) job = self.hsclient.start_job(projectid=self.hs_projectid, spider=spider.name) self._msg("New job started: %s" % job) self.fclient.close() self.hsclient.close() def _get_new_requests(self): """ Get a new batch of links from the HCF.""" num_batches = 0 num_links = 0 for num_batches, batch in enumerate(self.fclient.read(self.hs_frontier, self.hs_slot), 1): for r in batch['requests']: num_links += 1 yield Request(r[0]) self.batch_ids.append(batch['id']) if num_batches >= self.hs_max_baches: break self._msg('Read %d new batches from slot(%s)' % (num_batches, self.hs_slot)) self._msg('Read %d new links from slot(%s)' % (num_links, self.hs_slot)) def _save_new_links(self): """ Save the new extracted links into the HCF.""" for slot, fps in self.new_links.items(): self.fclient.add(self.hs_frontier, slot, fps) self._msg('Stored %d new links in slot(%s)' % (len(fps), slot)) self.new_links = defaultdict(list) def _delete_processed_ids(self): """ Delete in the HCF the ids of the processed batches.""" self.fclient.delete(self.hs_frontier, self.hs_slot, self.batch_ids) self._msg('Deleted %d processed batches in slot(%s)' % (len(self.batch_ids), self.hs_slot)) self.batch_ids = [] def _get_slot(self, request): """ Determine to which slot should be saved the request.""" return '0'
class SystemTest(HSTestCase): MAGICN = 1211 def setUp(self): super(HSTestCase, self).setUp() self.endpoint = self.hsclient.endpoint # Panel - no client auth, only project auth using user auth token self.panelclient = HubstorageClient(endpoint=self.endpoint) self.panelproject = self.panelclient.get_project(self.projectid, auth=self.auth) def tearDown(self): super(HSTestCase, self).tearDown() self.panelclient.close() def test_succeed_with_close_reason(self): self._do_test_success('all-good', 'all-good') def test_succeed_without_close_reason(self): self._do_test_success(None, 'no_reason') def test_scraper_failure(self): job = self._do_test_job(IOError('no more resources, ha!'), 'failed') # MAGICN per log level messages plus one of last failure stats = job.logs.stats() self.assertTrue(stats) self.assertEqual(stats['totals']['input_values'], self.MAGICN * 4 + 1) def _do_test_success(self, job_close_reason, expected_close_reason): job = self._do_test_job(job_close_reason, expected_close_reason) self.assertEqual(job.items.stats()['totals']['input_values'], self.MAGICN) self.assertEqual(job.logs.stats()['totals']['input_values'], self.MAGICN * 4) self.assertEqual(job.requests.stats()['totals']['input_values'], self.MAGICN) def _do_test_job(self, job_close_reason, expected_close_reason): p = self.panelproject pushed = p.jobq.push(self.spidername) # check pending state job = p.get_job(pushed['key']) self.assertEqual(job.metadata.get('state'), 'pending') # consume msg from runner self._run_runner(pushed, close_reason=job_close_reason) # query again from panel job = p.get_job(pushed['key']) self.assertEqual(job.metadata.get('state'), 'finished') self.assertEqual(job.metadata.get('close_reason'), expected_close_reason) return job def _run_runner(self, pushed, close_reason): client = HubstorageClient(endpoint=self.endpoint, auth=self.auth) with closing(client) as runnerclient: job = self.start_job() self.assertFalse(job.metadata.get('stop_requested')) job.metadata.update(host='localhost', slot=1) self.assertEqual(job.metadata.get('state'), 'running') # run scraper try: self._run_scraper(job.key, job.jobauth, close_reason=close_reason) except Exception as exc: job.logs.error(message=str(exc), appendmode=True) job.close_writers() job.jobq.finish(job, close_reason='failed') # logging from runner must append and never remove messages logged # by scraper self.assertTrue(job.logs.batch_append) else: job.jobq.finish(job, close_reason=close_reason or 'no_reason') def _run_scraper(self, jobkey, jobauth, close_reason=None): httpmethods = 'GET PUT POST DELETE HEAD OPTIONS TRACE CONNECT'.split() # Scraper - uses job level auth, no global or project auth available client = HubstorageClient(endpoint=self.endpoint) with closing(client) as scraperclient: job = scraperclient.get_job(jobkey, auth=jobauth) for idx in xrange(self.MAGICN): iid = job.items.write({'uuid': idx}) job.logs.debug('log debug %s' % idx, idx=idx) job.logs.info('log info %s' % idx, idx=idx) job.logs.warn('log warn %s' % idx, idx=idx) job.logs.error('log error %s' % idx, idx=idx) sid = job.samples.write([idx, idx, idx]) rid = job.requests.add( url='http://test.com/%d' % idx, status=random.randint(100, 1000), method=random.choice(httpmethods), rs=random.randint(0, 100000), duration=random.randint(0, 1000), parent=random.randrange(0, idx + 1) if idx > 10 else None, ts=millitime() + random.randint(100, 100000), ) self.assertEqual(iid, idx) self.assertEqual(sid, idx) self.assertEqual(rid, idx) if isinstance(close_reason, Exception): raise close_reason if close_reason: job.metadata['close_reason'] = close_reason job.metadata.save()
class HcfMiddleware(object): def __init__(self, crawler): self.crawler = crawler hs_endpoint = self._get_config(crawler, "HS_ENDPOINT") hs_auth = self._get_config(crawler, "HS_AUTH") self.hs_projectid = self._get_config(crawler, "HS_PROJECTID") self.hs_project_frontier = self._get_config(crawler, "HS_FRONTIER") self.hs_project_slot = self._get_config(crawler, "HS_SLOT") # Max number of batches to read from the HCF within a single run. try: self.hs_max_baches = int( crawler.settings.get("HS_MAX_BATCHES", DEFAULT_MAX_BATCHES)) except ValueError: self.hs_max_baches = DEFAULT_MAX_BATCHES self.hs_start_job_on_reason = crawler.settings.get( "HS_START_JOB_ON_REASON", []) self.hsclient = HubstorageClient(auth=hs_auth, endpoint=hs_endpoint) self.project = self.hsclient.get_project(self.hs_projectid) self.fclient = self.project.frontier self.new_links = defaultdict(list) self.batch_ids = [] crawler.signals.connect(self.idle_spider, signals.spider_idle) crawler.signals.connect(self.close_spider, signals.spider_closed) def _get_config(self, crawler, key): value = crawler.settings.get(key) if not value: raise NotConfigured('%s not found' % key) return value def _msg(self, msg): log.msg('(HCF) %s' % msg) @classmethod def from_crawler(cls, crawler): return cls(crawler) def process_start_requests(self, start_requests, spider): self.hs_frontier = getattr(spider, 'frontier', self.hs_project_frontier) self._msg('Using HS_FRONTIER=%s' % self.hs_frontier) self.hs_slot = getattr(spider, 'slot', self.hs_project_slot) self._msg('Using HS_SLOT=%s' % self.hs_slot) has_new_requests = False for req in self._get_new_requests(): has_new_requests = True yield req # if there are no links in the hcf, use the start_requests if not has_new_requests: self._msg('Using start_requests') for r in start_requests: yield r def process_spider_output(self, response, result, spider): slot_callback = getattr(spider, 'slot_callback', self._get_slot) for item in result: if isinstance(item, Request): request = item if (request.method == 'GET' and # XXX: Only GET support for now. request.meta.get('use_hcf', False)): slot = slot_callback(request) hcf_params = request.meta.get('hcf_params') fp = {'fp': request.url} if hcf_params: fp.update(hcf_params) self.new_links[slot].append(fp) else: yield item else: yield item def idle_spider(self, spider): self._save_new_links() self.fclient.flush() self._delete_processed_ids() has_new_requests = False for request in self._get_new_requests(): self.crawler.engine.schedule(request, spider) has_new_requests = True if has_new_requests: raise DontCloseSpider def close_spider(self, spider, reason): # Only store the results if the spider finished normally, if it # didn't finished properly there is not way to know whether all the url batches # were processed and it is better not to delete them from the frontier # (so they will be picked by anothe process). if reason == 'finished': self._save_new_links() self._delete_processed_ids() # If the reason is defined in the hs_start_job_on_reason list then start # a new job right after this spider is finished. The idea is to limit # every spider runtime (either via itemcount, pagecount or timeout) and # then have the old spider start a new one to take its place in the slot. if reason in self.hs_start_job_on_reason: self._msg("Starting new job" + spider.name) job = self.hsclient.start_job(projectid=self.hs_projectid, spider=spider.name) self._msg("New job started: %s" % job) self.fclient.close() self.hsclient.close() def _get_new_requests(self): """ Get a new batch of links from the HCF.""" num_batches = 0 num_links = 0 for num_batches, batch in enumerate( self.fclient.read(self.hs_frontier, self.hs_slot), 1): for r in batch['requests']: num_links += 1 yield Request(r[0]) self.batch_ids.append(batch['id']) if num_batches >= self.hs_max_baches: break self._msg('Read %d new batches from slot(%s)' % (num_batches, self.hs_slot)) self._msg('Read %d new links from slot(%s)' % (num_links, self.hs_slot)) def _save_new_links(self): """ Save the new extracted links into the HCF.""" for slot, fps in self.new_links.items(): self.fclient.add(self.hs_frontier, slot, fps) self._msg('Stored %d new links in slot(%s)' % (len(fps), slot)) self.new_links = defaultdict(list) def _delete_processed_ids(self): """ Delete in the HCF the ids of the processed batches.""" self.fclient.delete(self.hs_frontier, self.hs_slot, self.batch_ids) self._msg('Deleted %d processed batches in slot(%s)' % (len(self.batch_ids), self.hs_slot)) self.batch_ids = [] def _get_slot(self, request): """ Determine to which slot should be saved the request.""" return '0'
class SystemTest(HSTestCase): MAGICN = 1211 def setUp(self): super(HSTestCase, self).setUp() endpoint = self.hsclient.endpoint # Panel - no client auth, only project auth using user auth token self.panelclient = HubstorageClient(endpoint=endpoint) self.panelproject = self.panelclient.get_project(self.projectid, auth=self.auth) # Runner - client uses global auth to poll jobq self.runnerclient = HubstorageClient(endpoint=endpoint, auth=self.auth) # Scraper - uses job level auth, no global or project auth available self.scraperclient = HubstorageClient(endpoint=endpoint) def test_succeed_with_close_reason(self): p = self.panelproject pushed = p.jobq.push(self.spidername) # check pending state job = p.get_jobs(self.spiderid).next() self.assertEqual(job.metadata.get('state'), 'pending') # consume msg from runner self._run_runner(pushed, close_reason='all-good') # query again from panel job = p.get_jobs(self.spiderid).next() self.assertEqual(job.metadata.get('state'), 'finished') self.assertEqual(job.metadata.get('close_reason'), 'all-good') self.assertEqual(job.items.stats()['totals']['input_values'], self.MAGICN) self.assertEqual(job.logs.stats()['totals']['input_values'], self.MAGICN * 4) self.assertEqual(job.requests.stats()['totals']['input_values'], self.MAGICN) def test_succeed_without_close_reason(self): p = self.panelproject pushed = p.jobq.push(self.spidername) # check pending state job = p.get_jobs(self.spiderid).next() self.assertEqual(job.metadata.get('state'), 'pending') # consume msg from runner self._run_runner(pushed, close_reason=None) # query again from panel job = p.get_jobs(self.spiderid).next() self.assertEqual(job.metadata.get('state'), 'finished') self.assertEqual(job.metadata.get('close_reason'), 'no_reason') self.assertEqual(job.items.stats()['totals']['input_values'], self.MAGICN) self.assertEqual(job.logs.stats()['totals']['input_values'], self.MAGICN * 4) self.assertEqual(job.requests.stats()['totals']['input_values'], self.MAGICN) def test_scraper_failure(self): p = self.panelproject pushed = p.jobq.push(self.spidername) # check pending state job = p.get_jobs(self.spiderid).next() self.assertEqual(job.metadata.get('state'), 'pending') # consume msg from runner self._run_runner(pushed, close_reason=IOError('no more resources, ha!')) # query again from panel job = p.get_jobs(self.spiderid).next() self.assertEqual(job.metadata.get('state'), 'finished') self.assertEqual(job.metadata.get('close_reason'), 'failed') # MAGICN per log level messages plus one of last failure stats = job.logs.stats() self.assertTrue(stats) self.assertEqual(stats['totals']['input_values'], self.MAGICN * 4 + 1) def _run_runner(self, pushed, close_reason): job = self.runnerclient.start_job(self.projectid) self.assertFalse(job.metadata.get('stop_requested')) job.metadata.update(host='localhost', slot=1) self.assertEqual(job.metadata.get('state'), 'running') # run scraper try: self._run_scraper(job.key, job.jobauth, close_reason=close_reason) except Exception as exc: job.failed(message=str(exc)) # logging from runner must append and never remove messages logged # by scraper self.assertTrue(job.logs.batch_append) else: job.finished() self.runnerclient.close() def _run_scraper(self, jobkey, jobauth, close_reason=None): httpmethods = 'GET PUT POST DELETE HEAD OPTIONS TRACE CONNECT'.split() job = self.scraperclient.get_job(jobkey, auth=jobauth) for idx in xrange(self.MAGICN): iid = job.items.write({'uuid': idx}) job.logs.debug('log debug %s' % idx, idx=idx) job.logs.info('log info %s' % idx, idx=idx) job.logs.warn('log warn %s' % idx, idx=idx) job.logs.error('log error %s' % idx, idx=idx) sid = job.samples.write([idx, idx, idx]) rid = job.requests.add( url='http://test.com/%d' % idx, status=random.randint(100, 1000), method=random.choice(httpmethods), rs=random.randint(0, 100000), duration=random.randint(0, 1000), parent=random.randrange(0, idx + 1) if idx > 10 else None, ts=millitime() + random.randint(100, 100000), ) self.assertEqual(iid, idx) self.assertEqual(sid, idx) self.assertEqual(rid, idx) if isinstance(close_reason, Exception): self.scraperclient.close() raise close_reason if close_reason: job.metadata['close_reason'] = close_reason job.metadata.save() self.scraperclient.close() del self.scraperclient
class HCFStates(MemoryStates): def __init__(self, auth, project_id, colname, cache_size_limit, cleanup_on_start): super(HCFStates, self).__init__(cache_size_limit) self._hs_client = HubstorageClient(auth=auth) self.projectid = project_id project = self._hs_client.get_project(self.projectid) self._collections = project.collections self._colname = colname + "_states" self.logger = logging.getLogger("hcf.states") if cleanup_on_start: self._cleanup() def _cleanup(self): while True: nextstart = None params = {'method':'DELETE', 'url':'https://storage.scrapinghub.com/collections/%d/s/%s' % (self.projectid, self._colname), 'auth':self._hs_client.auth} if nextstart: params['prefix'] = nextstart response = self._hs_client.session.request(**params) if response.status_code != 200: self.logger.error("%d %s", response.status_code, response.content) self.logger.info(params) try: r = loads(response.content.decode('utf-8')) self.logger.debug("Removed %d, scanned %d", r["deleted"], r["scanned"]) nextstart = r.get('nextstart') except ValueError as ve: self.logger.debug(ve) self.logger.debug("content: %s (%d)" % (response.content, len(response.content))) if not nextstart: break def frontier_start(self): self._store = self._collections.new_store(self._colname) def frontier_stop(self): self.logger.debug("Got frontier stop.") self.flush() self._hs_client.close() def _hcf_fetch(self, to_fetch): finished = False i = iter(to_fetch) while True: prepared_keys = [] while True: try: prepared_keys.append("key=%s" % next(i)) if len(prepared_keys) >= 32: break except StopIteration: finished = True break if not prepared_keys: break prepared_keys.append("meta=_key") params = {'method':'GET', 'url':'https://storage.scrapinghub.com/collections/%d/s/%s' % (self.projectid, self._colname), 'params':str('&').join(prepared_keys), 'auth':self._hs_client.auth} start = time() response = self._hs_client.session.request(**params) self.logger.debug("Fetch request time %f ms", (time()-start) * 1000) if response.status_code != 200: self.logger.error("%d %s", response.status_code, response.content) self.logger.info(params) for line in response.content.decode('utf-8').split('\n'): if not line: continue try: yield loads(line) except ValueError as ve: self.logger.debug(ve) self.logger.debug("content: %s (%d)" % (line, len(line))) if finished: break def fetch(self, fingerprints): to_fetch = [f for f in fingerprints if f not in self._cache] self.logger.debug("cache size %s" % len(self._cache)) self.logger.debug("to fetch %d from %d" % (len(to_fetch), len(fingerprints))) if not to_fetch: return count = 0 for o in self._hcf_fetch(to_fetch): self._cache[o['_key']] = o['value'] count += 1 self.logger.debug("Fetched %d items" % count) def flush(self, force_clear=False): buffer = [] count = 0 start = time() try: for fprint, state_val in six.iteritems(self._cache): buffer.append({'_key': fprint, 'value':state_val}) if len(buffer) > 1024: count += len(buffer) self._store.set(buffer) buffer = [] finally: count += len(buffer) self._store.set(buffer) self.logger.debug("Send time %f ms", (time()-start) * 1000) self.logger.debug("State cache has been flushed: %d items" % count) super(HCFStates, self).flush(force_clear)
class HCFClientWrapper(object): def __init__(self, auth, project_id, frontier, batch_size=0, flush_interval=30): self._hs_client = HubstorageClient(auth=auth) self._hcf = self._hs_client.get_project(project_id).frontier self._hcf.batch_size = batch_size self._hcf.batch_interval = flush_interval self._frontier = frontier self._links_count = defaultdict(int) self._links_to_flush_count = defaultdict(int) self._hcf_retries = 10 self.logger = logging.getLogger("hubstorage-wrapper") def add_request(self, slot, request): self._hcf.add(self._frontier, slot, [request]) self._links_count[slot] += 1 self._links_to_flush_count[slot] += 1 return 0 def flush(self, slot=None): n_links_to_flush = self.get_number_of_links_to_flush(slot) if n_links_to_flush: if slot is None: self._hcf.flush() for slot in self._links_to_flush_count.keys(): self._links_to_flush_count[slot] = 0 else: writer = self._hcf._get_writer(self._frontier, slot) writer.flush() self._links_to_flush_count[slot] = 0 return n_links_to_flush def read(self, slot, mincount=None): for i in range(self._hcf_retries): try: return self._hcf.read(self._frontier, slot, mincount) except requests_lib.exceptions.ReadTimeout: self.logger.error("Could not read from {0}/{1} try {2}/{3}".format(self._frontier, slot, i+1, self._hcf_retries)) except requests_lib.exceptions.ConnectionError: self.logger.error("Connection error while reading from {0}/{1} try {2}/{3}".format(self._frontier, slot, i+1, self._hcf_retries)) except requests_lib.exceptions.RequestException: self.logger.error("Error while reading from {0}/{1} try {2}/{3}".format(self._frontier, slot, i+1, self._hcf_retries)) sleep(60 * (i + 1)) return [] def delete(self, slot, ids): for i in range(self._hcf_retries): try: self._hcf.delete(self._frontier, slot, ids) break except requests_lib.exceptions.ReadTimeout: self.logger.error("Could not delete ids from {0}/{1} try {2}/{3}".format(self._frontier, slot, i+1, self._hcf_retries)) except requests_lib.exceptions.ConnectionError: self.logger.error("Connection error while deleting ids from {0}/{1} try {2}/{3}".format(self._frontier, slot, i+1, self._hcf_retries)) except requests_lib.exceptions.RequestException: self.logger.error("Error deleting ids from {0}/{1} try {2}/{3}".format(self._frontier, slot, i+1, self._hcf_retries)) sleep(60 * (i + 1)) def delete_slot(self, slot): self._hcf.delete_slot(self._frontier, slot) def close(self): self._hcf.close() self._hs_client.close() def get_number_of_links(self, slot=None): if slot is None: return sum(self._links_count.values()) else: return self._links_count[slot] def get_number_of_links_to_flush(self, slot=None): if slot is None: return sum(self._links_to_flush_count.values()) else: return self._links_to_flush_count[slot]
class HCFStates(MemoryStates): def __init__(self, auth, project_id, colname, cache_size_limit, cleanup_on_start): super(HCFStates, self).__init__(cache_size_limit) self._hs_client = HubstorageClient(auth=auth) self.projectid = project_id project = self._hs_client.get_project(self.projectid) self._collections = project.collections self._colname = colname + "_states" self.logger = logging.getLogger("hcf.states") if cleanup_on_start: self._cleanup() def _cleanup(self): while True: nextstart = None params = { 'method': 'DELETE', 'url': 'https://storage.scrapinghub.com/collections/%d/s/%s' % (self.projectid, self._colname), 'auth': self._hs_client.auth } if nextstart: params['prefix'] = nextstart response = self._hs_client.session.request(**params) if response.status_code != 200: self.logger.error("%d %s", response.status_code, response.content) self.logger.info(params) try: r = loads(response.content.decode('utf-8')) self.logger.debug("Removed %d, scanned %d", r["deleted"], r["scanned"]) nextstart = r.get('nextstart') except ValueError as ve: self.logger.debug(ve) self.logger.debug("content: %s (%d)" % (response.content, len(response.content))) if not nextstart: break def frontier_start(self): self._store = self._collections.new_store(self._colname) def frontier_stop(self): self.logger.debug("Got frontier stop.") self.flush() self._hs_client.close() def _hcf_fetch(self, to_fetch): finished = False i = iter(to_fetch) while True: prepared_keys = [] while True: try: prepared_keys.append("key=%s" % next(i)) if len(prepared_keys) >= 32: break except StopIteration: finished = True break if not prepared_keys: break prepared_keys.append("meta=_key") params = { 'method': 'GET', 'url': 'https://storage.scrapinghub.com/collections/%d/s/%s' % (self.projectid, self._colname), 'params': str('&').join(prepared_keys), 'auth': self._hs_client.auth } start = time() response = self._hs_client.session.request(**params) self.logger.debug("Fetch request time %f ms", (time() - start) * 1000) if response.status_code != 200: self.logger.error("%d %s", response.status_code, response.content) self.logger.info(params) for line in response.content.decode('utf-8').split('\n'): if not line: continue try: yield loads(line) except ValueError as ve: self.logger.debug(ve) self.logger.debug("content: %s (%d)" % (line, len(line))) if finished: break def fetch(self, fingerprints): to_fetch = [f for f in fingerprints if f not in self._cache] self.logger.debug("cache size %s" % len(self._cache)) self.logger.debug("to fetch %d from %d" % (len(to_fetch), len(fingerprints))) if not to_fetch: return count = 0 for o in self._hcf_fetch(to_fetch): self._cache[o['_key']] = o['value'] count += 1 self.logger.debug("Fetched %d items" % count) def flush(self, force_clear=False): buffer = [] count = 0 start = time() try: for fprint, state_val in six.iteritems(self._cache): buffer.append({'_key': fprint, 'value': state_val}) if len(buffer) > 1024: count += len(buffer) self._store.set(buffer) buffer = [] finally: count += len(buffer) self._store.set(buffer) self.logger.debug("Send time %f ms", (time() - start) * 1000) self.logger.debug("State cache has been flushed: %d items" % count) super(HCFStates, self).flush(force_clear)
class HCFClientWrapper(object): def __init__(self, auth, project_id, frontier, batch_size=0, flush_interval=30): self._hs_client = HubstorageClient(auth=auth) self._hcf = self._hs_client.get_project(project_id).frontier self._hcf.batch_size = batch_size self._hcf.batch_interval = flush_interval self._frontier = frontier self._links_count = defaultdict(int) self._links_to_flush_count = defaultdict(int) self._hcf_retries = 10 self.logger = logging.getLogger("hubstorage-wrapper") def add_request(self, slot, request): self._hcf.add(self._frontier, slot, [request]) self._links_count[slot] += 1 self._links_to_flush_count[slot] += 1 return 0 def flush(self, slot=None): n_links_to_flush = self.get_number_of_links_to_flush(slot) if n_links_to_flush: if slot is None: self._hcf.flush() for slot in self._links_to_flush_count.keys(): self._links_to_flush_count[slot] = 0 else: writer = self._hcf._get_writer(self._frontier, slot) writer.flush() self._links_to_flush_count[slot] = 0 return n_links_to_flush def read(self, slot, mincount=None): for i in range(self._hcf_retries): try: return self._hcf.read(self._frontier, slot, mincount) except requests_lib.exceptions.ReadTimeout: self.logger.error( "Could not read from {0}/{1} try {2}/{3}".format( self._frontier, slot, i + 1, self._hcf_retries)) except requests_lib.exceptions.ConnectionError: self.logger.error( "Connection error while reading from {0}/{1} try {2}/{3}". format(self._frontier, slot, i + 1, self._hcf_retries)) except requests_lib.exceptions.RequestException: self.logger.error( "Error while reading from {0}/{1} try {2}/{3}".format( self._frontier, slot, i + 1, self._hcf_retries)) sleep(60 * (i + 1)) return [] def delete(self, slot, ids): for i in range(self._hcf_retries): try: self._hcf.delete(self._frontier, slot, ids) break except requests_lib.exceptions.ReadTimeout: self.logger.error( "Could not delete ids from {0}/{1} try {2}/{3}".format( self._frontier, slot, i + 1, self._hcf_retries)) except requests_lib.exceptions.ConnectionError: self.logger.error( "Connection error while deleting ids from {0}/{1} try {2}/{3}" .format(self._frontier, slot, i + 1, self._hcf_retries)) except requests_lib.exceptions.RequestException: self.logger.error( "Error deleting ids from {0}/{1} try {2}/{3}".format( self._frontier, slot, i + 1, self._hcf_retries)) sleep(60 * (i + 1)) def delete_slot(self, slot): self._hcf.delete_slot(self._frontier, slot) def close(self): self._hcf.close() self._hs_client.close() def get_number_of_links(self, slot=None): if slot is None: return sum(self._links_count.values()) else: return self._links_count[slot] def get_number_of_links_to_flush(self, slot=None): if slot is None: return sum(self._links_to_flush_count.values()) else: return self._links_to_flush_count[slot]