def test_collection_store_and_delete_are_retried(self): # Prepare client = HubstorageClient(auth=self.auth, endpoint=self.endpoint, max_retries=3, max_retry_time=1) callback_post, attempts_count_post = self.make_request_callback(2, []) callback_delete, attempts_count_delete = self.make_request_callback( 2, []) self.mock_api(method=POST, callback=callback_delete, url_match='/.*/deleted') self.mock_api( method=POST, callback=callback_post ) # /!\ default regexp matches all paths, has to be added last # Act project = client.get_project(self.projectid) store = project.collections.new_store('foo') store.set({'_key': 'bar', 'content': 'value'}) store.delete('baz') # Assert self.assertEqual(attempts_count_post[0], 3) self.assertEqual(attempts_count_delete[0], 3)
def test_delete_on_hubstorage_api_does_not_404(self): # NOTE: The current Hubstorage API does not raise 404 errors on deleting resources that do not exist, # Thus the retry policy does not catch 404 errors when retrying deletes (simplify implementation A LOT). # This test checks that this assumption holds. client = HubstorageClient(auth=self.auth, endpoint=self.endpoint, max_retries=0) project = client.get_project(projectid=self.projectid) # Check frontier delete project.frontier.delete_slot('frontier_non_existing', 'slot_non_existing') # Check metadata delete job = client.push_job(self.projectid, self.spidername) job.metadata[ 'foo'] = 'bar' # Add then delete key, this will trigger an api delete for item foo del job.metadata['foo'] job.metadata.save() # Check collections delete store = project.collections.new_store('foo') store.set({'_key': 'foo'}) store.delete('bar') self.assertTrue( True, "No error have been triggered by calling a delete on resources that do not exist" )
def get_scraped_data(dir,items_job, key, spider): # establish a connection with scrapyhub and get a items generator hc = HubstorageClient(auth=key) empty, totalItems, keptItems = 0, 0, 0 for job in hc.get_project(items_job).jobq.list(spider=spider): for item in hc.get_job(job['key']).items.list(): totalItems += 1 item = pd.Series(item) if item['title'] != '' and item['article'] != '' and \ item['title'] != ' ' and item['article'] != ' ': item['spider'] = spider item = item.drop('category') item = item.replace(["page1", "page2", "page3", "scrape_time", "", "basic"], [np.nan, np.nan, np.nan, np.nan, np.nan, "reutersbasic"]) item = item.replace({'<.*?>': '', '\[.*?\]': '', '\(.*?\)': ''}, regex=True) #add article hash code as the id of the article item['id'] = hash(item['article']) #write item(as records) to a json file file = dir + 'raw/' + str(item['id']) + '.json' item.to_json(file) keptItems += 1 else: empty += 1 print '#' * 50 print 'Fetched: ', totalItems, ' from spider: ', item['spider'] print keptItems, ' were written to the folder' print '-' * 50, '\n\n'
def open_spider(self, spider): client = HubstorageClient(auth=settings.SHUB_KEY) project = client.get_project(settings.SHUB_PROJ_ID) self.data_stores = {} for product_name in get_product_names(): self.data_stores[product_name] = project.collections.new_store( product_name)
def test_auth(self): # client without global auth set hsc = HubstorageClient(endpoint=self.hsclient.endpoint) self.assertEqual(hsc.auth, None) # check no-auth access try: hsc.push_job(self.projectid, self.spidername) except HTTPError as exc: self.assertTrue(exc.response.status_code, 401) else: self.assertTrue(False, '401 not raised') try: hsc.get_project(self.projectid).push_job(self.spidername) except HTTPError as exc: self.assertTrue(exc.response.status_code, 401) else: self.assertTrue(False, '401 not raised') try: hsc.get_job((self.projectid, 1, 1)).items.list() except HTTPError as exc: self.assertTrue(exc.response.status_code, 401) else: self.assertTrue(False, '401 not raised') try: hsc.get_project(self.projectid).get_job( (self.projectid, 1, 1)).items.list() except HTTPError as exc: self.assertTrue(exc.response.status_code, 401) else: self.assertTrue(False, '401 not raised') # create project with auth auth = self.hsclient.auth project = hsc.get_project(self.projectid, auth) self.assertEqual(project.auth, auth) job = project.push_job(self.spidername) samejob = project.get_job(job.key) self.assertEqual(samejob.key, job.key)
def test_auth(self): # client without global auth set hsc = HubstorageClient(endpoint=self.hsclient.endpoint) self.assertEqual(hsc.auth, None) # check no-auth access try: hsc.push_job(self.projectid, self.spidername) except HTTPError as exc: self.assertTrue(exc.response.status_code, 401) else: self.assertTrue(False, '401 not raised') try: hsc.get_project(self.projectid).push_job(self.spidername) except HTTPError as exc: self.assertTrue(exc.response.status_code, 401) else: self.assertTrue(False, '401 not raised') try: hsc.get_job((self.projectid, 1, 1)).items.list() except HTTPError as exc: self.assertTrue(exc.response.status_code, 401) else: self.assertTrue(False, '401 not raised') try: hsc.get_project(self.projectid).get_job((self.projectid, 1, 1)).items.list() except HTTPError as exc: self.assertTrue(exc.response.status_code, 401) else: self.assertTrue(False, '401 not raised') # create project with auth auth = self.hsclient.auth project = hsc.get_project(self.projectid, auth) self.assertEqual(project.auth, auth) job = project.push_job(self.spidername) samejob = project.get_job(job.key) self.assertEqual(samejob.key, job.key)
def fetch_and_save_items(): hc = HubstorageClient(auth=API_KEY) project = hc.get_project(SH_PROJECT) for spider in SPIDERS: print("\nworking on spider {}".format(spider['spider_name'])) spider_id = project.ids.spider(spider['spider_name']) summary = project.spiders.lastjobsummary(spiderid=spider_id) for element in summary: print(element['key']) job = hc.get_job(element['key']) items = job.items.iter_values() save_items(items, spider['institution_name'])
class BaseSpider(Spider): def __init__(self, *args, **kwargs): super(BaseSpider, self).__init__(*args, **kwargs) if 'crawl_days' in kwargs: self.crawl_type = 'full' self.crawl_days = int(self.crawl_days) assert self.crawl_days elif 'crawl_days' not in kwargs and 'crawl_type' not in kwargs: self.crawl_type = 'full' self.crawl_days = 14 def start_requests(self): yield Request(self.input_url, callback=self.parse) def initialize_hubstorage_collection(self): self.hs_client = HubstorageClient(self.settings.get('HS_AUTH')) self.hs_projectid = os.environ.get('SCRAPY_PROJECT_ID') if self.hs_projectid is None: self.hs_projectid = self.settings.get('HS_PROJECTID') collections = self.hs_client.get_project(self.hs_projectid).collections self.hs_collection = collections.new_store(self.name) def set_min_post_date(self): if getattr(self, 'crawl_days', None): self.min_post_date = datetime.now() - timedelta( days=self.crawl_days) else: self.min_post_date = datetime.strptime( self.settings['AVC_MIN_POST_DATE'], self.settings['AVC_DATE_FORMAT'], ) self.logger.info('Setting min_post_date as %s' % self.min_post_date) def get_latest_scraped_date(self, username): try: entry = self.hs_collection.get(username) return datetime.strptime(entry['value'], self.settings['AVC_DATE_TIME_FORMAT']) except: return None def set_latest_scraped_date(self, username, latest_scraped_date): entry = { '_key': username, 'value': latest_scraped_date.strftime( self.settings['AVC_DATE_TIME_FORMAT']), } self.hs_collection.set(entry)
def run_job(project, timeout, auth, **kwargs): hc = HubstorageClient(auth=auth) project = hc.get_project(project) key = project.push_job('py:run_pipeline.py', **kwargs).key running = True stop_at = datetime.now() + timedelta(seconds=timeout) while running: running = project.get_job(key).metadata['state'] in ('pending', 'running') print('Still running') if datetime.now() > stop_at: print('Timeout exceeded') running = False print('Finished')
def test_collection_store_and_delete_are_retried(self): # Prepare client = HubstorageClient(auth=self.auth, endpoint=self.endpoint, max_retries=3) callback_post, attempts_count_post = self.make_request_callback(2, []) callback_delete, attempts_count_delete = self.make_request_callback(2, []) self.mock_api(method=POST, callback=callback_delete, url_match='/.*/deleted') self.mock_api(method=POST, callback=callback_post) # /!\ default regexp matches all paths, has to be added last # Act project = client.get_project(self.projectid) store = project.collections.new_store('foo') store.set({'_key': 'bar', 'content': 'value'}) store.delete('baz') # Assert self.assertEqual(attempts_count_post[0], 3) self.assertEqual(attempts_count_delete[0], 3)
class ClientTest(HSTestCase): def test_push_job(self): c = self.hsclient job = c.push_job(self.projectid, self.spidername, state='running', priority=self.project.jobq.PRIO_LOW, foo='baz') m = job.metadata self.assertEqual(m.get('state'), u'running', c.auth) self.assertEqual(m.get('foo'), u'baz') self.project.jobq.delete(job) m.expire() self.assertEqual(m.get('state'), u'deleted') self.assertEqual(m.get('foo'), u'baz') def test_botgroup(self): self.project.settings.update(botgroups=['foo'], created=millitime()) self.project.settings.save() c = self.hsclient q1 = c.push_job(self.project.projectid, self.spidername) j1 = c.start_job() self.assertEqual(j1, None, 'got %s, pushed job was %s' % (j1, q1)) j2 = c.start_job(botgroup='bar') self.assertEqual(j2, None, 'got %s, pushed job was %s' % (j2, q1)) j3 = c.start_job(botgroup='foo') self.assertEqual(j3.key, q1.key) def test_debug_queries(self): self.hsclient = HubstorageClient(auth=self.auth, endpoint=self.endpoint, debug=True) self.assertEqual(self.hsclient.queries, []) self.project = self.hsclient.get_project(self.projectid) list(self.project.get_jobs(self.spiderid)) self.assertEqual(len(self.hsclient.queries), 1) q = self.hsclient.queries[0] self.assertEqual(q['method'], 'GET') self.assert_(q['time'] > 0) self.assert_('url' in q)
def test_delete_on_hubstorage_api_does_not_404(self): # NOTE: The current Hubstorage API does not raise 404 errors on deleting resources that do not exist, # Thus the retry policy does not catch 404 errors when retrying deletes (simplify implementation A LOT). # This test checks that this assumption holds. client = HubstorageClient(auth=self.auth, endpoint=self.endpoint, max_retries=0) project = client.get_project(projectid=self.projectid) # Check frontier delete project.frontier.delete_slot('frontier_non_existing', 'slot_non_existing') # Check metadata delete job = client.push_job(self.projectid, self.spidername) job.metadata['foo'] = 'bar' # Add then delete key, this will trigger an api delete for item foo del job.metadata['foo'] job.metadata.save() # Check collections delete store = project.collections.new_store('foo') store.set({'_key': 'foo'}) store.delete('bar') self.assertTrue(True, "No error have been triggered by calling a delete on resources that do not exist")
def main(argv): apikey = '' project = '' try: opts, args = getopt.getopt(argv, "hi:o", ["apikey=","project="]) except getopt.GetoptError: print 'alljobs.py -k <API Key> -p <ProjectID>' sys.exit(2) for opt, arg in opts: if opt == '-h': print 'alljobs.py -k <API Key> -p <ProjectID>' sys.exit() elif opt in("-k", "--apikey"): apikey = arg elif opt in("-p", "--project"): project = arg hc = HubstorageClient(auth=apikey) project = hc.get_project(project) jobs_metadata = project.jobq.list() jobids = [j['key'] for j in jobs_metadata] jobidsUtf = [x.encode('UTF8') for x in jobids] print jobidsUtf
class SystemTest(HSTestCase): MAGICN = 1211 def setUp(self): super(HSTestCase, self).setUp() self.endpoint = self.hsclient.endpoint # Panel - no client auth, only project auth using user auth token self.panelclient = HubstorageClient(endpoint=self.endpoint) self.panelproject = self.panelclient.get_project(self.projectid, auth=self.auth) def tearDown(self): super(HSTestCase, self).tearDown() self.panelclient.close() def test_succeed_with_close_reason(self): self._do_test_success('all-good', 'all-good') def test_succeed_without_close_reason(self): self._do_test_success(None, 'no_reason') def test_scraper_failure(self): job = self._do_test_job(IOError('no more resources, ha!'), 'failed') # MAGICN per log level messages plus one of last failure stats = job.logs.stats() self.assertTrue(stats) self.assertEqual(stats['totals']['input_values'], self.MAGICN * 4 + 1) def _do_test_success(self, job_close_reason, expected_close_reason): job = self._do_test_job(job_close_reason, expected_close_reason) self.assertEqual(job.items.stats()['totals']['input_values'], self.MAGICN) self.assertEqual(job.logs.stats()['totals']['input_values'], self.MAGICN * 4) self.assertEqual(job.requests.stats()['totals']['input_values'], self.MAGICN) def _do_test_job(self, job_close_reason, expected_close_reason): p = self.panelproject pushed = p.jobq.push(self.spidername) # check pending state job = p.get_job(pushed['key']) self.assertEqual(job.metadata.get('state'), 'pending') # consume msg from runner self._run_runner(pushed, close_reason=job_close_reason) # query again from panel job = p.get_job(pushed['key']) self.assertEqual(job.metadata.get('state'), 'finished') self.assertEqual(job.metadata.get('close_reason'), expected_close_reason) return job def _run_runner(self, pushed, close_reason): client = HubstorageClient(endpoint=self.endpoint, auth=self.auth) with closing(client) as runnerclient: job = self.start_job() self.assertFalse(job.metadata.get('stop_requested')) job.metadata.update(host='localhost', slot=1) self.assertEqual(job.metadata.get('state'), 'running') # run scraper try: self._run_scraper(job.key, job.jobauth, close_reason=close_reason) except Exception as exc: job.logs.error(message=str(exc), appendmode=True) job.close_writers() job.jobq.finish(job, close_reason='failed') # logging from runner must append and never remove messages logged # by scraper self.assertTrue(job.logs.batch_append) else: job.jobq.finish(job, close_reason=close_reason or 'no_reason') def _run_scraper(self, jobkey, jobauth, close_reason=None): httpmethods = 'GET PUT POST DELETE HEAD OPTIONS TRACE CONNECT'.split() # Scraper - uses job level auth, no global or project auth available client = HubstorageClient(endpoint=self.endpoint) with closing(client) as scraperclient: job = scraperclient.get_job(jobkey, auth=jobauth) for idx in xrange(self.MAGICN): iid = job.items.write({'uuid': idx}) job.logs.debug('log debug %s' % idx, idx=idx) job.logs.info('log info %s' % idx, idx=idx) job.logs.warn('log warn %s' % idx, idx=idx) job.logs.error('log error %s' % idx, idx=idx) sid = job.samples.write([idx, idx, idx]) rid = job.requests.add( url='http://test.com/%d' % idx, status=random.randint(100, 1000), method=random.choice(httpmethods), rs=random.randint(0, 100000), duration=random.randint(0, 1000), parent=random.randrange(0, idx + 1) if idx > 10 else None, ts=millitime() + random.randint(100, 100000), ) self.assertEqual(iid, idx) self.assertEqual(sid, idx) self.assertEqual(rid, idx) if isinstance(close_reason, Exception): raise close_reason if close_reason: job.metadata['close_reason'] = close_reason job.metadata.save()
class HCFStates(MemoryStates): def __init__(self, auth, project_id, colname, cache_size_limit, cleanup_on_start): super(HCFStates, self).__init__(cache_size_limit) self._hs_client = HubstorageClient(auth=auth) self.projectid = project_id project = self._hs_client.get_project(self.projectid) self._collections = project.collections self._colname = colname + "_states" self.logger = logging.getLogger("hcf.states") if cleanup_on_start: self._cleanup() def _cleanup(self): while True: nextstart = None params = { 'method': 'DELETE', 'url': 'https://storage.scrapinghub.com/collections/%d/s/%s' % (self.projectid, self._colname), 'auth': self._hs_client.auth } if nextstart: params['prefix'] = nextstart response = self._hs_client.session.request(**params) if response.status_code != 200: self.logger.error("%d %s", response.status_code, response.content) self.logger.info(params) try: r = loads(response.content.decode('utf-8')) self.logger.debug("Removed %d, scanned %d", r["deleted"], r["scanned"]) nextstart = r.get('nextstart') except ValueError as ve: self.logger.debug(ve) self.logger.debug("content: %s (%d)" % (response.content, len(response.content))) if not nextstart: break def frontier_start(self): self._store = self._collections.new_store(self._colname) def frontier_stop(self): self.logger.debug("Got frontier stop.") self.flush() self._hs_client.close() def _hcf_fetch(self, to_fetch): finished = False i = iter(to_fetch) while True: prepared_keys = [] while True: try: prepared_keys.append("key=%s" % next(i)) if len(prepared_keys) >= 32: break except StopIteration: finished = True break if not prepared_keys: break prepared_keys.append("meta=_key") params = { 'method': 'GET', 'url': 'https://storage.scrapinghub.com/collections/%d/s/%s' % (self.projectid, self._colname), 'params': str('&').join(prepared_keys), 'auth': self._hs_client.auth } start = time() response = self._hs_client.session.request(**params) self.logger.debug("Fetch request time %f ms", (time() - start) * 1000) if response.status_code != 200: self.logger.error("%d %s", response.status_code, response.content) self.logger.info(params) for line in response.content.decode('utf-8').split('\n'): if not line: continue try: yield loads(line) except ValueError as ve: self.logger.debug(ve) self.logger.debug("content: %s (%d)" % (line, len(line))) if finished: break def fetch(self, fingerprints): to_fetch = [f for f in fingerprints if f not in self._cache] self.logger.debug("cache size %s" % len(self._cache)) self.logger.debug("to fetch %d from %d" % (len(to_fetch), len(fingerprints))) if not to_fetch: return count = 0 for o in self._hcf_fetch(to_fetch): self._cache[o['_key']] = o['value'] count += 1 self.logger.debug("Fetched %d items" % count) def flush(self, force_clear=False): buffer = [] count = 0 start = time() try: for fprint, state_val in six.iteritems(self._cache): buffer.append({'_key': fprint, 'value': state_val}) if len(buffer) > 1024: count += len(buffer) self._store.set(buffer) buffer = [] finally: count += len(buffer) self._store.set(buffer) self.logger.debug("Send time %f ms", (time() - start) * 1000) self.logger.debug("State cache has been flushed: %d items" % count) super(HCFStates, self).flush(force_clear)
class HcfMiddleware(object): def __init__(self, crawler): settings = crawler.settings self.hs_endpoint = settings.get("HS_ENDPOINT") self.hs_auth = self._get_config(settings, "HS_AUTH") self.hs_projectid = self._get_config(settings, "HS_PROJECTID", os.environ.get('SCRAPY_PROJECT_ID')) self.hs_frontier = self._get_config(settings, "HS_FRONTIER") self.hs_consume_from_slot = self._get_config(settings, "HS_CONSUME_FROM_SLOT") self.hs_number_of_slots = settings.getint("HS_NUMBER_OF_SLOTS", DEFAULT_HS_NUMBER_OF_SLOTS) self.hs_max_links = settings.getint("HS_MAX_LINKS", DEFAULT_MAX_LINKS) self.hs_start_job_enabled = settings.getbool("HS_START_JOB_ENABLED", False) self.hs_start_job_on_reason = settings.getlist("HS_START_JOB_ON_REASON", ['finished']) conn = Connection(self.hs_auth) self.panel_project = conn[self.hs_projectid] self.hsclient = HubstorageClient(auth=self.hs_auth, endpoint=self.hs_endpoint) self.project = self.hsclient.get_project(self.hs_projectid) self.fclient = self.project.frontier self.new_links = defaultdict(set) self.batch_ids = [] crawler.signals.connect(self.close_spider, signals.spider_closed) # Make sure the logger for hubstorage.batchuploader is configured logging.basicConfig() def _get_config(self, settings, key, default=None): value = settings.get(key, default) if not value: raise NotConfigured('%s not found' % key) return value def _msg(self, msg, level=log.INFO): log.msg('(HCF) %s' % msg, level) def start_job(self, spider): self._msg("Starting new job for: %s" % spider.name) jobid = self.panel_project.schedule( spider.name, hs_consume_from_slot=self.hs_consume_from_slot, dummy=datetime.now() ) self._msg("New job started: %s" % jobid) return jobid @classmethod def from_crawler(cls, crawler): return cls(crawler) def process_start_requests(self, start_requests, spider): self.hs_frontier = getattr(spider, 'hs_frontier', self.hs_frontier) self._msg('Using HS_FRONTIER=%s' % self.hs_frontier) self.hs_consume_from_slot = getattr(spider, 'hs_consume_from_slot', self.hs_consume_from_slot) self._msg('Using HS_CONSUME_FROM_SLOT=%s' % self.hs_consume_from_slot) self.has_new_requests = False for req in self._get_new_requests(): self.has_new_requests = True yield req # if there are no links in the hcf, use the start_requests # unless this is not the first job. if not self.has_new_requests and not getattr(spider, 'dummy', None): self._msg('Using start_requests') for r in start_requests: yield r def process_spider_output(self, response, result, spider): slot_callback = getattr(spider, 'slot_callback', self._get_slot) for item in result: if isinstance(item, Request): request = item if request.meta.get('use_hcf', False): if request.method == 'GET': # XXX: Only GET support for now. slot = slot_callback(request) if not request.url in self.new_links[slot]: hcf_params = request.meta.get('hcf_params') fp = {'fp': request.url} if hcf_params: fp.update(hcf_params) # Save the new links as soon as possible using # the batch uploader self.fclient.add(self.hs_frontier, slot, [fp]) self.new_links[slot].add(request.url) else: self._msg("'use_hcf' meta key is not supported for non GET requests (%s)" % request.url, log.ERROR) yield request else: yield request else: yield item def close_spider(self, spider, reason): # Only store the results if the spider finished normally, if it # didn't finished properly there is not way to know whether all the url batches # were processed and it is better not to delete them from the frontier # (so they will be picked by another process). if reason == 'finished': self._save_new_links_count() self._delete_processed_ids() # Close the frontier client in order to make sure that all the new links # are stored. self.fclient.close() self.hsclient.close() # If the reason is defined in the hs_start_job_on_reason list then start # a new job right after this spider is finished. if self.hs_start_job_enabled and reason in self.hs_start_job_on_reason: # Start the new job if this job had requests from the HCF or it # was the first job. if self.has_new_requests or not getattr(spider, 'dummy', None): self.start_job(spider) def _get_new_requests(self): """ Get a new batch of links from the HCF.""" num_batches = 0 num_links = 0 for num_batches, batch in enumerate(self.fclient.read(self.hs_frontier, self.hs_consume_from_slot), 1): for fingerprint, data in batch['requests']: num_links += 1 yield Request(url=fingerprint, meta={'hcf_params': {'qdata': data}}) self.batch_ids.append(batch['id']) if num_links >= self.hs_max_links: break self._msg('Read %d new batches from slot(%s)' % (num_batches, self.hs_consume_from_slot)) self._msg('Read %d new links from slot(%s)' % (num_links, self.hs_consume_from_slot)) def _save_new_links_count(self): """ Save the new extracted links into the HCF.""" for slot, new_links in self.new_links.items(): self._msg('Stored %d new links in slot(%s)' % (len(new_links), slot)) self.new_links = defaultdict(set) def _delete_processed_ids(self): """ Delete in the HCF the ids of the processed batches.""" self.fclient.delete(self.hs_frontier, self.hs_consume_from_slot, self.batch_ids) self._msg('Deleted %d processed batches in slot(%s)' % (len(self.batch_ids), self.hs_consume_from_slot)) self.batch_ids = [] def _get_slot(self, request): """ Determine to which slot should be saved the request.""" md5 = hashlib.md5() md5.update(request.url) digest = md5.hexdigest() return str(int(digest, 16) % self.hs_number_of_slots)
class HCFClientWrapper(object): def __init__(self, auth, project_id, frontier, batch_size=0, flush_interval=30): self._hs_client = HubstorageClient(auth=auth) self._hcf = self._hs_client.get_project(project_id).frontier self._hcf.batch_size = batch_size self._hcf.batch_interval = flush_interval self._frontier = frontier self._links_count = defaultdict(int) self._links_to_flush_count = defaultdict(int) self._hcf_retries = 10 self.logger = logging.getLogger("hubstorage-wrapper") def add_request(self, slot, request): self._hcf.add(self._frontier, slot, [request]) self._links_count[slot] += 1 self._links_to_flush_count[slot] += 1 return 0 def flush(self, slot=None): n_links_to_flush = self.get_number_of_links_to_flush(slot) if n_links_to_flush: if slot is None: self._hcf.flush() for slot in self._links_to_flush_count.keys(): self._links_to_flush_count[slot] = 0 else: writer = self._hcf._get_writer(self._frontier, slot) writer.flush() self._links_to_flush_count[slot] = 0 return n_links_to_flush def read(self, slot, mincount=None): for i in range(self._hcf_retries): try: return self._hcf.read(self._frontier, slot, mincount) except requests_lib.exceptions.ReadTimeout: self.logger.error("Could not read from {0}/{1} try {2}/{3}".format(self._frontier, slot, i+1, self._hcf_retries)) except requests_lib.exceptions.ConnectionError: self.logger.error("Connection error while reading from {0}/{1} try {2}/{3}".format(self._frontier, slot, i+1, self._hcf_retries)) except requests_lib.exceptions.RequestException: self.logger.error("Error while reading from {0}/{1} try {2}/{3}".format(self._frontier, slot, i+1, self._hcf_retries)) sleep(60 * (i + 1)) return [] def delete(self, slot, ids): for i in range(self._hcf_retries): try: self._hcf.delete(self._frontier, slot, ids) break except requests_lib.exceptions.ReadTimeout: self.logger.error("Could not delete ids from {0}/{1} try {2}/{3}".format(self._frontier, slot, i+1, self._hcf_retries)) except requests_lib.exceptions.ConnectionError: self.logger.error("Connection error while deleting ids from {0}/{1} try {2}/{3}".format(self._frontier, slot, i+1, self._hcf_retries)) except requests_lib.exceptions.RequestException: self.logger.error("Error deleting ids from {0}/{1} try {2}/{3}".format(self._frontier, slot, i+1, self._hcf_retries)) sleep(60 * (i + 1)) def delete_slot(self, slot): self._hcf.delete_slot(self._frontier, slot) def close(self): self._hcf.close() self._hs_client.close() def get_number_of_links(self, slot=None): if slot is None: return sum(self._links_count.values()) else: return self._links_count[slot] def get_number_of_links_to_flush(self, slot=None): if slot is None: return sum(self._links_to_flush_count.values()) else: return self._links_to_flush_count[slot]
class HcfMiddleware(object): def __init__(self, crawler): self.crawler = crawler hs_endpoint = self._get_config(crawler, "HS_ENDPOINT") hs_auth = self._get_config(crawler, "HS_AUTH") self.hs_projectid = self._get_config(crawler, "HS_PROJECTID") self.hs_project_frontier = self._get_config(crawler, "HS_FRONTIER") self.hs_project_slot = self._get_config(crawler, "HS_SLOT") # Max number of batches to read from the HCF within a single run. try: self.hs_max_baches = int(crawler.settings.get("HS_MAX_BATCHES", DEFAULT_MAX_BATCHES)) except ValueError: self.hs_max_baches = DEFAULT_MAX_BATCHES self.hs_start_job_on_reason = crawler.settings.get("HS_START_JOB_ON_REASON", []) self.hsclient = HubstorageClient(auth=hs_auth, endpoint=hs_endpoint) self.project = self.hsclient.get_project(self.hs_projectid) self.fclient = self.project.frontier self.new_links = defaultdict(list) self.batch_ids = [] crawler.signals.connect(self.idle_spider, signals.spider_idle) crawler.signals.connect(self.close_spider, signals.spider_closed) def _get_config(self, crawler, key): value = crawler.settings.get(key) if not value: raise NotConfigured('%s not found' % key) return value def _msg(self, msg): log.msg('(HCF) %s' % msg) @classmethod def from_crawler(cls, crawler): return cls(crawler) def process_start_requests(self, start_requests, spider): self.hs_frontier = getattr(spider, 'frontier', self.hs_project_frontier) self._msg('Using HS_FRONTIER=%s' % self.hs_frontier) self.hs_slot = getattr(spider, 'slot', self.hs_project_slot) self._msg('Using HS_SLOT=%s' % self.hs_slot) has_new_requests = False for req in self._get_new_requests(): has_new_requests = True yield req # if there are no links in the hcf, use the start_requests if not has_new_requests: self._msg('Using start_requests') for r in start_requests: yield r def process_spider_output(self, response, result, spider): slot_callback = getattr(spider, 'slot_callback', self._get_slot) for item in result: if isinstance(item, Request): request = item if (request.method == 'GET' and # XXX: Only GET support for now. request.meta.get('use_hcf', False)): slot = slot_callback(request) hcf_params = request.meta.get('hcf_params') fp = {'fp': request.url} if hcf_params: fp.update(hcf_params) self.new_links[slot].append(fp) else: yield item else: yield item def idle_spider(self, spider): self._save_new_links() self.fclient.flush() self._delete_processed_ids() has_new_requests = False for request in self._get_new_requests(): self.crawler.engine.schedule(request, spider) has_new_requests = True if has_new_requests: raise DontCloseSpider def close_spider(self, spider, reason): # Only store the results if the spider finished normally, if it # didn't finished properly there is not way to know whether all the url batches # were processed and it is better not to delete them from the frontier # (so they will be picked by anothe process). if reason == 'finished': self._save_new_links() self._delete_processed_ids() # If the reason is defined in the hs_start_job_on_reason list then start # a new job right after this spider is finished. The idea is to limit # every spider runtime (either via itemcount, pagecount or timeout) and # then have the old spider start a new one to take its place in the slot. if reason in self.hs_start_job_on_reason: self._msg("Starting new job" + spider.name) job = self.hsclient.start_job(projectid=self.hs_projectid, spider=spider.name) self._msg("New job started: %s" % job) self.fclient.close() self.hsclient.close() def _get_new_requests(self): """ Get a new batch of links from the HCF.""" num_batches = 0 num_links = 0 for num_batches, batch in enumerate(self.fclient.read(self.hs_frontier, self.hs_slot), 1): for r in batch['requests']: num_links += 1 yield Request(r[0]) self.batch_ids.append(batch['id']) if num_batches >= self.hs_max_baches: break self._msg('Read %d new batches from slot(%s)' % (num_batches, self.hs_slot)) self._msg('Read %d new links from slot(%s)' % (num_links, self.hs_slot)) def _save_new_links(self): """ Save the new extracted links into the HCF.""" for slot, fps in self.new_links.items(): self.fclient.add(self.hs_frontier, slot, fps) self._msg('Stored %d new links in slot(%s)' % (len(fps), slot)) self.new_links = defaultdict(list) def _delete_processed_ids(self): """ Delete in the HCF the ids of the processed batches.""" self.fclient.delete(self.hs_frontier, self.hs_slot, self.batch_ids) self._msg('Deleted %d processed batches in slot(%s)' % (len(self.batch_ids), self.hs_slot)) self.batch_ids = [] def _get_slot(self, request): """ Determine to which slot should be saved the request.""" return '0'
class HCFStates(MemoryStates): def __init__(self, auth, project_id, colname, cache_size_limit, cleanup_on_start): super(HCFStates, self).__init__(cache_size_limit) self._hs_client = HubstorageClient(auth=auth) self.projectid = project_id project = self._hs_client.get_project(self.projectid) self._collections = project.collections self._colname = colname + "_states" self.logger = logging.getLogger("hcf.states") if cleanup_on_start: self._cleanup() def _cleanup(self): while True: nextstart = None params = {'method':'DELETE', 'url':'https://storage.scrapinghub.com/collections/%d/s/%s' % (self.projectid, self._colname), 'auth':self._hs_client.auth} if nextstart: params['prefix'] = nextstart response = self._hs_client.session.request(**params) if response.status_code != 200: self.logger.error("%d %s", response.status_code, response.content) self.logger.info(params) try: r = loads(response.content.decode('utf-8')) self.logger.debug("Removed %d, scanned %d", r["deleted"], r["scanned"]) nextstart = r.get('nextstart') except ValueError as ve: self.logger.debug(ve) self.logger.debug("content: %s (%d)" % (response.content, len(response.content))) if not nextstart: break def frontier_start(self): self._store = self._collections.new_store(self._colname) def frontier_stop(self): self.logger.debug("Got frontier stop.") self.flush() self._hs_client.close() def _hcf_fetch(self, to_fetch): finished = False i = iter(to_fetch) while True: prepared_keys = [] while True: try: prepared_keys.append("key=%s" % next(i)) if len(prepared_keys) >= 32: break except StopIteration: finished = True break if not prepared_keys: break prepared_keys.append("meta=_key") params = {'method':'GET', 'url':'https://storage.scrapinghub.com/collections/%d/s/%s' % (self.projectid, self._colname), 'params':str('&').join(prepared_keys), 'auth':self._hs_client.auth} start = time() response = self._hs_client.session.request(**params) self.logger.debug("Fetch request time %f ms", (time()-start) * 1000) if response.status_code != 200: self.logger.error("%d %s", response.status_code, response.content) self.logger.info(params) for line in response.content.decode('utf-8').split('\n'): if not line: continue try: yield loads(line) except ValueError as ve: self.logger.debug(ve) self.logger.debug("content: %s (%d)" % (line, len(line))) if finished: break def fetch(self, fingerprints): to_fetch = [f for f in fingerprints if f not in self._cache] self.logger.debug("cache size %s" % len(self._cache)) self.logger.debug("to fetch %d from %d" % (len(to_fetch), len(fingerprints))) if not to_fetch: return count = 0 for o in self._hcf_fetch(to_fetch): self._cache[o['_key']] = o['value'] count += 1 self.logger.debug("Fetched %d items" % count) def flush(self, force_clear=False): buffer = [] count = 0 start = time() try: for fprint, state_val in six.iteritems(self._cache): buffer.append({'_key': fprint, 'value':state_val}) if len(buffer) > 1024: count += len(buffer) self._store.set(buffer) buffer = [] finally: count += len(buffer) self._store.set(buffer) self.logger.debug("Send time %f ms", (time()-start) * 1000) self.logger.debug("State cache has been flushed: %d items" % count) super(HCFStates, self).flush(force_clear)
class SystemTest(HSTestCase): MAGICN = 1211 def setUp(self): super(HSTestCase, self).setUp() endpoint = self.hsclient.endpoint # Panel - no client auth, only project auth using user auth token self.panelclient = HubstorageClient(endpoint=endpoint) self.panelproject = self.panelclient.get_project(self.projectid, auth=self.auth) # Runner - client uses global auth to poll jobq self.runnerclient = HubstorageClient(endpoint=endpoint, auth=self.auth) # Scraper - uses job level auth, no global or project auth available self.scraperclient = HubstorageClient(endpoint=endpoint) def test_succeed_with_close_reason(self): p = self.panelproject pushed = p.jobq.push(self.spidername) # check pending state job = p.get_jobs(self.spiderid).next() self.assertEqual(job.metadata.get('state'), 'pending') # consume msg from runner self._run_runner(pushed, close_reason='all-good') # query again from panel job = p.get_jobs(self.spiderid).next() self.assertEqual(job.metadata.get('state'), 'finished') self.assertEqual(job.metadata.get('close_reason'), 'all-good') self.assertEqual(job.items.stats()['totals']['input_values'], self.MAGICN) self.assertEqual(job.logs.stats()['totals']['input_values'], self.MAGICN * 4) self.assertEqual(job.requests.stats()['totals']['input_values'], self.MAGICN) def test_succeed_without_close_reason(self): p = self.panelproject pushed = p.jobq.push(self.spidername) # check pending state job = p.get_jobs(self.spiderid).next() self.assertEqual(job.metadata.get('state'), 'pending') # consume msg from runner self._run_runner(pushed, close_reason=None) # query again from panel job = p.get_jobs(self.spiderid).next() self.assertEqual(job.metadata.get('state'), 'finished') self.assertEqual(job.metadata.get('close_reason'), 'no_reason') self.assertEqual(job.items.stats()['totals']['input_values'], self.MAGICN) self.assertEqual(job.logs.stats()['totals']['input_values'], self.MAGICN * 4) self.assertEqual(job.requests.stats()['totals']['input_values'], self.MAGICN) def test_scraper_failure(self): p = self.panelproject pushed = p.jobq.push(self.spidername) # check pending state job = p.get_jobs(self.spiderid).next() self.assertEqual(job.metadata.get('state'), 'pending') # consume msg from runner self._run_runner(pushed, close_reason=IOError('no more resources, ha!')) # query again from panel job = p.get_jobs(self.spiderid).next() self.assertEqual(job.metadata.get('state'), 'finished') self.assertEqual(job.metadata.get('close_reason'), 'failed') # MAGICN per log level messages plus one of last failure stats = job.logs.stats() self.assertTrue(stats) self.assertEqual(stats['totals']['input_values'], self.MAGICN * 4 + 1) def _run_runner(self, pushed, close_reason): job = self.runnerclient.start_job(self.projectid) self.assertFalse(job.metadata.get('stop_requested')) job.metadata.update(host='localhost', slot=1) self.assertEqual(job.metadata.get('state'), 'running') # run scraper try: self._run_scraper(job.key, job.jobauth, close_reason=close_reason) except Exception as exc: job.failed(message=str(exc)) # logging from runner must append and never remove messages logged # by scraper self.assertTrue(job.logs.batch_append) else: job.finished() self.runnerclient.close() def _run_scraper(self, jobkey, jobauth, close_reason=None): httpmethods = 'GET PUT POST DELETE HEAD OPTIONS TRACE CONNECT'.split() job = self.scraperclient.get_job(jobkey, auth=jobauth) for idx in xrange(self.MAGICN): iid = job.items.write({'uuid': idx}) job.logs.debug('log debug %s' % idx, idx=idx) job.logs.info('log info %s' % idx, idx=idx) job.logs.warn('log warn %s' % idx, idx=idx) job.logs.error('log error %s' % idx, idx=idx) sid = job.samples.write([idx, idx, idx]) rid = job.requests.add( url='http://test.com/%d' % idx, status=random.randint(100, 1000), method=random.choice(httpmethods), rs=random.randint(0, 100000), duration=random.randint(0, 1000), parent=random.randrange(0, idx + 1) if idx > 10 else None, ts=millitime() + random.randint(100, 100000), ) self.assertEqual(iid, idx) self.assertEqual(sid, idx) self.assertEqual(rid, idx) if isinstance(close_reason, Exception): self.scraperclient.close() raise close_reason if close_reason: job.metadata['close_reason'] = close_reason job.metadata.save() self.scraperclient.close() del self.scraperclient
class SystemTest(HSTestCase): MAGICN = 1211 def setUp(self): super(HSTestCase, self).setUp() self.endpoint = self.hsclient.endpoint # Panel - no client auth, only project auth using user auth token self.panelclient = HubstorageClient(endpoint=self.endpoint) self.panelproject = self.panelclient.get_project(self.projectid, auth=self.auth) def tearDown(self): super(HSTestCase, self).tearDown() self.panelclient.close() def test_succeed_with_close_reason(self): self._do_test_success("all-good", "all-good") def test_succeed_without_close_reason(self): self._do_test_success(None, "no_reason") def test_scraper_failure(self): job = self._do_test_job(IOError("no more resources, ha!"), "failed") # MAGICN per log level messages plus one of last failure stats = job.logs.stats() self.assertTrue(stats) self.assertEqual(stats["totals"]["input_values"], self.MAGICN * 4 + 1) def _do_test_success(self, job_close_reason, expected_close_reason): job = self._do_test_job(job_close_reason, expected_close_reason) self.assertEqual(job.items.stats()["totals"]["input_values"], self.MAGICN) self.assertEqual(job.logs.stats()["totals"]["input_values"], self.MAGICN * 4) self.assertEqual(job.requests.stats()["totals"]["input_values"], self.MAGICN) def _do_test_job(self, job_close_reason, expected_close_reason): p = self.panelproject pushed = p.jobq.push(self.spidername) # check pending state job = p.get_job(pushed["key"]) self.assertEqual(job.metadata.get("state"), "pending") # consume msg from runner self._run_runner(pushed, close_reason=job_close_reason) # query again from panel job = p.get_job(pushed["key"]) self.assertEqual(job.metadata.get("state"), "finished") self.assertEqual(job.metadata.get("close_reason"), expected_close_reason) return job def _run_runner(self, pushed, close_reason): client = HubstorageClient(endpoint=self.endpoint, auth=self.auth) with closing(client) as runnerclient: job = self.start_job() self.assertFalse(job.metadata.get("stop_requested")) job.metadata.update(host="localhost", slot=1) self.assertEqual(job.metadata.get("state"), "running") # run scraper try: self._run_scraper(job.key, job.jobauth, close_reason=close_reason) except Exception as exc: job.logs.error(message=str(exc), appendmode=True) job.close_writers() job.jobq.finish(job, close_reason="failed") # logging from runner must append and never remove messages logged # by scraper self.assertTrue(job.logs.batch_append) else: job.jobq.finish(job, close_reason=close_reason or "no_reason") def _run_scraper(self, jobkey, jobauth, close_reason=None): httpmethods = "GET PUT POST DELETE HEAD OPTIONS TRACE CONNECT".split() # Scraper - uses job level auth, no global or project auth available client = HubstorageClient(endpoint=self.endpoint) with closing(client) as scraperclient: job = scraperclient.get_job(jobkey, auth=jobauth) for idx in xrange(self.MAGICN): iid = job.items.write({"uuid": idx}) job.logs.debug("log debug %s" % idx, idx=idx) job.logs.info("log info %s" % idx, idx=idx) job.logs.warn("log warn %s" % idx, idx=idx) job.logs.error("log error %s" % idx, idx=idx) sid = job.samples.write([idx, idx, idx]) rid = job.requests.add( url="http://test.com/%d" % idx, status=random.randint(100, 1000), method=random.choice(httpmethods), rs=random.randint(0, 100000), duration=random.randint(0, 1000), parent=random.randrange(0, idx + 1) if idx > 10 else None, ts=millitime() + random.randint(100, 100000), ) self.assertEqual(iid, idx) self.assertEqual(sid, idx) self.assertEqual(rid, idx) if isinstance(close_reason, Exception): raise close_reason if close_reason: job.metadata["close_reason"] = close_reason job.metadata.save()
#coding=UTF-8 from hubstorage import HubstorageClient hc = HubstorageClient(auth='bc2aa25cc40f4ed4b03988e8e0b9e89e') project = hc.get_project('53883') itemslist = hc.get_job('53883/1/5').items.list() itemslist_size = itemslist.__sizeof__() for element in itemslist: element.__delitem__('_type') element.__delitem__('_cached_page_id') element.__delitem__('_template') elementIterator = element.iteritems() for fields in elementIterator: fieldIterator = fields.__iter__() for values in fieldIterator: if isinstance(values, basestring): print values else: print values.pop()
def test_allows_msgpack(monkeypatch, msgpack_available, path, expected_result): monkeypatch.setattr( 'hubstorage.collectionsrt.MSGPACK_AVAILABLE', msgpack_available) hsclient = HubstorageClient() collections = hsclient.get_project(2222000).collections assert collections._allows_mpack(path) is (msgpack_available and expected_result)
class HcfMiddleware(object): def __init__(self, crawler): self.crawler = crawler hs_endpoint = self._get_config(crawler, "HS_ENDPOINT") hs_auth = self._get_config(crawler, "HS_AUTH") self.hs_projectid = self._get_config(crawler, "HS_PROJECTID") self.hs_project_frontier = self._get_config(crawler, "HS_FRONTIER") self.hs_project_slot = self._get_config(crawler, "HS_SLOT") # Max number of batches to read from the HCF within a single run. try: self.hs_max_baches = int( crawler.settings.get("HS_MAX_BATCHES", DEFAULT_MAX_BATCHES)) except ValueError: self.hs_max_baches = DEFAULT_MAX_BATCHES self.hs_start_job_on_reason = crawler.settings.get( "HS_START_JOB_ON_REASON", []) self.hsclient = HubstorageClient(auth=hs_auth, endpoint=hs_endpoint) self.project = self.hsclient.get_project(self.hs_projectid) self.fclient = self.project.frontier self.new_links = defaultdict(list) self.batch_ids = [] crawler.signals.connect(self.idle_spider, signals.spider_idle) crawler.signals.connect(self.close_spider, signals.spider_closed) def _get_config(self, crawler, key): value = crawler.settings.get(key) if not value: raise NotConfigured('%s not found' % key) return value def _msg(self, msg): log.msg('(HCF) %s' % msg) @classmethod def from_crawler(cls, crawler): return cls(crawler) def process_start_requests(self, start_requests, spider): self.hs_frontier = getattr(spider, 'frontier', self.hs_project_frontier) self._msg('Using HS_FRONTIER=%s' % self.hs_frontier) self.hs_slot = getattr(spider, 'slot', self.hs_project_slot) self._msg('Using HS_SLOT=%s' % self.hs_slot) has_new_requests = False for req in self._get_new_requests(): has_new_requests = True yield req # if there are no links in the hcf, use the start_requests if not has_new_requests: self._msg('Using start_requests') for r in start_requests: yield r def process_spider_output(self, response, result, spider): slot_callback = getattr(spider, 'slot_callback', self._get_slot) for item in result: if isinstance(item, Request): request = item if (request.method == 'GET' and # XXX: Only GET support for now. request.meta.get('use_hcf', False)): slot = slot_callback(request) hcf_params = request.meta.get('hcf_params') fp = {'fp': request.url} if hcf_params: fp.update(hcf_params) self.new_links[slot].append(fp) else: yield item else: yield item def idle_spider(self, spider): self._save_new_links() self.fclient.flush() self._delete_processed_ids() has_new_requests = False for request in self._get_new_requests(): self.crawler.engine.schedule(request, spider) has_new_requests = True if has_new_requests: raise DontCloseSpider def close_spider(self, spider, reason): # Only store the results if the spider finished normally, if it # didn't finished properly there is not way to know whether all the url batches # were processed and it is better not to delete them from the frontier # (so they will be picked by anothe process). if reason == 'finished': self._save_new_links() self._delete_processed_ids() # If the reason is defined in the hs_start_job_on_reason list then start # a new job right after this spider is finished. The idea is to limit # every spider runtime (either via itemcount, pagecount or timeout) and # then have the old spider start a new one to take its place in the slot. if reason in self.hs_start_job_on_reason: self._msg("Starting new job" + spider.name) job = self.hsclient.start_job(projectid=self.hs_projectid, spider=spider.name) self._msg("New job started: %s" % job) self.fclient.close() self.hsclient.close() def _get_new_requests(self): """ Get a new batch of links from the HCF.""" num_batches = 0 num_links = 0 for num_batches, batch in enumerate( self.fclient.read(self.hs_frontier, self.hs_slot), 1): for r in batch['requests']: num_links += 1 yield Request(r[0]) self.batch_ids.append(batch['id']) if num_batches >= self.hs_max_baches: break self._msg('Read %d new batches from slot(%s)' % (num_batches, self.hs_slot)) self._msg('Read %d new links from slot(%s)' % (num_links, self.hs_slot)) def _save_new_links(self): """ Save the new extracted links into the HCF.""" for slot, fps in self.new_links.items(): self.fclient.add(self.hs_frontier, slot, fps) self._msg('Stored %d new links in slot(%s)' % (len(fps), slot)) self.new_links = defaultdict(list) def _delete_processed_ids(self): """ Delete in the HCF the ids of the processed batches.""" self.fclient.delete(self.hs_frontier, self.hs_slot, self.batch_ids) self._msg('Deleted %d processed batches in slot(%s)' % (len(self.batch_ids), self.hs_slot)) self.batch_ids = [] def _get_slot(self, request): """ Determine to which slot should be saved the request.""" return '0'
class HCFClientWrapper(object): def __init__(self, auth, project_id, frontier, batch_size=0, flush_interval=30): self._hs_client = HubstorageClient(auth=auth) self._hcf = self._hs_client.get_project(project_id).frontier self._hcf.batch_size = batch_size self._hcf.batch_interval = flush_interval self._frontier = frontier self._links_count = defaultdict(int) self._links_to_flush_count = defaultdict(int) self._hcf_retries = 10 self.logger = logging.getLogger("hubstorage-wrapper") def add_request(self, slot, request): self._hcf.add(self._frontier, slot, [request]) self._links_count[slot] += 1 self._links_to_flush_count[slot] += 1 return 0 def flush(self, slot=None): n_links_to_flush = self.get_number_of_links_to_flush(slot) if n_links_to_flush: if slot is None: self._hcf.flush() for slot in self._links_to_flush_count.keys(): self._links_to_flush_count[slot] = 0 else: writer = self._hcf._get_writer(self._frontier, slot) writer.flush() self._links_to_flush_count[slot] = 0 return n_links_to_flush def read(self, slot, mincount=None): for i in range(self._hcf_retries): try: return self._hcf.read(self._frontier, slot, mincount) except requests_lib.exceptions.ReadTimeout: self.logger.error( "Could not read from {0}/{1} try {2}/{3}".format( self._frontier, slot, i + 1, self._hcf_retries)) except requests_lib.exceptions.ConnectionError: self.logger.error( "Connection error while reading from {0}/{1} try {2}/{3}". format(self._frontier, slot, i + 1, self._hcf_retries)) except requests_lib.exceptions.RequestException: self.logger.error( "Error while reading from {0}/{1} try {2}/{3}".format( self._frontier, slot, i + 1, self._hcf_retries)) sleep(60 * (i + 1)) return [] def delete(self, slot, ids): for i in range(self._hcf_retries): try: self._hcf.delete(self._frontier, slot, ids) break except requests_lib.exceptions.ReadTimeout: self.logger.error( "Could not delete ids from {0}/{1} try {2}/{3}".format( self._frontier, slot, i + 1, self._hcf_retries)) except requests_lib.exceptions.ConnectionError: self.logger.error( "Connection error while deleting ids from {0}/{1} try {2}/{3}" .format(self._frontier, slot, i + 1, self._hcf_retries)) except requests_lib.exceptions.RequestException: self.logger.error( "Error deleting ids from {0}/{1} try {2}/{3}".format( self._frontier, slot, i + 1, self._hcf_retries)) sleep(60 * (i + 1)) def delete_slot(self, slot): self._hcf.delete_slot(self._frontier, slot) def close(self): self._hcf.close() self._hs_client.close() def get_number_of_links(self, slot=None): if slot is None: return sum(self._links_count.values()) else: return self._links_count[slot] def get_number_of_links_to_flush(self, slot=None): if slot is None: return sum(self._links_to_flush_count.values()) else: return self._links_to_flush_count[slot]
def __init__(self, project: str, spider: str): hc = HubstorageClient(auth=shub_cfg.get('apikey')) key = next(hc.get_project(project).jobq.list(spider=spider)).get('key') self.job = hc.get_job(key)