def test_delete_requests_are_retried(self): # Prepare client = HubstorageClient(auth=self.auth, endpoint=self.endpoint, max_retries=3) job_metadata = { 'project': self.projectid, 'spider': self.spidername, 'state': 'pending' } callback_getpost, attempts_count_getpost = self.make_request_callback( 0, job_metadata) callback_delete, attempts_count_delete = self.make_request_callback( 2, job_metadata) self.mock_api(method=GET, callback=callback_getpost) self.mock_api(method=POST, callback=callback_getpost) self.mock_api(method=DELETE, callback=callback_delete) # Act job = client.get_job('%s/%s/%s' % (self.projectid, self.spiderid, 42)) job.metadata['foo'] = 'bar' del job.metadata['foo'] job.metadata.save() # Assert self.assertEqual(attempts_count_delete[0], 3)
def test_metadata_save_does_retry(self): # Prepare client = HubstorageClient(auth=self.auth, endpoint=self.endpoint, max_retries=3, max_retry_time=1) job_metadata = { 'project': self.projectid, 'spider': self.spidername, 'state': 'pending' } callback_get, attempts_count_get = self.make_request_callback( 0, job_metadata) callback_post, attempts_count_post = self.make_request_callback( 2, job_metadata) self.mock_api(method=GET, callback=callback_get) self.mock_api(method=POST, callback=callback_post) # Act job = client.get_job('%s/%s/%s' % (self.projectid, self.spiderid, 42)) job.metadata['foo'] = 'bar' job.metadata.save() # Assert self.assertEqual(attempts_count_post[0], 3)
def get_scraped_data(dir,items_job, key, spider): # establish a connection with scrapyhub and get a items generator hc = HubstorageClient(auth=key) empty, totalItems, keptItems = 0, 0, 0 for job in hc.get_project(items_job).jobq.list(spider=spider): for item in hc.get_job(job['key']).items.list(): totalItems += 1 item = pd.Series(item) if item['title'] != '' and item['article'] != '' and \ item['title'] != ' ' and item['article'] != ' ': item['spider'] = spider item = item.drop('category') item = item.replace(["page1", "page2", "page3", "scrape_time", "", "basic"], [np.nan, np.nan, np.nan, np.nan, np.nan, "reutersbasic"]) item = item.replace({'<.*?>': '', '\[.*?\]': '', '\(.*?\)': ''}, regex=True) #add article hash code as the id of the article item['id'] = hash(item['article']) #write item(as records) to a json file file = dir + 'raw/' + str(item['id']) + '.json' item.to_json(file) keptItems += 1 else: empty += 1 print '#' * 50 print 'Fetched: ', totalItems, ' from spider: ', item['spider'] print keptItems, ' were written to the folder' print '-' * 50, '\n\n'
def test_retrier_does_not_catch_unwanted_exception(self): # Prepare client = HubstorageClient(auth=self.auth, endpoint=self.endpoint, max_retries=2, max_retry_time=1) job_metadata = { 'project': self.projectid, 'spider': self.spidername, 'state': 'pending' } callback, attempts_count = self.make_request_callback( 3, job_metadata, http_error_status=403) self.mock_api(callback=callback) # Act job, metadata, err = None, None, None try: job = client.get_job('%s/%s/%s' % (self.projectid, self.spiderid, 42)) metadata = dict(job.metadata) except HTTPError as e: err = e # Assert self.assertIsNone(metadata) self.assertIsNotNone(err) self.assertEqual(err.response.status_code, 403) self.assertEqual(attempts_count[0], 1)
def get_job(job): jobid, apikey = get_job_specs(job) hsc = HubstorageClient(auth=apikey) job = hsc.get_job(jobid) if not job.metadata: raise NotFoundException('Job {} does not exist'.format(jobid)) return job
def test_retrier_catches_badstatusline_and_429(self): # Prepare client = HubstorageClient(auth=self.auth, endpoint=self.endpoint, max_retries=3) job_metadata = {'project': self.projectid, 'spider': self.spidername, 'state': 'pending'} attempts_count = [0] # use a list for nonlocal mutability used in request_callback def request_callback(request): attempts_count[0] += 1 if attempts_count[0] <= 2: raise ConnectionError("Connection aborted.", BadStatusLine("''")) if attempts_count[0] == 3: return (429, {}, {}) else: resp_body = dict(job_metadata) return (200, {}, json.dumps(resp_body)) self.mock_api(callback=request_callback) # Act job = client.get_job('%s/%s/%s' % (self.projectid, self.spiderid, 42)) # Assert self.assertEqual(dict(job_metadata), dict(job.metadata)) self.assertEqual(attempts_count[0], 4)
def get_job(job): jobid, apikey = get_job_specs(job) hsc = HubstorageClient(auth=apikey) job = hsc.get_job(jobid) if not job.metadata: raise NotFoundException("Job {} does not exist".format(jobid)) return job
def test_get_job_does_fails_if_no_retries(self): # Prepare client = HubstorageClient(auth=self.auth, endpoint=self.endpoint, max_retries=0) job_metadata = { 'project': self.projectid, 'spider': self.spidername, 'state': 'pending' } callback, attempts_count = self.make_request_callback(2, job_metadata) self.mock_api(callback=callback) # Act job, metadata, err = None, None, None try: job = client.get_job('%s/%s/%s' % (self.projectid, self.spiderid, 42)) metadata = dict(job.metadata) except HTTPError as e: err = e # Assert self.assertIsNone(metadata) self.assertIsNotNone(err) self.assertEqual(err.response.status_code, 504) self.assertEqual(attempts_count[0], 1)
def test_api_delete_can_be_set_to_non_idempotent(self): # Prepare client = HubstorageClient(auth=self.auth, endpoint=self.endpoint, max_retries=3, max_retry_time=1) job_metadata = { 'project': self.projectid, 'spider': self.spidername, 'state': 'pending' } callback_delete, attempts_count_delete = self.make_request_callback( 2, job_metadata) self.mock_api(method=DELETE, callback=callback_delete) # Act job = client.get_job('%s/%s/%s' % (self.projectid, self.spiderid, 42)) err = None try: job.metadata.apidelete('/my/non/idempotent/delete/', is_idempotent=False) except HTTPError as e: err = e # Assert self.assertEqual(attempts_count_delete[0], 1) self.assertIsNotNone(err)
def test_retrier_catches_badstatusline_and_429(self): # Prepare client = HubstorageClient(auth=self.auth, endpoint=self.endpoint, max_retries=3) job_metadata = { 'project': self.projectid, 'spider': self.spidername, 'state': 'pending' } attempts_count = [ 0 ] # use a list for nonlocal mutability used in request_callback def request_callback(request): attempts_count[0] += 1 if attempts_count[0] <= 2: raise ConnectionError("Connection aborted.", BadStatusLine("''")) if attempts_count[0] == 3: return (429, {}, {}) else: resp_body = dict(job_metadata) return (200, {}, json.dumps(resp_body)) self.mock_api(callback=request_callback) # Act job = client.get_job('%s/%s/%s' % (self.projectid, self.spiderid, 42)) # Assert self.assertEqual(dict(job_metadata), dict(job.metadata)) self.assertEqual(attempts_count[0], 4)
def test_allows_msgpack(monkeypatch, msgpack_available, path, expected_result): monkeypatch.setattr("hubstorage.resourcetype.MSGPACK_AVAILABLE", msgpack_available) hsclient = HubstorageClient() job = hsclient.get_job("2222000/1/1") for resource in [job.items, job.logs, job.samples]: assert resource._allows_mpack(path) is (msgpack_available and expected_result) assert job.requests._allows_mpack(path) is False assert job.metadata._allows_mpack(path) is False assert job.jobq._allows_mpack(path) is False
def test_auth(self): # client without global auth set hsc = HubstorageClient(endpoint=self.hsclient.endpoint) self.assertEqual(hsc.auth, None) # check no-auth access try: hsc.push_job(self.projectid, self.spidername) except HTTPError as exc: self.assertTrue(exc.response.status_code, 401) else: self.assertTrue(False, '401 not raised') try: hsc.get_project(self.projectid).push_job(self.spidername) except HTTPError as exc: self.assertTrue(exc.response.status_code, 401) else: self.assertTrue(False, '401 not raised') try: hsc.get_job((self.projectid, 1, 1)).items.list() except HTTPError as exc: self.assertTrue(exc.response.status_code, 401) else: self.assertTrue(False, '401 not raised') try: hsc.get_project(self.projectid).get_job( (self.projectid, 1, 1)).items.list() except HTTPError as exc: self.assertTrue(exc.response.status_code, 401) else: self.assertTrue(False, '401 not raised') # create project with auth auth = self.hsclient.auth project = hsc.get_project(self.projectid, auth) self.assertEqual(project.auth, auth) job = project.push_job(self.spidername) samejob = project.get_job(job.key) self.assertEqual(samejob.key, job.key)
def test_auth(self): # client without global auth set hsc = HubstorageClient(endpoint=self.hsclient.endpoint) self.assertEqual(hsc.auth, None) # check no-auth access try: hsc.push_job(self.projectid, self.spidername) except HTTPError as exc: self.assertTrue(exc.response.status_code, 401) else: self.assertTrue(False, '401 not raised') try: hsc.get_project(self.projectid).push_job(self.spidername) except HTTPError as exc: self.assertTrue(exc.response.status_code, 401) else: self.assertTrue(False, '401 not raised') try: hsc.get_job((self.projectid, 1, 1)).items.list() except HTTPError as exc: self.assertTrue(exc.response.status_code, 401) else: self.assertTrue(False, '401 not raised') try: hsc.get_project(self.projectid).get_job((self.projectid, 1, 1)).items.list() except HTTPError as exc: self.assertTrue(exc.response.status_code, 401) else: self.assertTrue(False, '401 not raised') # create project with auth auth = self.hsclient.auth project = hsc.get_project(self.projectid, auth) self.assertEqual(project.auth, auth) job = project.push_job(self.spidername) samejob = project.get_job(job.key) self.assertEqual(samejob.key, job.key)
def fetch_and_save_items(): hc = HubstorageClient(auth=API_KEY) project = hc.get_project(SH_PROJECT) for spider in SPIDERS: print("\nworking on spider {}".format(spider['spider_name'])) spider_id = project.ids.spider(spider['spider_name']) summary = project.spiders.lastjobsummary(spiderid=spider_id) for element in summary: print(element['key']) job = hc.get_job(element['key']) items = job.items.iter_values() save_items(items, spider['institution_name'])
def test_get_job_does_retry(self): # Prepare client = HubstorageClient(auth=self.auth, endpoint=self.endpoint, max_retries=3) job_metadata = {'project': self.projectid, 'spider': self.spidername, 'state': 'pending'} callback, attempts_count = self.make_request_callback(2, job_metadata) self.mock_api(callback=callback) # Act job = client.get_job('%s/%s/%s' % (self.projectid, self.spiderid, 42)) # Assert self.assertEqual(dict(job_metadata), dict(job.metadata)) self.assertEqual(attempts_count[0], 3)
def test_metadata_save_does_retry(self): # Prepare client = HubstorageClient(auth=self.auth, endpoint=self.endpoint, max_retries=3) job_metadata = {'project': self.projectid, 'spider': self.spidername, 'state': 'pending'} callback_get, attempts_count_get = self.make_request_callback(0, job_metadata) callback_post, attempts_count_post = self.make_request_callback(2, job_metadata) self.mock_api(method=GET, callback=callback_get) self.mock_api(method=POST, callback=callback_post) # Act job = client.get_job('%s/%s/%s' % (self.projectid, self.spiderid, 42)) job.metadata['foo'] = 'bar' job.metadata.save() # Assert self.assertEqual(attempts_count_post[0], 3)
def test_get_job_does_retry(self): # Prepare client = HubstorageClient(auth=self.auth, endpoint=self.endpoint, max_retries=3) job_metadata = { 'project': self.projectid, 'spider': self.spidername, 'state': 'pending' } callback, attempts_count = self.make_request_callback(2, job_metadata) self.mock_api(callback=callback) # Act job = client.get_job('%s/%s/%s' % (self.projectid, self.spiderid, 42)) # Assert self.assertEqual(dict(job_metadata), dict(job.metadata)) self.assertEqual(attempts_count[0], 3)
def test_api_delete_can_be_set_to_non_idempotent(self): # Prepare client = HubstorageClient(auth=self.auth, endpoint=self.endpoint, max_retries=3) job_metadata = {'project': self.projectid, 'spider': self.spidername, 'state': 'pending'} callback_delete, attempts_count_delete = self.make_request_callback(2, job_metadata) self.mock_api(method=DELETE, callback=callback_delete) # Act job = client.get_job('%s/%s/%s' % (self.projectid, self.spiderid, 42)) err = None try: job.metadata.apidelete('/my/non/idempotent/delete/', is_idempotent=False) except HTTPError as e: err = e # Assert self.assertEqual(attempts_count_delete[0], 1) self.assertIsNotNone(err)
def test_retrier_does_not_catch_unwanted_exception(self): # Prepare client = HubstorageClient(auth=self.auth, endpoint=self.endpoint, max_retries=2) job_metadata = {'project': self.projectid, 'spider': self.spidername, 'state': 'pending'} callback, attempts_count = self.make_request_callback(3, job_metadata, http_error_status=403) self.mock_api(callback=callback) # Act job, metadata, err = None, None, None try: job = client.get_job('%s/%s/%s' % (self.projectid, self.spiderid, 42)) metadata = dict(job.metadata) except HTTPError as e: err = e # Assert self.assertIsNone(metadata) self.assertIsNotNone(err) self.assertEqual(err.response.status_code, 403) self.assertEqual(attempts_count[0], 1)
def test_get_job_does_fails_on_too_many_retries(self): # Prepare client = HubstorageClient(auth=self.auth, endpoint=self.endpoint, max_retries=2, max_retry_time=1) job_metadata = {'project': self.projectid, 'spider': self.spidername, 'state': 'pending'} callback, attempts_count = self.make_request_callback(3, job_metadata) self.mock_api(callback=callback) # Act job, metadata, err = None, None, None try: job = client.get_job('%s/%s/%s' % (self.projectid, self.spiderid, 42)) metadata = dict(job.metadata) except HTTPError as e: err = e # Assert self.assertIsNone(metadata) self.assertIsNotNone(err) self.assertEqual(err.response.status_code, 504) self.assertEqual(attempts_count[0], 3)
def main(argv): apikey = '' spider = '' try: opts, args = getopt.getopt(argv, "hi:o", ["apikey=","spider="]) except getopt.GetoptError: print 'alljobs.py -k <API Key> -s <ProjectID\Spider>' sys.exit(2) for opt, arg in opts: if opt == '-h': print 'alljobs.py -k <API Key> -s <ProjectID\Spider>' sys.exit() elif opt in("-k", "--apikey"): apikey = arg elif opt in("-s", "--spider"): spider = arg hc = HubstorageClient(auth=apikey) itemslist = hc.get_job(spider).items.list() itemslistIterator = itemslist.__iter__() for items in itemslistIterator: print json.dumps(items)
def __init__(self, project: str, spider: str): hc = HubstorageClient(auth=shub_cfg.get('apikey')) key = next(hc.get_project(project).jobq.list(spider=spider)).get('key') self.job = hc.get_job(key)
class SystemTest(HSTestCase): MAGICN = 1211 def setUp(self): super(HSTestCase, self).setUp() endpoint = self.hsclient.endpoint # Panel - no client auth, only project auth using user auth token self.panelclient = HubstorageClient(endpoint=endpoint) self.panelproject = self.panelclient.get_project(self.projectid, auth=self.auth) # Runner - client uses global auth to poll jobq self.runnerclient = HubstorageClient(endpoint=endpoint, auth=self.auth) # Scraper - uses job level auth, no global or project auth available self.scraperclient = HubstorageClient(endpoint=endpoint) def test_succeed_with_close_reason(self): p = self.panelproject pushed = p.jobq.push(self.spidername) # check pending state job = p.get_jobs(self.spiderid).next() self.assertEqual(job.metadata.get('state'), 'pending') # consume msg from runner self._run_runner(pushed, close_reason='all-good') # query again from panel job = p.get_jobs(self.spiderid).next() self.assertEqual(job.metadata.get('state'), 'finished') self.assertEqual(job.metadata.get('close_reason'), 'all-good') self.assertEqual(job.items.stats()['totals']['input_values'], self.MAGICN) self.assertEqual(job.logs.stats()['totals']['input_values'], self.MAGICN * 4) self.assertEqual(job.requests.stats()['totals']['input_values'], self.MAGICN) def test_succeed_without_close_reason(self): p = self.panelproject pushed = p.jobq.push(self.spidername) # check pending state job = p.get_jobs(self.spiderid).next() self.assertEqual(job.metadata.get('state'), 'pending') # consume msg from runner self._run_runner(pushed, close_reason=None) # query again from panel job = p.get_jobs(self.spiderid).next() self.assertEqual(job.metadata.get('state'), 'finished') self.assertEqual(job.metadata.get('close_reason'), 'no_reason') self.assertEqual(job.items.stats()['totals']['input_values'], self.MAGICN) self.assertEqual(job.logs.stats()['totals']['input_values'], self.MAGICN * 4) self.assertEqual(job.requests.stats()['totals']['input_values'], self.MAGICN) def test_scraper_failure(self): p = self.panelproject pushed = p.jobq.push(self.spidername) # check pending state job = p.get_jobs(self.spiderid).next() self.assertEqual(job.metadata.get('state'), 'pending') # consume msg from runner self._run_runner(pushed, close_reason=IOError('no more resources, ha!')) # query again from panel job = p.get_jobs(self.spiderid).next() self.assertEqual(job.metadata.get('state'), 'finished') self.assertEqual(job.metadata.get('close_reason'), 'failed') # MAGICN per log level messages plus one of last failure stats = job.logs.stats() self.assertTrue(stats) self.assertEqual(stats['totals']['input_values'], self.MAGICN * 4 + 1) def _run_runner(self, pushed, close_reason): job = self.runnerclient.start_job(self.projectid) self.assertFalse(job.metadata.get('stop_requested')) job.metadata.update(host='localhost', slot=1) self.assertEqual(job.metadata.get('state'), 'running') # run scraper try: self._run_scraper(job.key, job.jobauth, close_reason=close_reason) except Exception as exc: job.failed(message=str(exc)) # logging from runner must append and never remove messages logged # by scraper self.assertTrue(job.logs.batch_append) else: job.finished() self.runnerclient.close() def _run_scraper(self, jobkey, jobauth, close_reason=None): httpmethods = 'GET PUT POST DELETE HEAD OPTIONS TRACE CONNECT'.split() job = self.scraperclient.get_job(jobkey, auth=jobauth) for idx in xrange(self.MAGICN): iid = job.items.write({'uuid': idx}) job.logs.debug('log debug %s' % idx, idx=idx) job.logs.info('log info %s' % idx, idx=idx) job.logs.warn('log warn %s' % idx, idx=idx) job.logs.error('log error %s' % idx, idx=idx) sid = job.samples.write([idx, idx, idx]) rid = job.requests.add( url='http://test.com/%d' % idx, status=random.randint(100, 1000), method=random.choice(httpmethods), rs=random.randint(0, 100000), duration=random.randint(0, 1000), parent=random.randrange(0, idx + 1) if idx > 10 else None, ts=millitime() + random.randint(100, 100000), ) self.assertEqual(iid, idx) self.assertEqual(sid, idx) self.assertEqual(rid, idx) if isinstance(close_reason, Exception): self.scraperclient.close() raise close_reason if close_reason: job.metadata['close_reason'] = close_reason job.metadata.save() self.scraperclient.close() del self.scraperclient
#coding=UTF-8 from hubstorage import HubstorageClient hc = HubstorageClient(auth='bc2aa25cc40f4ed4b03988e8e0b9e89e') project = hc.get_project('53883') itemslist = hc.get_job('53883/1/5').items.list() itemslist_size = itemslist.__sizeof__() for element in itemslist: element.__delitem__('_type') element.__delitem__('_cached_page_id') element.__delitem__('_template') elementIterator = element.iteritems() for fields in elementIterator: fieldIterator = fields.__iter__() for values in fieldIterator: if isinstance(values, basestring): print values else: print values.pop()