def get_job(job): jobid, apikey = get_job_specs(job) hsc = HubstorageClient(auth=apikey) job = hsc.get_job(jobid) if not job.metadata: raise NotFoundException('Job {} does not exist'.format(jobid)) return job
class _HubstorageRef(object): def __init__(self): self.enabled = 'SHUB_JOBKEY' in os.environ self._client = None self._project = None self._job = None if self.enabled: self.jobkey = os.environ['SHUB_JOBKEY'] job_id = [int(id) for id in self.jobkey.split('/')] self._projectid, self._spiderid, self._jobcounter = job_id else: self._projectid = None self._spiderid = None self._jobcounter = None @property def auth(self): return to_native_str(decode(os.environ['SHUB_JOBAUTH'], 'hex_codec')) @property def endpoint(self): return os.environ.get('SHUB_STORAGE') @property def projectid(self): return self._projectid @property def spiderid(self): return self._spiderid @property def jobid(self): return self._jobcounter @property def client(self): from scrapinghub import HubstorageClient if self._client is None: user_agent = os.environ.get('SHUB_HS_USER_AGENT') self._client = HubstorageClient(endpoint=self.endpoint, auth=self.auth, user_agent=user_agent) return self._client @property def project(self): if self._project is None: self._project = self.client.get_project(str(self.projectid)) return self._project @property def job(self): if self._job is None: self._job = self.project.get_job((self.spiderid, self.jobid)) return self._job def close(self): if self._client is not None: self._client.close()
class _Hubstorage(object): def __init__(self): self.available = "SHUB_JOBKEY" in os.environ and HubstorageClient is not None self._client = None self._project = None self._job = None if self.available: self.job_key = os.environ["SHUB_JOBKEY"] self._project_id, self._spider_id, self._job_id = map( int, self.job_key.split("/") ) else: self._project_id = None self._spider_id = None self._job_id = None @property def auth(self): if six.PY2: return os.environ["SHUB_JOBAUTH"].decode("hex") else: return decode(os.environ["SHUB_JOBAUTH"], "hex_codec").decode("utf-8") @property def endpoint(self): return os.environ.get("SHUB_STORAGE") @property def project_id(self): return self._project_id @property def spider_id(self): return self._spider_id @property def job_id(self): return self._job_id @property def client(self): if self._client is None: self._client = HubstorageClient(endpoint=self.endpoint, auth=self.auth) return self._client @property def project(self): if self._project is None: self._project = self.client.get_project(str(self.project_id)) return self._project @property def job(self): if self._job is None: self._job = self.project.get_job((self.spider_id, self.job_id)) return self._job def close(self): if self._client is not None: self._client.close()
def client(self): from scrapinghub import HubstorageClient if self._client is None: user_agent = os.environ.get('SHUB_HS_USER_AGENT') self._client = HubstorageClient(endpoint=self.endpoint, auth=self.auth, user_agent=user_agent) return self._client
def test_allows_msgpack(monkeypatch, msgpack_available, path, expected_result): monkeypatch.setattr( 'scrapinghub.hubstorage.resourcetype.MSGPACK_AVAILABLE', msgpack_available) hsclient = HubstorageClient() job = hsclient.get_job('2222000/1/1') for resource in [job.items, job.logs, job.samples]: assert resource._allows_mpack(path) is (msgpack_available and expected_result) assert job.requests._allows_mpack(path) is False assert job.metadata._allows_mpack(path) is False assert job.jobq._allows_mpack(path) is False
def _run_scraper(jobkey, jobauth, close_reason=None): httpmethods = 'GET PUT POST DELETE HEAD OPTIONS TRACE CONNECT'.split() # Scraper - uses job level auth, no global or project auth available client = HubstorageClient(endpoint=TEST_ENDPOINT) # use some fixed timestamp to represent current time now_ts = 1476803148638 with closing(client) as scraperclient: job = scraperclient.get_job(jobkey, auth=jobauth) for idx in range(MAGICN): iid = job.items.write({'uuid': idx}) job.logs.debug('log debug %s' % idx, idx=idx) job.logs.info('log info %s' % idx, idx=idx) job.logs.warn('log warn %s' % idx, idx=idx) job.logs.error('log error %s' % idx, idx=idx) sid = job.samples.write([idx, idx, idx]) rid = job.requests.add( url='http://test.com/%d' % idx, status=random.randint(100, 1000), method=random.choice(httpmethods), rs=random.randint(0, 100000), duration=random.randint(0, 1000), parent=random.randrange(0, idx + 1) if idx > 10 else None, ts=now_ts + 100 + idx, ) assert iid == idx assert sid == idx assert rid == idx if isinstance(close_reason, Exception): raise close_reason if close_reason: job.metadata['close_reason'] = close_reason job.metadata.save()
def hsclient_with_retries(max_retries=3, max_retry_time=1): return HubstorageClient( auth=TEST_AUTH, endpoint=TEST_ENDPOINT, max_retries=max_retries, max_retry_time=max_retry_time, )
def test_auth(hsclient, json_and_msgpack): # client without global auth set hsc = HubstorageClient(endpoint=hsclient.endpoint, use_msgpack=hsclient.use_msgpack) assert hsc.auth is None # check no-auth access try: hsc.push_job(TEST_PROJECT_ID, TEST_SPIDER_NAME) except HTTPError as exc: assert exc.response.status_code == 401 else: raise AssertionError('401 not raised') try: hsc.get_project(TEST_PROJECT_ID).push_job(TEST_SPIDER_NAME) except HTTPError as exc: assert exc.response.status_code == 401 else: raise AssertionError('401 not raised') try: hsc.get_job((TEST_PROJECT_ID, 1, 1)).items.list() except HTTPError as exc: assert exc.response.status_code == 401 else: raise AssertionError('401 not raised') try: hsc.get_project(TEST_PROJECT_ID).get_job( (TEST_PROJECT_ID, 1, 1)).items.list() except HTTPError as exc: assert exc.response.status_code == 401 else: raise AssertionError('401 not raised') # create project with auth auth = hsclient.auth project = hsc.get_project(TEST_PROJECT_ID, auth) assert project.auth == auth job = project.push_job(TEST_SPIDER_NAME) samejob = project.get_job(job.key) assert samejob.key == job.key
class DiscoveryProcessorMixin(object): def get_previous_job(self, attr): if not hasattr(self, attr): raise AttributeError( 'You should specify a {attr} argument to the job'.format( attr=attr ) ) job_id = getattr(self, attr) auth = self.crawler.settings.get('SCRAPINGHUB_APIKEY') hc = HubstorageClient(auth=auth) return hc.get_job(job_id)
def _run_runner(hsproject, pushed, close_reason): client = HubstorageClient(endpoint=TEST_ENDPOINT, auth=TEST_AUTH) with closing(client) as runnerclient: job = start_job(hsproject) assert not job.metadata.get('stop_requested') job.metadata.update(host='localhost', slot=1) assert job.metadata.get('state') == 'running' # run scraper try: _run_scraper(job.key, job.jobauth, close_reason=close_reason) except Exception as exc: job.logs.error(message=str(exc), appendmode=True) job.close_writers() job.jobq.finish(job, close_reason='failed') # logging from runner must append and never remove messages logged # by scraper assert job.logs.batch_append else: job.jobq.finish(job, close_reason=close_reason or 'no_reason')
def client(self): if self._client is None: self._client = HubstorageClient(endpoint=self.endpoint, auth=self.auth) return self._client
def setUpClass(cls): cls.endpoint = HS_ENDPOINT cls.auth = HS_AUTH cls.hsclient = HubstorageClient(auth=cls.auth, endpoint=cls.endpoint) cls.project = cls.hsclient.get_project(cls.projectid) cls.fclient = cls.project.frontier
def client(self): from scrapinghub import HubstorageClient if self._client is None: self._client = HubstorageClient(endpoint=self.endpoint, auth=self.auth) return self._client
def hsclient(): return HubstorageClient(auth=TEST_AUTH, endpoint=TEST_ENDPOINT)
class HcfMiddleware(object): def __init__(self, crawler): settings = crawler.settings self.hs_endpoint = settings.get("HS_ENDPOINT") self.hs_auth = self._get_config(settings, "HS_AUTH") self.hs_projectid = self._get_config(settings, "HS_PROJECTID", os.environ.get('SCRAPY_PROJECT_ID')) self.hs_frontier = self._get_config(settings, "HS_FRONTIER") self.hs_consume_from_slot = self._get_config(settings, "HS_CONSUME_FROM_SLOT") self.hs_number_of_slots = settings.getint("HS_NUMBER_OF_SLOTS", DEFAULT_HS_NUMBER_OF_SLOTS) self.hs_max_links = settings.getint("HS_MAX_LINKS", DEFAULT_MAX_LINKS) self.hs_start_job_enabled = settings.getbool("HS_START_JOB_ENABLED", False) self.hs_start_job_on_reason = settings.getlist("HS_START_JOB_ON_REASON", ['finished']) conn = Connection(self.hs_auth) self.panel_project = conn[self.hs_projectid] self.hsclient = HubstorageClient(auth=self.hs_auth, endpoint=self.hs_endpoint) self.project = self.hsclient.get_project(self.hs_projectid) self.fclient = self.project.frontier self.new_links = defaultdict(set) self.batch_ids = [] crawler.signals.connect(self.close_spider, signals.spider_closed) # Make sure the logger for hubstorage.batchuploader is configured logging.basicConfig() self.logger = logging.getLogger("HCF") def _get_config(self, settings, key, default=None): value = settings.get(key, default) if not value: raise NotConfigured('%s not found' % key) return value def _msg(self, msg, level=logging.INFO): self.logger.log(level, msg) def start_job(self, spider): self._msg("Starting new job for: %s" % spider.name) jobid = self.panel_project.schedule( spider.name, hs_consume_from_slot=self.hs_consume_from_slot, dummy=datetime.now() ) self._msg("New job started: %s" % jobid) return jobid @classmethod def from_crawler(cls, crawler): return cls(crawler) def process_start_requests(self, start_requests, spider): self.hs_frontier = getattr(spider, 'hs_frontier', self.hs_frontier) self._msg('Using HS_FRONTIER=%s' % self.hs_frontier) self.hs_consume_from_slot = getattr(spider, 'hs_consume_from_slot', self.hs_consume_from_slot) self._msg('Using HS_CONSUME_FROM_SLOT=%s' % self.hs_consume_from_slot) self.has_new_requests = False for req in self._get_new_requests(): self.has_new_requests = True yield req # if there are no links in the hcf, use the start_requests # unless this is not the first job. if not self.has_new_requests and not getattr(spider, 'dummy', None): self._msg('Using start_requests') for r in start_requests: yield r def process_spider_output(self, response, result, spider): slot_callback = getattr(spider, 'slot_callback', self._get_slot) for item in result: if isinstance(item, Request): request = item if request.meta.get('use_hcf', False): if request.method == 'GET': # XXX: Only GET support for now. slot = slot_callback(request) if not request.url in self.new_links[slot]: hcf_params = request.meta.get('hcf_params') fp = {'fp': request.url} if hcf_params: fp.update(hcf_params) # Save the new links as soon as possible using # the batch uploader self.fclient.add(self.hs_frontier, slot, [fp]) self.new_links[slot].add(request.url) else: self._msg("'use_hcf' meta key is not supported for non GET requests (%s)" % request.url, log.ERROR) yield request else: yield request else: yield item def close_spider(self, spider, reason): # Only store the results if the spider finished normally, if it # didn't finished properly there is not way to know whether all the url batches # were processed and it is better not to delete them from the frontier # (so they will be picked by another process). if reason == 'finished': self._save_new_links_count() self._delete_processed_ids() # Close the frontier client in order to make sure that all the new links # are stored. self.fclient.close() self.hsclient.close() # If the reason is defined in the hs_start_job_on_reason list then start # a new job right after this spider is finished. if self.hs_start_job_enabled and reason in self.hs_start_job_on_reason: # Start the new job if this job had requests from the HCF or it # was the first job. if self.has_new_requests or not getattr(spider, 'dummy', None): self.start_job(spider) def _get_new_requests(self): """ Get a new batch of links from the HCF.""" num_batches = 0 num_links = 0 for num_batches, batch in enumerate(self.fclient.read(self.hs_frontier, self.hs_consume_from_slot), 1): for fingerprint, data in batch['requests']: num_links += 1 yield Request(url=fingerprint, meta={'hcf_params': {'qdata': data}}) self.batch_ids.append(batch['id']) if num_links >= self.hs_max_links: break self._msg('Read %d new batches from slot(%s)' % (num_batches, self.hs_consume_from_slot)) self._msg('Read %d new links from slot(%s)' % (num_links, self.hs_consume_from_slot)) def _save_new_links_count(self): """ Save the new extracted links into the HCF.""" for slot, new_links in self.new_links.items(): self._msg('Stored %d new links in slot(%s)' % (len(new_links), slot)) self.new_links = defaultdict(set) def _delete_processed_ids(self): """ Delete in the HCF the ids of the processed batches.""" self.fclient.delete(self.hs_frontier, self.hs_consume_from_slot, self.batch_ids) self._msg('Deleted %d processed batches in slot(%s)' % (len(self.batch_ids), self.hs_consume_from_slot)) self.batch_ids = [] def _get_slot(self, request): """ Determine to which slot should be saved the request.""" md5 = hashlib.md5() md5.update(request.url) digest = md5.hexdigest() return str(int(digest, 16) % self.hs_number_of_slots)
def test_allows_msgpack(monkeypatch, msgpack_available, path, expected_result): monkeypatch.setattr( 'scrapinghub.hubstorage.collectionsrt.MSGPACK_AVAILABLE', msgpack_available) hsclient = HubstorageClient() collections = hsclient.get_project(2222000).collections assert collections._allows_mpack(path) is (msgpack_available and expected_result)
def test_custom_ua(): client = HubstorageClient(auth=TEST_AUTH, endpoint=TEST_ENDPOINT, user_agent='testUA') assert client.user_agent == 'testUA'
def panelclient(): # Panel - no client auth, only project auth using user auth token return HubstorageClient(endpoint=TEST_ENDPOINT)