Esempio n. 1
0
class HcfMiddleware(object):

    def __init__(self, crawler):

        self.crawler = crawler
        hs_endpoint = self._get_config(crawler, "HS_ENDPOINT")
        hs_auth = self._get_config(crawler, "HS_AUTH")
        self.hs_projectid = self._get_config(crawler, "HS_PROJECTID")
        self.hs_project_frontier = self._get_config(crawler, "HS_FRONTIER")
        self.hs_project_slot = self._get_config(crawler, "HS_SLOT")
        # Max number of batches to read from the HCF within a single run.
        try:
            self.hs_max_baches = int(crawler.settings.get("HS_MAX_BATCHES", DEFAULT_MAX_BATCHES))
        except ValueError:
            self.hs_max_baches = DEFAULT_MAX_BATCHES
        self.hs_start_job_on_reason = crawler.settings.get("HS_START_JOB_ON_REASON", [])

        self.hsclient = HubstorageClient(auth=hs_auth, endpoint=hs_endpoint)
        self.project = self.hsclient.get_project(self.hs_projectid)
        self.fclient = self.project.frontier

        self.new_links = defaultdict(list)
        self.batch_ids = []

        crawler.signals.connect(self.idle_spider, signals.spider_idle)
        crawler.signals.connect(self.close_spider, signals.spider_closed)

    def _get_config(self, crawler, key):
        value = crawler.settings.get(key)
        if not value:
            raise NotConfigured('%s not found' % key)
        return value

    def _msg(self, msg):
        log.msg('(HCF) %s' % msg)

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)

    def process_start_requests(self, start_requests, spider):

        self.hs_frontier = getattr(spider, 'frontier', self.hs_project_frontier)
        self._msg('Using HS_FRONTIER=%s' % self.hs_frontier)

        self.hs_slot = getattr(spider, 'slot', self.hs_project_slot)
        self._msg('Using HS_SLOT=%s' % self.hs_slot)

        has_new_requests = False
        for req in self._get_new_requests():
            has_new_requests = True
            yield req

        # if there are no links in the hcf, use the start_requests
        if not has_new_requests:
            self._msg('Using start_requests')
            for r in start_requests:
                yield r

    def process_spider_output(self, response, result, spider):
        slot_callback = getattr(spider, 'slot_callback', self._get_slot)
        for item in result:
            if isinstance(item, Request):
                request = item
                if (request.method == 'GET' and  # XXX: Only GET support for now.
                    request.meta.get('use_hcf', False)):
                    slot = slot_callback(request)
                    hcf_params = request.meta.get('hcf_params')
                    fp = {'fp': request.url}
                    if hcf_params:
                        fp.update(hcf_params)
                    self.new_links[slot].append(fp)
                else:
                    yield item
            else:
                yield item

    def idle_spider(self, spider):
        self._save_new_links()
        self.fclient.flush()
        self._delete_processed_ids()
        has_new_requests = False
        for request in self._get_new_requests():
            self.crawler.engine.schedule(request, spider)
            has_new_requests = True
        if has_new_requests:
            raise DontCloseSpider

    def close_spider(self, spider, reason):
        # Only store the results if the spider finished normally, if it
        # didn't finished properly there is not way to know whether all the url batches
        # were processed and it is better not to delete them from the frontier
        # (so they will be picked by anothe process).
        if reason == 'finished':
            self._save_new_links()
            self._delete_processed_ids()

        # If the reason is defined in the hs_start_job_on_reason list then start
        # a new job right after this spider is finished. The idea is to limit
        # every spider runtime (either via itemcount, pagecount or timeout) and
        # then have the old spider start a new one to take its place in the slot.
        if reason in self.hs_start_job_on_reason:
            self._msg("Starting new job" + spider.name)
            job = self.hsclient.start_job(projectid=self.hs_projectid,
                                          spider=spider.name)
            self._msg("New job started: %s" % job)
        self.fclient.close()
        self.hsclient.close()

    def _get_new_requests(self):
        """ Get a new batch of links from the HCF."""
        num_batches = 0
        num_links = 0
        for num_batches, batch in enumerate(self.fclient.read(self.hs_frontier, self.hs_slot), 1):
            for r in batch['requests']:
                num_links += 1
                yield Request(r[0])
            self.batch_ids.append(batch['id'])
            if num_batches >= self.hs_max_baches:
                break
        self._msg('Read %d new batches from slot(%s)' % (num_batches, self.hs_slot))
        self._msg('Read %d new links from slot(%s)' % (num_links, self.hs_slot))

    def _save_new_links(self):
        """ Save the new extracted links into the HCF."""
        for slot, fps in self.new_links.items():
            self.fclient.add(self.hs_frontier, slot, fps)
            self._msg('Stored %d new links in slot(%s)' % (len(fps), slot))
        self.new_links = defaultdict(list)

    def _delete_processed_ids(self):
        """ Delete in the HCF the ids of the processed batches."""
        self.fclient.delete(self.hs_frontier, self.hs_slot, self.batch_ids)
        self._msg('Deleted %d processed batches in slot(%s)' % (len(self.batch_ids),
                                                                self.hs_slot))
        self.batch_ids = []

    def _get_slot(self, request):
        """ Determine to which slot should be saved the request."""
        return '0'
Esempio n. 2
0
class HcfMiddleware(object):
    def __init__(self, crawler):

        self.crawler = crawler
        hs_endpoint = self._get_config(crawler, "HS_ENDPOINT")
        hs_auth = self._get_config(crawler, "HS_AUTH")
        self.hs_projectid = self._get_config(crawler, "HS_PROJECTID")
        self.hs_project_frontier = self._get_config(crawler, "HS_FRONTIER")
        self.hs_project_slot = self._get_config(crawler, "HS_SLOT")
        # Max number of batches to read from the HCF within a single run.
        try:
            self.hs_max_baches = int(
                crawler.settings.get("HS_MAX_BATCHES", DEFAULT_MAX_BATCHES))
        except ValueError:
            self.hs_max_baches = DEFAULT_MAX_BATCHES
        self.hs_start_job_on_reason = crawler.settings.get(
            "HS_START_JOB_ON_REASON", [])

        self.hsclient = HubstorageClient(auth=hs_auth, endpoint=hs_endpoint)
        self.project = self.hsclient.get_project(self.hs_projectid)
        self.fclient = self.project.frontier

        self.new_links = defaultdict(list)
        self.batch_ids = []

        crawler.signals.connect(self.idle_spider, signals.spider_idle)
        crawler.signals.connect(self.close_spider, signals.spider_closed)

    def _get_config(self, crawler, key):
        value = crawler.settings.get(key)
        if not value:
            raise NotConfigured('%s not found' % key)
        return value

    def _msg(self, msg):
        log.msg('(HCF) %s' % msg)

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)

    def process_start_requests(self, start_requests, spider):

        self.hs_frontier = getattr(spider, 'frontier',
                                   self.hs_project_frontier)
        self._msg('Using HS_FRONTIER=%s' % self.hs_frontier)

        self.hs_slot = getattr(spider, 'slot', self.hs_project_slot)
        self._msg('Using HS_SLOT=%s' % self.hs_slot)

        has_new_requests = False
        for req in self._get_new_requests():
            has_new_requests = True
            yield req

        # if there are no links in the hcf, use the start_requests
        if not has_new_requests:
            self._msg('Using start_requests')
            for r in start_requests:
                yield r

    def process_spider_output(self, response, result, spider):
        slot_callback = getattr(spider, 'slot_callback', self._get_slot)
        for item in result:
            if isinstance(item, Request):
                request = item
                if (request.method == 'GET'
                        and  # XXX: Only GET support for now.
                        request.meta.get('use_hcf', False)):
                    slot = slot_callback(request)
                    hcf_params = request.meta.get('hcf_params')
                    fp = {'fp': request.url}
                    if hcf_params:
                        fp.update(hcf_params)
                    self.new_links[slot].append(fp)
                else:
                    yield item
            else:
                yield item

    def idle_spider(self, spider):
        self._save_new_links()
        self.fclient.flush()
        self._delete_processed_ids()
        has_new_requests = False
        for request in self._get_new_requests():
            self.crawler.engine.schedule(request, spider)
            has_new_requests = True
        if has_new_requests:
            raise DontCloseSpider

    def close_spider(self, spider, reason):
        # Only store the results if the spider finished normally, if it
        # didn't finished properly there is not way to know whether all the url batches
        # were processed and it is better not to delete them from the frontier
        # (so they will be picked by anothe process).
        if reason == 'finished':
            self._save_new_links()
            self._delete_processed_ids()

        # If the reason is defined in the hs_start_job_on_reason list then start
        # a new job right after this spider is finished. The idea is to limit
        # every spider runtime (either via itemcount, pagecount or timeout) and
        # then have the old spider start a new one to take its place in the slot.
        if reason in self.hs_start_job_on_reason:
            self._msg("Starting new job" + spider.name)
            job = self.hsclient.start_job(projectid=self.hs_projectid,
                                          spider=spider.name)
            self._msg("New job started: %s" % job)
        self.fclient.close()
        self.hsclient.close()

    def _get_new_requests(self):
        """ Get a new batch of links from the HCF."""
        num_batches = 0
        num_links = 0
        for num_batches, batch in enumerate(
                self.fclient.read(self.hs_frontier, self.hs_slot), 1):
            for r in batch['requests']:
                num_links += 1
                yield Request(r[0])
            self.batch_ids.append(batch['id'])
            if num_batches >= self.hs_max_baches:
                break
        self._msg('Read %d new batches from slot(%s)' %
                  (num_batches, self.hs_slot))
        self._msg('Read %d new links from slot(%s)' %
                  (num_links, self.hs_slot))

    def _save_new_links(self):
        """ Save the new extracted links into the HCF."""
        for slot, fps in self.new_links.items():
            self.fclient.add(self.hs_frontier, slot, fps)
            self._msg('Stored %d new links in slot(%s)' % (len(fps), slot))
        self.new_links = defaultdict(list)

    def _delete_processed_ids(self):
        """ Delete in the HCF the ids of the processed batches."""
        self.fclient.delete(self.hs_frontier, self.hs_slot, self.batch_ids)
        self._msg('Deleted %d processed batches in slot(%s)' %
                  (len(self.batch_ids), self.hs_slot))
        self.batch_ids = []

    def _get_slot(self, request):
        """ Determine to which slot should be saved the request."""
        return '0'
Esempio n. 3
0
class HcfMiddleware(object):
    def __init__(self, crawler):

        self.crawler = crawler
        self.hs_endpoint = crawler.settings.get("HS_ENDPOINT")
        self.hs_auth = self._get_config(crawler, "HS_AUTH")
        self.hs_projectid = self._get_config(crawler, "HS_PROJECTID")
        self.hs_frontier = self._get_config(crawler, "HS_FRONTIER")
        self.hs_consume_from_slot = self._get_config(crawler,
                                                     "HS_CONSUME_FROM_SLOT")
        try:
            self.hs_number_of_slots = int(
                crawler.settings.get("HS_NUMBER_OF_SLOTS",
                                     DEFAULT_HS_NUMBER_OF_SLOTS))
        except ValueError:
            self.hs_number_of_slots = DEFAULT_HS_NUMBER_OF_SLOTS
        try:
            self.hs_max_links = int(
                crawler.settings.get("HS_MAX_LINKS", DEFAULT_MAX_LINKS))
        except ValueError:
            self.hs_max_links = DEFAULT_MAX_LINKS
        self.hs_start_job_enabled = crawler.settings.get(
            "HS_START_JOB_ENABLED", False)
        self.hs_start_job_on_reason = crawler.settings.get(
            "HS_START_JOB_ON_REASON", ['finished'])
        self.hs_start_job_new_panel = crawler.settings.get(
            "HS_START_JOB_NEW_PANEL", False)

        if not self.hs_start_job_new_panel:
            conn = Connection(self.hs_auth)
            self.oldpanel_project = conn[self.hs_projectid]

        self.hsclient = HubstorageClient(auth=self.hs_auth,
                                         endpoint=self.hs_endpoint)
        self.project = self.hsclient.get_project(self.hs_projectid)
        self.fclient = self.project.frontier

        self.new_links_count = defaultdict(int)
        self.batch_ids = []

        crawler.signals.connect(self.close_spider, signals.spider_closed)

        # Make sure the logger for hubstorage.batchuploader is configured
        logging.basicConfig()

    def _get_config(self, crawler, key):
        value = crawler.settings.get(key)
        if not value:
            raise NotConfigured('%s not found' % key)
        return value

    def _msg(self, msg, level=log.INFO):
        log.msg('(HCF) %s' % msg, level)

    def _start_job(self, spider):
        self._msg("Starting new job for: %s" % spider.name)
        if self.hs_start_job_new_panel:
            jobid = self.hsclient.start_job(projectid=self.hs_projectid,
                                            spider=spider.name)
        else:
            jobid = self.oldpanel_project.schedule(
                spider.name,
                slot=self.hs_consume_from_slot,
                dummy=datetime.now())
        self._msg("New job started: %s" % jobid)

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)

    def process_start_requests(self, start_requests, spider):

        self.hs_frontier = getattr(spider, 'hs_frontier', self.hs_frontier)
        self._msg('Using HS_FRONTIER=%s' % self.hs_frontier)

        self.hs_consume_from_slot = getattr(spider, 'hs_consume_from_slot',
                                            self.hs_consume_from_slot)
        self._msg('Using HS_CONSUME_FROM_SLOT=%s' % self.hs_consume_from_slot)

        self.has_new_requests = False
        for req in self._get_new_requests():
            self.has_new_requests = True
            yield req

        # if there are no links in the hcf, use the start_requests
        # unless this is not the first job.
        if not self.has_new_requests and not getattr(spider, 'dummy', None):
            self._msg('Using start_requests')
            for r in start_requests:
                yield r

    def process_spider_output(self, response, result, spider):
        slot_callback = getattr(spider, 'slot_callback', self._get_slot)
        for item in result:
            if isinstance(item, Request):
                request = item
                if request.meta.get('use_hcf', False):
                    if request.method == 'GET':  # XXX: Only GET support for now.
                        slot = slot_callback(request)
                        hcf_params = request.meta.get('hcf_params')
                        fp = {'fp': request.url}
                        if hcf_params:
                            fp.update(hcf_params)
                        # Save the new links as soon as possible using
                        # the batch uploader
                        self.fclient.add(self.hs_frontier, slot, [fp])
                        self.new_links_count[slot] += 1
                    else:
                        self._msg(
                            "'use_hcf' meta key is not supported for non GET requests (%s)"
                            % request.url, log.ERROR)
                        yield request
                else:
                    yield request
            else:
                yield item

    def close_spider(self, spider, reason):
        # Only store the results if the spider finished normally, if it
        # didn't finished properly there is not way to know whether all the url batches
        # were processed and it is better not to delete them from the frontier
        # (so they will be picked by another process).
        if reason == 'finished':
            self._save_new_links_count()
            self._delete_processed_ids()

        # Close the frontier client in order to make sure that all the new links
        # are stored.
        self.fclient.close()
        self.hsclient.close()

        # If the reason is defined in the hs_start_job_on_reason list then start
        # a new job right after this spider is finished.
        if self.hs_start_job_enabled and reason in self.hs_start_job_on_reason:

            # Start the new job if this job had requests from the HCF or it
            # was the first job.
            if self.has_new_requests or not getattr(spider, 'dummy', None):
                self._start_job(spider)

    def _get_new_requests(self):
        """ Get a new batch of links from the HCF."""
        num_batches = 0
        num_links = 0
        for num_batches, batch in enumerate(
                self.fclient.read(self.hs_frontier, self.hs_consume_from_slot),
                1):
            for fingerprint, data in batch['requests']:
                num_links += 1
                yield Request(url=fingerprint,
                              meta={'hcf_params': {
                                  'qdata': data
                              }})
            self.batch_ids.append(batch['id'])
            if num_links >= self.hs_max_links:
                break
        self._msg('Read %d new batches from slot(%s)' %
                  (num_batches, self.hs_consume_from_slot))
        self._msg('Read %d new links from slot(%s)' %
                  (num_links, self.hs_consume_from_slot))

    def _save_new_links_count(self):
        """ Save the new extracted links into the HCF."""
        for slot, link_count in self.new_links_count.items():
            self._msg('Stored %d new links in slot(%s)' % (link_count, slot))
        self.new_links_count = defaultdict(list)

    def _delete_processed_ids(self):
        """ Delete in the HCF the ids of the processed batches."""
        self.fclient.delete(self.hs_frontier, self.hs_consume_from_slot,
                            self.batch_ids)
        self._msg('Deleted %d processed batches in slot(%s)' %
                  (len(self.batch_ids), self.hs_consume_from_slot))
        self.batch_ids = []

    def _get_slot(self, request):
        """ Determine to which slot should be saved the request."""
        md5 = hashlib.md5()
        md5.update(request.url)
        digest = md5.hexdigest()
        return str(int(digest, 16) % self.hs_number_of_slots)
class SystemTest(HSTestCase):

    MAGICN = 1211

    def setUp(self):
        super(HSTestCase, self).setUp()
        endpoint = self.hsclient.endpoint
        # Panel - no client auth, only project auth using user auth token
        self.panelclient = HubstorageClient(endpoint=endpoint)
        self.panelproject = self.panelclient.get_project(self.projectid,
                                                         auth=self.auth)
        # Runner - client uses global auth to poll jobq
        self.runnerclient = HubstorageClient(endpoint=endpoint, auth=self.auth)
        # Scraper - uses job level auth, no global or project auth available
        self.scraperclient = HubstorageClient(endpoint=endpoint)

    def test_succeed_with_close_reason(self):
        p = self.panelproject
        pushed = p.jobq.push(self.spidername)
        # check pending state
        job = p.get_jobs(self.spiderid).next()
        self.assertEqual(job.metadata.get('state'), 'pending')
        # consume msg from runner
        self._run_runner(pushed, close_reason='all-good')
        # query again from panel
        job = p.get_jobs(self.spiderid).next()
        self.assertEqual(job.metadata.get('state'), 'finished')
        self.assertEqual(job.metadata.get('close_reason'), 'all-good')
        self.assertEqual(job.items.stats()['totals']['input_values'],
                         self.MAGICN)
        self.assertEqual(job.logs.stats()['totals']['input_values'],
                         self.MAGICN * 4)
        self.assertEqual(job.requests.stats()['totals']['input_values'],
                         self.MAGICN)

    def test_succeed_without_close_reason(self):
        p = self.panelproject
        pushed = p.jobq.push(self.spidername)
        # check pending state
        job = p.get_jobs(self.spiderid).next()
        self.assertEqual(job.metadata.get('state'), 'pending')
        # consume msg from runner
        self._run_runner(pushed, close_reason=None)
        # query again from panel
        job = p.get_jobs(self.spiderid).next()
        self.assertEqual(job.metadata.get('state'), 'finished')
        self.assertEqual(job.metadata.get('close_reason'), 'no_reason')
        self.assertEqual(job.items.stats()['totals']['input_values'],
                         self.MAGICN)
        self.assertEqual(job.logs.stats()['totals']['input_values'],
                         self.MAGICN * 4)
        self.assertEqual(job.requests.stats()['totals']['input_values'],
                         self.MAGICN)

    def test_scraper_failure(self):
        p = self.panelproject
        pushed = p.jobq.push(self.spidername)
        # check pending state
        job = p.get_jobs(self.spiderid).next()
        self.assertEqual(job.metadata.get('state'), 'pending')
        # consume msg from runner
        self._run_runner(pushed,
                         close_reason=IOError('no more resources, ha!'))
        # query again from panel
        job = p.get_jobs(self.spiderid).next()
        self.assertEqual(job.metadata.get('state'), 'finished')
        self.assertEqual(job.metadata.get('close_reason'), 'failed')
        # MAGICN per log level messages plus one of last failure
        stats = job.logs.stats()
        self.assertTrue(stats)
        self.assertEqual(stats['totals']['input_values'], self.MAGICN * 4 + 1)

    def _run_runner(self, pushed, close_reason):
        job = self.runnerclient.start_job(self.projectid)
        self.assertFalse(job.metadata.get('stop_requested'))
        job.metadata.update(host='localhost', slot=1)
        self.assertEqual(job.metadata.get('state'), 'running')
        # run scraper
        try:
            self._run_scraper(job.key, job.jobauth, close_reason=close_reason)
        except Exception as exc:
            job.failed(message=str(exc))
            # logging from runner must append and never remove messages logged
            # by scraper
            self.assertTrue(job.logs.batch_append)
        else:
            job.finished()

        self.runnerclient.close()

    def _run_scraper(self, jobkey, jobauth, close_reason=None):
        httpmethods = 'GET PUT POST DELETE HEAD OPTIONS TRACE CONNECT'.split()
        job = self.scraperclient.get_job(jobkey, auth=jobauth)
        for idx in xrange(self.MAGICN):
            iid = job.items.write({'uuid': idx})
            job.logs.debug('log debug %s' % idx, idx=idx)
            job.logs.info('log info %s' % idx, idx=idx)
            job.logs.warn('log warn %s' % idx, idx=idx)
            job.logs.error('log error %s' % idx, idx=idx)
            sid = job.samples.write([idx, idx, idx])
            rid = job.requests.add(
                url='http://test.com/%d' % idx,
                status=random.randint(100, 1000),
                method=random.choice(httpmethods),
                rs=random.randint(0, 100000),
                duration=random.randint(0, 1000),
                parent=random.randrange(0, idx + 1) if idx > 10 else None,
                ts=millitime() + random.randint(100, 100000),
            )
            self.assertEqual(iid, idx)
            self.assertEqual(sid, idx)
            self.assertEqual(rid, idx)

        if isinstance(close_reason, Exception):
            self.scraperclient.close()
            raise close_reason

        if close_reason:
            job.metadata['close_reason'] = close_reason

        job.metadata.save()
        self.scraperclient.close()
        del self.scraperclient
Esempio n. 5
0
class HcfMiddleware(object):

    def __init__(self, crawler):
        settings = crawler.settings
        self.hs_endpoint = settings.get("HS_ENDPOINT")
        self.hs_auth = self._get_config(settings, "HS_AUTH")
        self.hs_projectid = self._get_config(settings, "HS_PROJECTID")
        self.hs_frontier = self._get_config(settings, "HS_FRONTIER")
        self.hs_consume_from_slot = self._get_config(settings, "HS_CONSUME_FROM_SLOT")
        self.hs_number_of_slots = settings.getint("HS_NUMBER_OF_SLOTS", DEFAULT_HS_NUMBER_OF_SLOTS)
        self.hs_max_links = settings.getint("HS_MAX_LINKS", DEFAULT_MAX_LINKS)
        self.hs_start_job_enabled = settings.getbool("HS_START_JOB_ENABLED", False)
        self.hs_start_job_on_reason = settings.getlist("HS_START_JOB_ON_REASON", ['finished'])
        self.hs_start_job_new_panel = settings.getbool("HS_START_JOB_NEW_PANEL", False)

        if not self.hs_start_job_new_panel:
            conn = Connection(self.hs_auth)
            self.oldpanel_project = conn[self.hs_projectid]

        self.hsclient = HubstorageClient(auth=self.hs_auth, endpoint=self.hs_endpoint)
        self.project = self.hsclient.get_project(self.hs_projectid)
        self.fclient = self.project.frontier

        self.new_links = defaultdict(set)
        self.batch_ids = []

        crawler.signals.connect(self.close_spider, signals.spider_closed)

        # Make sure the logger for hubstorage.batchuploader is configured
        logging.basicConfig()

    def _get_config(self, settings, key):
        value = settings.get(key)
        if not value:
            raise NotConfigured('%s not found' % key)
        return value

    def _msg(self, msg, level=log.INFO):
        log.msg('(HCF) %s' % msg, level)

    def _start_job(self, spider):
        self._msg("Starting new job for: %s" % spider.name)
        if self.hs_start_job_new_panel:
            jobid = self.hsclient.start_job(projectid=self.hs_projectid,
                                          spider=spider.name)
        else:
            jobid = self.oldpanel_project.schedule(spider.name, slot=self.hs_consume_from_slot,
                                                   dummy=datetime.now())
        self._msg("New job started: %s" % jobid)

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)

    def process_start_requests(self, start_requests, spider):

        self.hs_frontier = getattr(spider, 'hs_frontier', self.hs_frontier)
        self._msg('Using HS_FRONTIER=%s' % self.hs_frontier)

        self.hs_consume_from_slot = getattr(spider, 'hs_consume_from_slot', self.hs_consume_from_slot)
        self._msg('Using HS_CONSUME_FROM_SLOT=%s' % self.hs_consume_from_slot)

        self.has_new_requests = False
        for req in self._get_new_requests():
            self.has_new_requests = True
            yield req

        # if there are no links in the hcf, use the start_requests
        # unless this is not the first job.
        if not self.has_new_requests and not getattr(spider, 'dummy', None):
            self._msg('Using start_requests')
            for r in start_requests:
                yield r

    def process_spider_output(self, response, result, spider):
        slot_callback = getattr(spider, 'slot_callback', self._get_slot)
        for item in result:
            if isinstance(item, Request):
                request = item
                if request.meta.get('use_hcf', False):
                    if request.method == 'GET':  # XXX: Only GET support for now.
                        slot = slot_callback(request)
                        if not request.url in self.new_links:
                            hcf_params = request.meta.get('hcf_params')
                            fp = {'fp': request.url}
                            if hcf_params:
                                fp.update(hcf_params)
                            # Save the new links as soon as possible using
                            # the batch uploader
                            self.fclient.add(self.hs_frontier, slot, [fp])
                            self.new_links[slot].add(request.url)
                    else:
                        self._msg("'use_hcf' meta key is not supported for non GET requests (%s)" % request.url,
                                  log.ERROR)
                        yield request
                else:
                    yield request
            else:
                yield item

    def close_spider(self, spider, reason):
        # Only store the results if the spider finished normally, if it
        # didn't finished properly there is not way to know whether all the url batches
        # were processed and it is better not to delete them from the frontier
        # (so they will be picked by another process).
        if reason == 'finished':
            self._save_new_links_count()
            self._delete_processed_ids()

        # Close the frontier client in order to make sure that all the new links
        # are stored.
        self.fclient.close()
        self.hsclient.close()

        # If the reason is defined in the hs_start_job_on_reason list then start
        # a new job right after this spider is finished.
        if self.hs_start_job_enabled and reason in self.hs_start_job_on_reason:

            # Start the new job if this job had requests from the HCF or it
            # was the first job.
            if self.has_new_requests or not getattr(spider, 'dummy', None):
                self._start_job(spider)

    def _get_new_requests(self):
        """ Get a new batch of links from the HCF."""
        num_batches = 0
        num_links = 0
        for num_batches, batch in enumerate(self.fclient.read(self.hs_frontier, self.hs_consume_from_slot), 1):
            for fingerprint, data in batch['requests']:
                num_links += 1
                yield Request(url=fingerprint, meta={'hcf_params': {'qdata': data}})
            self.batch_ids.append(batch['id'])
            if num_links >= self.hs_max_links:
                break
        self._msg('Read %d new batches from slot(%s)' % (num_batches, self.hs_consume_from_slot))
        self._msg('Read %d new links from slot(%s)' % (num_links, self.hs_consume_from_slot))

    def _save_new_links_count(self):
        """ Save the new extracted links into the HCF."""
        for slot, new_links in self.new_links.items():
            self._msg('Stored %d new links in slot(%s)' % (len(new_links), slot))
        self.new_links = defaultdict(set)

    def _delete_processed_ids(self):
        """ Delete in the HCF the ids of the processed batches."""
        self.fclient.delete(self.hs_frontier, self.hs_consume_from_slot, self.batch_ids)
        self._msg('Deleted %d processed batches in slot(%s)' % (len(self.batch_ids),
                                                                self.hs_consume_from_slot))
        self.batch_ids = []

    def _get_slot(self, request):
        """ Determine to which slot should be saved the request."""
        md5 = hashlib.md5()
        md5.update(request.url)
        digest = md5.hexdigest()
        return str(int(digest, 16) % self.hs_number_of_slots)
Esempio n. 6
0
class SystemTest(HSTestCase):

    MAGICN = 1211

    def setUp(self):
        super(HSTestCase, self).setUp()
        endpoint = self.hsclient.endpoint
        # Panel - no client auth, only project auth using user auth token
        self.panelclient = HubstorageClient(endpoint=endpoint)
        self.panelproject = self.panelclient.get_project(self.projectid, auth=self.auth)
        # Runner - client uses global auth to poll jobq
        self.runnerclient = HubstorageClient(endpoint=endpoint, auth=self.auth)
        # Scraper - uses job level auth, no global or project auth available
        self.scraperclient = HubstorageClient(endpoint=endpoint)

    def test_succeed_with_close_reason(self):
        p = self.panelproject
        pushed = p.jobq.push(self.spidername)
        # check pending state
        job = p.get_jobs(self.spiderid).next()
        self.assertEqual(job.metadata.get('state'), 'pending')
        # consume msg from runner
        self._run_runner(pushed, close_reason='all-good')
        # query again from panel
        job = p.get_jobs(self.spiderid).next()
        self.assertEqual(job.metadata.get('state'), 'finished')
        self.assertEqual(job.metadata.get('close_reason'), 'all-good')
        self.assertEqual(job.items.stats()['totals']['input_values'], self.MAGICN)
        self.assertEqual(job.logs.stats()['totals']['input_values'], self.MAGICN * 4)
        self.assertEqual(job.requests.stats()['totals']['input_values'], self.MAGICN)

    def test_succeed_without_close_reason(self):
        p = self.panelproject
        pushed = p.jobq.push(self.spidername)
        # check pending state
        job = p.get_jobs(self.spiderid).next()
        self.assertEqual(job.metadata.get('state'), 'pending')
        # consume msg from runner
        self._run_runner(pushed, close_reason=None)
        # query again from panel
        job = p.get_jobs(self.spiderid).next()
        self.assertEqual(job.metadata.get('state'), 'finished')
        self.assertEqual(job.metadata.get('close_reason'), 'no_reason')
        self.assertEqual(job.items.stats()['totals']['input_values'], self.MAGICN)
        self.assertEqual(job.logs.stats()['totals']['input_values'], self.MAGICN * 4)
        self.assertEqual(job.requests.stats()['totals']['input_values'], self.MAGICN)

    def test_scraper_failure(self):
        p = self.panelproject
        pushed = p.jobq.push(self.spidername)
        # check pending state
        job = p.get_jobs(self.spiderid).next()
        self.assertEqual(job.metadata.get('state'), 'pending')
        # consume msg from runner
        self._run_runner(pushed, close_reason=IOError('no more resources, ha!'))
        # query again from panel
        job = p.get_jobs(self.spiderid).next()
        self.assertEqual(job.metadata.get('state'), 'finished')
        self.assertEqual(job.metadata.get('close_reason'), 'failed')
        # MAGICN per log level messages plus one of last failure
        stats = job.logs.stats()
        self.assertTrue(stats)
        self.assertEqual(stats['totals']['input_values'], self.MAGICN * 4 + 1)

    def _run_runner(self, pushed, close_reason):
        job = self.runnerclient.start_job(self.projectid)
        self.assertFalse(job.metadata.get('stop_requested'))
        job.metadata.update(host='localhost', slot=1)
        self.assertEqual(job.metadata.get('state'), 'running')
        # run scraper
        try:
            self._run_scraper(job.key, job.jobauth, close_reason=close_reason)
        except Exception as exc:
            job.failed(message=str(exc))
            # logging from runner must append and never remove messages logged
            # by scraper
            self.assertTrue(job.logs.batch_append)
        else:
            job.finished()

        self.runnerclient.close()

    def _run_scraper(self, jobkey, jobauth, close_reason=None):
        httpmethods = 'GET PUT POST DELETE HEAD OPTIONS TRACE CONNECT'.split()
        job = self.scraperclient.get_job(jobkey, auth=jobauth)
        for idx in xrange(self.MAGICN):
            iid = job.items.write({'uuid': idx})
            job.logs.debug('log debug %s' % idx, idx=idx)
            job.logs.info('log info %s' % idx, idx=idx)
            job.logs.warn('log warn %s' % idx, idx=idx)
            job.logs.error('log error %s' % idx, idx=idx)
            sid = job.samples.write([idx, idx, idx])
            rid = job.requests.add(
                url='http://test.com/%d' % idx,
                status=random.randint(100, 1000),
                method=random.choice(httpmethods),
                rs=random.randint(0, 100000),
                duration=random.randint(0, 1000),
                parent=random.randrange(0, idx + 1) if idx > 10 else None,
                ts=millitime() + random.randint(100, 100000),
            )
            self.assertEqual(iid, idx)
            self.assertEqual(sid, idx)
            self.assertEqual(rid, idx)

        if isinstance(close_reason, Exception):
            self.scraperclient.close()
            raise close_reason

        if close_reason:
            job.metadata['close_reason'] = close_reason

        job.metadata.save()
        self.scraperclient.close()
        del self.scraperclient