scope.set_tag('JS_INSTRUMENT_MODULES', JS_INSTRUMENT) scope.set_tag('SAVE_CONTENT', SAVE_CONTENT) scope.set_tag('DWELL_TIME', DWELL_TIME) scope.set_tag('TIMEOUT', TIMEOUT) scope.set_tag('CRAWL_REFERENCE', '%s/%s' % (S3_BUCKET, CRAWL_DIRECTORY)) # context adds addition information that may be of interest scope.set_context("crawl_config", { 'REDIS_QUEUE_NAME': REDIS_QUEUE_NAME, }) # Send a sentry error message (temporarily - to easily be able # to compare error frequencies to crawl worker instance count) sentry_sdk.capture_message("Crawl worker started") # Connect to job queue job_queue = rediswq.RedisWQ(name=REDIS_QUEUE_NAME, host=REDIS_HOST) manager.logger.info("Worker with sessionID: %s" % job_queue.sessionID()) manager.logger.info("Initial queue state: empty=%s" % job_queue.empty()) # Crawl sites specified in job queue until empty while not job_queue.empty(): job = job_queue.lease(lease_secs=120, block=True, timeout=5) if job is None: manager.logger.info("Waiting for work") time.sleep(5) else: site_rank, site = job.decode("utf-8").split(',') if "://" not in site: site = "http://" + site manager.logger.info("Visiting %s..." % site) command_sequence = CommandSequence.CommandSequence(site, reset=True)
scope.set_tag('TIMEOUT', TIMEOUT) scope.set_tag('MAX_JOB_RETRIES', MAX_JOB_RETRIES) scope.set_tag('CRAWL_REFERENCE', '%s/%s' % (S3_BUCKET, CRAWL_DIRECTORY)) # context adds addition information that may be of interest scope.set_context("PREFS", PREFS) scope.set_context("crawl_config", { 'REDIS_QUEUE_NAME': REDIS_QUEUE_NAME, }) # Send a sentry error message (temporarily - to easily be able # to compare error frequencies to crawl worker instance count) sentry_sdk.capture_message("Crawl worker started") # Connect to job queue job_queue = rediswq.RedisWQ(name=REDIS_QUEUE_NAME, host=REDIS_HOST, max_retries=MAX_JOB_RETRIES) manager.logger.info("Worker with sessionID: %s" % job_queue.sessionID()) manager.logger.info("Initial queue state: empty=%s" % job_queue.empty()) # Crawl sites specified in job queue until empty while not job_queue.empty(): job_queue.check_expired_leases() job = job_queue.lease(lease_secs=TIMEOUT + DWELL_TIME + 30, block=True, timeout=5) if job is None: manager.logger.info("Waiting for work") time.sleep(5) continue