Example #1
0
        scope.set_tag('JS_INSTRUMENT_MODULES', JS_INSTRUMENT)
        scope.set_tag('SAVE_CONTENT', SAVE_CONTENT)
        scope.set_tag('DWELL_TIME', DWELL_TIME)
        scope.set_tag('TIMEOUT', TIMEOUT)
        scope.set_tag('CRAWL_REFERENCE',
                      '%s/%s' % (S3_BUCKET, CRAWL_DIRECTORY))
        # context adds addition information that may be of interest
        scope.set_context("crawl_config", {
            'REDIS_QUEUE_NAME': REDIS_QUEUE_NAME,
        })
    # Send a sentry error message (temporarily - to easily be able
    # to compare error frequencies to crawl worker instance count)
    sentry_sdk.capture_message("Crawl worker started")

# Connect to job queue
job_queue = rediswq.RedisWQ(name=REDIS_QUEUE_NAME, host=REDIS_HOST)
manager.logger.info("Worker with sessionID: %s" % job_queue.sessionID())
manager.logger.info("Initial queue state: empty=%s" % job_queue.empty())

# Crawl sites specified in job queue until empty
while not job_queue.empty():
    job = job_queue.lease(lease_secs=120, block=True, timeout=5)
    if job is None:
        manager.logger.info("Waiting for work")
        time.sleep(5)
    else:
        site_rank, site = job.decode("utf-8").split(',')
        if "://" not in site:
            site = "http://" + site
        manager.logger.info("Visiting %s..." % site)
        command_sequence = CommandSequence.CommandSequence(site, reset=True)
Example #2
0
        scope.set_tag('TIMEOUT', TIMEOUT)
        scope.set_tag('MAX_JOB_RETRIES', MAX_JOB_RETRIES)
        scope.set_tag('CRAWL_REFERENCE',
                      '%s/%s' % (S3_BUCKET, CRAWL_DIRECTORY))
        # context adds addition information that may be of interest
        scope.set_context("PREFS", PREFS)
        scope.set_context("crawl_config", {
            'REDIS_QUEUE_NAME': REDIS_QUEUE_NAME,
        })
    # Send a sentry error message (temporarily - to easily be able
    # to compare error frequencies to crawl worker instance count)
    sentry_sdk.capture_message("Crawl worker started")

# Connect to job queue
job_queue = rediswq.RedisWQ(name=REDIS_QUEUE_NAME,
                            host=REDIS_HOST,
                            max_retries=MAX_JOB_RETRIES)
manager.logger.info("Worker with sessionID: %s" % job_queue.sessionID())
manager.logger.info("Initial queue state: empty=%s" % job_queue.empty())

# Crawl sites specified in job queue until empty
while not job_queue.empty():
    job_queue.check_expired_leases()
    job = job_queue.lease(lease_secs=TIMEOUT + DWELL_TIME + 30,
                          block=True,
                          timeout=5)

    if job is None:
        manager.logger.info("Waiting for work")
        time.sleep(5)
        continue