コード例 #1
0
class TestZookeeperWatcher(TestCase):
    def __init__(self, name, hosts):
        self.hosts = hosts
        self.file_path = '/test_path'
        self.file_data = 'test_data'
        self.pointer_path = '/test_pointer'
        self.pointer_data = self.file_path
        TestCase.__init__(self, name)

    def setUp(self):
        self.zoo_client = KazooClient(hosts=self.hosts)
        self.zoo_client.start()
        # prepare data
        self.zoo_client.ensure_path(self.file_path)
        self.zoo_client.set(self.file_path, self.file_data.encode('utf-8'))
        self.zoo_client.ensure_path(self.pointer_path)
        self.zoo_client.set(self.pointer_path,
                            self.pointer_data.encode('utf-8'))

        self.zoo_watcher = ZookeeperWatcher(hosts=self.hosts,
                                            filepath=self.file_path,
                                            pointer=False,
                                            ensure=False,
                                            valid_init=True)

    def test_ping(self):
        self.assertTrue(self.zoo_watcher.ping())

    def test_get_file_contents(self):
        pointer_zoo_watcher = ZookeeperWatcher(hosts=self.hosts,
                                               filepath=self.pointer_path,
                                               pointer=True,
                                               ensure=False,
                                               valid_init=True)

        self.assertEqual(self.zoo_watcher.get_file_contents(), self.file_data)
        self.assertEqual(pointer_zoo_watcher.get_file_contents(),
                         self.file_data)
        self.assertEqual(pointer_zoo_watcher.get_file_contents(True),
                         self.pointer_data)

        pointer_zoo_watcher.close()

    def tearDown(self):
        self.zoo_watcher.close()

        self.zoo_client.ensure_path(self.file_path)
        self.zoo_client.delete(self.file_path)
        self.zoo_client.ensure_path(self.pointer_path)
        self.zoo_client.delete(self.pointer_path)
        self.zoo_client.stop()
        self.zoo_client.close()
コード例 #2
0
class DistributedScheduler(object):
    '''
    Scrapy request scheduler that utilizes Redis Throttled Priority Queues
    to moderate different domain scrape requests within a distributed scrapy
    cluster
    '''
    redis_conn = None # the redis connection
    queue_dict = None # the dict of throttled queues
    spider = None # the spider using this scheduler
    queue_keys = None # the list of current queues
    queue_class = None # the class to use for the queue
    dupefilter = None # the redis dupefilter
    update_time = 0 # the last time the queues were updated
    update_ip_time = 0 # the last time the ip was updated
    update_interval = 0 # how often to update the queues
    extract = None # the tld extractor
    hits = 0 # default number of hits for a queue
    window = 0 # default window to calculate number of hits
    my_ip = None # the ip address of the scheduler (if needed)
    old_ip = None # the old ip for logging
    ip_update_interval = 0 # the interval to update the ip address
    add_type = None # add spider type to redis throttle queue key
    add_ip = None # add spider public ip to redis throttle queue key
    item_retries = 0 # the number of extra tries to get an item
    my_uuid = None # the generated UUID for the particular scrapy process
    # Zookeeper Dynamic Config Vars
    domain_config = {}  # The list of domains and their configs
    my_id = None  # The id used to read the throttle config
    config_flag = False  # Flag to reload queues if settings are wiped too
    assign_path = None  # The base assigned configuration path to read
    zoo_client = None  # The KazooClient to manage the config
    my_assignment = None  # Zookeeper path to read actual yml config
    black_domains = [] # the domains to ignore thanks to zookeeper config

    def __init__(self, server, persist, update_int, timeout, retries, logger,
                 hits, window, mod, ip_refresh, add_type, add_ip, ip_regex,
                 backlog_blacklist, queue_timeout):
        '''
        Initialize the scheduler
        '''
        self.redis_conn = server
        self.persist = persist
        self.queue_dict = {}
        self.update_interval = update_int
        self.hits = hits
        self.window = window
        self.moderated = mod
        self.rfp_timeout = timeout
        self.ip_update_interval = ip_refresh
        self.add_type = add_type
        self.add_ip = add_ip
        self.item_retires = retries
        self.logger = logger
        self.ip_regex = re.compile(ip_regex)
        self.backlog_blacklist = backlog_blacklist
        self.queue_timeout = queue_timeout

        # set up tldextract
        self.extract = tldextract.TLDExtract()

        self.update_ipaddress()

        # if we need better uuid's mod this line
        self.my_uuid = str(uuid.uuid4()).split('-')[4]

    def setup_zookeeper(self):
        self.assign_path = settings.get('ZOOKEEPER_ASSIGN_PATH', "")
        self.my_id = settings.get('ZOOKEEPER_ID', 'all')
        self.logger.debug("Trying to establish Zookeeper connection")
        try:
            self.zoo_watcher = ZookeeperWatcher(
                                hosts=settings.get('ZOOKEEPER_HOSTS'),
                                filepath=self.assign_path + self.my_id,
                                config_handler=self.change_config,
                                error_handler=self.error_config,
                                pointer=False, ensure=True, valid_init=True)
        except KazooTimeoutError:
            self.logger.error("Could not connect to Zookeeper")
            sys.exit(1)

        if self.zoo_watcher.ping():
            self.logger.debug("Successfully set up Zookeeper connection")
        else:
            self.logger.error("Could not ping Zookeeper")
            sys.exit(1)

    def change_config(self, config_string):
        if config_string and len(config_string) > 0:
            loaded_config = yaml.safe_load(config_string)
            self.logger.info("Zookeeper config changed", extra=loaded_config)
            self.load_domain_config(loaded_config)
            self.update_domain_queues()
        elif config_string is None or len(config_string) == 0:
            self.error_config("Zookeeper config wiped")

        self.create_queues()

    def load_domain_config(self, loaded_config):
        '''
        Loads the domain_config and sets up queue_dict
        @param loaded_config: the yaml loaded config dict from zookeeper
        '''
        self.domain_config = {}
        # vetting process to ensure correct configs
        if loaded_config:
            if 'domains' in loaded_config:
                for domain in loaded_config['domains']:
                    item = loaded_config['domains'][domain]
                    # check valid
                    if 'window' in item and 'hits' in item:
                        self.logger.debug("Added domain {dom} to loaded config"
                                          .format(dom=domain))
                        self.domain_config[domain] = item
            if 'blacklist' in loaded_config:
                self.black_domains = loaded_config['blacklist']

        self.config_flag = True

    def update_domain_queues(self):
        '''
        Check to update existing queues already in memory
        new queues are created elsewhere
        '''
        for key in self.domain_config:
            final_key = "{name}:{domain}:queue".format(
                    name=self.spider.name,
                    domain=key)
            # we already have a throttled queue for this domain, update it to new settings
            if final_key in self.queue_dict:
                self.queue_dict[final_key][0].window = float(self.domain_config[key]['window'])
                self.logger.debug("Updated queue {q} with new config"
                                  .format(q=final_key))
                # if scale is applied, scale back; otherwise use updated hits
                if 'scale' in self.domain_config[key]:
                    # round to int
                    hits = int(self.domain_config[key]['hits'] * self.fit_scale(
                               self.domain_config[key]['scale']))
                    self.queue_dict[final_key][0].limit = float(hits)
                else:
                    self.queue_dict[final_key][0].limit = float(self.domain_config[key]['hits'])

    def error_config(self, message):
        extras = {}
        extras['message'] = message
        extras['revert_window'] = self.window
        extras['revert_hits'] = self.hits
        extras['spiderid'] = self.spider.name
        self.logger.info("Lost config from Zookeeper", extra=extras)
        # lost connection to zookeeper, reverting back to defaults
        for key in self.domain_config:
            final_key = "{name}:{domain}:queue".format(
                    name=self.spider.name,
                    domain=key)
            self.queue_dict[final_key][0].window = self.window
            self.queue_dict[final_key][0].limit = self.hits

        self.domain_config = {}

    def fit_scale(self, scale):
        '''
        @return: a scale >= 0 and <= 1
        '''
        if scale >= 1:
            return 1.0
        elif scale <= 0:
            return 0.0
        else:
            return scale

    def create_queues(self):
        '''
        Updates the in memory list of the redis queues
        Creates new throttled queue instances if it does not have them
        '''
        # new config could have loaded between scrapes
        newConf = self.check_config()

        self.queue_keys = self.redis_conn.keys(self.spider.name + ":*:queue")

        for key in self.queue_keys:
            # build final queue key, depending on type and ip bools
            throttle_key = ""

            if self.add_type:
                throttle_key = self.spider.name + ":"
            if self.add_ip:
                throttle_key = throttle_key + self.my_ip + ":"

            # add the tld from the key `type:tld:queue`
            the_domain = re.split(':', key)[1]
            throttle_key = throttle_key + the_domain

            if key not in self.queue_dict or newConf:
                self.logger.debug("Added new Throttled Queue {q}"
                                  .format(q=key))
                q = RedisPriorityQueue(self.redis_conn, key, encoding=ujson)

                # use default window and hits
                if the_domain not in self.domain_config:
                    # this is now a tuple, all access needs to use [0] to get
                    # the object, use [1] to get the time
                    self.queue_dict[key] = [RedisThrottledQueue(self.redis_conn,
                    q, self.window, self.hits, self.moderated, throttle_key,
                    throttle_key, True), time.time()]
                # use custom window and hits
                else:
                    window = self.domain_config[the_domain]['window']
                    hits = self.domain_config[the_domain]['hits']

                    # adjust the crawl rate based on the scale if exists
                    if 'scale' in self.domain_config[the_domain]:
                        hits = int(hits * self.fit_scale(self.domain_config[the_domain]['scale']))

                    self.queue_dict[key] = [RedisThrottledQueue(self.redis_conn,
                    q, window, hits, self.moderated, throttle_key,
                    throttle_key, True), time.time()]

    def expire_queues(self):
        '''
        Expires old queue_dict keys that have not been used in a long time.
        Prevents slow memory build up when crawling lots of different domains
        '''
        curr_time = time.time()
        for key in list(self.queue_dict):
            diff = curr_time - self.queue_dict[key][1]
            if diff > self.queue_timeout:
                self.logger.debug("Expiring domain queue key " + key)
                del self.queue_dict[key]
                if key in self.queue_keys:
                    self.queue_keys.remove(key)

    def check_config(self):
        '''
        Controls configuration for the scheduler
        @return: True if there is a new configuration
        '''
        if self.config_flag:
            self.config_flag = False
            return True

        return False

    def update_ipaddress(self):
        '''
        Updates the scheduler so it knows its own ip address
        '''
        # assign local ip in case of exception
        self.old_ip = self.my_ip
        self.my_ip = '127.0.0.1'
        try:
            obj = urllib.request.urlopen(settings.get('PUBLIC_IP_URL',
                                  'http://ip.42.pl/raw'))
            results = self.ip_regex.findall(obj.read())
            if len(results) > 0:
                self.my_ip = results[0]
            else:
                raise IOError("Could not get valid IP Address")
            obj.close()
            self.logger.debug("Current public ip: {ip}".format(ip=self.my_ip))
        except IOError:
            self.logger.error("Could not reach out to get public ip")
            pass

        if self.old_ip != self.my_ip:
            self.logger.info("Changed Public IP: {old} -> {new}".format(
                             old=self.old_ip, new=self.my_ip))

    def report_self(self):
        '''
        Reports the crawler uuid to redis
        '''
        self.logger.debug("Reporting self id", extra={'uuid':self.my_uuid})
        key = "stats:crawler:{m}:{s}:{u}".format(
            m=socket.gethostname(),
            s=self.spider.name,
            u=self.my_uuid)
        self.redis_conn.set(key, time.time())
        self.redis_conn.expire(key, self.ip_update_interval * 2)

    @classmethod
    def from_settings(cls, settings):
        server = redis.Redis(host=settings.get('REDIS_HOST'),
                             port=settings.get('REDIS_PORT'),
                             db=settings.get('REDIS_DB'))
        persist = settings.get('SCHEDULER_PERSIST', True)
        up_int = settings.get('SCHEDULER_QUEUE_REFRESH', 10)
        hits = settings.get('QUEUE_HITS', 10)
        window = settings.get('QUEUE_WINDOW', 60)
        mod = settings.get('QUEUE_MODERATED', False)
        timeout = settings.get('DUPEFILTER_TIMEOUT', 600)
        ip_refresh = settings.get('SCHEDULER_IP_REFRESH', 60)
        add_type = settings.get('SCHEDULER_TYPE_ENABLED', False)
        add_ip = settings.get('SCHEDULER_IP_ENABLED', False)
        retries = settings.get('SCHEUDLER_ITEM_RETRIES', 3)
        ip_regex = settings.get('IP_ADDR_REGEX', '.*')
        backlog_blacklist = settings.get('SCHEDULER_BACKLOG_BLACKLIST', True)
        queue_timeout = settings.get('SCHEDULER_QUEUE_TIMEOUT', 3600)


        my_level = settings.get('SC_LOG_LEVEL', 'INFO')
        my_name = settings.get('SC_LOGGER_NAME', 'sc-logger')
        my_output = settings.get('SC_LOG_STDOUT', True)
        my_json = settings.get('SC_LOG_JSON', False)
        my_dir = settings.get('SC_LOG_DIR', 'logs')
        my_bytes = settings.get('SC_LOG_MAX_BYTES', '10MB')
        my_file = settings.get('SC_LOG_FILE', 'main.log')
        my_backups = settings.get('SC_LOG_BACKUPS', 5)

        logger = LogFactory.get_instance(json=my_json,
                                         name=my_name,
                                         stdout=my_output,
                                         level=my_level,
                                         dir=my_dir,
                                         file=my_file,
                                         bytes=my_bytes,
                                         backups=my_backups)

        return cls(server, persist, up_int, timeout, retries, logger, hits,
                   window, mod, ip_refresh, add_type, add_ip, ip_regex,
                   backlog_blacklist, queue_timeout)

    @classmethod
    def from_crawler(cls, crawler):
        return cls.from_settings(crawler.settings)

    def open(self, spider):
        self.spider = spider
        self.spider.set_logger(self.logger)
        self.create_queues()
        self.setup_zookeeper()
        self.dupefilter = RFPDupeFilter(self.redis_conn,
                                        self.spider.name + ':dupefilter',
                                        self.rfp_timeout)

    def close(self, reason):
        self.logger.info("Closing Spider", {'spiderid':self.spider.name})
        if not self.persist:
            self.logger.warning("Clearing crawl queues")
            self.dupefilter.clear()
            for key in self.queue_keys:
                self.queue_dict[key][0].clear()

    def is_blacklisted(self, appid, crawlid):
        '''
        Checks the redis blacklist for crawls that should not be propagated
        either from expiring or stopped
        @return: True if the appid crawlid combo is blacklisted
        '''
        key_check = '{appid}||{crawlid}'.format(appid=appid,
                                                crawlid=crawlid)
        redis_key = self.spider.name + ":blacklist"
        return self.redis_conn.sismember(redis_key, key_check)

    def enqueue_request(self, request):
        '''
        Pushes a request from the spider into the proper throttled queue
        '''
        if not request.dont_filter and self.dupefilter.request_seen(request):
            self.logger.debug("Request not added back to redis")
            return
        req_dict = self.request_to_dict(request)

        if not self.is_blacklisted(req_dict['meta']['appid'],
                                   req_dict['meta']['crawlid']):
            # grab the tld of the request
            ex_res = self.extract(req_dict['url'])
            key = "{sid}:{dom}.{suf}:queue".format(
                sid=req_dict['meta']['spiderid'],
                dom=ex_res.domain,
                suf=ex_res.suffix)

            curr_time = time.time()

            domain = "{d}.{s}".format(d=ex_res.domain, s=ex_res.suffix)

            # allow only if we want all requests or we want
            # everything but blacklisted domains
            # insert if crawl never expires (0) or time < expires
            if (self.backlog_blacklist or
                    (not self.backlog_blacklist and
                    domain not in self.black_domains)) and \
                    (req_dict['meta']['expires'] == 0 or
                    curr_time < req_dict['meta']['expires']):
                # we may already have the queue in memory
                if key in self.queue_keys:
                    self.queue_dict[key][0].push(req_dict,
                                              req_dict['meta']['priority'])
                else:
                    # shoving into a new redis queue, negative b/c of sorted sets
                    # this will populate ourself and other schedulers when
                    # they call create_queues
                    self.redis_conn.zadd(key, ujson.dumps(req_dict),
                                        -req_dict['meta']['priority'])
                self.logger.debug("Crawlid: '{id}' Appid: '{appid}' added to queue"
                    .format(appid=req_dict['meta']['appid'],
                            id=req_dict['meta']['crawlid']))
            else:
                self.logger.debug("Crawlid: '{id}' Appid: '{appid}' expired"
                                  .format(appid=req_dict['meta']['appid'],
                                          id=req_dict['meta']['crawlid']))
        else:
            self.logger.debug("Crawlid: '{id}' Appid: '{appid}' blacklisted"
                              .format(appid=req_dict['meta']['appid'],
                                      id=req_dict['meta']['crawlid']))

    def request_to_dict(self, request):
        '''
        Convert Request object to a dict.
        modified from scrapy.utils.reqser
        '''
        req_dict = {
            # urls should be safe (safe_string_url)
            'url': to_unicode(request.url),
            'method': request.method,
            'headers': dict(request.headers),
            'body': request.body,
            'cookies': request.cookies,
            'meta': request.meta,
            '_encoding': request._encoding,
            'priority': request.priority,
            'dont_filter': request.dont_filter,
             #  callback/errback are assumed to be a bound instance of the spider
            'callback': None if request.callback is None else request.callback.__name__,
            'errback': None if request.errback is None else request.errback.__name__,
        }
        return req_dict

    def find_item(self):
        '''
        Finds an item from the throttled queues
        '''
        random.shuffle(self.queue_keys)
        count = 0

        while count <= self.item_retries:
            for key in self.queue_keys:
                # skip if the whole domain has been blacklisted in zookeeper
                if key.split(':')[1] in self.black_domains:
                    continue
                # the throttled queue only returns an item if it is allowed
                item = self.queue_dict[key][0].pop()

                if item:
                    # update timeout and return
                    self.queue_dict[key][1] = time.time()
                    return item

            count = count + 1

        return None

    def next_request(self):
        '''
        Logic to handle getting a new url request, from a bunch of
        different queues
        '''
        t = time.time()
        # update the redis queues every so often
        if t - self.update_time > self.update_interval:
            self.update_time = t
            self.create_queues()
            self.expire_queues()

        # update the ip address every so often
        if t - self.update_ip_time > self.ip_update_interval:
            self.update_ip_time = t
            self.update_ipaddress()
            self.report_self()

        item = self.find_item()
        if item:
            self.logger.debug("Found url to crawl {url}" \
                    .format(url=item['url']))
            try:
                req = Request(item['url'])
            except ValueError:
                # need absolute url
                # need better url validation here
                req = Request('http://' + item['url'])

            try:
                if 'callback' in item and item['callback'] is not None:
                    req.callback = getattr(self.spider, item['callback'])
            except AttributeError:
                self.logger.warn("Unable to find callback method")

            try:
                if 'errback' in item and item['errback'] is not None:
                    req.errback = getattr(self.spider, item['errback'])
            except AttributeError:
                self.logger.warn("Unable to find errback method")

            if 'meta' in item:
                item = item['meta']

            # defaults not in schema
            if 'curdepth' not in item:
                item['curdepth'] = 0
            if "retry_times" not in item:
                item['retry_times'] = 0

            for key in list(item.keys()):
                req.meta[key] = item[key]

            # extra check to add items to request
            if 'useragent' in item and item['useragent'] is not None:
                req.headers['User-Agent'] = item['useragent']
            if 'cookie' in item and item['cookie'] is not None:
                if isinstance(item['cookie'], dict):
                    req.cookies = item['cookie']
                elif isinstance(item['cookie'], basestring):
                    req.cookies = self.parse_cookie(item['cookie'])

            return req

        return None

    def parse_cookie(self, string):
        '''
        Parses a cookie string like returned in a Set-Cookie header
        @param string: The cookie string
        @return: the cookie dict
        '''
        results = re.findall('([^=]+)=([^\;]+);?\s?', string)
        my_dict = {}
        for item in results:
            my_dict[item[0]] = item[1]

        return my_dict

    def has_pending_requests(self):
        '''
        We never want to say we have pending requests
        If this returns True scrapy sometimes hangs.
        '''
        return False
コード例 #3
0
class TestZookeeperWatcher(TestCase):

    def setUp(self):

        zoo_client = MagicMock()
        zoo_client.get = MagicMock(return_value=('data', 'blah'))

        with patch('scutils.zookeeper_watcher.KazooClient') as k:
            k.return_value = zoo_client
            self.zoo_watcher = ZookeeperWatcher(
                                hosts='localhost',
                                filepath='/mypath',
                                pointer=False, ensure=True,
                                valid_init=True)

    def test_ping(self):
        self.zoo_watcher.zoo_client.server_version = MagicMock()
        self.assertTrue(self.zoo_watcher.ping())
        self.zoo_watcher.zoo_client.server_version = MagicMock(side_effect=KazooException)
        self.assertFalse(self.zoo_watcher.ping())

    def test_get_file_contents(self):
        self.zoo_watcher.old_pointed = 'old_pointed'
        self.zoo_watcher.old_data = 'old_data'

        self.zoo_watcher.pointer = False
        self.assertEquals(self.zoo_watcher.get_file_contents(), 'old_data')

        self.zoo_watcher.pointer = True
        self.assertEquals(self.zoo_watcher.get_file_contents(), 'old_data')

        self.zoo_watcher.pointer = True
        self.assertEquals(self.zoo_watcher.get_file_contents(True), 'old_pointed')

    def test_compare_pointer(self):
        self.zoo_watcher.old_pointed = '/path1'

        self.assertTrue(self.zoo_watcher.compare_pointer('/path2'))

        self.zoo_watcher.old_pointed = '/path1'

        self.assertFalse(self.zoo_watcher.compare_pointer('/path1'))

    def test_compare_data(self):
        self.zoo_watcher.old_data = 'old_data'

        self.assertTrue(self.zoo_watcher.compare_data('new_data'))

        self.zoo_watcher.old_data = 'same_data'
        self.assertFalse(self.zoo_watcher.compare_data('same_data'))

    def test_set_valid(self):
        self.zoo_watcher.is_valid = MagicMock(return_value=True)
        self.zoo_watcher.valid_handler = MagicMock()
        self.zoo_watcher.set_valid(False)

        self.zoo_watcher.valid_handler.assert_called_once_with(True)

    def test_call_valid(self):
        self.the_bool = False
        def the_set(state):
            self.the_bool = True

        self.zoo_watcher.valid_handler = the_set
        self.zoo_watcher.call_valid(True)

        self.assertTrue(self.the_bool)

    def test_call_config(self):
        self.the_bool = False
        def the_set(state):
            self.the_bool = True

        self.zoo_watcher.config_handler = the_set
        self.zoo_watcher.call_config(True)

        self.assertTrue(self.the_bool)

    def test_call_error(self):
        self.the_bool = False
        def the_set(state):
            self.the_bool = True

        self.zoo_watcher.error_handler = the_set
        self.zoo_watcher.call_error(True)

        self.assertTrue(self.the_bool)
コード例 #4
0
class DistributedScheduler(object):
    '''
    Scrapy request scheduler that utilizes Redis Throttled Priority Queues
    to moderate different domain scrape requests within a distributed scrapy
    cluster
    '''
    redis_conn = None  # the redis connection
    queue_dict = None  # the dict of throttled queues
    spider = None  # the spider using this scheduler
    queue_keys = None  # the list of current queues
    queue_class = None  # the class to use for the queue
    dupefilter = None  # the redis dupefilter
    update_time = 0  # the last time the queues were updated
    update_ip_time = 0  # the last time the ip was updated
    update_interval = 0  # how often to update the queues
    extract = None  # the tld extractor
    hits = 0  # default number of hits for a queue
    window = 0  # default window to calculate number of hits
    my_ip = None  # the ip address of the scheduler (if needed)
    old_ip = None  # the old ip for logging
    ip_update_interval = 0  # the interval to update the ip address
    add_type = None  # add spider type to redis throttle queue key
    add_ip = None  # add spider public ip to redis throttle queue key
    item_retries = 0  # the number of extra tries to get an item
    my_uuid = None  # the generated UUID for the particular scrapy process
    # Zookeeper Dynamic Config Vars
    domain_config = {}  # The list of domains and their configs
    my_id = None  # The id used to read the throttle config
    config_flag = False  # Flag to reload queues if settings are wiped too
    assign_path = None  # The base assigned configuration path to read
    zoo_client = None  # The KazooClient to manage the config
    my_assignment = None  # Zookeeper path to read actual yml config

    def __init__(self, server, persist, update_int, timeout, retries, logger,
                 hits, window, mod, ip_refresh, add_type, add_ip, ip_regex):
        '''
        Initialize the scheduler
        '''
        self.redis_conn = server
        self.persist = persist
        self.queue_dict = {}
        self.update_interval = update_int
        self.hits = hits
        self.window = window
        self.moderated = mod
        self.rfp_timeout = timeout
        self.ip_update_interval = ip_refresh
        self.add_type = add_type
        self.add_ip = add_ip
        self.item_retries = retries
        self.logger = logger
        self.ip_regex = re.compile(ip_regex)

        # set up tldextract
        self.extract = tldextract.TLDExtract()

        self.update_ipaddress()

        # if we need better uuid's mod this line
        self.my_uuid = str(uuid.uuid4()).split('-')[4]
        # wrapper next_request
        self.next_request = next_request_method_wrapper(self)(
            self.next_request)
        # add test by msc
        #self.banned_pages = 0

    def setup_zookeeper(self):
        self.assign_path = settings.get('ZOOKEEPER_ASSIGN_PATH', "")
        self.my_id = settings.get('ZOOKEEPER_ID', 'all')
        self.logger.debug("Trying to establish Zookeeper connection")
        try:
            self.zoo_watcher = ZookeeperWatcher(
                hosts=settings.get('ZOOKEEPER_HOSTS'),
                filepath=self.assign_path + self.my_id,
                config_handler=self.change_config,
                error_handler=self.error_config,
                pointer=False,
                ensure=True,
                valid_init=True)
        except KazooTimeoutError:
            self.logger.error("Could not connect to Zookeeper")
            sys.exit(1)

        if self.zoo_watcher.ping():
            self.logger.debug("Successfully set up Zookeeper connection")
        else:
            self.logger.error("Could not ping Zookeeper")
            sys.exit(1)

    def change_config(self, config_string):
        if config_string and len(config_string) > 0:
            loaded_config = yaml.safe_load(config_string)
            self.logger.info("Zookeeper config changed", extra=loaded_config)
            self.load_domain_config(loaded_config)
            self.update_domain_queues()
        elif config_string is None or len(config_string) == 0:
            self.error_config("Zookeeper config wiped")

        self.create_queues()

    def load_domain_config(self, loaded_config):
        '''
        Loads the domain_config and sets up queue_dict
        @param loaded_config: the yaml loaded config dict from zookeeper
        '''
        self.domain_config = {}
        # vetting process to ensure correct configs
        if loaded_config and 'domains' in loaded_config:
            for domain in loaded_config['domains']:
                item = loaded_config['domains'][domain]
                # check valid
                if 'window' in item and 'hits' in item:
                    self.logger.debug(
                        "Added domain {dom} to loaded config".format(
                            dom=domain))
                    self.domain_config[domain] = item

        self.config_flag = True

    def update_domain_queues(self):
        '''
        Check to update existing queues already in memory
        new queues are created elsewhere
        '''
        for key in self.domain_config:
            final_key = "{name}:{domain}:queue".format(name=self.spider.name,
                                                       domain=key)
            # we already have a throttled queue for this domain, update it to new settings
            if final_key in self.queue_dict:
                self.queue_dict[final_key].window = float(
                    self.domain_config[key]['window'])
                self.logger.debug(
                    "Updated queue {q} with new config".format(q=final_key))
                # if scale is applied, scale back; otherwise use updated hits
                if 'scale' in self.domain_config[key]:
                    # round to int
                    hits = int(
                        self.domain_config[key]['hits'] *
                        self.fit_scale(self.domain_config[key]['scale']))
                    self.queue_dict[final_key].limit = float(hits)
                else:
                    self.queue_dict[final_key].limit = float(
                        self.domain_config[key]['hits'])

    def error_config(self, message):
        extras = {}
        extras['message'] = message
        extras['revert_window'] = self.window
        extras['revert_hits'] = self.hits
        extras['spiderid'] = self.spider.name
        self.logger.info("Lost config from Zookeeper", extra=extras)
        # lost connection to zookeeper, reverting back to defaults
        for key in self.domain_config:
            final_key = "{name}:{domain}:queue".format(name=self.spider.name,
                                                       domain=key)
            self.queue_dict[final_key].window = self.window
            self.queue_dict[final_key].limit = self.hits

        self.domain_config = {}

    def fit_scale(self, scale):
        '''
        @return: a scale >= 0 and <= 1
        '''
        if scale >= 1:
            return 1.0
        elif scale <= 0:
            return 0.0
        else:
            return scale

    def create_queues(self):
        '''
        Updates the in memory list of the redis queues
        Creates new throttled queue instances if it does not have them
        '''
        # new config could have loaded between scrapes
        newConf = self.check_config()

        self.queue_keys = self.redis_conn.keys(self.spider.name + ":*:queue")

        for key in self.queue_keys:
            # build final queue key, depending on type and ip bools
            throttle_key = ""

            if self.add_type:
                throttle_key = self.spider.name + ":"
            if self.add_ip:
                throttle_key = throttle_key + self.my_ip + ":"

            # add the tld from the key `type:tld:queue`
            the_domain = re.split(':', key)[1]
            throttle_key = throttle_key + the_domain

            if key not in self.queue_dict or newConf:
                self.logger.debug(
                    "Added new Throttled Queue {q}".format(q=key))
                q = RedisPriorityQueue(self.redis_conn, key)

                # use default window and hits
                if the_domain not in self.domain_config:
                    self.queue_dict[key] = RedisThrottledQueue(
                        self.redis_conn, q, self.window, self.hits,
                        self.moderated, throttle_key, throttle_key)
                # use custom window and hits
                else:
                    window = self.domain_config[the_domain]['window']
                    hits = self.domain_config[the_domain]['hits']

                    # adjust the crawl rate based on the scale if exists
                    if 'scale' in self.domain_config[the_domain]:
                        hits = int(hits * self.fit_scale(
                            self.domain_config[the_domain]['scale']))

                    self.queue_dict[key] = RedisThrottledQueue(
                        self.redis_conn, q, window, hits, self.moderated,
                        throttle_key, throttle_key)

    def check_config(self):
        '''
        Controls configuration for the scheduler
        @return: True if there is a new configuration
        '''
        if self.config_flag:
            self.config_flag = False
            return True

        return False

    def update_ipaddress(self):
        '''
        Updates the scheduler so it knows its own ip address
        '''
        # assign local ip in case of exception
        self.old_ip = self.my_ip
        self.my_ip = get_raspberrypi_ip_address()
        try:
            obj = urllib2.urlopen(
                settings.get('PUBLIC_IP_URL', 'http://ip.42.pl/raw'))
            results = self.ip_regex.findall(obj.read())
            if len(results) > 0:
                self.my_ip = results[0]
            else:
                raise IOError("Could not get valid IP Address")
            obj.close()
            self.logger.debug("Current public ip: {ip}".format(ip=self.my_ip))
        except IOError:
            self.logger.error("Could not reach out to get public ip")
            pass

        if self.old_ip != self.my_ip:
            self.logger.info("Changed Public IP: {old} -> {new}".format(
                old=self.old_ip, new=self.my_ip))

    def report_self(self):
        '''
        Reports the crawler uuid to redis
        '''
        self.logger.debug("Reporting self id", extra={'uuid': self.my_uuid})
        key = "stats:crawler:{m}:{s}:{u}".format(m=socket.gethostname(),
                                                 s=self.spider.name,
                                                 u=self.my_uuid)
        self.redis_conn.set(key, time.time())
        self.redis_conn.expire(key, self.ip_update_interval * 2)

    @classmethod
    def from_settings(cls, settings, spidername):
        server = redis.Redis(host=settings.get('REDIS_HOST'),
                             port=settings.get('REDIS_PORT'))
        persist = settings.get('SCHEDULER_PERSIST', True)
        up_int = settings.get('SCHEDULER_QUEUE_REFRESH', 10)
        hits = settings.get('QUEUE_HITS', 10)
        window = settings.get('QUEUE_WINDOW', 60)
        mod = settings.get('QUEUE_MODERATED', False)
        timeout = settings.get('DUPEFILTER_TIMEOUT', 600)
        ip_refresh = settings.get('SCHEDULER_IP_REFRESH', 60)
        add_type = settings.get('SCHEDULER_TYPE_ENABLED', False)
        add_ip = settings.get('SCHEDULER_IP_ENABLED', False)
        retries = settings.get('SCHEUDLER_ITEM_RETRIES', 3)
        ip_regex = settings.get('IP_ADDR_REGEX', '.*')

        my_level = settings.get('SC_LOG_LEVEL', 'DEBUG')
        my_name = "%s_%s" % (spidername, get_raspberrypi_ip_address())
        my_output = settings.get('SC_LOG_STDOUT', False)
        my_json = settings.get('SC_LOG_JSON', True)
        my_dir = settings.get('SC_LOG_DIR', 'logs')
        my_bytes = settings.get('SC_LOG_MAX_BYTES', '10MB')
        my_file = "%s_%s.log" % (spidername, get_raspberrypi_ip_address())
        my_backups = settings.get('SC_LOG_BACKUPS', 5)

        logger = CustomLogFactory.get_instance(json=my_json,
                                               name=my_name,
                                               stdout=my_output,
                                               level=my_level,
                                               dir=my_dir,
                                               file=my_file,
                                               bytes=my_bytes,
                                               backups=my_backups)

        return cls(server, persist, up_int, timeout, retries, logger, hits,
                   window, mod, ip_refresh, add_type, add_ip, ip_regex)

    @classmethod
    def from_crawler(cls, crawler):
        return cls.from_settings(crawler.settings, crawler.spider.name)

    def open(self, spider):
        self.spider = spider
        self.spider.set_logger(self.logger)
        self.spider.set_redis(self.redis_conn)
        self.spider.setup_stats()
        self.create_queues()
        self.setup_zookeeper()
        self.dupefilter = RFPDupeFilter(self.redis_conn,
                                        self.spider.name + ':dupefilter',
                                        self.rfp_timeout)
        # add by msc
        # if self.spider.name == "amazon":
        #     self.count_per_minute = RedisDict(self.redis_conn, None, "%s_%s:count_per_minute"%(self.spider.name, self.spider.worker_id))

    def close(self, reason):
        self.logger.info("Closing Spider", {'spiderid': self.spider.name})
        if not self.persist:
            self.logger.warning("Clearing crawl queues")
            self.dupefilter.clear()
            for key in self.queue_keys:
                self.queue_dict[key].clear()

    def is_blacklisted(self, appid, crawlid):
        '''
        Checks the redis blacklist for crawls that should not be propagated
        either from expiring or stopped
        @return: True if the appid crawlid combo is blacklisted
        '''
        # key_check = '{appid}||{crawlid}'.format(appid=appid,
        #                                         crawlid=crawlid)
        # redis_key = self.spider.name + ":blacklist"
        # return self.redis_conn.sismember(redis_key, key_check)
        # dont use the blacklist
        return False

    def enqueue_request(self, request):
        '''
        Pushes a request from the spider into the proper throttled queue
        '''
        if not request.dont_filter and self.dupefilter.request_seen(request):
            self.logger.debug("Request not added back to redis")
            return
        req_dict = self.request_to_dict(request)
        if not self.is_blacklisted(req_dict['meta']['appid'],
                                   req_dict['meta']['crawlid']):
            # grab the tld of the request
            ex_res = self.extract(req_dict['url'])
            key = "{sid}:{dom}.{suf}:queue".format(
                sid=req_dict['meta']['spiderid'],
                dom=ex_res.domain,
                suf=ex_res.suffix)

            curr_time = time.time()

            # insert if crawl never expires (0) or time < expires
            if req_dict['meta']['expires'] == 0 or \
                    curr_time < req_dict['meta']['expires']:
                # we may already have the queue in memory
                if key in self.queue_keys:
                    self.queue_dict[key].push(req_dict,
                                              req_dict['meta']['priority'])
                else:
                    # shoving into a new redis queue, negative b/c of sorted sets
                    # this will populate ourself and other schedulers when
                    # they call create_queues
                    self.redis_conn.zadd(key,
                                         pickle.dumps(req_dict, protocol=-1),
                                         -req_dict['meta']['priority'])

                self.logger.debug(
                    "Crawlid: '{id}' Appid: '{appid}' Url: '{url}' added to queue"
                    .format(appid=req_dict['meta']['appid'],
                            id=req_dict['meta']['crawlid'],
                            url=req_dict['meta']['url']))
            else:
                self.logger.debug(
                    "Crawlid: '{id}' Appid: '{appid}' expired".format(
                        appid=req_dict['meta']['appid'],
                        id=req_dict['meta']['crawlid']))

        else:
            self.logger.debug(
                "Crawlid: '{id}' Appid: '{appid}' blacklisted".format(
                    appid=req_dict['meta']['appid'],
                    id=req_dict['meta']['crawlid']))

    def request_to_dict(self, request):
        '''
        Convert Request object to a dict.
        modified from scrapy.utils.reqser
        '''
        req_dict = {
            # urls should be safe (safe_string_url)
            'url':
            request.url.decode('ascii'),
            'method':
            request.method,
            'headers':
            dict(request.headers),
            'body':
            request.body,
            'cookies':
            request.cookies,
            'meta':
            request.meta,
            '_encoding':
            request._encoding,
            'priority':
            request.priority,
            'dont_filter':
            request.dont_filter,
            #  callback/errback are assumed to be a bound instance of the spider
            'callback':
            None if request.callback is None else request.callback.func_name,
            'errback':
            None if request.errback is None else request.errback.func_name,
        }
        return req_dict

    def find_item(self):
        '''
        Finds an item from the throttled queues
        '''
        random.shuffle(self.queue_keys)
        count = 0
        while count <= self.item_retries:
            for key in self.queue_keys:
                # the throttled queue only returns an item if it is allowed
                item = self.queue_dict[key].pop()
                self.present_item = item
                #self.spider.log('key: %s ' % key)
                msgvalue = {
                    'queuename': key,
                    'lenthofqueue': (int(len(self.queue_dict[key])) or 0)
                }
                msg = "lenth of queue %s" % key
                self.logger.info('key: %s ' % key)
                self.logger.info('len(self.queue_dict[key]): %s ' %
                                 len(self.queue_dict[key]))
                self.logger.info(msg, msgvalue)

                if item:
                    return item
            # we want the spiders to get slightly out of sync
            # with each other for better performance
            time.sleep(random.random())
            count = count + 1

        return None

    def next_request(self):
        '''
        Logic to handle getting a new url request, from a bunch of
        different queues
        '''

        t = time.time()
        # update the redis queues every so often

        if t - self.update_time > self.update_interval:
            self.update_time = t
            self.create_queues()

        item = self.find_item()

        if item:
            self.logger.info(
                'distributed_scheduler.py::DistributedScheduler::next_request call find_item() result is : %s'
                % (item["meta"]["url"] if 'meta' in item else item["url"]))
            self.logger.debug("Found url to crawl {url}" \
                    .format(url=item['url']))
            try:
                req = Request(item['url'])
            except ValueError:
                # need absolute url
                # need better url validation here
                req = Request('http://' + item['url'])

            if 'callback' in item:
                cb = item['callback']
                if cb and self.spider:
                    cb = get_method(self.spider, cb)
                    req.callback = cb

            if 'errback' in item:
                eb = item['errback']
                if eb and self.spider:
                    eb = get_method(self.spider, eb)
                    req.errback = eb

            if 'meta' in item:
                item = item['meta']

            # defaults not in schema
            if 'curdepth' not in item:
                item['curdepth'] = 0
            if "retry_times" not in item:
                item['retry_times'] = 0

            for key in item.keys():
                req.meta[key] = item[key]

            # extra check to add items to request
            if 'useragent' in item and item['useragent'] is not None:
                req.headers['User-Agent'] = item['useragent']
            if 'cookie' in item and item['cookie'] is not None:
                if isinstance(item['cookie'], dict):
                    req.cookies = item['cookie']
                elif isinstance(item['cookie'], basestring):
                    req.cookies = self.parse_cookie(item['cookie'])
            return req

        return None

    def parse_cookie(self, string):
        '''
        Parses a cookie string like returned in a Set-Cookie header
        @param string: The cookie string
        @return: the cookie dict
        '''
        results = re.findall('([^=]+)=([^\;]+);?\s?', string)
        my_dict = {}
        for item in results:
            my_dict[item[0]] = item[1]

        return my_dict

    def has_pending_requests(self):
        '''
        We never want to say we have pending requests
        If this returns True scrapy sometimes hangs.
        '''
        return False
コード例 #5
0
class TestZookeeperWatcher(TestCase):
    def setUp(self):

        zoo_client = MagicMock()
        zoo_client.get = MagicMock(return_value=('data', 'blah'))

        with patch('scutils.zookeeper_watcher.KazooClient') as k:
            k.return_value = zoo_client
            self.zoo_watcher = ZookeeperWatcher(hosts='localhost',
                                                filepath='/mypath',
                                                pointer=False,
                                                ensure=True,
                                                valid_init=True)

    def test_ping(self):
        self.zoo_watcher.zoo_client.server_version = MagicMock()
        self.assertTrue(self.zoo_watcher.ping())
        self.zoo_watcher.zoo_client.server_version = MagicMock(
            side_effect=KazooException)
        self.assertFalse(self.zoo_watcher.ping())

    def test_get_file_contents(self):
        self.zoo_watcher.old_pointed = 'old_pointed'
        self.zoo_watcher.old_data = 'old_data'

        self.zoo_watcher.pointer = False
        self.assertEquals(self.zoo_watcher.get_file_contents(), 'old_data')

        self.zoo_watcher.pointer = True
        self.assertEquals(self.zoo_watcher.get_file_contents(), 'old_data')

        self.zoo_watcher.pointer = True
        self.assertEquals(self.zoo_watcher.get_file_contents(True),
                          'old_pointed')

    def test_compare_pointer(self):
        self.zoo_watcher.old_pointed = '/path1'

        self.assertTrue(self.zoo_watcher.compare_pointer('/path2'))

        self.zoo_watcher.old_pointed = '/path1'

        self.assertFalse(self.zoo_watcher.compare_pointer('/path1'))

    def test_compare_data(self):
        self.zoo_watcher.old_data = 'old_data'

        self.assertTrue(self.zoo_watcher.compare_data('new_data'))

        self.zoo_watcher.old_data = 'same_data'
        self.assertFalse(self.zoo_watcher.compare_data('same_data'))

    def test_set_valid(self):
        self.zoo_watcher.is_valid = MagicMock(return_value=True)
        self.zoo_watcher.valid_handler = MagicMock()
        self.zoo_watcher.set_valid(False)

        self.zoo_watcher.valid_handler.assert_called_once_with(True)

    def test_call_valid(self):
        self.the_bool = False

        def the_set(state):
            self.the_bool = True

        self.zoo_watcher.valid_handler = the_set
        self.zoo_watcher.call_valid(True)

        self.assertTrue(self.the_bool)

    def test_call_config(self):
        self.the_bool = False

        def the_set(state):
            self.the_bool = True

        self.zoo_watcher.config_handler = the_set
        self.zoo_watcher.call_config(True)

        self.assertTrue(self.the_bool)

    def test_call_error(self):
        self.the_bool = False

        def the_set(state):
            self.the_bool = True

        self.zoo_watcher.error_handler = the_set
        self.zoo_watcher.call_error(True)

        self.assertTrue(self.the_bool)
コード例 #6
0
class DistributedScheduler(object):

    redis_conn = None  # the redis connection
    queue_dict = None  # the dict of throttled queues
    spider = None  # the spider using this scheduler
    queue_keys = None  # the list of current queues
    queue_class = None  # the class to use for the queue
    dupefilter = None  # the redis dupefilter
    update_time = 0  # the last time the queues were updated
    update_ip_time = 0  # the last time the ip was updated
    update_interval = 0  # how often to update the queues
    extract = None  # the tld extractor
    hits = 0  # default number of hits for a queue
    window = 0  # default window to calculate number of hits
    ip = '127.0.0.1'  # 爬虫节点对应的IP
    old_ip = None  # the old ip for logging
    ip_update_interval = 0  # the interval to update the ip address
    add_type = None  # add spider type to redis throttle queue key
    add_ip = None  # add spider public ip to redis throttle queue key
    item_retries = 0  # the number of extra tries to get an item
    my_uuid = None  # the generated UUID for the particular scrapy process
    # Zookeeper Dynamic Config Vars
    domain_config = {}  # The list of domains and their configs
    my_id = None  # The id used to read the throttle config
    config_flag = False  # Flag to reload queues if settings are wiped too
    assign_path = None  # The base assigned configuration path to read
    zoo_client = None  # The KazooClient to manage the config
    my_assignment = None  # Zookeeper path to read actual yml config
    black_domains = []  # the domains to ignore thanks to zookeeper config

    producer = None  # Kafka消息队列中的生产者
    closed = False  # kafka连接是否关闭

    def __init__(self, server, persist, update_int, timeout, retries, logger,
                 hits, window, mod, ip_refresh, add_type, add_ip, ip_regex,
                 backlog_blacklist, queue_timeout, chose):
        self.redis_conn = server
        self.persist = persist
        self.queue_dict = {}
        self.update_interval = update_int
        self.hits = hits
        self.window = window
        self.moderated = mod
        self.rfp_timeout = timeout
        self.ip_update_interval = ip_refresh
        self.add_type = add_type
        self.add_ip = add_ip
        self.item_retires = retries
        self.logger = logger
        self.ip_regex = re.compile(ip_regex)
        self.backlog_blacklist = backlog_blacklist
        self.queue_timeout = queue_timeout
        self.chose = chose
        self.extract = tldextract.TLDExtract()

        self.job_id = None  # 标识爬虫进程
        self.paused = False  # 标识爬虫是否暂停

    def setup_zookeeper(self):
        self.assign_path = settings.get('ZOOKEEPER_ASSIGN_PATH', "")
        self.my_id = settings.get('ZOOKEEPER_ID', 'all')
        self.logger.debug("Trying to establish Zookeeper connection")
        try:
            self.zoo_watcher = ZookeeperWatcher(
                                hosts=settings.get('ZOOKEEPER_HOSTS'),
                                filepath=self.assign_path + self.my_id,
                                config_handler=self.change_config,
                                error_handler=self.error_config,
                                pointer=False, ensure=True, valid_init=True)
        except KazooTimeoutError:
            self.logger.error("Could not connect to Zookeeper")
            sys.exit(1)

        if self.zoo_watcher.ping():
            self.logger.debug("Successfully set up Zookeeper connection")
        else:
            self.logger.error("Could not ping Zookeeper")
            sys.exit(1)

    def change_config(self, config_string):
        if config_string and len(config_string) > 0:
            loaded_config = yaml.safe_load(config_string)
            self.logger.info("Zookeeper config changed", extra=loaded_config)
            self.load_domain_config(loaded_config)
            self.update_domain_queues()
        elif config_string is None or len(config_string) == 0:
            self.error_config("Zookeeper config wiped")

        self.create_throttle_queues()

    def load_domain_config(self, loaded_config):
        '''
        Loads the domain_config and sets up queue_dict

        @param loaded_config: the yaml loaded config dict from zookeeper
        '''
        self.domain_config = {}
        # vetting process to ensure correct configs
        if loaded_config:
            if 'domains' in loaded_config:
                for domain in loaded_config['domains']:
                    item = loaded_config['domains'][domain]
                    # check valid
                    if 'window' in item and 'hits' in item:
                        self.logger.debug("Added domain {dom} to loaded config"
                                          .format(dom=domain))
                        self.domain_config[domain] = item
                        # domain_config = {'wikipedia.org': {'window': 60, 'scale': 0.5, 'hits': 30}}
            if 'blacklist' in loaded_config:
                self.black_domains = loaded_config['blacklist']
                # black_domains = ['domain3.com', 'www.baidu.com']

        self.config_flag = True

    def update_domain_queues(self):
        '''
        Check to update existing queues already in memory
        new queues are created elsewhere
        '''
        for key in self.domain_config:
            final_key = "{spider_type}:{job_id}:{domain}:queue".format(
                    spider_type=self.spider.name,
                    job_id=self.job_id,
                    domain=key)
            # we already have a throttled queue for this domain, update it to new settings
            if final_key in self.queue_dict:
                self.queue_dict[final_key][0].window = float(self.domain_config[key]['window'])
                self.logger.debug("Updated queue {q} with new config"
                                  .format(q=final_key))
                # if scale is applied, scale back; otherwise use updated hits
                if 'scale' in self.domain_config[key]:
                    # round to int
                    hits = int(self.domain_config[key]['hits'] * self.fit_scale(
                               self.domain_config[key]['scale']))
                    self.queue_dict[final_key][0].limit = float(hits)
                else:
                    self.queue_dict[final_key][0].limit = float(self.domain_config[key]['hits'])

    def error_config(self, message):
        extras = {}
        extras['message'] = message
        extras['revert_window'] = self.window
        extras['revert_hits'] = self.hits
        extras['spiderid'] = self.spider.name
        self.logger.info("Lost config from Zookeeper", extra=extras)
        # lost connection to zookeeper, reverting back to defaults
        for key in self.domain_config:
            final_key = "{name}:{domain}:queue".format(
                    name=self.spider.name,
                    domain=key)
            self.queue_dict[final_key][0].window = self.window
            self.queue_dict[final_key][0].limit = self.hits

        self.domain_config = {}

    def fit_scale(self, scale):
        '''
        @return: a scale >= 0 and <= 1
        '''
        if scale >= 1:
            return 1.0
        elif scale <= 0:
            return 0.0
        else:
            return scale

    def create_throttle_queues(self):
        """
        创建限流队列
        :return:
        """
        new_conf = self.check_config()
        queue_key = '{spider_type}:{job_id}:*:queue'.format(spider_type=self.spider.name,
                                                            job_id=self.job_id)
        self.queue_keys = self.redis_conn.keys(queue_key)
        for key in self.queue_keys:
            throttle_key = ""

            if self.add_type:
                throttle_key = self.spider.name + ":"
            if self.add_ip:
                throttle_key = throttle_key + self.ip + ":"

            the_domain = re.split(':', key)[2]
            throttle_key += the_domain

            if key not in self.queue_dict or new_conf:
                self.logger.debug("Added new Throttled Queue {q}"
                                  .format(q=key))
                q = RedisPriorityQueue(self.redis_conn, key)
                if the_domain not in self.domain_config:
                    self.queue_dict[key] = [RedisThrottledQueue(self.redis_conn, q, self.window, self.hits,
                                                                self.moderated, throttle_key, throttle_key, True),
                                            time.time()]
                else:
                    window = self.domain_config[the_domain]['window']
                    hits = self.domain_config[the_domain]['hits']
                    if 'scale' in self.domain_config[the_domain]:
                        hits = int(hits * self.fit_scale(self.domain_config[the_domain]['scale']))

                    self.queue_dict[key] = [RedisThrottledQueue(self.redis_conn, q, window, hits,
                                                                self.moderated, throttle_key, throttle_key, True),
                                            time.time()]

    def expire_queues(self):
        '''
        Expires old queue_dict keys that have not been used in a long time.
        Prevents slow memory build up when crawling lots of different domains
        '''
        curr_time = time.time()
        for key in list(self.queue_dict):
            diff = curr_time - self.queue_dict[key][1]
            if diff > self.queue_timeout:
                self.logger.debug("Expiring domain queue key " + key)
                del self.queue_dict[key]
                if key in self.queue_keys:
                    self.queue_keys.remove(key)

    def check_config(self):
        '''
        Controls configuration for the scheduler

        @return: True if there is a new configuration
        '''
        if self.config_flag:
            self.config_flag = False
            return True

        return False

    def report_self(self):
        ip = DistributedScheduler.get_local_ip()
        key = "stats:spider:{ip}:{job}".format(
            ip=ip,
            job=self.job_id
        )
        self.redis_conn.set(key, time.time())

    @staticmethod
    def get_local_ip(ifname='enp1s0'):
        s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
        inet = fcntl.ioctl(s.fileno(), 0x8915, struct.pack('256s', ifname[:15]))
        ret = socket.inet_ntoa(inet[20:24])
        return ret

    @classmethod
    def from_settings(cls, settings):
        server = redis.Redis(host=settings.get('REDIS_HOST'),
                             port=settings.get('REDIS_PORT'),
                             db=settings.get('REDIS_DB'))
        persist = settings.get('SCHEDULER_PERSIST', True)
        up_int = settings.get('SCHEDULER_QUEUE_REFRESH', 10)
        hits = settings.get('QUEUE_HITS', 10)
        window = settings.get('QUEUE_WINDOW', 60)
        mod = settings.get('QUEUE_MODERATED', False)
        timeout = settings.get('DUPEFILTER_TIMEOUT', 600)
        ip_refresh = settings.get('SCHEDULER_IP_REFRESH', 60)
        add_type = settings.get('SCHEDULER_TYPE_ENABLED', True)
        add_ip = settings.get('SCHEDULER_IP_ENABLED', False)
        retries = settings.get('SCHEUDLER_ITEM_RETRIES', 3)
        ip_regex = settings.get('IP_ADDR_REGEX', '.*')
        backlog_blacklist = settings.get('SCHEDULER_BACKLOG_BLACKLIST', True)
        queue_timeout = settings.get('SCHEDULER_QUEUE_TIMEOUT', 3600)

        my_level = settings.get('SC_LOG_LEVEL', 'INFO')
        my_name = settings.get('SC_LOGGER_NAME', 'sc-logger')
        my_output = settings.get('SC_LOG_STDOUT', True)
        my_json = settings.get('SC_LOG_JSON', False)
        my_dir = settings.get('SC_LOG_DIR', 'logs')
        my_bytes = settings.get('SC_LOG_MAX_BYTES', '10MB')
        my_file = settings.get('SC_LOG_FILE', 'main.log')
        my_backups = settings.get('SC_LOG_BACKUPS', 5)

        logger = LogFactory.get_instance(json=my_json,
                                         name=my_name,
                                         stdout=my_output,
                                         level=my_level,
                                         dir=my_dir,
                                         file=my_file,
                                         bytes=my_bytes,
                                         backups=my_backups)

        # spider_ids = ['1', ]
        spider_ids = ['1', '2', '3']
        chose = ketama.Continuum(spider_ids)

        return cls(server, persist, up_int, timeout, retries, logger, hits,
                   window, mod, ip_refresh, add_type, add_ip, ip_regex,
                   backlog_blacklist, queue_timeout, chose)

    @classmethod
    def from_crawler(cls, crawler):
        return cls.from_settings(crawler.settings)

    def open(self, spider):
        self.spider = spider
        self.ip = DistributedScheduler.get_local_ip()
        self.job_id = spider.settings['job_id']
        # self.job_id = '1'
        self.spider.set_logger(self.logger)
        self.create_throttle_queues()
        self.setup_zookeeper()  # 连接zookeeper
        self.setup_kafka()  # 连接Kafka

        key = "stats:spider:{ip}:{job}".format(
            ip=DistributedScheduler.get_local_ip(),
            job=self.job_id
        )
        self.redis_conn.set(key, time.time())
        self.dupefilter = RFPDupeFilter(self.redis_conn, self.spider.name)

    def close(self, reason):
        self.logger.info("Closing Spider", {'spiderid': self.spider.name})

        # 清空爬虫队列对应的限流队列
        if not self.persist:
            self.logger.warning("Clearing crawl queues")
            for key in self.queue_keys:
                self.queue_dict[key][0].clear()

        # 清空Redis中爬虫节点状态
        ip = DistributedScheduler.get_local_ip()
        key = "stats:spider:{ip}:{job}".format(
            ip=ip,
            job=self.job_id
        )
        self.redis_conn.delete(key)
        key = "{job}:status".format(job=self.job_id)
        self.redis_conn.delete(key)

        # 关闭Kafka连接
        if self.producer is not None:
            self.logger.debug("Closing kafka producer")
            self.producer.close(timeout=10)

    def is_blacklisted(self, appid, crawlid):
        '''
        Checks the redis blacklist for crawls that should not be propagated
        either from expiring or stopped

        @return: True if the appid crawlid combo is blacklisted
        '''
        key_check = '{appid}||{crawlid}'.format(appid=appid,
                                                crawlid=crawlid)
        redis_key = self.spider.name + ":blacklist"
        return self.redis_conn.sismember(redis_key, key_check)

    def enqueue_request(self, request):
        """
        从spider模块中获取新的Request,交给Kafka消息队列
        :param request:
        :return:
        """
        if not True and self.dupefilter.request_seen(request):
            self.logger.debug("Request not added back to redis")
            return
        req_dict = request_to_dict(request, self.spider)
        real_url = req_dict['meta']['splash']['args']['url'] if 'splash' in req_dict['meta'] else req_dict['url']
        # 强调:  加入spider_type,job_id
        req_dict['spider_type'] = self.spider.name
        req_dict['job_id'] = self.chose[real_url.encode('utf-8')]
        # req_dict['errback'] = 'parse_inform_index'
        req_dict.update({'errback': 'parse_inform_index'})
        self._feed_to_kafka(req_dict)

    def _feed_to_kafka(self, json_item):

        def _feed(item):
            try:
                self.logger.debug("Sending json to kafka at " +
                                  str(settings['KAFKA_PRODUCER_TOPIC']))
                future = self.producer.send(settings['KAFKA_PRODUCER_TOPIC'],
                                            item)
                self.producer.flush()
                return True

            except Exception as e:
                self.logger.error("Lost connection to Kafka")
                return False

        return _feed(json_item)

    def find_item(self):
        random.shuffle(self.queue_keys)
        count = 0

        while count <= self.item_retries:
            for key in self.queue_keys:
                if key.split(':')[2] in self.black_domains:
                    continue

                item = self.queue_dict[key][0].pop()

                if item:
                    self.queue_dict[key][1] = time.time()
                    return item
            count += 1

        return None

    def next_request(self):
        """
        从redis中取出已被序列化的任务封装成Requests,交给Engine
        :return:
        """
        if self.paused:
            return

        item = self.find_item()
        if item:
            '''考虑两种情况的Request:
                1. 被渲染后的Request
                2. 前端用户传入的Request
            '''
            if 'splash' in item['meta']:
                self.logger.debug("Crawl url: %s via %s" % (item['meta']['splash']['args']['url'], item['url']))
                # req = request_from_dict(item, self.spider)
                req = SplashRequest(url=item['url'],
                                    meta=item['meta'],
                                    method=item['method'],
                                    body=item['body'],
                                    dont_send_headers=True
                                    )
                if 'callback' in item:
                    req.callback = getattr(self.spider, item['callback'])
                req.headers['content-type'] = 'application/json'
                if 'headers' in req.meta['splash']['args']:
                    req.meta['splash']['args']['headers'] = {}
                    req.meta['splash']['args']['content-type'] = 'application/json'
            else:
                req = SplashRequest(url=item['url'],
                                    callback=item['callback'],
                                    meta=item['meta'],
                                    dont_send_headers=True
                                    )
                if 'method' in item:
                    req.method = item['method']
                if 'headers' in item:
                    req.headers = item['headers']
                if 'body' in item:
                    req.body = item['body']
                if 'cookies' in item:
                    req.cookies = item['cookies']
                if 'priority' in item:
                    req.priority = item['priority']
                self.logger.debug("Crawl url: %s" % item['url'])
            return req
        return None

    def status_from_redis(self):
        self.create_throttle_queues()
        self.expire_queues()

        status = self.redis_conn.get('{job}:status'.format(job=self.job_id))
        if status == 'pause':  # 暂停爬虫 && 重置一致性分布
            self.paused = True
            spiders = self.redis_conn.keys('stats:spider:*:*')
            spider_ids = []
            for spider in spiders:
                spider_ids.append(spider.split(':')[3])
            self.chose = ketama.Continuum(spider_ids)
            return
        if status == 'running':
            self.paused = False

    def parse_cookie(self, string):
        '''
        Parses a cookie string like returned in a Set-Cookie header
        @param string: The cookie string
        @return: the cookie dict
        '''
        results = re.findall('([^=]+)=([^\;]+);?\s?', string)
        my_dict = {}
        for item in results:
            my_dict[item[0]] = item[1]

        return my_dict

    def has_pending_requests(self):
        '''
        We never want to say we have pending requests
        If this returns True scrapy sometimes hangs.
        '''
        return False

    def setup_kafka(self):
        """
        创建生产者
        :return:
        """
        self.producer = self._create_producer()
        self.logger.debug("Successfully connected to Kafka")

    @retry(wait_exponential_multiplier=500, wait_exponential_max=10000)
    def _create_producer(self):
        if not self.closed:
            try:
                self.logger.debug("Creating new kafka producer using brokers: " +
                                  str(settings['KAFKA_HOSTS']))

                return KafkaProducer(bootstrap_servers=settings['KAFKA_HOSTS'],
                                     value_serializer=lambda v: json.dumps(v).encode('utf-8'),
                                     retries=3,
                                     linger_ms=settings['KAFKA_PRODUCER_BATCH_LINGER_MS'],
                                     buffer_memory=settings['KAFKA_PRODUCER_BUFFER_BYTES'])
            except KeyError as e:
                self.logger.error('Missing setting named ' + str(e),
                                  {'ex': traceback.format_exc()})
            except:
                self.logger.error("Couldn't initialize kafka producer.",
                                  {'ex': traceback.format_exc()})
            raise
コード例 #7
0
class DistributedScheduler(object):
    '''
    Scrapy request scheduler that utilizes Redis Throttled Priority Queues
    to moderate different domain scrape requests within a distributed scrapy
    cluster
    '''
    redis_conn = None  # the redis connection
    queue_dict = None  # the dict of throttled queues
    spider = None  # the spider using this scheduler
    queue_keys = None  # the list of current queues
    queue_class = None  # the class to use for the queue
    dupefilter = None  # the redis dupefilter
    global_page_per_domain_filter = None  # the global redis page per domain filter, applied to all domains.
    domain_max_page_filter = None  # the individual domain's redis max page filter.
    update_time = 0  # the last time the queues were updated
    update_ip_time = 0  # the last time the ip was updated
    update_interval = 0  # how often to update the queues
    extract = None  # the tld extractor
    hits = 0  # default number of hits for a queue
    window = 0  # default window to calculate number of hits
    my_ip = None  # the ip address of the scheduler (if needed)
    old_ip = None  # the old ip for logging
    ip_update_interval = 0  # the interval to update the ip address
    add_type = None  # add spider type to redis throttle queue key
    add_ip = None  # add spider public ip to redis throttle queue key
    item_retries = 0  # the number of extra tries to get an item
    my_uuid = None  # the generated UUID for the particular scrapy process
    # Zookeeper Dynamic Config Vars
    domain_config = {}  # The list of domains and their configs
    my_id = None  # The id used to read the throttle config
    config_flag = False  # Flag to reload queues if settings are wiped too
    assign_path = None  # The base assigned configuration path to read
    zoo_client = None  # The KazooClient to manage the config
    my_assignment = None  # Zookeeper path to read actual yml config
    black_domains = []  # the domains to ignore thanks to zookeeper config

    def __init__(self, server, persist, update_int, timeout, retries, logger,
                 hits, window, mod, ip_refresh, add_type, add_ip, ip_regex,
                 backlog_blacklist, queue_timeout,
                 global_page_per_domain_limit,
                 global_page_per_domain_limit_timeout,
                 domain_max_page_timeout):
        '''
        Initialize the scheduler
        '''
        self.redis_conn = server
        self.persist = persist
        self.queue_dict = {}
        self.update_interval = update_int
        self.hits = hits
        self.window = window
        self.moderated = mod
        self.rfp_timeout = timeout
        self.ip_update_interval = ip_refresh
        self.add_type = add_type
        self.add_ip = add_ip
        self.item_retries = retries
        self.logger = logger
        self.ip_regex = re.compile(ip_regex)
        self.backlog_blacklist = backlog_blacklist
        self.queue_timeout = queue_timeout
        self.global_page_per_domain_limit = global_page_per_domain_limit
        self.global_page_per_domain_limit_timeout = global_page_per_domain_limit_timeout
        self.domain_max_page_timeout = domain_max_page_timeout

        # set up tldextract
        self.extract = tldextract.TLDExtract()

        self.update_ipaddress()

        # if we need better uuid's mod this line
        self.my_uuid = str(uuid.uuid4()).split('-')[4]

    def setup_zookeeper(self):
        self.assign_path = settings.get('ZOOKEEPER_ASSIGN_PATH', "")
        self.my_id = settings.get('ZOOKEEPER_ID', 'all')
        self.logger.debug("Trying to establish Zookeeper connection")
        try:
            self.zoo_watcher = ZookeeperWatcher(
                hosts=settings.get('ZOOKEEPER_HOSTS'),
                filepath=self.assign_path + self.my_id,
                config_handler=self.change_config,
                error_handler=self.error_config,
                pointer=False,
                ensure=True,
                valid_init=True)
        except KazooTimeoutError:
            self.logger.error("Could not connect to Zookeeper")
            sys.exit(1)

        if self.zoo_watcher.ping():
            self.logger.debug("Successfully set up Zookeeper connection")
        else:
            self.logger.error("Could not ping Zookeeper")
            sys.exit(1)

    def change_config(self, config_string):
        if config_string and len(config_string) > 0:
            loaded_config = yaml.safe_load(config_string)
            self.logger.info("Zookeeper config changed", extra=loaded_config)
            self.load_domain_config(loaded_config)
            self.update_domain_queues()
        elif config_string is None or len(config_string) == 0:
            self.error_config("Zookeeper config wiped")

        self.create_queues()

    def load_domain_config(self, loaded_config):
        '''
        Loads the domain_config and sets up queue_dict
        @param loaded_config: the yaml loaded config dict from zookeeper
        '''
        self.domain_config = {}
        # vetting process to ensure correct configs
        if loaded_config:
            if 'domains' in loaded_config:
                for domain in loaded_config['domains']:
                    item = loaded_config['domains'][domain]
                    # check valid
                    if 'window' in item and 'hits' in item:
                        self.logger.debug(
                            "Added domain {dom} to loaded config".format(
                                dom=domain))
                        self.domain_config[domain] = item
            if 'blacklist' in loaded_config:
                self.black_domains = loaded_config['blacklist']

        self.config_flag = True

    def update_domain_queues(self):
        '''
        Check to update existing queues already in memory
        new queues are created elsewhere
        '''
        for key in self.domain_config:
            final_key = "{name}:{domain}:queue".format(name=self.spider.name,
                                                       domain=key)
            # we already have a throttled queue for this domain, update it to new settings
            if final_key in self.queue_dict:
                self.queue_dict[final_key][0].window = float(
                    self.domain_config[key]['window'])
                self.logger.debug(
                    "Updated queue {q} with new config".format(q=final_key))
                # if scale is applied, scale back; otherwise use updated hits
                if 'scale' in self.domain_config[key]:
                    # round to int
                    hits = int(
                        self.domain_config[key]['hits'] *
                        self.fit_scale(self.domain_config[key]['scale']))
                    self.queue_dict[final_key][0].limit = float(hits)
                else:
                    self.queue_dict[final_key][0].limit = float(
                        self.domain_config[key]['hits'])

    def error_config(self, message):
        extras = {}
        extras['message'] = message
        extras['revert_window'] = self.window
        extras['revert_hits'] = self.hits
        extras['spiderid'] = self.spider.name
        self.logger.info("Lost config from Zookeeper", extra=extras)
        # lost connection to zookeeper, reverting back to defaults
        for key in self.domain_config:
            final_key = "{name}:{domain}:queue".format(name=self.spider.name,
                                                       domain=key)
            self.queue_dict[final_key][0].window = self.window
            self.queue_dict[final_key][0].limit = self.hits

        self.domain_config = {}

    def fit_scale(self, scale):
        '''
        @return: a scale >= 0 and <= 1
        '''
        if scale >= 1:
            return 1.0
        elif scale <= 0:
            return 0.0
        else:
            return scale

    def create_queues(self):
        '''
        Updates the in memory list of the redis queues
        Creates new throttled queue instances if it does not have them
        '''
        # new config could have loaded between scrapes
        newConf = self.check_config()

        self.queue_keys = self.redis_conn.keys(self.spider.name + ":*:queue")

        for key in self.queue_keys:
            # build final queue key, depending on type and ip bools
            throttle_key = ""

            if self.add_type:
                throttle_key = self.spider.name + ":"
            if self.add_ip:
                throttle_key = throttle_key + self.my_ip + ":"

            # add the tld from the key `type:tld:queue`
            the_domain = re.split(':', key)[1]
            throttle_key = throttle_key + the_domain

            if key not in self.queue_dict or newConf:
                self.logger.debug(
                    "Added new Throttled Queue {q}".format(q=key))
                q = RedisPriorityQueue(self.redis_conn, key, encoding=ujson)

                # use default window and hits
                if the_domain not in self.domain_config:
                    # this is now a tuple, all access needs to use [0] to get
                    # the object, use [1] to get the time
                    self.queue_dict[key] = [
                        RedisThrottledQueue(self.redis_conn, q, self.window,
                                            self.hits, self.moderated,
                                            throttle_key, throttle_key, True),
                        time.time()
                    ]
                # use custom window and hits
                else:
                    window = self.domain_config[the_domain]['window']
                    hits = self.domain_config[the_domain]['hits']

                    # adjust the crawl rate based on the scale if exists
                    if 'scale' in self.domain_config[the_domain]:
                        hits = int(hits * self.fit_scale(
                            self.domain_config[the_domain]['scale']))

                    self.queue_dict[key] = [
                        RedisThrottledQueue(self.redis_conn, q, window, hits,
                                            self.moderated, throttle_key,
                                            throttle_key, True),
                        time.time()
                    ]

    def expire_queues(self):
        '''
        Expires old queue_dict keys that have not been used in a long time.
        Prevents slow memory build up when crawling lots of different domains
        '''
        curr_time = time.time()
        for key in list(self.queue_dict):
            diff = curr_time - self.queue_dict[key][1]
            if diff > self.queue_timeout:
                self.logger.debug("Expiring domain queue key " + key)
                del self.queue_dict[key]
                if key in self.queue_keys:
                    self.queue_keys.remove(key)

    def check_config(self):
        '''
        Controls configuration for the scheduler
        @return: True if there is a new configuration
        '''
        if self.config_flag:
            self.config_flag = False
            return True

        return False

    def update_ipaddress(self):
        '''
        Updates the scheduler so it knows its own ip address
        '''
        # assign local ip in case of exception
        self.old_ip = self.my_ip
        self.my_ip = '127.0.0.1'
        try:
            obj = urllib.request.urlopen(
                settings.get('PUBLIC_IP_URL', 'http://ip.42.pl/raw'))
            results = self.ip_regex.findall(obj.read().decode('utf-8'))
            if len(results) > 0:
                self.my_ip = results[0]
            else:
                raise IOError("Could not get valid IP Address")
            obj.close()
            self.logger.debug("Current public ip: {ip}".format(ip=self.my_ip))
        except IOError:
            self.logger.error("Could not reach out to get public ip")
            pass

        if self.old_ip != self.my_ip:
            self.logger.info("Changed Public IP: {old} -> {new}".format(
                old=self.old_ip, new=self.my_ip))

    def report_self(self):
        '''
        Reports the crawler uuid to redis
        '''
        self.logger.debug("Reporting self id", extra={'uuid': self.my_uuid})
        key = "stats:crawler:{m}:{s}:{u}".format(m=socket.gethostname(),
                                                 s=self.spider.name,
                                                 u=self.my_uuid)
        self.redis_conn.set(key, time.time())
        self.redis_conn.expire(key, self.ip_update_interval * 2)

    @classmethod
    def from_settings(cls, settings):
        server = redis.Redis(
            host=settings.get('REDIS_HOST'),
            port=settings.get('REDIS_PORT'),
            db=settings.get('REDIS_DB'),
            password=settings.get('REDIS_PASSWORD'),
            decode_responses=True,
            socket_timeout=settings.get('REDIS_SOCKET_TIMEOUT'),
            socket_connect_timeout=settings.get('REDIS_SOCKET_TIMEOUT'))
        persist = settings.get('SCHEDULER_PERSIST', True)
        up_int = settings.get('SCHEDULER_QUEUE_REFRESH', 10)
        hits = settings.get('QUEUE_HITS', 10)
        window = settings.get('QUEUE_WINDOW', 60)
        mod = settings.get('QUEUE_MODERATED', False)
        timeout = settings.get('DUPEFILTER_TIMEOUT', 600)
        ip_refresh = settings.get('SCHEDULER_IP_REFRESH', 60)
        add_type = settings.get('SCHEDULER_TYPE_ENABLED', False)
        add_ip = settings.get('SCHEDULER_IP_ENABLED', False)
        retries = settings.get('SCHEUDLER_ITEM_RETRIES', 3)
        ip_regex = settings.get('IP_ADDR_REGEX', '.*')
        backlog_blacklist = settings.get('SCHEDULER_BACKLOG_BLACKLIST', True)
        queue_timeout = settings.get('SCHEDULER_QUEUE_TIMEOUT', 3600)

        my_level = settings.get('SC_LOG_LEVEL', 'INFO')
        my_name = settings.get('SC_LOGGER_NAME', 'sc-logger')
        my_output = settings.get('SC_LOG_STDOUT', True)
        my_json = settings.get('SC_LOG_JSON', False)
        my_dir = settings.get('SC_LOG_DIR', 'logs')
        my_bytes = settings.get('SC_LOG_MAX_BYTES', '10MB')
        my_file = settings.get('SC_LOG_FILE', 'main.log')
        my_backups = settings.get('SC_LOG_BACKUPS', 5)

        logger = LogFactory.get_instance(json=my_json,
                                         name=my_name,
                                         stdout=my_output,
                                         level=my_level,
                                         dir=my_dir,
                                         file=my_file,
                                         bytes=my_bytes,
                                         backups=my_backups)

        global_page_per_domain_limit = settings.get(
            'GLOBAL_PAGE_PER_DOMAIN_LIMIT', None)
        global_page_per_domain_limit_timeout = settings.get(
            'GLOBAL_PAGE_PER_DOMAIN_LIMIT_TIMEOUT', 600)
        domain_max_page_timeout = settings.get('DOMAIN_MAX_PAGE_TIMEOUT', 600)

        return cls(server, persist, up_int, timeout, retries, logger, hits,
                   window, mod, ip_refresh, add_type, add_ip, ip_regex,
                   backlog_blacklist, queue_timeout,
                   global_page_per_domain_limit,
                   global_page_per_domain_limit_timeout,
                   domain_max_page_timeout)

    @classmethod
    def from_crawler(cls, crawler):
        return cls.from_settings(crawler.settings)

    def open(self, spider):
        self.spider = spider
        self.spider.set_logger(self.logger)
        self.create_queues()
        self.setup_zookeeper()
        self.dupefilter = RFPDupeFilter(self.redis_conn,
                                        self.spider.name + ':dupefilter',
                                        self.rfp_timeout)
        self.global_page_per_domain_filter = RFGlobalPagePerDomainFilter(
            self.redis_conn, self.spider.name + ':global_page_count_filter',
            self.global_page_per_domain_limit,
            self.global_page_per_domain_limit_timeout)
        self.domain_max_page_filter = RFDomainMaxPageFilter(
            self.redis_conn, self.spider.name + ':domain_max_page_filter',
            self.domain_max_page_timeout)

    def close(self, reason):
        self.logger.info("Closing Spider", {'spiderid': self.spider.name})
        if not self.persist:
            self.logger.warning("Clearing crawl queues")
            self.dupefilter.clear()
            self.global_page_per_domain_filter.clear()
            self.domain_max_page_filter.clear()
            for key in self.queue_keys:
                self.queue_dict[key][0].clear()

    def is_blacklisted(self, appid, crawlid):
        '''
        Checks the redis blacklist for crawls that should not be propagated
        either from expiring or stopped
        @return: True if the appid crawlid combo is blacklisted
        '''
        key_check = '{appid}||{crawlid}'.format(appid=appid, crawlid=crawlid)
        redis_key = self.spider.name + ":blacklist"
        return self.redis_conn.sismember(redis_key, key_check)

    def enqueue_request(self, request):
        '''
        Pushes a request from the spider into the proper throttled queue
        '''

        # # # # # # # # # # # # # # # # # # Duplicate link Filter # # # # # # # # # # # # # # #
        if not request.dont_filter and self.dupefilter.request_seen(request):
            self.logger.debug("Request not added back to redis")
            return

        # An individual crawling request of a domain's page
        req_dict = request_to_dict(request, self.spider)

        # # # # # # # # # # # # # # # # # # Page Limit Filters # # # # # # # # # # # # # # #
        # Max page filter per individual domain
        if req_dict['meta'][
                'domain_max_pages'] and self.domain_max_page_filter.request_page_limit_reached(
                    request=request, spider=self.spider):
            self.logger.debug(
                "Request {0} reached domain's page limit of {1}".format(
                    request.url, req_dict['meta']['domain_max_pages']))
            return

        # Global - cluster wide - max page filter
        if self.global_page_per_domain_limit and self.global_page_per_domain_filter.request_page_limit_reached(
                request=request, spider=self.spider):
            self.logger.debug(
                "Request {0} reached global page limit of {1}".format(
                    request.url, self.global_page_per_domain_limit))
            return

        # # # # # # # # # # # # # # # # # # Blacklist Filter # # # # # # # # # # # # # # #
        if not self.is_blacklisted(req_dict['meta']['appid'],
                                   req_dict['meta']['crawlid']):
            # grab the tld of the request
            ex_res = self.extract(req_dict['url'])
            key = "{sid}:{dom}.{suf}:queue".format(
                sid=req_dict['meta']['spiderid'],
                dom=ex_res.domain,
                suf=ex_res.suffix)

            curr_time = time.time()

            domain = "{d}.{s}".format(d=ex_res.domain, s=ex_res.suffix)

            # allow only if we want all requests or we want
            # everything but blacklisted domains
            # insert if crawl never expires (0) or time < expires
            if (self.backlog_blacklist or
                    (not self.backlog_blacklist and
                    domain not in self.black_domains)) and \
                    (req_dict['meta']['expires'] == 0 or
                    curr_time < req_dict['meta']['expires']):
                # we may already have the queue in memory
                if key in self.queue_keys:
                    self.queue_dict[key][0].push(req_dict,
                                                 req_dict['meta']['priority'])
                else:
                    # shoving into a new redis queue, negative b/c of sorted sets
                    # this will populate ourself and other schedulers when
                    # they call create_queues
                    self.redis_conn.zadd(key, ujson.dumps(req_dict),
                                         -req_dict['meta']['priority'])
                self.logger.debug(
                    "Crawlid: '{id}' Appid: '{appid}' added to queue".format(
                        appid=req_dict['meta']['appid'],
                        id=req_dict['meta']['crawlid']))
            else:
                self.logger.debug(
                    "Crawlid: '{id}' Appid: '{appid}' expired".format(
                        appid=req_dict['meta']['appid'],
                        id=req_dict['meta']['crawlid']))
        else:
            self.logger.debug(
                "Crawlid: '{id}' Appid: '{appid}' blacklisted".format(
                    appid=req_dict['meta']['appid'],
                    id=req_dict['meta']['crawlid']))

    def find_item(self):
        '''
        Finds an item from the throttled queues
        '''
        random.shuffle(self.queue_keys)
        count = 0

        while count <= self.item_retries:
            for key in self.queue_keys:
                # skip if the whole domain has been blacklisted in zookeeper
                if key.split(':')[1] in self.black_domains:
                    continue
                # the throttled queue only returns an item if it is allowed
                item = self.queue_dict[key][0].pop()

                if item:
                    # update timeout and return
                    self.queue_dict[key][1] = time.time()
                    return item

            count = count + 1

        return None

    def next_request(self):
        '''
        Logic to handle getting a new url request, from a bunch of
        different queues
        '''
        t = time.time()
        # update the redis queues every so often
        if t - self.update_time > self.update_interval:
            self.update_time = t
            self.create_queues()
            self.expire_queues()

        # update the ip address every so often
        if t - self.update_ip_time > self.ip_update_interval:
            self.update_ip_time = t
            self.update_ipaddress()
            self.report_self()

        item = self.find_item()
        if item:
            self.logger.debug(u"Found url to crawl {url}" \
                    .format(url=item['url']))
            if 'meta' in item:
                # item is a serialized request
                req = request_from_dict(item, self.spider)
            else:
                # item is a feed from outside, parse it manually
                req = self.request_from_feed(item)

            # extra check to add items to request
            if 'useragent' in req.meta and req.meta['useragent'] is not None:
                req.headers['User-Agent'] = req.meta['useragent']
            if 'cookie' in req.meta and req.meta['cookie'] is not None:
                if isinstance(req.meta['cookie'], dict):
                    req.cookies = req.meta['cookie']
                elif isinstance(req.meta['cookie'], string_types):
                    req.cookies = self.parse_cookie(req.meta['cookie'])

            return req

        return None

    def request_from_feed(self, item):
        try:
            req = Request(item['url'])
        except ValueError:
            # need absolute url
            # need better url validation here
            req = Request('http://' + item['url'])

        # defaults not in schema
        if 'curdepth' not in item:
            item['curdepth'] = 0
        if "retry_times" not in item:
            item['retry_times'] = 0

        for key in list(item.keys()):
            req.meta[key] = item[key]

        # extra check to add items to request
        if 'cookie' in item and item['cookie'] is not None:
            if isinstance(item['cookie'], dict):
                req.cookies = item['cookie']
            elif isinstance(item['cookie'], string_types):
                req.cookies = self.parse_cookie(item['cookie'])
        return req

    def parse_cookie(self, string):
        '''
        Parses a cookie string like returned in a Set-Cookie header
        @param string: The cookie string
        @return: the cookie dict
        '''
        results = re.findall('([^=]+)=([^\;]+);?\s?', string)
        my_dict = {}
        for item in results:
            my_dict[item[0]] = item[1]

        return my_dict

    def has_pending_requests(self):
        '''
        We never want to say we have pending requests
        If this returns True scrapy sometimes hangs.
        '''
        return False
コード例 #8
0
class DistributedScheduler(object):
    """
    Scrapy request scheduler that utilizes Redis Throttled Priority Queues
    to moderate different domain scrape requests within a distributed scrapy
    cluster
    """

    redis_conn = None  # the redis connection
    queue_dict = None  # the dict of throttled queues
    spider = None  # the spider using this scheduler
    queue_keys = None  # the list of current queues
    queue_class = None  # the class to use for the queue
    dupefilter = None  # the redis dupefilter
    update_time = 0  # the last time the queues were updated
    update_ip_time = 0  # the last time the ip was updated
    update_interval = 0  # how often to update the queues
    extract = None  # the tld extractor
    hits = 0  # default number of hits for a queue
    window = 0  # default window to calculate number of hits
    my_ip = None  # the ip address of the scheduler (if needed)
    old_ip = None  # the old ip for logging
    ip_update_interval = 0  # the interval to update the ip address
    add_type = None  # add spider type to redis throttle queue key
    add_ip = None  # add spider public ip to redis throttle queue key
    item_retries = 0  # the number of extra tries to get an item
    my_uuid = None  # the generated UUID for the particular scrapy process
    # Zookeeper Dynamic Config Vars
    domain_config = {}  # The list of domains and their configs
    my_id = None  # The id used to read the throttle config
    config_flag = False  # Flag to reload queues if settings are wiped too
    assign_path = None  # The base assigned configuration path to read
    zoo_client = None  # The KazooClient to manage the config
    my_assignment = None  # Zookeeper path to read actual yml config

    def __init__(
        self,
        server,
        persist,
        update_int,
        timeout,
        retries,
        logger,
        hits,
        window,
        mod,
        ip_refresh,
        add_type,
        add_ip,
        ip_regex,
    ):
        """
        Initialize the scheduler
        """
        self.redis_conn = server
        self.persist = persist
        self.queue_dict = {}
        self.update_interval = update_int
        self.hits = hits
        self.window = window
        self.moderated = mod
        self.rfp_timeout = timeout
        self.ip_update_interval = ip_refresh
        self.add_type = add_type
        self.add_ip = add_ip
        self.item_retires = retries
        self.logger = logger
        self.ip_regex = re.compile(ip_regex)

        # set up tldextract
        self.extract = tldextract.TLDExtract()

        self.update_ipaddress()

        # if we need better uuid's mod this line
        self.my_uuid = str(uuid.uuid4()).split("-")[4]

    def setup_zookeeper(self):
        self.assign_path = settings.get("ZOOKEEPER_ASSIGN_PATH", "")
        self.my_id = settings.get("ZOOKEEPER_ID", "all")
        self.logger.debug("Trying to establish Zookeeper connection")
        try:
            self.zoo_watcher = ZookeeperWatcher(
                hosts=settings.get("ZOOKEEPER_HOSTS"),
                filepath=self.assign_path + self.my_id,
                config_handler=self.change_config,
                error_handler=self.error_config,
                pointer=False,
                ensure=True,
                valid_init=True,
            )
        except KazooTimeoutError:
            self.logger.error("Could not connect to Zookeeper")
            sys.exit(1)

        if self.zoo_watcher.ping():
            self.logger.debug("Successfully set up Zookeeper connection")
        else:
            self.logger.error("Could not ping Zookeeper")
            sys.exit(1)

    def change_config(self, config_string):
        if config_string and len(config_string) > 0:
            loaded_config = yaml.safe_load(config_string)
            self.logger.info("Zookeeper config changed", extra=loaded_config)
            self.load_domain_config(loaded_config)
            self.update_domain_queues()
        elif config_string is None or len(config_string) == 0:
            self.error_config("Zookeeper config wiped")

        self.create_queues()

    def load_domain_config(self, loaded_config):
        """
        Loads the domain_config and sets up queue_dict
        @param loaded_config: the yaml loaded config dict from zookeeper
        """
        self.domain_config = {}
        # vetting process to ensure correct configs
        if loaded_config and "domains" in loaded_config:
            for domain in loaded_config["domains"]:
                item = loaded_config["domains"][domain]
                # check valid
                if "window" in item and "hits" in item:
                    self.logger.debug("Added domain {dom} to loaded config".format(dom=domain))
                    self.domain_config[domain] = item

        self.config_flag = True

    def update_domain_queues(self):
        """
        Check to update existing queues already in memory
        new queues are created elsewhere
        """
        for key in self.domain_config:
            final_key = "{name}:{domain}:queue".format(name=self.spider.name, domain=key)
            # we already have a throttled queue for this domain, update it to new settings
            if final_key in self.queue_dict:
                self.queue_dict[final_key].window = float(self.domain_config[key]["window"])
                self.logger.debug("Updated queue {q} with new config".format(q=final_key))
                # if scale is applied, scale back; otherwise use updated hits
                if "scale" in self.domain_config[key]:
                    # round to int
                    hits = int(self.domain_config[key]["hits"] * self.fit_scale(self.domain_config[key]["scale"]))
                    self.queue_dict[final_key].limit = float(hits)
                else:
                    self.queue_dict[final_key].limit = float(self.domain_config[key]["hits"])

    def error_config(self, message):
        extras = {}
        extras["message"] = message
        extras["revert_window"] = self.window
        extras["revert_hits"] = self.hits
        extras["spiderid"] = self.spider.name
        self.logger.info("Lost config from Zookeeper", extra=extras)
        # lost connection to zookeeper, reverting back to defaults
        for key in self.domain_config:
            final_key = "{name}:{domain}:queue".format(name=self.spider.name, domain=key)
            self.queue_dict[final_key].window = self.window
            self.queue_dict[final_key].limit = self.hits

        self.domain_config = {}

    def fit_scale(self, scale):
        """
        @return: a scale >= 0 and <= 1
        """
        if scale >= 1:
            return 1.0
        elif scale <= 0:
            return 0.0
        else:
            return scale

    def create_queues(self):
        """
        Updates the in memory list of the redis queues
        Creates new throttled queue instances if it does not have them
        """
        # new config could have loaded between scrapes
        newConf = self.check_config()

        self.queue_keys = self.redis_conn.keys(self.spider.name + ":*:queue")

        for key in self.queue_keys:
            # build final queue key, depending on type and ip bools
            throttle_key = ""

            if self.add_type:
                throttle_key = self.spider.name + ":"
            if self.add_ip:
                throttle_key = throttle_key + self.my_ip + ":"

            # add the tld from the key `type:tld:queue`
            the_domain = re.split(":", key)[1]
            throttle_key = throttle_key + the_domain

            if key not in self.queue_dict or newConf:
                self.logger.debug("Added new Throttled Queue {q}".format(q=key))
                q = RedisPriorityQueue(self.redis_conn, key)

                # use default window and hits
                if the_domain not in self.domain_config:
                    self.queue_dict[key] = RedisThrottledQueue(
                        self.redis_conn, q, self.window, self.hits, self.moderated, throttle_key, throttle_key
                    )
                # use custom window and hits
                else:
                    window = self.domain_config[the_domain]["window"]
                    hits = self.domain_config[the_domain]["hits"]

                    # adjust the crawl rate based on the scale if exists
                    if "scale" in self.domain_config[the_domain]:
                        hits = int(hits * self.fit_scale(self.domain_config[the_domain]["scale"]))

                    self.queue_dict[key] = RedisThrottledQueue(
                        self.redis_conn, q, window, hits, self.moderated, throttle_key, throttle_key
                    )

    def check_config(self):
        """
        Controls configuration for the scheduler
        @return: True if there is a new configuration
        """
        if self.config_flag:
            self.config_flag = False
            return True

        return False

    def update_ipaddress(self):
        """
        Updates the scheduler so it knows its own ip address
        """
        # assign local ip in case of exception
        self.old_ip = self.my_ip
        self.my_ip = "127.0.0.1"
        try:
            obj = urllib2.urlopen(settings.get("PUBLIC_IP_URL", "http://ip.42.pl/raw"))
            results = self.ip_regex.findall(obj.read())
            if len(results) > 0:
                self.my_ip = results[0]
            else:
                raise IOError("Could not get valid IP Address")
            obj.close()
            self.logger.debug("Current public ip: {ip}".format(ip=self.my_ip))
        except IOError:
            self.logger.error("Could not reach out to get public ip")
            pass

        if self.old_ip != self.my_ip:
            self.logger.info("Changed Public IP: {old} -> {new}".format(old=self.old_ip, new=self.my_ip))

    def report_self(self):
        """
        Reports the crawler uuid to redis
        """
        self.logger.debug("Reporting self id", extra={"uuid": self.my_uuid})
        key = "stats:crawler:{m}:{s}:{u}".format(m=socket.gethostname(), s=self.spider.name, u=self.my_uuid)
        self.redis_conn.set(key, time.time())
        self.redis_conn.expire(key, self.ip_update_interval * 2)

    @classmethod
    def from_settings(cls, settings):
        server = redis.Redis(host=settings.get("REDIS_HOST"), port=settings.get("REDIS_PORT"))
        persist = settings.get("SCHEDULER_PERSIST", True)
        up_int = settings.get("SCHEDULER_QUEUE_REFRESH", 10)
        hits = settings.get("QUEUE_HITS", 10)
        window = settings.get("QUEUE_WINDOW", 60)
        mod = settings.get("QUEUE_MODERATED", False)
        timeout = settings.get("DUPEFILTER_TIMEOUT", 600)
        ip_refresh = settings.get("SCHEDULER_IP_REFRESH", 60)
        add_type = settings.get("SCHEDULER_TYPE_ENABLED", False)
        add_ip = settings.get("SCHEDULER_IP_ENABLED", False)
        retries = settings.get("SCHEUDLER_ITEM_RETRIES", 3)
        ip_regex = settings.get("IP_ADDR_REGEX", ".*")

        my_level = settings.get("SC_LOG_LEVEL", "INFO")
        my_name = settings.get("SC_LOGGER_NAME", "sc-logger")
        my_output = settings.get("SC_LOG_STDOUT", True)
        my_json = settings.get("SC_LOG_JSON", False)
        my_dir = settings.get("SC_LOG_DIR", "logs")
        my_bytes = settings.get("SC_LOG_MAX_BYTES", "10MB")
        my_file = settings.get("SC_LOG_FILE", "main.log")
        my_backups = settings.get("SC_LOG_BACKUPS", 5)

        logger = LogFactory.get_instance(
            json=my_json,
            name=my_name,
            stdout=my_output,
            level=my_level,
            dir=my_dir,
            file=my_file,
            bytes=my_bytes,
            backups=my_backups,
        )

        return cls(
            server, persist, up_int, timeout, retries, logger, hits, window, mod, ip_refresh, add_type, add_ip, ip_regex
        )

    @classmethod
    def from_crawler(cls, crawler):
        return cls.from_settings(crawler.settings)

    def open(self, spider):
        self.spider = spider
        self.spider.set_logger(self.logger)
        self.spider.set_redis(self.redis_conn)
        self.spider.setup_stats()
        self.create_queues()
        self.setup_zookeeper()
        self.dupefilter = RFPDupeFilter(self.redis_conn, self.spider.name + ":dupefilter", self.rfp_timeout)

    def close(self, reason):
        self.logger.info("Closing Spider", {"spiderid": self.spider.name})
        if not self.persist:
            self.logger.warning("Clearing crawl queues")
            self.dupefilter.clear()
            for key in self.queue_keys:
                self.queue_dict[key].clear()

    def is_blacklisted(self, appid, crawlid):
        """
        Checks the redis blacklist for crawls that should not be propagated
        either from expiring or stopped
        @return: True if the appid crawlid combo is blacklisted
        """
        key_check = "{appid}||{crawlid}".format(appid=appid, crawlid=crawlid)
        redis_key = self.spider.name + ":blacklist"
        return self.redis_conn.sismember(redis_key, key_check)

    def enqueue_request(self, request):
        """
        Pushes a request from the spider into the proper throttled queue
        """
        if not request.dont_filter and self.dupefilter.request_seen(request):
            self.logger.debug("Request not added back to redis")
            return
        req_dict = self.request_to_dict(request)

        if not self.is_blacklisted(req_dict["meta"]["appid"], req_dict["meta"]["crawlid"]):
            # grab the tld of the request
            ex_res = self.extract(req_dict["url"])
            key = "{sid}:{dom}.{suf}:queue".format(
                sid=req_dict["meta"]["spiderid"], dom=ex_res.domain, suf=ex_res.suffix
            )

            curr_time = time.time()

            # insert if crawl never expires (0) or time < expires
            if req_dict["meta"]["expires"] == 0 or curr_time < req_dict["meta"]["expires"]:
                # we may already have the queue in memory
                if key in self.queue_keys:
                    self.queue_dict[key].push(req_dict, req_dict["meta"]["priority"])
                else:
                    # shoving into a new redis queue, negative b/c of sorted sets
                    # this will populate ourself and other schedulers when
                    # they call create_queues
                    self.redis_conn.zadd(key, pickle.dumps(req_dict, protocol=-1), -req_dict["meta"]["priority"])
                self.logger.debug(
                    "Crawlid: '{id}' Appid: '{appid}' added to queue".format(
                        appid=req_dict["meta"]["appid"], id=req_dict["meta"]["crawlid"]
                    )
                )
            else:
                self.logger.debug(
                    "Crawlid: '{id}' Appid: '{appid}' expired".format(
                        appid=req_dict["meta"]["appid"], id=req_dict["meta"]["crawlid"]
                    )
                )
        else:
            self.logger.debug(
                "Crawlid: '{id}' Appid: '{appid}' blacklisted".format(
                    appid=req_dict["meta"]["appid"], id=req_dict["meta"]["crawlid"]
                )
            )

    def request_to_dict(self, request):
        """
        Convert Request object to a dict.
        modified from scrapy.utils.reqser
        """
        req_dict = {
            # urls should be safe (safe_string_url)
            "url": request.url.decode("ascii"),
            "method": request.method,
            "headers": dict(request.headers),
            "body": request.body,
            "cookies": request.cookies,
            "meta": request.meta,
            "_encoding": request._encoding,
            "priority": request.priority,
            "dont_filter": request.dont_filter,
            #  callback/errback are assumed to be a bound instance of the spider
            "callback": None if request.callback is None else request.callback.func_name,
            "errback": None if request.errback is None else request.errback.func_name,
        }
        return req_dict

    def find_item(self):
        """
        Finds an item from the throttled queues
        """
        random.shuffle(self.queue_keys)
        count = 0

        while count <= self.item_retries:
            for key in self.queue_keys:
                # the throttled queue only returns an item if it is allowed
                item = self.queue_dict[key].pop()

                if item:
                    return item
            # we want the spiders to get slightly out of sync
            # with each other for better performance
            time.sleep(random.random())
            count = count + 1

        return None

    def next_request(self):
        """
        Logic to handle getting a new url request, from a bunch of
        different queues
        """
        t = time.time()
        # update the redis queues every so often
        if t - self.update_time > self.update_interval:
            self.update_time = t
            self.create_queues()

        # update the ip address every so often
        if t - self.update_ip_time > self.ip_update_interval:
            self.update_ip_time = t
            self.update_ipaddress()
            self.report_self()

        item = self.find_item()
        if item:
            self.logger.debug("Found url to crawl {url}".format(url=item["url"]))
            try:
                req = Request(item["url"])
            except ValueError:
                # need absolute url
                # need better url validation here
                req = Request("http://" + item["url"])

            if "meta" in item:
                item = item["meta"]

            # defaults not in schema
            if "curdepth" not in item:
                item["curdepth"] = 0
            if "retry_times" not in item:
                item["retry_times"] = 0

            for key in item.keys():
                req.meta[key] = item[key]

            # extra check to add items to request
            if "useragent" in item and item["useragent"] is not None:
                req.headers["User-Agent"] = item["useragent"]
            if "cookie" in item and item["cookie"] is not None:
                if isinstance(item["cookie"], dict):
                    req.cookies = item["cookie"]
                elif isinstance(item["cookie"], basestring):
                    req.cookies = self.parse_cookie(item["cookie"])

            return req

        return None

    def parse_cookie(self, string):
        """
        Parses a cookie string like returned in a Set-Cookie header
        @param string: The cookie string
        @return: the cookie dict
        """
        results = re.findall("([^=]+)=([^\;]+);?\s?", string)
        my_dict = {}
        for item in results:
            my_dict[item[0]] = item[1]

        return my_dict

    def has_pending_requests(self):
        """
        We never want to say we have pending requests
        If this returns True scrapy sometimes hangs.
        """
        return False