class TestZookeeperWatcher(TestCase): def __init__(self, name, hosts): self.hosts = hosts self.file_path = '/test_path' self.file_data = 'test_data' self.pointer_path = '/test_pointer' self.pointer_data = self.file_path TestCase.__init__(self, name) def setUp(self): self.zoo_client = KazooClient(hosts=self.hosts) self.zoo_client.start() # prepare data self.zoo_client.ensure_path(self.file_path) self.zoo_client.set(self.file_path, self.file_data.encode('utf-8')) self.zoo_client.ensure_path(self.pointer_path) self.zoo_client.set(self.pointer_path, self.pointer_data.encode('utf-8')) self.zoo_watcher = ZookeeperWatcher(hosts=self.hosts, filepath=self.file_path, pointer=False, ensure=False, valid_init=True) def test_ping(self): self.assertTrue(self.zoo_watcher.ping()) def test_get_file_contents(self): pointer_zoo_watcher = ZookeeperWatcher(hosts=self.hosts, filepath=self.pointer_path, pointer=True, ensure=False, valid_init=True) self.assertEqual(self.zoo_watcher.get_file_contents(), self.file_data) self.assertEqual(pointer_zoo_watcher.get_file_contents(), self.file_data) self.assertEqual(pointer_zoo_watcher.get_file_contents(True), self.pointer_data) pointer_zoo_watcher.close() def tearDown(self): self.zoo_watcher.close() self.zoo_client.ensure_path(self.file_path) self.zoo_client.delete(self.file_path) self.zoo_client.ensure_path(self.pointer_path) self.zoo_client.delete(self.pointer_path) self.zoo_client.stop() self.zoo_client.close()
class DistributedScheduler(object): ''' Scrapy request scheduler that utilizes Redis Throttled Priority Queues to moderate different domain scrape requests within a distributed scrapy cluster ''' redis_conn = None # the redis connection queue_dict = None # the dict of throttled queues spider = None # the spider using this scheduler queue_keys = None # the list of current queues queue_class = None # the class to use for the queue dupefilter = None # the redis dupefilter update_time = 0 # the last time the queues were updated update_ip_time = 0 # the last time the ip was updated update_interval = 0 # how often to update the queues extract = None # the tld extractor hits = 0 # default number of hits for a queue window = 0 # default window to calculate number of hits my_ip = None # the ip address of the scheduler (if needed) old_ip = None # the old ip for logging ip_update_interval = 0 # the interval to update the ip address add_type = None # add spider type to redis throttle queue key add_ip = None # add spider public ip to redis throttle queue key item_retries = 0 # the number of extra tries to get an item my_uuid = None # the generated UUID for the particular scrapy process # Zookeeper Dynamic Config Vars domain_config = {} # The list of domains and their configs my_id = None # The id used to read the throttle config config_flag = False # Flag to reload queues if settings are wiped too assign_path = None # The base assigned configuration path to read zoo_client = None # The KazooClient to manage the config my_assignment = None # Zookeeper path to read actual yml config black_domains = [] # the domains to ignore thanks to zookeeper config def __init__(self, server, persist, update_int, timeout, retries, logger, hits, window, mod, ip_refresh, add_type, add_ip, ip_regex, backlog_blacklist, queue_timeout): ''' Initialize the scheduler ''' self.redis_conn = server self.persist = persist self.queue_dict = {} self.update_interval = update_int self.hits = hits self.window = window self.moderated = mod self.rfp_timeout = timeout self.ip_update_interval = ip_refresh self.add_type = add_type self.add_ip = add_ip self.item_retires = retries self.logger = logger self.ip_regex = re.compile(ip_regex) self.backlog_blacklist = backlog_blacklist self.queue_timeout = queue_timeout # set up tldextract self.extract = tldextract.TLDExtract() self.update_ipaddress() # if we need better uuid's mod this line self.my_uuid = str(uuid.uuid4()).split('-')[4] def setup_zookeeper(self): self.assign_path = settings.get('ZOOKEEPER_ASSIGN_PATH', "") self.my_id = settings.get('ZOOKEEPER_ID', 'all') self.logger.debug("Trying to establish Zookeeper connection") try: self.zoo_watcher = ZookeeperWatcher( hosts=settings.get('ZOOKEEPER_HOSTS'), filepath=self.assign_path + self.my_id, config_handler=self.change_config, error_handler=self.error_config, pointer=False, ensure=True, valid_init=True) except KazooTimeoutError: self.logger.error("Could not connect to Zookeeper") sys.exit(1) if self.zoo_watcher.ping(): self.logger.debug("Successfully set up Zookeeper connection") else: self.logger.error("Could not ping Zookeeper") sys.exit(1) def change_config(self, config_string): if config_string and len(config_string) > 0: loaded_config = yaml.safe_load(config_string) self.logger.info("Zookeeper config changed", extra=loaded_config) self.load_domain_config(loaded_config) self.update_domain_queues() elif config_string is None or len(config_string) == 0: self.error_config("Zookeeper config wiped") self.create_queues() def load_domain_config(self, loaded_config): ''' Loads the domain_config and sets up queue_dict @param loaded_config: the yaml loaded config dict from zookeeper ''' self.domain_config = {} # vetting process to ensure correct configs if loaded_config: if 'domains' in loaded_config: for domain in loaded_config['domains']: item = loaded_config['domains'][domain] # check valid if 'window' in item and 'hits' in item: self.logger.debug("Added domain {dom} to loaded config" .format(dom=domain)) self.domain_config[domain] = item if 'blacklist' in loaded_config: self.black_domains = loaded_config['blacklist'] self.config_flag = True def update_domain_queues(self): ''' Check to update existing queues already in memory new queues are created elsewhere ''' for key in self.domain_config: final_key = "{name}:{domain}:queue".format( name=self.spider.name, domain=key) # we already have a throttled queue for this domain, update it to new settings if final_key in self.queue_dict: self.queue_dict[final_key][0].window = float(self.domain_config[key]['window']) self.logger.debug("Updated queue {q} with new config" .format(q=final_key)) # if scale is applied, scale back; otherwise use updated hits if 'scale' in self.domain_config[key]: # round to int hits = int(self.domain_config[key]['hits'] * self.fit_scale( self.domain_config[key]['scale'])) self.queue_dict[final_key][0].limit = float(hits) else: self.queue_dict[final_key][0].limit = float(self.domain_config[key]['hits']) def error_config(self, message): extras = {} extras['message'] = message extras['revert_window'] = self.window extras['revert_hits'] = self.hits extras['spiderid'] = self.spider.name self.logger.info("Lost config from Zookeeper", extra=extras) # lost connection to zookeeper, reverting back to defaults for key in self.domain_config: final_key = "{name}:{domain}:queue".format( name=self.spider.name, domain=key) self.queue_dict[final_key][0].window = self.window self.queue_dict[final_key][0].limit = self.hits self.domain_config = {} def fit_scale(self, scale): ''' @return: a scale >= 0 and <= 1 ''' if scale >= 1: return 1.0 elif scale <= 0: return 0.0 else: return scale def create_queues(self): ''' Updates the in memory list of the redis queues Creates new throttled queue instances if it does not have them ''' # new config could have loaded between scrapes newConf = self.check_config() self.queue_keys = self.redis_conn.keys(self.spider.name + ":*:queue") for key in self.queue_keys: # build final queue key, depending on type and ip bools throttle_key = "" if self.add_type: throttle_key = self.spider.name + ":" if self.add_ip: throttle_key = throttle_key + self.my_ip + ":" # add the tld from the key `type:tld:queue` the_domain = re.split(':', key)[1] throttle_key = throttle_key + the_domain if key not in self.queue_dict or newConf: self.logger.debug("Added new Throttled Queue {q}" .format(q=key)) q = RedisPriorityQueue(self.redis_conn, key, encoding=ujson) # use default window and hits if the_domain not in self.domain_config: # this is now a tuple, all access needs to use [0] to get # the object, use [1] to get the time self.queue_dict[key] = [RedisThrottledQueue(self.redis_conn, q, self.window, self.hits, self.moderated, throttle_key, throttle_key, True), time.time()] # use custom window and hits else: window = self.domain_config[the_domain]['window'] hits = self.domain_config[the_domain]['hits'] # adjust the crawl rate based on the scale if exists if 'scale' in self.domain_config[the_domain]: hits = int(hits * self.fit_scale(self.domain_config[the_domain]['scale'])) self.queue_dict[key] = [RedisThrottledQueue(self.redis_conn, q, window, hits, self.moderated, throttle_key, throttle_key, True), time.time()] def expire_queues(self): ''' Expires old queue_dict keys that have not been used in a long time. Prevents slow memory build up when crawling lots of different domains ''' curr_time = time.time() for key in list(self.queue_dict): diff = curr_time - self.queue_dict[key][1] if diff > self.queue_timeout: self.logger.debug("Expiring domain queue key " + key) del self.queue_dict[key] if key in self.queue_keys: self.queue_keys.remove(key) def check_config(self): ''' Controls configuration for the scheduler @return: True if there is a new configuration ''' if self.config_flag: self.config_flag = False return True return False def update_ipaddress(self): ''' Updates the scheduler so it knows its own ip address ''' # assign local ip in case of exception self.old_ip = self.my_ip self.my_ip = '127.0.0.1' try: obj = urllib.request.urlopen(settings.get('PUBLIC_IP_URL', 'http://ip.42.pl/raw')) results = self.ip_regex.findall(obj.read()) if len(results) > 0: self.my_ip = results[0] else: raise IOError("Could not get valid IP Address") obj.close() self.logger.debug("Current public ip: {ip}".format(ip=self.my_ip)) except IOError: self.logger.error("Could not reach out to get public ip") pass if self.old_ip != self.my_ip: self.logger.info("Changed Public IP: {old} -> {new}".format( old=self.old_ip, new=self.my_ip)) def report_self(self): ''' Reports the crawler uuid to redis ''' self.logger.debug("Reporting self id", extra={'uuid':self.my_uuid}) key = "stats:crawler:{m}:{s}:{u}".format( m=socket.gethostname(), s=self.spider.name, u=self.my_uuid) self.redis_conn.set(key, time.time()) self.redis_conn.expire(key, self.ip_update_interval * 2) @classmethod def from_settings(cls, settings): server = redis.Redis(host=settings.get('REDIS_HOST'), port=settings.get('REDIS_PORT'), db=settings.get('REDIS_DB')) persist = settings.get('SCHEDULER_PERSIST', True) up_int = settings.get('SCHEDULER_QUEUE_REFRESH', 10) hits = settings.get('QUEUE_HITS', 10) window = settings.get('QUEUE_WINDOW', 60) mod = settings.get('QUEUE_MODERATED', False) timeout = settings.get('DUPEFILTER_TIMEOUT', 600) ip_refresh = settings.get('SCHEDULER_IP_REFRESH', 60) add_type = settings.get('SCHEDULER_TYPE_ENABLED', False) add_ip = settings.get('SCHEDULER_IP_ENABLED', False) retries = settings.get('SCHEUDLER_ITEM_RETRIES', 3) ip_regex = settings.get('IP_ADDR_REGEX', '.*') backlog_blacklist = settings.get('SCHEDULER_BACKLOG_BLACKLIST', True) queue_timeout = settings.get('SCHEDULER_QUEUE_TIMEOUT', 3600) my_level = settings.get('SC_LOG_LEVEL', 'INFO') my_name = settings.get('SC_LOGGER_NAME', 'sc-logger') my_output = settings.get('SC_LOG_STDOUT', True) my_json = settings.get('SC_LOG_JSON', False) my_dir = settings.get('SC_LOG_DIR', 'logs') my_bytes = settings.get('SC_LOG_MAX_BYTES', '10MB') my_file = settings.get('SC_LOG_FILE', 'main.log') my_backups = settings.get('SC_LOG_BACKUPS', 5) logger = LogFactory.get_instance(json=my_json, name=my_name, stdout=my_output, level=my_level, dir=my_dir, file=my_file, bytes=my_bytes, backups=my_backups) return cls(server, persist, up_int, timeout, retries, logger, hits, window, mod, ip_refresh, add_type, add_ip, ip_regex, backlog_blacklist, queue_timeout) @classmethod def from_crawler(cls, crawler): return cls.from_settings(crawler.settings) def open(self, spider): self.spider = spider self.spider.set_logger(self.logger) self.create_queues() self.setup_zookeeper() self.dupefilter = RFPDupeFilter(self.redis_conn, self.spider.name + ':dupefilter', self.rfp_timeout) def close(self, reason): self.logger.info("Closing Spider", {'spiderid':self.spider.name}) if not self.persist: self.logger.warning("Clearing crawl queues") self.dupefilter.clear() for key in self.queue_keys: self.queue_dict[key][0].clear() def is_blacklisted(self, appid, crawlid): ''' Checks the redis blacklist for crawls that should not be propagated either from expiring or stopped @return: True if the appid crawlid combo is blacklisted ''' key_check = '{appid}||{crawlid}'.format(appid=appid, crawlid=crawlid) redis_key = self.spider.name + ":blacklist" return self.redis_conn.sismember(redis_key, key_check) def enqueue_request(self, request): ''' Pushes a request from the spider into the proper throttled queue ''' if not request.dont_filter and self.dupefilter.request_seen(request): self.logger.debug("Request not added back to redis") return req_dict = self.request_to_dict(request) if not self.is_blacklisted(req_dict['meta']['appid'], req_dict['meta']['crawlid']): # grab the tld of the request ex_res = self.extract(req_dict['url']) key = "{sid}:{dom}.{suf}:queue".format( sid=req_dict['meta']['spiderid'], dom=ex_res.domain, suf=ex_res.suffix) curr_time = time.time() domain = "{d}.{s}".format(d=ex_res.domain, s=ex_res.suffix) # allow only if we want all requests or we want # everything but blacklisted domains # insert if crawl never expires (0) or time < expires if (self.backlog_blacklist or (not self.backlog_blacklist and domain not in self.black_domains)) and \ (req_dict['meta']['expires'] == 0 or curr_time < req_dict['meta']['expires']): # we may already have the queue in memory if key in self.queue_keys: self.queue_dict[key][0].push(req_dict, req_dict['meta']['priority']) else: # shoving into a new redis queue, negative b/c of sorted sets # this will populate ourself and other schedulers when # they call create_queues self.redis_conn.zadd(key, ujson.dumps(req_dict), -req_dict['meta']['priority']) self.logger.debug("Crawlid: '{id}' Appid: '{appid}' added to queue" .format(appid=req_dict['meta']['appid'], id=req_dict['meta']['crawlid'])) else: self.logger.debug("Crawlid: '{id}' Appid: '{appid}' expired" .format(appid=req_dict['meta']['appid'], id=req_dict['meta']['crawlid'])) else: self.logger.debug("Crawlid: '{id}' Appid: '{appid}' blacklisted" .format(appid=req_dict['meta']['appid'], id=req_dict['meta']['crawlid'])) def request_to_dict(self, request): ''' Convert Request object to a dict. modified from scrapy.utils.reqser ''' req_dict = { # urls should be safe (safe_string_url) 'url': to_unicode(request.url), 'method': request.method, 'headers': dict(request.headers), 'body': request.body, 'cookies': request.cookies, 'meta': request.meta, '_encoding': request._encoding, 'priority': request.priority, 'dont_filter': request.dont_filter, # callback/errback are assumed to be a bound instance of the spider 'callback': None if request.callback is None else request.callback.__name__, 'errback': None if request.errback is None else request.errback.__name__, } return req_dict def find_item(self): ''' Finds an item from the throttled queues ''' random.shuffle(self.queue_keys) count = 0 while count <= self.item_retries: for key in self.queue_keys: # skip if the whole domain has been blacklisted in zookeeper if key.split(':')[1] in self.black_domains: continue # the throttled queue only returns an item if it is allowed item = self.queue_dict[key][0].pop() if item: # update timeout and return self.queue_dict[key][1] = time.time() return item count = count + 1 return None def next_request(self): ''' Logic to handle getting a new url request, from a bunch of different queues ''' t = time.time() # update the redis queues every so often if t - self.update_time > self.update_interval: self.update_time = t self.create_queues() self.expire_queues() # update the ip address every so often if t - self.update_ip_time > self.ip_update_interval: self.update_ip_time = t self.update_ipaddress() self.report_self() item = self.find_item() if item: self.logger.debug("Found url to crawl {url}" \ .format(url=item['url'])) try: req = Request(item['url']) except ValueError: # need absolute url # need better url validation here req = Request('http://' + item['url']) try: if 'callback' in item and item['callback'] is not None: req.callback = getattr(self.spider, item['callback']) except AttributeError: self.logger.warn("Unable to find callback method") try: if 'errback' in item and item['errback'] is not None: req.errback = getattr(self.spider, item['errback']) except AttributeError: self.logger.warn("Unable to find errback method") if 'meta' in item: item = item['meta'] # defaults not in schema if 'curdepth' not in item: item['curdepth'] = 0 if "retry_times" not in item: item['retry_times'] = 0 for key in list(item.keys()): req.meta[key] = item[key] # extra check to add items to request if 'useragent' in item and item['useragent'] is not None: req.headers['User-Agent'] = item['useragent'] if 'cookie' in item and item['cookie'] is not None: if isinstance(item['cookie'], dict): req.cookies = item['cookie'] elif isinstance(item['cookie'], basestring): req.cookies = self.parse_cookie(item['cookie']) return req return None def parse_cookie(self, string): ''' Parses a cookie string like returned in a Set-Cookie header @param string: The cookie string @return: the cookie dict ''' results = re.findall('([^=]+)=([^\;]+);?\s?', string) my_dict = {} for item in results: my_dict[item[0]] = item[1] return my_dict def has_pending_requests(self): ''' We never want to say we have pending requests If this returns True scrapy sometimes hangs. ''' return False
class TestZookeeperWatcher(TestCase): def setUp(self): zoo_client = MagicMock() zoo_client.get = MagicMock(return_value=('data', 'blah')) with patch('scutils.zookeeper_watcher.KazooClient') as k: k.return_value = zoo_client self.zoo_watcher = ZookeeperWatcher( hosts='localhost', filepath='/mypath', pointer=False, ensure=True, valid_init=True) def test_ping(self): self.zoo_watcher.zoo_client.server_version = MagicMock() self.assertTrue(self.zoo_watcher.ping()) self.zoo_watcher.zoo_client.server_version = MagicMock(side_effect=KazooException) self.assertFalse(self.zoo_watcher.ping()) def test_get_file_contents(self): self.zoo_watcher.old_pointed = 'old_pointed' self.zoo_watcher.old_data = 'old_data' self.zoo_watcher.pointer = False self.assertEquals(self.zoo_watcher.get_file_contents(), 'old_data') self.zoo_watcher.pointer = True self.assertEquals(self.zoo_watcher.get_file_contents(), 'old_data') self.zoo_watcher.pointer = True self.assertEquals(self.zoo_watcher.get_file_contents(True), 'old_pointed') def test_compare_pointer(self): self.zoo_watcher.old_pointed = '/path1' self.assertTrue(self.zoo_watcher.compare_pointer('/path2')) self.zoo_watcher.old_pointed = '/path1' self.assertFalse(self.zoo_watcher.compare_pointer('/path1')) def test_compare_data(self): self.zoo_watcher.old_data = 'old_data' self.assertTrue(self.zoo_watcher.compare_data('new_data')) self.zoo_watcher.old_data = 'same_data' self.assertFalse(self.zoo_watcher.compare_data('same_data')) def test_set_valid(self): self.zoo_watcher.is_valid = MagicMock(return_value=True) self.zoo_watcher.valid_handler = MagicMock() self.zoo_watcher.set_valid(False) self.zoo_watcher.valid_handler.assert_called_once_with(True) def test_call_valid(self): self.the_bool = False def the_set(state): self.the_bool = True self.zoo_watcher.valid_handler = the_set self.zoo_watcher.call_valid(True) self.assertTrue(self.the_bool) def test_call_config(self): self.the_bool = False def the_set(state): self.the_bool = True self.zoo_watcher.config_handler = the_set self.zoo_watcher.call_config(True) self.assertTrue(self.the_bool) def test_call_error(self): self.the_bool = False def the_set(state): self.the_bool = True self.zoo_watcher.error_handler = the_set self.zoo_watcher.call_error(True) self.assertTrue(self.the_bool)
class DistributedScheduler(object): ''' Scrapy request scheduler that utilizes Redis Throttled Priority Queues to moderate different domain scrape requests within a distributed scrapy cluster ''' redis_conn = None # the redis connection queue_dict = None # the dict of throttled queues spider = None # the spider using this scheduler queue_keys = None # the list of current queues queue_class = None # the class to use for the queue dupefilter = None # the redis dupefilter update_time = 0 # the last time the queues were updated update_ip_time = 0 # the last time the ip was updated update_interval = 0 # how often to update the queues extract = None # the tld extractor hits = 0 # default number of hits for a queue window = 0 # default window to calculate number of hits my_ip = None # the ip address of the scheduler (if needed) old_ip = None # the old ip for logging ip_update_interval = 0 # the interval to update the ip address add_type = None # add spider type to redis throttle queue key add_ip = None # add spider public ip to redis throttle queue key item_retries = 0 # the number of extra tries to get an item my_uuid = None # the generated UUID for the particular scrapy process # Zookeeper Dynamic Config Vars domain_config = {} # The list of domains and their configs my_id = None # The id used to read the throttle config config_flag = False # Flag to reload queues if settings are wiped too assign_path = None # The base assigned configuration path to read zoo_client = None # The KazooClient to manage the config my_assignment = None # Zookeeper path to read actual yml config def __init__(self, server, persist, update_int, timeout, retries, logger, hits, window, mod, ip_refresh, add_type, add_ip, ip_regex): ''' Initialize the scheduler ''' self.redis_conn = server self.persist = persist self.queue_dict = {} self.update_interval = update_int self.hits = hits self.window = window self.moderated = mod self.rfp_timeout = timeout self.ip_update_interval = ip_refresh self.add_type = add_type self.add_ip = add_ip self.item_retries = retries self.logger = logger self.ip_regex = re.compile(ip_regex) # set up tldextract self.extract = tldextract.TLDExtract() self.update_ipaddress() # if we need better uuid's mod this line self.my_uuid = str(uuid.uuid4()).split('-')[4] # wrapper next_request self.next_request = next_request_method_wrapper(self)( self.next_request) # add test by msc #self.banned_pages = 0 def setup_zookeeper(self): self.assign_path = settings.get('ZOOKEEPER_ASSIGN_PATH', "") self.my_id = settings.get('ZOOKEEPER_ID', 'all') self.logger.debug("Trying to establish Zookeeper connection") try: self.zoo_watcher = ZookeeperWatcher( hosts=settings.get('ZOOKEEPER_HOSTS'), filepath=self.assign_path + self.my_id, config_handler=self.change_config, error_handler=self.error_config, pointer=False, ensure=True, valid_init=True) except KazooTimeoutError: self.logger.error("Could not connect to Zookeeper") sys.exit(1) if self.zoo_watcher.ping(): self.logger.debug("Successfully set up Zookeeper connection") else: self.logger.error("Could not ping Zookeeper") sys.exit(1) def change_config(self, config_string): if config_string and len(config_string) > 0: loaded_config = yaml.safe_load(config_string) self.logger.info("Zookeeper config changed", extra=loaded_config) self.load_domain_config(loaded_config) self.update_domain_queues() elif config_string is None or len(config_string) == 0: self.error_config("Zookeeper config wiped") self.create_queues() def load_domain_config(self, loaded_config): ''' Loads the domain_config and sets up queue_dict @param loaded_config: the yaml loaded config dict from zookeeper ''' self.domain_config = {} # vetting process to ensure correct configs if loaded_config and 'domains' in loaded_config: for domain in loaded_config['domains']: item = loaded_config['domains'][domain] # check valid if 'window' in item and 'hits' in item: self.logger.debug( "Added domain {dom} to loaded config".format( dom=domain)) self.domain_config[domain] = item self.config_flag = True def update_domain_queues(self): ''' Check to update existing queues already in memory new queues are created elsewhere ''' for key in self.domain_config: final_key = "{name}:{domain}:queue".format(name=self.spider.name, domain=key) # we already have a throttled queue for this domain, update it to new settings if final_key in self.queue_dict: self.queue_dict[final_key].window = float( self.domain_config[key]['window']) self.logger.debug( "Updated queue {q} with new config".format(q=final_key)) # if scale is applied, scale back; otherwise use updated hits if 'scale' in self.domain_config[key]: # round to int hits = int( self.domain_config[key]['hits'] * self.fit_scale(self.domain_config[key]['scale'])) self.queue_dict[final_key].limit = float(hits) else: self.queue_dict[final_key].limit = float( self.domain_config[key]['hits']) def error_config(self, message): extras = {} extras['message'] = message extras['revert_window'] = self.window extras['revert_hits'] = self.hits extras['spiderid'] = self.spider.name self.logger.info("Lost config from Zookeeper", extra=extras) # lost connection to zookeeper, reverting back to defaults for key in self.domain_config: final_key = "{name}:{domain}:queue".format(name=self.spider.name, domain=key) self.queue_dict[final_key].window = self.window self.queue_dict[final_key].limit = self.hits self.domain_config = {} def fit_scale(self, scale): ''' @return: a scale >= 0 and <= 1 ''' if scale >= 1: return 1.0 elif scale <= 0: return 0.0 else: return scale def create_queues(self): ''' Updates the in memory list of the redis queues Creates new throttled queue instances if it does not have them ''' # new config could have loaded between scrapes newConf = self.check_config() self.queue_keys = self.redis_conn.keys(self.spider.name + ":*:queue") for key in self.queue_keys: # build final queue key, depending on type and ip bools throttle_key = "" if self.add_type: throttle_key = self.spider.name + ":" if self.add_ip: throttle_key = throttle_key + self.my_ip + ":" # add the tld from the key `type:tld:queue` the_domain = re.split(':', key)[1] throttle_key = throttle_key + the_domain if key not in self.queue_dict or newConf: self.logger.debug( "Added new Throttled Queue {q}".format(q=key)) q = RedisPriorityQueue(self.redis_conn, key) # use default window and hits if the_domain not in self.domain_config: self.queue_dict[key] = RedisThrottledQueue( self.redis_conn, q, self.window, self.hits, self.moderated, throttle_key, throttle_key) # use custom window and hits else: window = self.domain_config[the_domain]['window'] hits = self.domain_config[the_domain]['hits'] # adjust the crawl rate based on the scale if exists if 'scale' in self.domain_config[the_domain]: hits = int(hits * self.fit_scale( self.domain_config[the_domain]['scale'])) self.queue_dict[key] = RedisThrottledQueue( self.redis_conn, q, window, hits, self.moderated, throttle_key, throttle_key) def check_config(self): ''' Controls configuration for the scheduler @return: True if there is a new configuration ''' if self.config_flag: self.config_flag = False return True return False def update_ipaddress(self): ''' Updates the scheduler so it knows its own ip address ''' # assign local ip in case of exception self.old_ip = self.my_ip self.my_ip = get_raspberrypi_ip_address() try: obj = urllib2.urlopen( settings.get('PUBLIC_IP_URL', 'http://ip.42.pl/raw')) results = self.ip_regex.findall(obj.read()) if len(results) > 0: self.my_ip = results[0] else: raise IOError("Could not get valid IP Address") obj.close() self.logger.debug("Current public ip: {ip}".format(ip=self.my_ip)) except IOError: self.logger.error("Could not reach out to get public ip") pass if self.old_ip != self.my_ip: self.logger.info("Changed Public IP: {old} -> {new}".format( old=self.old_ip, new=self.my_ip)) def report_self(self): ''' Reports the crawler uuid to redis ''' self.logger.debug("Reporting self id", extra={'uuid': self.my_uuid}) key = "stats:crawler:{m}:{s}:{u}".format(m=socket.gethostname(), s=self.spider.name, u=self.my_uuid) self.redis_conn.set(key, time.time()) self.redis_conn.expire(key, self.ip_update_interval * 2) @classmethod def from_settings(cls, settings, spidername): server = redis.Redis(host=settings.get('REDIS_HOST'), port=settings.get('REDIS_PORT')) persist = settings.get('SCHEDULER_PERSIST', True) up_int = settings.get('SCHEDULER_QUEUE_REFRESH', 10) hits = settings.get('QUEUE_HITS', 10) window = settings.get('QUEUE_WINDOW', 60) mod = settings.get('QUEUE_MODERATED', False) timeout = settings.get('DUPEFILTER_TIMEOUT', 600) ip_refresh = settings.get('SCHEDULER_IP_REFRESH', 60) add_type = settings.get('SCHEDULER_TYPE_ENABLED', False) add_ip = settings.get('SCHEDULER_IP_ENABLED', False) retries = settings.get('SCHEUDLER_ITEM_RETRIES', 3) ip_regex = settings.get('IP_ADDR_REGEX', '.*') my_level = settings.get('SC_LOG_LEVEL', 'DEBUG') my_name = "%s_%s" % (spidername, get_raspberrypi_ip_address()) my_output = settings.get('SC_LOG_STDOUT', False) my_json = settings.get('SC_LOG_JSON', True) my_dir = settings.get('SC_LOG_DIR', 'logs') my_bytes = settings.get('SC_LOG_MAX_BYTES', '10MB') my_file = "%s_%s.log" % (spidername, get_raspberrypi_ip_address()) my_backups = settings.get('SC_LOG_BACKUPS', 5) logger = CustomLogFactory.get_instance(json=my_json, name=my_name, stdout=my_output, level=my_level, dir=my_dir, file=my_file, bytes=my_bytes, backups=my_backups) return cls(server, persist, up_int, timeout, retries, logger, hits, window, mod, ip_refresh, add_type, add_ip, ip_regex) @classmethod def from_crawler(cls, crawler): return cls.from_settings(crawler.settings, crawler.spider.name) def open(self, spider): self.spider = spider self.spider.set_logger(self.logger) self.spider.set_redis(self.redis_conn) self.spider.setup_stats() self.create_queues() self.setup_zookeeper() self.dupefilter = RFPDupeFilter(self.redis_conn, self.spider.name + ':dupefilter', self.rfp_timeout) # add by msc # if self.spider.name == "amazon": # self.count_per_minute = RedisDict(self.redis_conn, None, "%s_%s:count_per_minute"%(self.spider.name, self.spider.worker_id)) def close(self, reason): self.logger.info("Closing Spider", {'spiderid': self.spider.name}) if not self.persist: self.logger.warning("Clearing crawl queues") self.dupefilter.clear() for key in self.queue_keys: self.queue_dict[key].clear() def is_blacklisted(self, appid, crawlid): ''' Checks the redis blacklist for crawls that should not be propagated either from expiring or stopped @return: True if the appid crawlid combo is blacklisted ''' # key_check = '{appid}||{crawlid}'.format(appid=appid, # crawlid=crawlid) # redis_key = self.spider.name + ":blacklist" # return self.redis_conn.sismember(redis_key, key_check) # dont use the blacklist return False def enqueue_request(self, request): ''' Pushes a request from the spider into the proper throttled queue ''' if not request.dont_filter and self.dupefilter.request_seen(request): self.logger.debug("Request not added back to redis") return req_dict = self.request_to_dict(request) if not self.is_blacklisted(req_dict['meta']['appid'], req_dict['meta']['crawlid']): # grab the tld of the request ex_res = self.extract(req_dict['url']) key = "{sid}:{dom}.{suf}:queue".format( sid=req_dict['meta']['spiderid'], dom=ex_res.domain, suf=ex_res.suffix) curr_time = time.time() # insert if crawl never expires (0) or time < expires if req_dict['meta']['expires'] == 0 or \ curr_time < req_dict['meta']['expires']: # we may already have the queue in memory if key in self.queue_keys: self.queue_dict[key].push(req_dict, req_dict['meta']['priority']) else: # shoving into a new redis queue, negative b/c of sorted sets # this will populate ourself and other schedulers when # they call create_queues self.redis_conn.zadd(key, pickle.dumps(req_dict, protocol=-1), -req_dict['meta']['priority']) self.logger.debug( "Crawlid: '{id}' Appid: '{appid}' Url: '{url}' added to queue" .format(appid=req_dict['meta']['appid'], id=req_dict['meta']['crawlid'], url=req_dict['meta']['url'])) else: self.logger.debug( "Crawlid: '{id}' Appid: '{appid}' expired".format( appid=req_dict['meta']['appid'], id=req_dict['meta']['crawlid'])) else: self.logger.debug( "Crawlid: '{id}' Appid: '{appid}' blacklisted".format( appid=req_dict['meta']['appid'], id=req_dict['meta']['crawlid'])) def request_to_dict(self, request): ''' Convert Request object to a dict. modified from scrapy.utils.reqser ''' req_dict = { # urls should be safe (safe_string_url) 'url': request.url.decode('ascii'), 'method': request.method, 'headers': dict(request.headers), 'body': request.body, 'cookies': request.cookies, 'meta': request.meta, '_encoding': request._encoding, 'priority': request.priority, 'dont_filter': request.dont_filter, # callback/errback are assumed to be a bound instance of the spider 'callback': None if request.callback is None else request.callback.func_name, 'errback': None if request.errback is None else request.errback.func_name, } return req_dict def find_item(self): ''' Finds an item from the throttled queues ''' random.shuffle(self.queue_keys) count = 0 while count <= self.item_retries: for key in self.queue_keys: # the throttled queue only returns an item if it is allowed item = self.queue_dict[key].pop() self.present_item = item #self.spider.log('key: %s ' % key) msgvalue = { 'queuename': key, 'lenthofqueue': (int(len(self.queue_dict[key])) or 0) } msg = "lenth of queue %s" % key self.logger.info('key: %s ' % key) self.logger.info('len(self.queue_dict[key]): %s ' % len(self.queue_dict[key])) self.logger.info(msg, msgvalue) if item: return item # we want the spiders to get slightly out of sync # with each other for better performance time.sleep(random.random()) count = count + 1 return None def next_request(self): ''' Logic to handle getting a new url request, from a bunch of different queues ''' t = time.time() # update the redis queues every so often if t - self.update_time > self.update_interval: self.update_time = t self.create_queues() item = self.find_item() if item: self.logger.info( 'distributed_scheduler.py::DistributedScheduler::next_request call find_item() result is : %s' % (item["meta"]["url"] if 'meta' in item else item["url"])) self.logger.debug("Found url to crawl {url}" \ .format(url=item['url'])) try: req = Request(item['url']) except ValueError: # need absolute url # need better url validation here req = Request('http://' + item['url']) if 'callback' in item: cb = item['callback'] if cb and self.spider: cb = get_method(self.spider, cb) req.callback = cb if 'errback' in item: eb = item['errback'] if eb and self.spider: eb = get_method(self.spider, eb) req.errback = eb if 'meta' in item: item = item['meta'] # defaults not in schema if 'curdepth' not in item: item['curdepth'] = 0 if "retry_times" not in item: item['retry_times'] = 0 for key in item.keys(): req.meta[key] = item[key] # extra check to add items to request if 'useragent' in item and item['useragent'] is not None: req.headers['User-Agent'] = item['useragent'] if 'cookie' in item and item['cookie'] is not None: if isinstance(item['cookie'], dict): req.cookies = item['cookie'] elif isinstance(item['cookie'], basestring): req.cookies = self.parse_cookie(item['cookie']) return req return None def parse_cookie(self, string): ''' Parses a cookie string like returned in a Set-Cookie header @param string: The cookie string @return: the cookie dict ''' results = re.findall('([^=]+)=([^\;]+);?\s?', string) my_dict = {} for item in results: my_dict[item[0]] = item[1] return my_dict def has_pending_requests(self): ''' We never want to say we have pending requests If this returns True scrapy sometimes hangs. ''' return False
class TestZookeeperWatcher(TestCase): def setUp(self): zoo_client = MagicMock() zoo_client.get = MagicMock(return_value=('data', 'blah')) with patch('scutils.zookeeper_watcher.KazooClient') as k: k.return_value = zoo_client self.zoo_watcher = ZookeeperWatcher(hosts='localhost', filepath='/mypath', pointer=False, ensure=True, valid_init=True) def test_ping(self): self.zoo_watcher.zoo_client.server_version = MagicMock() self.assertTrue(self.zoo_watcher.ping()) self.zoo_watcher.zoo_client.server_version = MagicMock( side_effect=KazooException) self.assertFalse(self.zoo_watcher.ping()) def test_get_file_contents(self): self.zoo_watcher.old_pointed = 'old_pointed' self.zoo_watcher.old_data = 'old_data' self.zoo_watcher.pointer = False self.assertEquals(self.zoo_watcher.get_file_contents(), 'old_data') self.zoo_watcher.pointer = True self.assertEquals(self.zoo_watcher.get_file_contents(), 'old_data') self.zoo_watcher.pointer = True self.assertEquals(self.zoo_watcher.get_file_contents(True), 'old_pointed') def test_compare_pointer(self): self.zoo_watcher.old_pointed = '/path1' self.assertTrue(self.zoo_watcher.compare_pointer('/path2')) self.zoo_watcher.old_pointed = '/path1' self.assertFalse(self.zoo_watcher.compare_pointer('/path1')) def test_compare_data(self): self.zoo_watcher.old_data = 'old_data' self.assertTrue(self.zoo_watcher.compare_data('new_data')) self.zoo_watcher.old_data = 'same_data' self.assertFalse(self.zoo_watcher.compare_data('same_data')) def test_set_valid(self): self.zoo_watcher.is_valid = MagicMock(return_value=True) self.zoo_watcher.valid_handler = MagicMock() self.zoo_watcher.set_valid(False) self.zoo_watcher.valid_handler.assert_called_once_with(True) def test_call_valid(self): self.the_bool = False def the_set(state): self.the_bool = True self.zoo_watcher.valid_handler = the_set self.zoo_watcher.call_valid(True) self.assertTrue(self.the_bool) def test_call_config(self): self.the_bool = False def the_set(state): self.the_bool = True self.zoo_watcher.config_handler = the_set self.zoo_watcher.call_config(True) self.assertTrue(self.the_bool) def test_call_error(self): self.the_bool = False def the_set(state): self.the_bool = True self.zoo_watcher.error_handler = the_set self.zoo_watcher.call_error(True) self.assertTrue(self.the_bool)
class DistributedScheduler(object): redis_conn = None # the redis connection queue_dict = None # the dict of throttled queues spider = None # the spider using this scheduler queue_keys = None # the list of current queues queue_class = None # the class to use for the queue dupefilter = None # the redis dupefilter update_time = 0 # the last time the queues were updated update_ip_time = 0 # the last time the ip was updated update_interval = 0 # how often to update the queues extract = None # the tld extractor hits = 0 # default number of hits for a queue window = 0 # default window to calculate number of hits ip = '127.0.0.1' # 爬虫节点对应的IP old_ip = None # the old ip for logging ip_update_interval = 0 # the interval to update the ip address add_type = None # add spider type to redis throttle queue key add_ip = None # add spider public ip to redis throttle queue key item_retries = 0 # the number of extra tries to get an item my_uuid = None # the generated UUID for the particular scrapy process # Zookeeper Dynamic Config Vars domain_config = {} # The list of domains and their configs my_id = None # The id used to read the throttle config config_flag = False # Flag to reload queues if settings are wiped too assign_path = None # The base assigned configuration path to read zoo_client = None # The KazooClient to manage the config my_assignment = None # Zookeeper path to read actual yml config black_domains = [] # the domains to ignore thanks to zookeeper config producer = None # Kafka消息队列中的生产者 closed = False # kafka连接是否关闭 def __init__(self, server, persist, update_int, timeout, retries, logger, hits, window, mod, ip_refresh, add_type, add_ip, ip_regex, backlog_blacklist, queue_timeout, chose): self.redis_conn = server self.persist = persist self.queue_dict = {} self.update_interval = update_int self.hits = hits self.window = window self.moderated = mod self.rfp_timeout = timeout self.ip_update_interval = ip_refresh self.add_type = add_type self.add_ip = add_ip self.item_retires = retries self.logger = logger self.ip_regex = re.compile(ip_regex) self.backlog_blacklist = backlog_blacklist self.queue_timeout = queue_timeout self.chose = chose self.extract = tldextract.TLDExtract() self.job_id = None # 标识爬虫进程 self.paused = False # 标识爬虫是否暂停 def setup_zookeeper(self): self.assign_path = settings.get('ZOOKEEPER_ASSIGN_PATH', "") self.my_id = settings.get('ZOOKEEPER_ID', 'all') self.logger.debug("Trying to establish Zookeeper connection") try: self.zoo_watcher = ZookeeperWatcher( hosts=settings.get('ZOOKEEPER_HOSTS'), filepath=self.assign_path + self.my_id, config_handler=self.change_config, error_handler=self.error_config, pointer=False, ensure=True, valid_init=True) except KazooTimeoutError: self.logger.error("Could not connect to Zookeeper") sys.exit(1) if self.zoo_watcher.ping(): self.logger.debug("Successfully set up Zookeeper connection") else: self.logger.error("Could not ping Zookeeper") sys.exit(1) def change_config(self, config_string): if config_string and len(config_string) > 0: loaded_config = yaml.safe_load(config_string) self.logger.info("Zookeeper config changed", extra=loaded_config) self.load_domain_config(loaded_config) self.update_domain_queues() elif config_string is None or len(config_string) == 0: self.error_config("Zookeeper config wiped") self.create_throttle_queues() def load_domain_config(self, loaded_config): ''' Loads the domain_config and sets up queue_dict @param loaded_config: the yaml loaded config dict from zookeeper ''' self.domain_config = {} # vetting process to ensure correct configs if loaded_config: if 'domains' in loaded_config: for domain in loaded_config['domains']: item = loaded_config['domains'][domain] # check valid if 'window' in item and 'hits' in item: self.logger.debug("Added domain {dom} to loaded config" .format(dom=domain)) self.domain_config[domain] = item # domain_config = {'wikipedia.org': {'window': 60, 'scale': 0.5, 'hits': 30}} if 'blacklist' in loaded_config: self.black_domains = loaded_config['blacklist'] # black_domains = ['domain3.com', 'www.baidu.com'] self.config_flag = True def update_domain_queues(self): ''' Check to update existing queues already in memory new queues are created elsewhere ''' for key in self.domain_config: final_key = "{spider_type}:{job_id}:{domain}:queue".format( spider_type=self.spider.name, job_id=self.job_id, domain=key) # we already have a throttled queue for this domain, update it to new settings if final_key in self.queue_dict: self.queue_dict[final_key][0].window = float(self.domain_config[key]['window']) self.logger.debug("Updated queue {q} with new config" .format(q=final_key)) # if scale is applied, scale back; otherwise use updated hits if 'scale' in self.domain_config[key]: # round to int hits = int(self.domain_config[key]['hits'] * self.fit_scale( self.domain_config[key]['scale'])) self.queue_dict[final_key][0].limit = float(hits) else: self.queue_dict[final_key][0].limit = float(self.domain_config[key]['hits']) def error_config(self, message): extras = {} extras['message'] = message extras['revert_window'] = self.window extras['revert_hits'] = self.hits extras['spiderid'] = self.spider.name self.logger.info("Lost config from Zookeeper", extra=extras) # lost connection to zookeeper, reverting back to defaults for key in self.domain_config: final_key = "{name}:{domain}:queue".format( name=self.spider.name, domain=key) self.queue_dict[final_key][0].window = self.window self.queue_dict[final_key][0].limit = self.hits self.domain_config = {} def fit_scale(self, scale): ''' @return: a scale >= 0 and <= 1 ''' if scale >= 1: return 1.0 elif scale <= 0: return 0.0 else: return scale def create_throttle_queues(self): """ 创建限流队列 :return: """ new_conf = self.check_config() queue_key = '{spider_type}:{job_id}:*:queue'.format(spider_type=self.spider.name, job_id=self.job_id) self.queue_keys = self.redis_conn.keys(queue_key) for key in self.queue_keys: throttle_key = "" if self.add_type: throttle_key = self.spider.name + ":" if self.add_ip: throttle_key = throttle_key + self.ip + ":" the_domain = re.split(':', key)[2] throttle_key += the_domain if key not in self.queue_dict or new_conf: self.logger.debug("Added new Throttled Queue {q}" .format(q=key)) q = RedisPriorityQueue(self.redis_conn, key) if the_domain not in self.domain_config: self.queue_dict[key] = [RedisThrottledQueue(self.redis_conn, q, self.window, self.hits, self.moderated, throttle_key, throttle_key, True), time.time()] else: window = self.domain_config[the_domain]['window'] hits = self.domain_config[the_domain]['hits'] if 'scale' in self.domain_config[the_domain]: hits = int(hits * self.fit_scale(self.domain_config[the_domain]['scale'])) self.queue_dict[key] = [RedisThrottledQueue(self.redis_conn, q, window, hits, self.moderated, throttle_key, throttle_key, True), time.time()] def expire_queues(self): ''' Expires old queue_dict keys that have not been used in a long time. Prevents slow memory build up when crawling lots of different domains ''' curr_time = time.time() for key in list(self.queue_dict): diff = curr_time - self.queue_dict[key][1] if diff > self.queue_timeout: self.logger.debug("Expiring domain queue key " + key) del self.queue_dict[key] if key in self.queue_keys: self.queue_keys.remove(key) def check_config(self): ''' Controls configuration for the scheduler @return: True if there is a new configuration ''' if self.config_flag: self.config_flag = False return True return False def report_self(self): ip = DistributedScheduler.get_local_ip() key = "stats:spider:{ip}:{job}".format( ip=ip, job=self.job_id ) self.redis_conn.set(key, time.time()) @staticmethod def get_local_ip(ifname='enp1s0'): s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) inet = fcntl.ioctl(s.fileno(), 0x8915, struct.pack('256s', ifname[:15])) ret = socket.inet_ntoa(inet[20:24]) return ret @classmethod def from_settings(cls, settings): server = redis.Redis(host=settings.get('REDIS_HOST'), port=settings.get('REDIS_PORT'), db=settings.get('REDIS_DB')) persist = settings.get('SCHEDULER_PERSIST', True) up_int = settings.get('SCHEDULER_QUEUE_REFRESH', 10) hits = settings.get('QUEUE_HITS', 10) window = settings.get('QUEUE_WINDOW', 60) mod = settings.get('QUEUE_MODERATED', False) timeout = settings.get('DUPEFILTER_TIMEOUT', 600) ip_refresh = settings.get('SCHEDULER_IP_REFRESH', 60) add_type = settings.get('SCHEDULER_TYPE_ENABLED', True) add_ip = settings.get('SCHEDULER_IP_ENABLED', False) retries = settings.get('SCHEUDLER_ITEM_RETRIES', 3) ip_regex = settings.get('IP_ADDR_REGEX', '.*') backlog_blacklist = settings.get('SCHEDULER_BACKLOG_BLACKLIST', True) queue_timeout = settings.get('SCHEDULER_QUEUE_TIMEOUT', 3600) my_level = settings.get('SC_LOG_LEVEL', 'INFO') my_name = settings.get('SC_LOGGER_NAME', 'sc-logger') my_output = settings.get('SC_LOG_STDOUT', True) my_json = settings.get('SC_LOG_JSON', False) my_dir = settings.get('SC_LOG_DIR', 'logs') my_bytes = settings.get('SC_LOG_MAX_BYTES', '10MB') my_file = settings.get('SC_LOG_FILE', 'main.log') my_backups = settings.get('SC_LOG_BACKUPS', 5) logger = LogFactory.get_instance(json=my_json, name=my_name, stdout=my_output, level=my_level, dir=my_dir, file=my_file, bytes=my_bytes, backups=my_backups) # spider_ids = ['1', ] spider_ids = ['1', '2', '3'] chose = ketama.Continuum(spider_ids) return cls(server, persist, up_int, timeout, retries, logger, hits, window, mod, ip_refresh, add_type, add_ip, ip_regex, backlog_blacklist, queue_timeout, chose) @classmethod def from_crawler(cls, crawler): return cls.from_settings(crawler.settings) def open(self, spider): self.spider = spider self.ip = DistributedScheduler.get_local_ip() self.job_id = spider.settings['job_id'] # self.job_id = '1' self.spider.set_logger(self.logger) self.create_throttle_queues() self.setup_zookeeper() # 连接zookeeper self.setup_kafka() # 连接Kafka key = "stats:spider:{ip}:{job}".format( ip=DistributedScheduler.get_local_ip(), job=self.job_id ) self.redis_conn.set(key, time.time()) self.dupefilter = RFPDupeFilter(self.redis_conn, self.spider.name) def close(self, reason): self.logger.info("Closing Spider", {'spiderid': self.spider.name}) # 清空爬虫队列对应的限流队列 if not self.persist: self.logger.warning("Clearing crawl queues") for key in self.queue_keys: self.queue_dict[key][0].clear() # 清空Redis中爬虫节点状态 ip = DistributedScheduler.get_local_ip() key = "stats:spider:{ip}:{job}".format( ip=ip, job=self.job_id ) self.redis_conn.delete(key) key = "{job}:status".format(job=self.job_id) self.redis_conn.delete(key) # 关闭Kafka连接 if self.producer is not None: self.logger.debug("Closing kafka producer") self.producer.close(timeout=10) def is_blacklisted(self, appid, crawlid): ''' Checks the redis blacklist for crawls that should not be propagated either from expiring or stopped @return: True if the appid crawlid combo is blacklisted ''' key_check = '{appid}||{crawlid}'.format(appid=appid, crawlid=crawlid) redis_key = self.spider.name + ":blacklist" return self.redis_conn.sismember(redis_key, key_check) def enqueue_request(self, request): """ 从spider模块中获取新的Request,交给Kafka消息队列 :param request: :return: """ if not True and self.dupefilter.request_seen(request): self.logger.debug("Request not added back to redis") return req_dict = request_to_dict(request, self.spider) real_url = req_dict['meta']['splash']['args']['url'] if 'splash' in req_dict['meta'] else req_dict['url'] # 强调: 加入spider_type,job_id req_dict['spider_type'] = self.spider.name req_dict['job_id'] = self.chose[real_url.encode('utf-8')] # req_dict['errback'] = 'parse_inform_index' req_dict.update({'errback': 'parse_inform_index'}) self._feed_to_kafka(req_dict) def _feed_to_kafka(self, json_item): def _feed(item): try: self.logger.debug("Sending json to kafka at " + str(settings['KAFKA_PRODUCER_TOPIC'])) future = self.producer.send(settings['KAFKA_PRODUCER_TOPIC'], item) self.producer.flush() return True except Exception as e: self.logger.error("Lost connection to Kafka") return False return _feed(json_item) def find_item(self): random.shuffle(self.queue_keys) count = 0 while count <= self.item_retries: for key in self.queue_keys: if key.split(':')[2] in self.black_domains: continue item = self.queue_dict[key][0].pop() if item: self.queue_dict[key][1] = time.time() return item count += 1 return None def next_request(self): """ 从redis中取出已被序列化的任务封装成Requests,交给Engine :return: """ if self.paused: return item = self.find_item() if item: '''考虑两种情况的Request: 1. 被渲染后的Request 2. 前端用户传入的Request ''' if 'splash' in item['meta']: self.logger.debug("Crawl url: %s via %s" % (item['meta']['splash']['args']['url'], item['url'])) # req = request_from_dict(item, self.spider) req = SplashRequest(url=item['url'], meta=item['meta'], method=item['method'], body=item['body'], dont_send_headers=True ) if 'callback' in item: req.callback = getattr(self.spider, item['callback']) req.headers['content-type'] = 'application/json' if 'headers' in req.meta['splash']['args']: req.meta['splash']['args']['headers'] = {} req.meta['splash']['args']['content-type'] = 'application/json' else: req = SplashRequest(url=item['url'], callback=item['callback'], meta=item['meta'], dont_send_headers=True ) if 'method' in item: req.method = item['method'] if 'headers' in item: req.headers = item['headers'] if 'body' in item: req.body = item['body'] if 'cookies' in item: req.cookies = item['cookies'] if 'priority' in item: req.priority = item['priority'] self.logger.debug("Crawl url: %s" % item['url']) return req return None def status_from_redis(self): self.create_throttle_queues() self.expire_queues() status = self.redis_conn.get('{job}:status'.format(job=self.job_id)) if status == 'pause': # 暂停爬虫 && 重置一致性分布 self.paused = True spiders = self.redis_conn.keys('stats:spider:*:*') spider_ids = [] for spider in spiders: spider_ids.append(spider.split(':')[3]) self.chose = ketama.Continuum(spider_ids) return if status == 'running': self.paused = False def parse_cookie(self, string): ''' Parses a cookie string like returned in a Set-Cookie header @param string: The cookie string @return: the cookie dict ''' results = re.findall('([^=]+)=([^\;]+);?\s?', string) my_dict = {} for item in results: my_dict[item[0]] = item[1] return my_dict def has_pending_requests(self): ''' We never want to say we have pending requests If this returns True scrapy sometimes hangs. ''' return False def setup_kafka(self): """ 创建生产者 :return: """ self.producer = self._create_producer() self.logger.debug("Successfully connected to Kafka") @retry(wait_exponential_multiplier=500, wait_exponential_max=10000) def _create_producer(self): if not self.closed: try: self.logger.debug("Creating new kafka producer using brokers: " + str(settings['KAFKA_HOSTS'])) return KafkaProducer(bootstrap_servers=settings['KAFKA_HOSTS'], value_serializer=lambda v: json.dumps(v).encode('utf-8'), retries=3, linger_ms=settings['KAFKA_PRODUCER_BATCH_LINGER_MS'], buffer_memory=settings['KAFKA_PRODUCER_BUFFER_BYTES']) except KeyError as e: self.logger.error('Missing setting named ' + str(e), {'ex': traceback.format_exc()}) except: self.logger.error("Couldn't initialize kafka producer.", {'ex': traceback.format_exc()}) raise
class DistributedScheduler(object): ''' Scrapy request scheduler that utilizes Redis Throttled Priority Queues to moderate different domain scrape requests within a distributed scrapy cluster ''' redis_conn = None # the redis connection queue_dict = None # the dict of throttled queues spider = None # the spider using this scheduler queue_keys = None # the list of current queues queue_class = None # the class to use for the queue dupefilter = None # the redis dupefilter global_page_per_domain_filter = None # the global redis page per domain filter, applied to all domains. domain_max_page_filter = None # the individual domain's redis max page filter. update_time = 0 # the last time the queues were updated update_ip_time = 0 # the last time the ip was updated update_interval = 0 # how often to update the queues extract = None # the tld extractor hits = 0 # default number of hits for a queue window = 0 # default window to calculate number of hits my_ip = None # the ip address of the scheduler (if needed) old_ip = None # the old ip for logging ip_update_interval = 0 # the interval to update the ip address add_type = None # add spider type to redis throttle queue key add_ip = None # add spider public ip to redis throttle queue key item_retries = 0 # the number of extra tries to get an item my_uuid = None # the generated UUID for the particular scrapy process # Zookeeper Dynamic Config Vars domain_config = {} # The list of domains and their configs my_id = None # The id used to read the throttle config config_flag = False # Flag to reload queues if settings are wiped too assign_path = None # The base assigned configuration path to read zoo_client = None # The KazooClient to manage the config my_assignment = None # Zookeeper path to read actual yml config black_domains = [] # the domains to ignore thanks to zookeeper config def __init__(self, server, persist, update_int, timeout, retries, logger, hits, window, mod, ip_refresh, add_type, add_ip, ip_regex, backlog_blacklist, queue_timeout, global_page_per_domain_limit, global_page_per_domain_limit_timeout, domain_max_page_timeout): ''' Initialize the scheduler ''' self.redis_conn = server self.persist = persist self.queue_dict = {} self.update_interval = update_int self.hits = hits self.window = window self.moderated = mod self.rfp_timeout = timeout self.ip_update_interval = ip_refresh self.add_type = add_type self.add_ip = add_ip self.item_retries = retries self.logger = logger self.ip_regex = re.compile(ip_regex) self.backlog_blacklist = backlog_blacklist self.queue_timeout = queue_timeout self.global_page_per_domain_limit = global_page_per_domain_limit self.global_page_per_domain_limit_timeout = global_page_per_domain_limit_timeout self.domain_max_page_timeout = domain_max_page_timeout # set up tldextract self.extract = tldextract.TLDExtract() self.update_ipaddress() # if we need better uuid's mod this line self.my_uuid = str(uuid.uuid4()).split('-')[4] def setup_zookeeper(self): self.assign_path = settings.get('ZOOKEEPER_ASSIGN_PATH', "") self.my_id = settings.get('ZOOKEEPER_ID', 'all') self.logger.debug("Trying to establish Zookeeper connection") try: self.zoo_watcher = ZookeeperWatcher( hosts=settings.get('ZOOKEEPER_HOSTS'), filepath=self.assign_path + self.my_id, config_handler=self.change_config, error_handler=self.error_config, pointer=False, ensure=True, valid_init=True) except KazooTimeoutError: self.logger.error("Could not connect to Zookeeper") sys.exit(1) if self.zoo_watcher.ping(): self.logger.debug("Successfully set up Zookeeper connection") else: self.logger.error("Could not ping Zookeeper") sys.exit(1) def change_config(self, config_string): if config_string and len(config_string) > 0: loaded_config = yaml.safe_load(config_string) self.logger.info("Zookeeper config changed", extra=loaded_config) self.load_domain_config(loaded_config) self.update_domain_queues() elif config_string is None or len(config_string) == 0: self.error_config("Zookeeper config wiped") self.create_queues() def load_domain_config(self, loaded_config): ''' Loads the domain_config and sets up queue_dict @param loaded_config: the yaml loaded config dict from zookeeper ''' self.domain_config = {} # vetting process to ensure correct configs if loaded_config: if 'domains' in loaded_config: for domain in loaded_config['domains']: item = loaded_config['domains'][domain] # check valid if 'window' in item and 'hits' in item: self.logger.debug( "Added domain {dom} to loaded config".format( dom=domain)) self.domain_config[domain] = item if 'blacklist' in loaded_config: self.black_domains = loaded_config['blacklist'] self.config_flag = True def update_domain_queues(self): ''' Check to update existing queues already in memory new queues are created elsewhere ''' for key in self.domain_config: final_key = "{name}:{domain}:queue".format(name=self.spider.name, domain=key) # we already have a throttled queue for this domain, update it to new settings if final_key in self.queue_dict: self.queue_dict[final_key][0].window = float( self.domain_config[key]['window']) self.logger.debug( "Updated queue {q} with new config".format(q=final_key)) # if scale is applied, scale back; otherwise use updated hits if 'scale' in self.domain_config[key]: # round to int hits = int( self.domain_config[key]['hits'] * self.fit_scale(self.domain_config[key]['scale'])) self.queue_dict[final_key][0].limit = float(hits) else: self.queue_dict[final_key][0].limit = float( self.domain_config[key]['hits']) def error_config(self, message): extras = {} extras['message'] = message extras['revert_window'] = self.window extras['revert_hits'] = self.hits extras['spiderid'] = self.spider.name self.logger.info("Lost config from Zookeeper", extra=extras) # lost connection to zookeeper, reverting back to defaults for key in self.domain_config: final_key = "{name}:{domain}:queue".format(name=self.spider.name, domain=key) self.queue_dict[final_key][0].window = self.window self.queue_dict[final_key][0].limit = self.hits self.domain_config = {} def fit_scale(self, scale): ''' @return: a scale >= 0 and <= 1 ''' if scale >= 1: return 1.0 elif scale <= 0: return 0.0 else: return scale def create_queues(self): ''' Updates the in memory list of the redis queues Creates new throttled queue instances if it does not have them ''' # new config could have loaded between scrapes newConf = self.check_config() self.queue_keys = self.redis_conn.keys(self.spider.name + ":*:queue") for key in self.queue_keys: # build final queue key, depending on type and ip bools throttle_key = "" if self.add_type: throttle_key = self.spider.name + ":" if self.add_ip: throttle_key = throttle_key + self.my_ip + ":" # add the tld from the key `type:tld:queue` the_domain = re.split(':', key)[1] throttle_key = throttle_key + the_domain if key not in self.queue_dict or newConf: self.logger.debug( "Added new Throttled Queue {q}".format(q=key)) q = RedisPriorityQueue(self.redis_conn, key, encoding=ujson) # use default window and hits if the_domain not in self.domain_config: # this is now a tuple, all access needs to use [0] to get # the object, use [1] to get the time self.queue_dict[key] = [ RedisThrottledQueue(self.redis_conn, q, self.window, self.hits, self.moderated, throttle_key, throttle_key, True), time.time() ] # use custom window and hits else: window = self.domain_config[the_domain]['window'] hits = self.domain_config[the_domain]['hits'] # adjust the crawl rate based on the scale if exists if 'scale' in self.domain_config[the_domain]: hits = int(hits * self.fit_scale( self.domain_config[the_domain]['scale'])) self.queue_dict[key] = [ RedisThrottledQueue(self.redis_conn, q, window, hits, self.moderated, throttle_key, throttle_key, True), time.time() ] def expire_queues(self): ''' Expires old queue_dict keys that have not been used in a long time. Prevents slow memory build up when crawling lots of different domains ''' curr_time = time.time() for key in list(self.queue_dict): diff = curr_time - self.queue_dict[key][1] if diff > self.queue_timeout: self.logger.debug("Expiring domain queue key " + key) del self.queue_dict[key] if key in self.queue_keys: self.queue_keys.remove(key) def check_config(self): ''' Controls configuration for the scheduler @return: True if there is a new configuration ''' if self.config_flag: self.config_flag = False return True return False def update_ipaddress(self): ''' Updates the scheduler so it knows its own ip address ''' # assign local ip in case of exception self.old_ip = self.my_ip self.my_ip = '127.0.0.1' try: obj = urllib.request.urlopen( settings.get('PUBLIC_IP_URL', 'http://ip.42.pl/raw')) results = self.ip_regex.findall(obj.read().decode('utf-8')) if len(results) > 0: self.my_ip = results[0] else: raise IOError("Could not get valid IP Address") obj.close() self.logger.debug("Current public ip: {ip}".format(ip=self.my_ip)) except IOError: self.logger.error("Could not reach out to get public ip") pass if self.old_ip != self.my_ip: self.logger.info("Changed Public IP: {old} -> {new}".format( old=self.old_ip, new=self.my_ip)) def report_self(self): ''' Reports the crawler uuid to redis ''' self.logger.debug("Reporting self id", extra={'uuid': self.my_uuid}) key = "stats:crawler:{m}:{s}:{u}".format(m=socket.gethostname(), s=self.spider.name, u=self.my_uuid) self.redis_conn.set(key, time.time()) self.redis_conn.expire(key, self.ip_update_interval * 2) @classmethod def from_settings(cls, settings): server = redis.Redis( host=settings.get('REDIS_HOST'), port=settings.get('REDIS_PORT'), db=settings.get('REDIS_DB'), password=settings.get('REDIS_PASSWORD'), decode_responses=True, socket_timeout=settings.get('REDIS_SOCKET_TIMEOUT'), socket_connect_timeout=settings.get('REDIS_SOCKET_TIMEOUT')) persist = settings.get('SCHEDULER_PERSIST', True) up_int = settings.get('SCHEDULER_QUEUE_REFRESH', 10) hits = settings.get('QUEUE_HITS', 10) window = settings.get('QUEUE_WINDOW', 60) mod = settings.get('QUEUE_MODERATED', False) timeout = settings.get('DUPEFILTER_TIMEOUT', 600) ip_refresh = settings.get('SCHEDULER_IP_REFRESH', 60) add_type = settings.get('SCHEDULER_TYPE_ENABLED', False) add_ip = settings.get('SCHEDULER_IP_ENABLED', False) retries = settings.get('SCHEUDLER_ITEM_RETRIES', 3) ip_regex = settings.get('IP_ADDR_REGEX', '.*') backlog_blacklist = settings.get('SCHEDULER_BACKLOG_BLACKLIST', True) queue_timeout = settings.get('SCHEDULER_QUEUE_TIMEOUT', 3600) my_level = settings.get('SC_LOG_LEVEL', 'INFO') my_name = settings.get('SC_LOGGER_NAME', 'sc-logger') my_output = settings.get('SC_LOG_STDOUT', True) my_json = settings.get('SC_LOG_JSON', False) my_dir = settings.get('SC_LOG_DIR', 'logs') my_bytes = settings.get('SC_LOG_MAX_BYTES', '10MB') my_file = settings.get('SC_LOG_FILE', 'main.log') my_backups = settings.get('SC_LOG_BACKUPS', 5) logger = LogFactory.get_instance(json=my_json, name=my_name, stdout=my_output, level=my_level, dir=my_dir, file=my_file, bytes=my_bytes, backups=my_backups) global_page_per_domain_limit = settings.get( 'GLOBAL_PAGE_PER_DOMAIN_LIMIT', None) global_page_per_domain_limit_timeout = settings.get( 'GLOBAL_PAGE_PER_DOMAIN_LIMIT_TIMEOUT', 600) domain_max_page_timeout = settings.get('DOMAIN_MAX_PAGE_TIMEOUT', 600) return cls(server, persist, up_int, timeout, retries, logger, hits, window, mod, ip_refresh, add_type, add_ip, ip_regex, backlog_blacklist, queue_timeout, global_page_per_domain_limit, global_page_per_domain_limit_timeout, domain_max_page_timeout) @classmethod def from_crawler(cls, crawler): return cls.from_settings(crawler.settings) def open(self, spider): self.spider = spider self.spider.set_logger(self.logger) self.create_queues() self.setup_zookeeper() self.dupefilter = RFPDupeFilter(self.redis_conn, self.spider.name + ':dupefilter', self.rfp_timeout) self.global_page_per_domain_filter = RFGlobalPagePerDomainFilter( self.redis_conn, self.spider.name + ':global_page_count_filter', self.global_page_per_domain_limit, self.global_page_per_domain_limit_timeout) self.domain_max_page_filter = RFDomainMaxPageFilter( self.redis_conn, self.spider.name + ':domain_max_page_filter', self.domain_max_page_timeout) def close(self, reason): self.logger.info("Closing Spider", {'spiderid': self.spider.name}) if not self.persist: self.logger.warning("Clearing crawl queues") self.dupefilter.clear() self.global_page_per_domain_filter.clear() self.domain_max_page_filter.clear() for key in self.queue_keys: self.queue_dict[key][0].clear() def is_blacklisted(self, appid, crawlid): ''' Checks the redis blacklist for crawls that should not be propagated either from expiring or stopped @return: True if the appid crawlid combo is blacklisted ''' key_check = '{appid}||{crawlid}'.format(appid=appid, crawlid=crawlid) redis_key = self.spider.name + ":blacklist" return self.redis_conn.sismember(redis_key, key_check) def enqueue_request(self, request): ''' Pushes a request from the spider into the proper throttled queue ''' # # # # # # # # # # # # # # # # # # Duplicate link Filter # # # # # # # # # # # # # # # if not request.dont_filter and self.dupefilter.request_seen(request): self.logger.debug("Request not added back to redis") return # An individual crawling request of a domain's page req_dict = request_to_dict(request, self.spider) # # # # # # # # # # # # # # # # # # Page Limit Filters # # # # # # # # # # # # # # # # Max page filter per individual domain if req_dict['meta'][ 'domain_max_pages'] and self.domain_max_page_filter.request_page_limit_reached( request=request, spider=self.spider): self.logger.debug( "Request {0} reached domain's page limit of {1}".format( request.url, req_dict['meta']['domain_max_pages'])) return # Global - cluster wide - max page filter if self.global_page_per_domain_limit and self.global_page_per_domain_filter.request_page_limit_reached( request=request, spider=self.spider): self.logger.debug( "Request {0} reached global page limit of {1}".format( request.url, self.global_page_per_domain_limit)) return # # # # # # # # # # # # # # # # # # Blacklist Filter # # # # # # # # # # # # # # # if not self.is_blacklisted(req_dict['meta']['appid'], req_dict['meta']['crawlid']): # grab the tld of the request ex_res = self.extract(req_dict['url']) key = "{sid}:{dom}.{suf}:queue".format( sid=req_dict['meta']['spiderid'], dom=ex_res.domain, suf=ex_res.suffix) curr_time = time.time() domain = "{d}.{s}".format(d=ex_res.domain, s=ex_res.suffix) # allow only if we want all requests or we want # everything but blacklisted domains # insert if crawl never expires (0) or time < expires if (self.backlog_blacklist or (not self.backlog_blacklist and domain not in self.black_domains)) and \ (req_dict['meta']['expires'] == 0 or curr_time < req_dict['meta']['expires']): # we may already have the queue in memory if key in self.queue_keys: self.queue_dict[key][0].push(req_dict, req_dict['meta']['priority']) else: # shoving into a new redis queue, negative b/c of sorted sets # this will populate ourself and other schedulers when # they call create_queues self.redis_conn.zadd(key, ujson.dumps(req_dict), -req_dict['meta']['priority']) self.logger.debug( "Crawlid: '{id}' Appid: '{appid}' added to queue".format( appid=req_dict['meta']['appid'], id=req_dict['meta']['crawlid'])) else: self.logger.debug( "Crawlid: '{id}' Appid: '{appid}' expired".format( appid=req_dict['meta']['appid'], id=req_dict['meta']['crawlid'])) else: self.logger.debug( "Crawlid: '{id}' Appid: '{appid}' blacklisted".format( appid=req_dict['meta']['appid'], id=req_dict['meta']['crawlid'])) def find_item(self): ''' Finds an item from the throttled queues ''' random.shuffle(self.queue_keys) count = 0 while count <= self.item_retries: for key in self.queue_keys: # skip if the whole domain has been blacklisted in zookeeper if key.split(':')[1] in self.black_domains: continue # the throttled queue only returns an item if it is allowed item = self.queue_dict[key][0].pop() if item: # update timeout and return self.queue_dict[key][1] = time.time() return item count = count + 1 return None def next_request(self): ''' Logic to handle getting a new url request, from a bunch of different queues ''' t = time.time() # update the redis queues every so often if t - self.update_time > self.update_interval: self.update_time = t self.create_queues() self.expire_queues() # update the ip address every so often if t - self.update_ip_time > self.ip_update_interval: self.update_ip_time = t self.update_ipaddress() self.report_self() item = self.find_item() if item: self.logger.debug(u"Found url to crawl {url}" \ .format(url=item['url'])) if 'meta' in item: # item is a serialized request req = request_from_dict(item, self.spider) else: # item is a feed from outside, parse it manually req = self.request_from_feed(item) # extra check to add items to request if 'useragent' in req.meta and req.meta['useragent'] is not None: req.headers['User-Agent'] = req.meta['useragent'] if 'cookie' in req.meta and req.meta['cookie'] is not None: if isinstance(req.meta['cookie'], dict): req.cookies = req.meta['cookie'] elif isinstance(req.meta['cookie'], string_types): req.cookies = self.parse_cookie(req.meta['cookie']) return req return None def request_from_feed(self, item): try: req = Request(item['url']) except ValueError: # need absolute url # need better url validation here req = Request('http://' + item['url']) # defaults not in schema if 'curdepth' not in item: item['curdepth'] = 0 if "retry_times" not in item: item['retry_times'] = 0 for key in list(item.keys()): req.meta[key] = item[key] # extra check to add items to request if 'cookie' in item and item['cookie'] is not None: if isinstance(item['cookie'], dict): req.cookies = item['cookie'] elif isinstance(item['cookie'], string_types): req.cookies = self.parse_cookie(item['cookie']) return req def parse_cookie(self, string): ''' Parses a cookie string like returned in a Set-Cookie header @param string: The cookie string @return: the cookie dict ''' results = re.findall('([^=]+)=([^\;]+);?\s?', string) my_dict = {} for item in results: my_dict[item[0]] = item[1] return my_dict def has_pending_requests(self): ''' We never want to say we have pending requests If this returns True scrapy sometimes hangs. ''' return False
class DistributedScheduler(object): """ Scrapy request scheduler that utilizes Redis Throttled Priority Queues to moderate different domain scrape requests within a distributed scrapy cluster """ redis_conn = None # the redis connection queue_dict = None # the dict of throttled queues spider = None # the spider using this scheduler queue_keys = None # the list of current queues queue_class = None # the class to use for the queue dupefilter = None # the redis dupefilter update_time = 0 # the last time the queues were updated update_ip_time = 0 # the last time the ip was updated update_interval = 0 # how often to update the queues extract = None # the tld extractor hits = 0 # default number of hits for a queue window = 0 # default window to calculate number of hits my_ip = None # the ip address of the scheduler (if needed) old_ip = None # the old ip for logging ip_update_interval = 0 # the interval to update the ip address add_type = None # add spider type to redis throttle queue key add_ip = None # add spider public ip to redis throttle queue key item_retries = 0 # the number of extra tries to get an item my_uuid = None # the generated UUID for the particular scrapy process # Zookeeper Dynamic Config Vars domain_config = {} # The list of domains and their configs my_id = None # The id used to read the throttle config config_flag = False # Flag to reload queues if settings are wiped too assign_path = None # The base assigned configuration path to read zoo_client = None # The KazooClient to manage the config my_assignment = None # Zookeeper path to read actual yml config def __init__( self, server, persist, update_int, timeout, retries, logger, hits, window, mod, ip_refresh, add_type, add_ip, ip_regex, ): """ Initialize the scheduler """ self.redis_conn = server self.persist = persist self.queue_dict = {} self.update_interval = update_int self.hits = hits self.window = window self.moderated = mod self.rfp_timeout = timeout self.ip_update_interval = ip_refresh self.add_type = add_type self.add_ip = add_ip self.item_retires = retries self.logger = logger self.ip_regex = re.compile(ip_regex) # set up tldextract self.extract = tldextract.TLDExtract() self.update_ipaddress() # if we need better uuid's mod this line self.my_uuid = str(uuid.uuid4()).split("-")[4] def setup_zookeeper(self): self.assign_path = settings.get("ZOOKEEPER_ASSIGN_PATH", "") self.my_id = settings.get("ZOOKEEPER_ID", "all") self.logger.debug("Trying to establish Zookeeper connection") try: self.zoo_watcher = ZookeeperWatcher( hosts=settings.get("ZOOKEEPER_HOSTS"), filepath=self.assign_path + self.my_id, config_handler=self.change_config, error_handler=self.error_config, pointer=False, ensure=True, valid_init=True, ) except KazooTimeoutError: self.logger.error("Could not connect to Zookeeper") sys.exit(1) if self.zoo_watcher.ping(): self.logger.debug("Successfully set up Zookeeper connection") else: self.logger.error("Could not ping Zookeeper") sys.exit(1) def change_config(self, config_string): if config_string and len(config_string) > 0: loaded_config = yaml.safe_load(config_string) self.logger.info("Zookeeper config changed", extra=loaded_config) self.load_domain_config(loaded_config) self.update_domain_queues() elif config_string is None or len(config_string) == 0: self.error_config("Zookeeper config wiped") self.create_queues() def load_domain_config(self, loaded_config): """ Loads the domain_config and sets up queue_dict @param loaded_config: the yaml loaded config dict from zookeeper """ self.domain_config = {} # vetting process to ensure correct configs if loaded_config and "domains" in loaded_config: for domain in loaded_config["domains"]: item = loaded_config["domains"][domain] # check valid if "window" in item and "hits" in item: self.logger.debug("Added domain {dom} to loaded config".format(dom=domain)) self.domain_config[domain] = item self.config_flag = True def update_domain_queues(self): """ Check to update existing queues already in memory new queues are created elsewhere """ for key in self.domain_config: final_key = "{name}:{domain}:queue".format(name=self.spider.name, domain=key) # we already have a throttled queue for this domain, update it to new settings if final_key in self.queue_dict: self.queue_dict[final_key].window = float(self.domain_config[key]["window"]) self.logger.debug("Updated queue {q} with new config".format(q=final_key)) # if scale is applied, scale back; otherwise use updated hits if "scale" in self.domain_config[key]: # round to int hits = int(self.domain_config[key]["hits"] * self.fit_scale(self.domain_config[key]["scale"])) self.queue_dict[final_key].limit = float(hits) else: self.queue_dict[final_key].limit = float(self.domain_config[key]["hits"]) def error_config(self, message): extras = {} extras["message"] = message extras["revert_window"] = self.window extras["revert_hits"] = self.hits extras["spiderid"] = self.spider.name self.logger.info("Lost config from Zookeeper", extra=extras) # lost connection to zookeeper, reverting back to defaults for key in self.domain_config: final_key = "{name}:{domain}:queue".format(name=self.spider.name, domain=key) self.queue_dict[final_key].window = self.window self.queue_dict[final_key].limit = self.hits self.domain_config = {} def fit_scale(self, scale): """ @return: a scale >= 0 and <= 1 """ if scale >= 1: return 1.0 elif scale <= 0: return 0.0 else: return scale def create_queues(self): """ Updates the in memory list of the redis queues Creates new throttled queue instances if it does not have them """ # new config could have loaded between scrapes newConf = self.check_config() self.queue_keys = self.redis_conn.keys(self.spider.name + ":*:queue") for key in self.queue_keys: # build final queue key, depending on type and ip bools throttle_key = "" if self.add_type: throttle_key = self.spider.name + ":" if self.add_ip: throttle_key = throttle_key + self.my_ip + ":" # add the tld from the key `type:tld:queue` the_domain = re.split(":", key)[1] throttle_key = throttle_key + the_domain if key not in self.queue_dict or newConf: self.logger.debug("Added new Throttled Queue {q}".format(q=key)) q = RedisPriorityQueue(self.redis_conn, key) # use default window and hits if the_domain not in self.domain_config: self.queue_dict[key] = RedisThrottledQueue( self.redis_conn, q, self.window, self.hits, self.moderated, throttle_key, throttle_key ) # use custom window and hits else: window = self.domain_config[the_domain]["window"] hits = self.domain_config[the_domain]["hits"] # adjust the crawl rate based on the scale if exists if "scale" in self.domain_config[the_domain]: hits = int(hits * self.fit_scale(self.domain_config[the_domain]["scale"])) self.queue_dict[key] = RedisThrottledQueue( self.redis_conn, q, window, hits, self.moderated, throttle_key, throttle_key ) def check_config(self): """ Controls configuration for the scheduler @return: True if there is a new configuration """ if self.config_flag: self.config_flag = False return True return False def update_ipaddress(self): """ Updates the scheduler so it knows its own ip address """ # assign local ip in case of exception self.old_ip = self.my_ip self.my_ip = "127.0.0.1" try: obj = urllib2.urlopen(settings.get("PUBLIC_IP_URL", "http://ip.42.pl/raw")) results = self.ip_regex.findall(obj.read()) if len(results) > 0: self.my_ip = results[0] else: raise IOError("Could not get valid IP Address") obj.close() self.logger.debug("Current public ip: {ip}".format(ip=self.my_ip)) except IOError: self.logger.error("Could not reach out to get public ip") pass if self.old_ip != self.my_ip: self.logger.info("Changed Public IP: {old} -> {new}".format(old=self.old_ip, new=self.my_ip)) def report_self(self): """ Reports the crawler uuid to redis """ self.logger.debug("Reporting self id", extra={"uuid": self.my_uuid}) key = "stats:crawler:{m}:{s}:{u}".format(m=socket.gethostname(), s=self.spider.name, u=self.my_uuid) self.redis_conn.set(key, time.time()) self.redis_conn.expire(key, self.ip_update_interval * 2) @classmethod def from_settings(cls, settings): server = redis.Redis(host=settings.get("REDIS_HOST"), port=settings.get("REDIS_PORT")) persist = settings.get("SCHEDULER_PERSIST", True) up_int = settings.get("SCHEDULER_QUEUE_REFRESH", 10) hits = settings.get("QUEUE_HITS", 10) window = settings.get("QUEUE_WINDOW", 60) mod = settings.get("QUEUE_MODERATED", False) timeout = settings.get("DUPEFILTER_TIMEOUT", 600) ip_refresh = settings.get("SCHEDULER_IP_REFRESH", 60) add_type = settings.get("SCHEDULER_TYPE_ENABLED", False) add_ip = settings.get("SCHEDULER_IP_ENABLED", False) retries = settings.get("SCHEUDLER_ITEM_RETRIES", 3) ip_regex = settings.get("IP_ADDR_REGEX", ".*") my_level = settings.get("SC_LOG_LEVEL", "INFO") my_name = settings.get("SC_LOGGER_NAME", "sc-logger") my_output = settings.get("SC_LOG_STDOUT", True) my_json = settings.get("SC_LOG_JSON", False) my_dir = settings.get("SC_LOG_DIR", "logs") my_bytes = settings.get("SC_LOG_MAX_BYTES", "10MB") my_file = settings.get("SC_LOG_FILE", "main.log") my_backups = settings.get("SC_LOG_BACKUPS", 5) logger = LogFactory.get_instance( json=my_json, name=my_name, stdout=my_output, level=my_level, dir=my_dir, file=my_file, bytes=my_bytes, backups=my_backups, ) return cls( server, persist, up_int, timeout, retries, logger, hits, window, mod, ip_refresh, add_type, add_ip, ip_regex ) @classmethod def from_crawler(cls, crawler): return cls.from_settings(crawler.settings) def open(self, spider): self.spider = spider self.spider.set_logger(self.logger) self.spider.set_redis(self.redis_conn) self.spider.setup_stats() self.create_queues() self.setup_zookeeper() self.dupefilter = RFPDupeFilter(self.redis_conn, self.spider.name + ":dupefilter", self.rfp_timeout) def close(self, reason): self.logger.info("Closing Spider", {"spiderid": self.spider.name}) if not self.persist: self.logger.warning("Clearing crawl queues") self.dupefilter.clear() for key in self.queue_keys: self.queue_dict[key].clear() def is_blacklisted(self, appid, crawlid): """ Checks the redis blacklist for crawls that should not be propagated either from expiring or stopped @return: True if the appid crawlid combo is blacklisted """ key_check = "{appid}||{crawlid}".format(appid=appid, crawlid=crawlid) redis_key = self.spider.name + ":blacklist" return self.redis_conn.sismember(redis_key, key_check) def enqueue_request(self, request): """ Pushes a request from the spider into the proper throttled queue """ if not request.dont_filter and self.dupefilter.request_seen(request): self.logger.debug("Request not added back to redis") return req_dict = self.request_to_dict(request) if not self.is_blacklisted(req_dict["meta"]["appid"], req_dict["meta"]["crawlid"]): # grab the tld of the request ex_res = self.extract(req_dict["url"]) key = "{sid}:{dom}.{suf}:queue".format( sid=req_dict["meta"]["spiderid"], dom=ex_res.domain, suf=ex_res.suffix ) curr_time = time.time() # insert if crawl never expires (0) or time < expires if req_dict["meta"]["expires"] == 0 or curr_time < req_dict["meta"]["expires"]: # we may already have the queue in memory if key in self.queue_keys: self.queue_dict[key].push(req_dict, req_dict["meta"]["priority"]) else: # shoving into a new redis queue, negative b/c of sorted sets # this will populate ourself and other schedulers when # they call create_queues self.redis_conn.zadd(key, pickle.dumps(req_dict, protocol=-1), -req_dict["meta"]["priority"]) self.logger.debug( "Crawlid: '{id}' Appid: '{appid}' added to queue".format( appid=req_dict["meta"]["appid"], id=req_dict["meta"]["crawlid"] ) ) else: self.logger.debug( "Crawlid: '{id}' Appid: '{appid}' expired".format( appid=req_dict["meta"]["appid"], id=req_dict["meta"]["crawlid"] ) ) else: self.logger.debug( "Crawlid: '{id}' Appid: '{appid}' blacklisted".format( appid=req_dict["meta"]["appid"], id=req_dict["meta"]["crawlid"] ) ) def request_to_dict(self, request): """ Convert Request object to a dict. modified from scrapy.utils.reqser """ req_dict = { # urls should be safe (safe_string_url) "url": request.url.decode("ascii"), "method": request.method, "headers": dict(request.headers), "body": request.body, "cookies": request.cookies, "meta": request.meta, "_encoding": request._encoding, "priority": request.priority, "dont_filter": request.dont_filter, # callback/errback are assumed to be a bound instance of the spider "callback": None if request.callback is None else request.callback.func_name, "errback": None if request.errback is None else request.errback.func_name, } return req_dict def find_item(self): """ Finds an item from the throttled queues """ random.shuffle(self.queue_keys) count = 0 while count <= self.item_retries: for key in self.queue_keys: # the throttled queue only returns an item if it is allowed item = self.queue_dict[key].pop() if item: return item # we want the spiders to get slightly out of sync # with each other for better performance time.sleep(random.random()) count = count + 1 return None def next_request(self): """ Logic to handle getting a new url request, from a bunch of different queues """ t = time.time() # update the redis queues every so often if t - self.update_time > self.update_interval: self.update_time = t self.create_queues() # update the ip address every so often if t - self.update_ip_time > self.ip_update_interval: self.update_ip_time = t self.update_ipaddress() self.report_self() item = self.find_item() if item: self.logger.debug("Found url to crawl {url}".format(url=item["url"])) try: req = Request(item["url"]) except ValueError: # need absolute url # need better url validation here req = Request("http://" + item["url"]) if "meta" in item: item = item["meta"] # defaults not in schema if "curdepth" not in item: item["curdepth"] = 0 if "retry_times" not in item: item["retry_times"] = 0 for key in item.keys(): req.meta[key] = item[key] # extra check to add items to request if "useragent" in item and item["useragent"] is not None: req.headers["User-Agent"] = item["useragent"] if "cookie" in item and item["cookie"] is not None: if isinstance(item["cookie"], dict): req.cookies = item["cookie"] elif isinstance(item["cookie"], basestring): req.cookies = self.parse_cookie(item["cookie"]) return req return None def parse_cookie(self, string): """ Parses a cookie string like returned in a Set-Cookie header @param string: The cookie string @return: the cookie dict """ results = re.findall("([^=]+)=([^\;]+);?\s?", string) my_dict = {} for item in results: my_dict[item[0]] = item[1] return my_dict def has_pending_requests(self): """ We never want to say we have pending requests If this returns True scrapy sometimes hangs. """ return False