class RFPDupeFilter(BaseDupeFilter): def __init__(self, server, key): self.server = server self.key = key self.bf = BloomFilter(server, key, blockNum=1) # 如果url比较多可以增加blockNum @classmethod def from_settings(cls, settings): server = connection.from_settings_filter(settings) key = "dupefilter:%s" % int(time.time()) return cls(server, key) @classmethod def from_crawler(cls, crawler): return cls.from_settings(crawler.settings) def request_seen(self, request): fp = request_fingerprint(request) if self.bf.isContains(fp): # 如果已经存在 return True else: self.bf.insert(fp) return False # This returns the number of values added, zero if already exists. #added = self.server.sadd(self.key, fp) #return added == 0 def close(self, reason=''): self.clear() def clear(self): """Clears fingerprints data.""" self.server.delete(self.key)
def __init__(self, server, key): """Initialize duplication filter Parameters ---------- server : Redis instance key : str Where to store fingerprints """ self.server = server self.key = key self.bf = BloomFilter(server, key, blockNum=1) # you can increase blockNum if your are filtering too many urls
class RFPDupeFilter(BaseDupeFilter): """Redis-based request duplication filter""" def __init__(self, server, key): """Initialize duplication filter Parameters ---------- server : Redis instance key : str Where to store fingerprints """ self.server = server self.key = key self.bf = BloomFilter(server, key, blockNum=1) # you can increase blockNum if your are filtering too many urls @classmethod def from_settings(cls, settings): server = connection.from_settings_filter(settings) # create one-time key. needed to support to use this # class as standalone dupefilter with scrapy's default scheduler # if scrapy passes spider on open() method this wouldn't be needed key = "dupefilter:%s" % int(time.time()) return cls(server, key) @classmethod def from_crawler(cls, crawler): return cls.from_settings(crawler.settings) def request_seen(self, request): fp = request_fingerprint(request) if self.bf.isContains(fp): return True else: self.bf.insert(fp) return False # added = self.server.sadd(self.key, fp) # return not added def close(self, reason): """Delete data on close. Called by scrapy's scheduler""" self.clear() def clear(self): """Clears fingerprints data""" self.server.delete(self.key)
def __init__(self, server, key, debug=False): """Initialize the duplicates filter. Parameters ---------- server : redis.StrictRedis The redis server instance. key : str Redis key Where to store fingerprints. debug : bool, optional Whether to log filtered requests. """ self.server = server self.key = key self.debug = debug self.logdupes = True self.bf = BloomFilter(server, key, blockNum=1) #add by me
# encoding=utf-8 import redis from BloomfilterOnRedis import BloomFilter from scrapy.utils.request import request_fingerprint from scrapy import Request rconn = redis.Redis('172.16.188.121', 6379) bf = BloomFilter(rconn, 'spider_1:dupefilter') if __name__ == '__main__': # while True: url = 'http://www.x14hack.com/' request = Request(url) fp = request_fingerprint(request) print fp, if bf.isContains(fp): print 'exist!' else: print 'not exist!'
class RFPDupeFilter(BaseDupeFilter): """Redis-based request duplicates filter. This class can also be used with default Scrapy's scheduler. """ logger = logger def __init__(self, server, key, debug=False): """Initialize the duplicates filter. Parameters ---------- server : redis.StrictRedis The redis server instance. key : str Redis key Where to store fingerprints. debug : bool, optional Whether to log filtered requests. """ self.server = server self.key = key self.debug = debug self.logdupes = True self.bf = BloomFilter(server, key, blockNum=1) #add by me @classmethod def from_settings(cls, settings): """Returns an instance from given settings. This uses by default the key ``dupefilter:<timestamp>``. When using the ``scrapy_redis.scheduler.Scheduler`` class, this method is not used as it needs to pass the spider name in the key. Parameters ---------- settings : scrapy.settings.Settings Returns ------- RFPDupeFilter A RFPDupeFilter instance. """ server = get_redis_from_settings(settings) # XXX: This creates one-time key. needed to support to use this # class as standalone dupefilter with scrapy's default scheduler # if scrapy passes spider on open() method this wouldn't be needed # TODO: Use SCRAPY_JOB env as default and fallback to timestamp. key = defaults.DUPEFILTER_KEY % {'timestamp': int(time.time())} debug = settings.getbool('DUPEFILTER_DEBUG') return cls(server, key=key, debug=debug) @classmethod def from_crawler(cls, crawler): """Returns instance from crawler. Parameters ---------- crawler : scrapy.crawler.Crawler Returns ------- RFPDupeFilter Instance of RFPDupeFilter. """ return cls.from_settings(crawler.settings) def request_seen(self, request): """Returns True if request was already seen. Parameters ---------- request : scrapy.http.Request Returns ------- bool """ fp = self.request_fingerprint(request) # This returns the number of values added, zero if already exists. # added = self.server.sadd(self.key, fp) # return added == 0 if self.bf.isContains(fp): return True else: self.bf.insert(fp) return False def request_fingerprint(self, request): """Returns a fingerprint for a given request. Parameters ---------- request : scrapy.http.Request Returns ------- str """ return request_fingerprint(request) def close(self, reason=''): """Delete data on close. Called by Scrapy's scheduler. Parameters ---------- reason : str, optional """ self.clear() def clear(self): """Clears fingerprints data.""" self.server.delete(self.key) def log(self, request, spider): """Logs given request. Parameters ---------- request : scrapy.http.Request spider : scrapy.spiders.Spider """ if self.debug: msg = "Filtered duplicate request: %(request)s" self.logger.debug(msg, {'request': request}, extra={'spider': spider}) elif self.logdupes: msg = ("Filtered duplicate request %(request)s" " - no more duplicates will be shown" " (see DUPEFILTER_DEBUG to show all duplicates)") self.logger.debug(msg, {'request': request}, extra={'spider': spider}) self.logdupes = False
def __init__(self, server, key): self.server = server self.key = key self.bf = BloomFilter(server, key, blockNum=1) # 如果url比较多可以增加blockNum