def test_cache_with_limit(self): cache = LocalCache(limit=2) cache['a'] = 1 cache['b'] = 2 cache['c'] = 3 self.assertEqual(len(cache), 2) self.assertNotIn('a', cache) self.assertIn('b', cache) self.assertIn('c', cache) self.assertEqual(cache['b'], 2) self.assertEqual(cache['c'], 3)
from twisted.internet import defer from twisted.internet.base import ThreadedResolver from scrapy.utils.datatypes import LocalCache # TODO: cache misses dnscache = LocalCache(10000) # 异步解析? class CachingThreadedResolver(ThreadedResolver): def __init__(self, reactor, cache_size, timeout): super(CachingThreadedResolver, self).__init__(reactor) dnscache.limit = cache_size self.timeout = timeout def getHostByName(self, name, timeout=None): if name in dnscache: # 命中缓冲 return defer.succeed(dnscache[name]) # in Twisted<=16.6, getHostByName() is always called with # a default timeout of 60s (actually passed as (1, 3, 11, 45) tuple), # so the input argument above is simply overridden # to enforce Scrapy's DNS_TIMEOUT setting's value # 异步解析 timeout = (self.timeout,) d = super(CachingThreadedResolver, self).getHostByName(name, timeout) if dnscache.limit: d.addCallback(self._cache_result, name)
from twisted.internet import defer from twisted.internet.base import ThreadedResolver from twisted.internet.interfaces import IHostResolution, IHostnameResolver, IResolutionReceiver, IResolverSimple from zope.interface.declarations import implementer, provider from scrapy.utils.datatypes import LocalCache # TODO: cache misses dnscache = LocalCache(10000) #就是一个带有limit的 OrderedDict @implementer(IResolverSimple ) #IResolverSimple 有一个方法 getHostByName 就是将域名解析为IP. zope接口 class CachingThreadedResolver(ThreadedResolver): """ Default caching resolver. IPv4 only, supports setting a timeout value for DNS requests. """ def __init__(self, reactor, cache_size, timeout): super().__init__(reactor) dnscache.limit = cache_size self.timeout = timeout @classmethod def from_crawler(cls, crawler, reactor): if crawler.settings.getbool( 'DNSCACHE_ENABLED'): #这块可以考虑在setting里加上 这个可以减少DNS调用哈 cache_size = crawler.settings.getint('DNSCACHE_SIZE') else: cache_size = 0 return cls(reactor, cache_size, crawler.settings.getfloat('DNS_TIMEOUT'))