def __init__(self, path=None, other=None): inmem = [ str(it['link']) for it in pymongo.MongoClient( settings['MONGODB_SERVER'], settings['MONGODB_PORT']).HousingListings.usedLinks.find() ] self.already_seen = set(inmem) RFPDupeFilter.__init__(self, path, other)
def test_filter(self): filter = RFPDupeFilter() filter.open() r1 = Request('http://scrapytest.org/1') r2 = Request('http://scrapytest.org/2') r3 = Request('http://scrapytest.org/2') assert not filter.request_seen(r1) assert filter.request_seen(r1) assert not filter.request_seen(r2) assert filter.request_seen(r3) filter.close('finished')
def test_request_fingerprint(self): """Test if customization of request_fingerprint method will change output of request_seen. """ r1 = Request('http://scrapytest.org/index.html') r2 = Request('http://scrapytest.org/INDEX.html') dupefilter = RFPDupeFilter() dupefilter.open() assert not dupefilter.request_seen(r1) assert not dupefilter.request_seen(r2) dupefilter.close('finished') class CaseInsensitiveRFPDupeFilter(RFPDupeFilter): def request_fingerprint(self, request): fp = hashlib.sha1() fp.update(request.url.lower()) return fp.hexdigest() case_insensitive_dupefilter = CaseInsensitiveRFPDupeFilter() case_insensitive_dupefilter.open() assert not case_insensitive_dupefilter.request_seen(r1) assert case_insensitive_dupefilter.request_seen(r2) case_insensitive_dupefilter.close('finished')
def test_filter(self): dupefilter = RFPDupeFilter() dupefilter.open() r1 = Request('http://scrapytest.org/1') r2 = Request('http://scrapytest.org/2') r3 = Request('http://scrapytest.org/2') assert not dupefilter.request_seen(r1) assert dupefilter.request_seen(r1) assert not dupefilter.request_seen(r2) assert dupefilter.request_seen(r3) dupefilter.close('finished')
def test_filter(self): filter = RFPDupeFilter() filter.open() r1 = Request('http://scrapytest.org/1') r2 = Request('http://scrapytest.org/2') r3 = Request('http://scrapytest.org/2') assert not filter.request_seen(r1) assert filter.request_seen(r1) assert not filter.request_seen(r2) assert filter.request_seen(r3) filter.close()
def __init__(self, path=None): self.urls_seen = set() RFPDupeFilter.__init__(self, path)
def __init__(self, path=None): self.urls_sbf = ScalableBloomFilter( mode=ScalableBloomFilter.SMALL_SET_GROWTH) RFPDupeFilter.__init__(self, path)
def __init__(self, path=None, debug=False): self.urls_seen = set() RFPDupeFilter.__init__(self, path, debug)
def __init__(self): self.urls_seen = set() RFPDupeFilter.__init__(self)
def __init__(self, path=None, other=None): inmem = [str(it['link']) for it in pymongo.MongoClient(settings['MONGODB_SERVER'], settings['MONGODB_PORT']).HousingListings.usedLinks.find()] self.already_seen = set(inmem) RFPDupeFilter.__init__(self, path, other)