Ejemplo n.º 1
0
 def __init__(self, path=None, other=None):
     inmem = [
         str(it['link']) for it in pymongo.MongoClient(
             settings['MONGODB_SERVER'],
             settings['MONGODB_PORT']).HousingListings.usedLinks.find()
     ]
     self.already_seen = set(inmem)
     RFPDupeFilter.__init__(self, path, other)
Ejemplo n.º 2
0
    def test_filter(self):
        filter = RFPDupeFilter()
        filter.open()

        r1 = Request('http://scrapytest.org/1')
        r2 = Request('http://scrapytest.org/2')
        r3 = Request('http://scrapytest.org/2')

        assert not filter.request_seen(r1)
        assert filter.request_seen(r1)

        assert not filter.request_seen(r2)
        assert filter.request_seen(r3)

        filter.close('finished')
Ejemplo n.º 3
0
    def test_request_fingerprint(self):
        """Test if customization of request_fingerprint method will change
        output of request_seen.

        """
        r1 = Request('http://scrapytest.org/index.html')
        r2 = Request('http://scrapytest.org/INDEX.html')

        dupefilter = RFPDupeFilter()
        dupefilter.open()

        assert not dupefilter.request_seen(r1)
        assert not dupefilter.request_seen(r2)

        dupefilter.close('finished')

        class CaseInsensitiveRFPDupeFilter(RFPDupeFilter):
            def request_fingerprint(self, request):
                fp = hashlib.sha1()
                fp.update(request.url.lower())
                return fp.hexdigest()

        case_insensitive_dupefilter = CaseInsensitiveRFPDupeFilter()
        case_insensitive_dupefilter.open()

        assert not case_insensitive_dupefilter.request_seen(r1)
        assert case_insensitive_dupefilter.request_seen(r2)

        case_insensitive_dupefilter.close('finished')
Ejemplo n.º 4
0
    def test_filter(self):
        dupefilter = RFPDupeFilter()
        dupefilter.open()

        r1 = Request('http://scrapytest.org/1')
        r2 = Request('http://scrapytest.org/2')
        r3 = Request('http://scrapytest.org/2')

        assert not dupefilter.request_seen(r1)
        assert dupefilter.request_seen(r1)

        assert not dupefilter.request_seen(r2)
        assert dupefilter.request_seen(r3)

        dupefilter.close('finished')
Ejemplo n.º 5
0
    def test_filter(self):
        filter = RFPDupeFilter()
        filter.open()

        r1 = Request('http://scrapytest.org/1')
        r2 = Request('http://scrapytest.org/2')
        r3 = Request('http://scrapytest.org/2')

        assert not filter.request_seen(r1)
        assert filter.request_seen(r1)

        assert not filter.request_seen(r2)
        assert filter.request_seen(r3)

        filter.close()
Ejemplo n.º 6
0
    def test_request_fingerprint(self):
        """Test if customization of request_fingerprint method will change
        output of request_seen.

        """
        r1 = Request('http://scrapytest.org/index.html')
        r2 = Request('http://scrapytest.org/INDEX.html')

        dupefilter = RFPDupeFilter()
        dupefilter.open()

        assert not dupefilter.request_seen(r1)
        assert not dupefilter.request_seen(r2)

        dupefilter.close('finished')

        class CaseInsensitiveRFPDupeFilter(RFPDupeFilter):

            def request_fingerprint(self, request):
                fp = hashlib.sha1()
                fp.update(request.url.lower())
                return fp.hexdigest()

        case_insensitive_dupefilter = CaseInsensitiveRFPDupeFilter()
        case_insensitive_dupefilter.open()

        assert not case_insensitive_dupefilter.request_seen(r1)
        assert case_insensitive_dupefilter.request_seen(r2)

        case_insensitive_dupefilter.close('finished')
Ejemplo n.º 7
0
 def __init__(self, path=None):
     self.urls_seen = set()
     RFPDupeFilter.__init__(self, path)
Ejemplo n.º 8
0
 def __init__(self, path=None):
     self.urls_sbf = ScalableBloomFilter(
         mode=ScalableBloomFilter.SMALL_SET_GROWTH)
     RFPDupeFilter.__init__(self, path)
Ejemplo n.º 9
0
 def __init__(self, path=None):
     self.urls_seen = set()
     RFPDupeFilter.__init__(self, path)
Ejemplo n.º 10
0
 def __init__(self, path=None, debug=False):
     self.urls_seen = set()
     RFPDupeFilter.__init__(self, path, debug)
Ejemplo n.º 11
0
 def __init__(self):
     self.urls_seen = set()
     RFPDupeFilter.__init__(self)
	def __init__(self, path=None, other=None):
		inmem = [str(it['link']) for it in pymongo.MongoClient(settings['MONGODB_SERVER'], settings['MONGODB_PORT']).HousingListings.usedLinks.find()]
		self.already_seen = set(inmem)
		RFPDupeFilter.__init__(self, path, other)
Ejemplo n.º 13
0
 def __init__(self, path=None, debug=False):
     self.urls_seen = set()
     RFPDupeFilter.__init__(self, path, debug)