def follow_links(self): if self.now_depth >= self.depth: return p = pyreBloom.pyreBloom('task%d' % self.task_id, 100000, 0.001, host='172.21.1.155') soup = BeautifulSoup(self.page_source) for link in soup.find_all('a', href=True): href = link.get('href').encode('utf8') if not href.startswith('http'): href = urlparse.urljoin(self.url, href) if not p.contains(href): p.extend(href) self.follow_links_delay(href, 1) elif href.find(self.netloc) != -1: if not p.contains(href): p.extend(href) self.follow_links_delay(href, 1) else: for domain in self.allow_domains: if href.find(domain) != -1: if not p.contains(href): p.extend(href) self.follow_links_delay(href, 0) break
def open(self, spider_name): kw = self.crawler.redis.connection_pool.connection_kwargs host = kw['host'] port = kw['port'] db = kw['db'] self.bfilter = pyreBloom.pyreBloom("bf:%s" % spider_name, 100000000, 0.001, host=host, port=port, db=db)
def open(self, spider): redis_conf = self.settings.get("REDIS_CONF") self.urls_seen = pyreBloom.pyreBloom("bfilter:{}".format(spider.name), 100000000, 0.001, host=redis_conf['host'], port=redis_conf['port'], db=redis_conf['db'])
def test_select_db(self): '''Can instantiate a bloom filter in a separate db''' bloom = pyreBloom.pyreBloom(self.KEY, self.CAPACITY, self.ERROR_RATE, db=1) # After adding key to our db=1 bloom filter, shouldn't see it in our db=0 bloom samples = sample_strings(20, 100) self.bloom.extend(samples) self.assertEqual(len(bloom.contains(samples)), 0)
def test_select_db(self): '''Can instantiate a bloom filter in a separate db''' bloom = pyreBloom.pyreBloom(self.KEY, self.CAPACITY, self.ERROR_RATE, db=1) # After adding key to our db=0 bloom filter, shouldn't see it in our db=0 bloom samples = sample_strings(20, 100) self.bloom.extend(samples) self.assertEqual(len(bloom.contains(samples)), 0)
def __init__(self, settings): host = settings.get("REDIS_HOST") port = settings.get("REDIS_PORT") sdb = settings.get("SLICE_REDIS_DB") self.srd = redis.Redis(host, port, sdb) self.logdupes = True self.urls_seen = pyreBloom.pyreBloom("bloomfilter", 100000000, 0.001, host=host, port=port)
def test_two_instances(self): p2 = pyreBloom.pyreBloom('pyreBloomTesting', 10000, 0.1) tests = ['hello', 'how', 'are', 'you', 'today'] # Add them through the first instance self.p.extend(tests) self.assertEqual(tests, self.p.contains(tests)) # Make sure they're accessible through the second instance self.assertEqual(tests, p2.contains(tests))
def filter_ad_by_user(self, user_id, ad_id_list, filter_key='rec_id'): try: p = pyreBloom.pyreBloom(user_id, self.capacity, self.error_rate) in_ele = set(p.contains([str(x[filter_key]) for x in ad_id_list])) return [x for x in ad_id_list if str(x[filter_key]) not in in_ele] except Exception as e: logging.error( '[bloom filter]filter err, user_id:{0}, ad_id:{1}, err:{2}'. format(user_id, ad_id_list, e)) return ad_id_list
def test_two_instances(self): '''Make sure two bloom filters pointing to the same key work''' bloom = pyreBloom.pyreBloom(self.KEY, self.CAPACITY, 0.1) tests = [b'hello', b'how', b'are', b'you', b'today'] # Add them through the first instance self.bloom.extend(tests) self.assertEqual(tests, self.bloom.contains(tests)) # Make sure they're accessible through the second instance self.assertEqual(tests, bloom.contains(tests))
def test_two_instances(self): '''Make sure two bloom filters pointing to the same key work''' bloom2 = pyreBloom.pyreBloom('pyreBloomTesting', 10000, 0.1) tests = ['hello', 'how', 'are', 'you', 'today'] # Add them through the first instance self.bloom.extend(tests) self.assertEqual(tests, self.bloom.contains(tests)) # Make sure they're accessible through the second instance self.assertEqual(tests, bloom2.contains(tests))
def __init__(self, server, key): connection_details = server.connection_pool.connection_kwargs host, port = connection_details['host'], connection_details['port'] password, db = connection_details['password'], connection_details['db'] self.filter = pyreBloom.pyreBloom(key, 10000000, 0.00001, host=host, port=port, password=password, db=db)
def _bloom_filter(self, set_key): key = self._base_key + set_key try: bloom_filter = self._bloom_filters[key] except KeyError: # It is expected that 10 in 1,000,000 URLs will be false positives # There is a bug in pyreBloom that makes it segfault when key is not a byte string. bloom_filter = pyreBloom.pyreBloom(key.encode(), 1000000, 0.00001, host=self._host, port=self._port) self._bloom_filters[key] = bloom_filter return bloom_filter
def retrieve_page(task_id, url, from_url=None, depth=0, now_depth=0, allow_domains=None): # Filter the url that has been crawled p = pyreBloom.pyreBloom('task%d' % task_id, 100000, 0.01, host='172.21.1.155') if p.contains(url): return # start crawling... fps = Fetch_and_parse_and_store(task_id, url, from_url, depth, now_depth, allow_domains, __name__) p.extend(url) if fps.fetch() == True: fps.store() fps.follow_links()
def __init__(self, settings): self.settings = settings host = settings.get("REDIS_HOST") port = settings.get("REDIS_PORT") db = settings.get("LOG_REDIS_DB") self.rd = redis.Redis(host, port, db) self.urls_seen = pyreBloom.pyreBloom("bloomfilter", 100000000, 0.001, host=host, port=port) mongo_server = settings.get("MONGO_SERVER") mongo_port = settings.get("MONGO_PORT") mongo_db = settings.get("MONGO_DB") conn = pymongo.Connection(host=mongo_server, port=mongo_port) self.db = conn[mongo_db]
def bf_conn(self): ''' 初始化pyreBloom ''' if not self._bf_conn: prefix = force_utf8(self.PREFIX) logging.debug( 'pyreBloom connect: redis://%s:%s/%s, (%s %s %s)', self._conf['host'], self._conf['port'], self._conf['db'], prefix, self.BF_SIZE, self.BF_ERROR, ) self._bf_conn = pyreBloom(prefix, self.BF_SIZE, self.BF_ERROR, **self._conf) return self._bf_conn
def save(self, user_id, ad_id_list, method='rec'): r = self.redis_base_obj.connect() time_now = time.time() if type(ad_id_list) != list: ad_id_list = [ad_id_list] p = pyreBloom.pyreBloom(user_id, self.capacity, self.error_rate) p.extend(ad_id_list) self.redis_obj.insert(user_id, ad_id_list, method) if r: try: time_update = r.get('bloom_filter_update' + user_id) if time_update and time_now - float( time_update) > self.rebuild_time: self.build_from_redis(user_id) r.set('bloom_filter_update' + user_id, time_now) except Exception as e: logging.error( '[bloom filter]save err, user_id:{0}, ad_id:{1}, err:{2}'. format(user_id, ad_id_list, e)) else: logging.error('[bloom filter] redis obj is None')
def setUp(self): self.bloom = pyreBloom.pyreBloom('pyreBloomTesting', 200000000, 0.00001) self.bloom.delete()
def setUp(self): self.bloom = pyreBloom.pyreBloom('pyreBloomTesting', 10000, 0.1) self.bloom.delete()
def convert_utf8(data): """ convert utf8 """ if isinstance(data, str): return data.encode('utf8') elif isinstance(data, tuple): data = tuple([convert_utf8(item) for item in data]) elif isinstance(data, list): for idx, i in enumerate(data): data[idx] = convert_utf8(i) elif isinstance(data, dict): for i in data: data[i] = convert_utf8(data[i]) return data redis_conf = {'host': '127.0.0.1', 'password': '', 'port': 6379, 'db': 0} for k, v in redis_conf.items(): redis_conf = convert_utf8(redis_conf) key = convert_utf8('tc') value = convert_utf8('hello') p = pyreBloom(key, 10000, 0.001, **redis_conf) p.add(value) print(p.contains(value))
def tearDown(self): '''Remove the bloom filter at the provided test key in all databases''' databases = int(self.redis.config_get('databases').get('databases', 0)) for db in range(databases): pyreBloom.pyreBloom(self.KEY, 1, 0.1, db=db).delete()
def tearDown(self): self.p = pyreBloom.pyreBloom('pyreBloomTesting', 10000, 0.1) self.p.delete()
import string import unittest import pyreBloom count = 10000 capacity = count * 2 error = 0.1 print('Generating %i random test words' % (count * 2)) start = -time.time() included = [''.join(random.sample(string.lowercase, 20)) for i in range(count)] outcluded = [''.join(random.sample(string.lowercase, 20)) for i in range(count)] start += time.time() print('Generated random test words in %fs' % start) p = pyreBloom.pyreBloom('pyreBloomTesting', capacity, error) p.delete() print('Filter using %i hash functions and %i bits' % (p.hashes, p.bits)) start = -time.time() p.extend(included) start += time.time() print('Batch insert : %fs (%f words / second)' % (start, (count / start))) p.delete() start = -time.time() r = [p.add(word) for word in included] start += time.time() print('Serial insert: %fs (%f words / second)' % (start, (count / start)))
def tearDown(self): self.bloom = pyreBloom.pyreBloom('pyreBloomTesting', 10000, 0.1) self.bloom.delete()
def get_redis_protocol(*args): # number of args s = "*%d%s%s" % (len(args), chr(13), chr(10)) for arg in args: s += "$%d%s%s" % (len(arg), chr(13), chr(10)) s += "%s%s%s" % (arg, chr(13), chr(10)) return s redis = StrictRedis(host='localhost', port=6379, db=0) redis.flushdb() bloom = pyreBloom('gene_symbols', 100000, 0.01) # iterate through the file, each line is a gene symbol # generate the redis request protocol for inserting the data # http://redis.io/topics/protocol command = "" for row in csv.DictReader(open('hgnc_complete_set.txt'), delimiter="\t"): # skip non-approved genes if row["Status"] != "Approved": continue symbol = row["Approved Symbol"] if len(symbol) == 1: print "SKIPPING %s: SYMBOL TOO SHORT" % (symbol) continue
def __init__(self,*args,**kwargs): super(TitleCrawler,self).__init__(*args,**kwargs) self.bloomfilt= pyreBloom.pyreBloom('titles_bloomfilter',100000,0.01)
def build_from_redis(self, user_id): p = pyreBloom.pyreBloom(user_id, self.capacity, self.error_rate) p.delete() p.extend(self.redis_obj.select(user_id, num=500))
def setUp(self): self.bloom = pyreBloom.pyreBloom(self.KEY, self.CAPACITY, self.ERROR_RATE, count=False) self.redis = Redis()
def before_request(): g.redis = redis.StrictRedis(host='localhost', port=6379, db=0) g.bloom = pyreBloom.pyreBloom('gene_symbols', 100000, 0.01) if app.debug: print "start request" g.start = time.time()
def setUp(self): self.bloom = pyreBloom.pyreBloom(self.KEY, self.CAPACITY, self.ERROR_RATE) self.redis = Redis()
def setUp(self): self.p = pyreBloom.pyreBloom('pyreBloomTesting', 10000, 0.1) self.p.delete()
import pyreBloom count = 10000 capacity = count * 2 error = 0.1 print 'Generating %i random test words' % (count * 2) start = -time.time() included = [''.join(random.sample(string.lowercase, 20)) for i in range(count)] outcluded = [ ''.join(random.sample(string.lowercase, 20)) for i in range(count) ] start += time.time() print 'Generated random test words in %fs' % start p = pyreBloom.pyreBloom('pyreBloomTesting', capacity, error) p.delete() print 'Filter using %i hash functions and %i bits' % (p.hashes, p.bits) start = -time.time() p.extend(included) start += time.time() print 'Batch insert : %fs (%f words / second)' % (start, (count / start)) p.delete() start = -time.time() r = [p.add(word) for word in included] start += time.time() print 'Serial insert: %fs (%f words / second)' % (start, (count / start))