Example #1
0
    def follow_links(self):
        if self.now_depth >= self.depth:
            return

        p = pyreBloom.pyreBloom('task%d' % self.task_id, 100000, 0.001,
                host='172.21.1.155')

        soup = BeautifulSoup(self.page_source)
        for link in soup.find_all('a', href=True):
            href = link.get('href').encode('utf8')
            if not href.startswith('http'):
                href = urlparse.urljoin(self.url, href)
                if not p.contains(href):
                    p.extend(href)
                    self.follow_links_delay(href, 1)
            elif href.find(self.netloc) != -1:
                if not p.contains(href):
                    p.extend(href)
                    self.follow_links_delay(href, 1)
            else:
                for domain in self.allow_domains:
                    if href.find(domain) != -1:
                        if not p.contains(href):
                            p.extend(href)
                            self.follow_links_delay(href, 0)
                            break
Example #2
0
 def open(self, spider_name):
     kw = self.crawler.redis.connection_pool.connection_kwargs
     host = kw['host']
     port = kw['port']
     db = kw['db']
     self.bfilter = pyreBloom.pyreBloom("bf:%s" % spider_name, 100000000, 0.001,
                                        host=host, port=port, db=db)
Example #3
0
 def open(self, spider):
     redis_conf = self.settings.get("REDIS_CONF")
     self.urls_seen = pyreBloom.pyreBloom("bfilter:{}".format(spider.name),
                                          100000000,
                                          0.001,
                                          host=redis_conf['host'],
                                          port=redis_conf['port'],
                                          db=redis_conf['db'])
Example #4
0
    def test_select_db(self):
        '''Can instantiate a bloom filter in a separate db'''
        bloom = pyreBloom.pyreBloom(self.KEY, self.CAPACITY, self.ERROR_RATE, db=1)

        # After adding key to our db=1 bloom filter, shouldn't see it in our db=0 bloom
        samples = sample_strings(20, 100)
        self.bloom.extend(samples)
        self.assertEqual(len(bloom.contains(samples)), 0)
Example #5
0
    def test_select_db(self):
        '''Can instantiate a bloom filter in a separate db'''
        bloom = pyreBloom.pyreBloom(self.KEY, self.CAPACITY, self.ERROR_RATE, db=1)

        # After adding key to our db=0 bloom filter, shouldn't see it in our db=0 bloom
        samples = sample_strings(20, 100)
        self.bloom.extend(samples)
        self.assertEqual(len(bloom.contains(samples)), 0)
Example #6
0
    def __init__(self, settings):
        host = settings.get("REDIS_HOST")
        port = settings.get("REDIS_PORT")
        sdb = settings.get("SLICE_REDIS_DB")
        self.srd = redis.Redis(host, port, sdb)
        self.logdupes = True

        self.urls_seen = pyreBloom.pyreBloom("bloomfilter", 100000000, 0.001,
                                             host=host, port=port)
Example #7
0
 def test_two_instances(self):
     p2 = pyreBloom.pyreBloom('pyreBloomTesting', 10000, 0.1)
     tests = ['hello', 'how', 'are', 'you', 'today']
     
     # Add them through the first instance
     self.p.extend(tests)
     self.assertEqual(tests, self.p.contains(tests))
     
     # Make sure they're accessible through the second instance
     self.assertEqual(tests, p2.contains(tests))
Example #8
0
 def filter_ad_by_user(self, user_id, ad_id_list, filter_key='rec_id'):
     try:
         p = pyreBloom.pyreBloom(user_id, self.capacity, self.error_rate)
         in_ele = set(p.contains([str(x[filter_key]) for x in ad_id_list]))
         return [x for x in ad_id_list if str(x[filter_key]) not in in_ele]
     except Exception as e:
         logging.error(
             '[bloom filter]filter err, user_id:{0}, ad_id:{1}, err:{2}'.
             format(user_id, ad_id_list, e))
         return ad_id_list
Example #9
0
    def test_two_instances(self):
        '''Make sure two bloom filters pointing to the same key work'''
        bloom = pyreBloom.pyreBloom(self.KEY, self.CAPACITY, 0.1)
        tests = [b'hello', b'how', b'are', b'you', b'today']

        # Add them through the first instance
        self.bloom.extend(tests)
        self.assertEqual(tests, self.bloom.contains(tests))

        # Make sure they're accessible through the second instance
        self.assertEqual(tests, bloom.contains(tests))
Example #10
0
    def test_two_instances(self):
        '''Make sure two bloom filters pointing to the same key work'''
        bloom2 = pyreBloom.pyreBloom('pyreBloomTesting', 10000, 0.1)
        tests = ['hello', 'how', 'are', 'you', 'today']

        # Add them through the first instance
        self.bloom.extend(tests)
        self.assertEqual(tests, self.bloom.contains(tests))

        # Make sure they're accessible through the second instance
        self.assertEqual(tests, bloom2.contains(tests))
Example #11
0
    def __init__(self, server, key):
        connection_details = server.connection_pool.connection_kwargs
        host, port = connection_details['host'], connection_details['port']
        password, db = connection_details['password'], connection_details['db']

        self.filter = pyreBloom.pyreBloom(key,
                                          10000000,
                                          0.00001,
                                          host=host,
                                          port=port,
                                          password=password,
                                          db=db)
Example #12
0
 def _bloom_filter(self, set_key):
     key = self._base_key + set_key
     try:
         bloom_filter = self._bloom_filters[key]
     except KeyError:
         # It is expected that 10 in 1,000,000 URLs will be false positives
         # There is a bug in pyreBloom that makes it segfault when key is not a byte string.
         bloom_filter = pyreBloom.pyreBloom(key.encode(),
                                            1000000,
                                            0.00001,
                                            host=self._host,
                                            port=self._port)
         self._bloom_filters[key] = bloom_filter
     return bloom_filter
Example #13
0
def retrieve_page(task_id, url, from_url=None, depth=0, now_depth=0, allow_domains=None):

	# Filter the url that has been crawled
	p = pyreBloom.pyreBloom('task%d' % task_id, 100000, 0.01, host='172.21.1.155')
	if p.contains(url):
		return

	# start crawling...
	fps = Fetch_and_parse_and_store(task_id, url, from_url, depth, now_depth, allow_domains, __name__)
	p.extend(url)
	
	if fps.fetch() == True:
		fps.store()
		fps.follow_links()
Example #14
0
    def __init__(self, settings):
        self.settings = settings
        host = settings.get("REDIS_HOST")
        port = settings.get("REDIS_PORT")
        db = settings.get("LOG_REDIS_DB")
        self.rd = redis.Redis(host, port, db)
        self.urls_seen = pyreBloom.pyreBloom("bloomfilter", 100000000, 0.001,
                                             host=host, port=port)

        mongo_server = settings.get("MONGO_SERVER")
        mongo_port = settings.get("MONGO_PORT")
        mongo_db = settings.get("MONGO_DB")
        conn = pymongo.Connection(host=mongo_server, port=mongo_port)
        self.db = conn[mongo_db]
Example #15
0
 def bf_conn(self):
     '''
     初始化pyreBloom
     '''
     if not self._bf_conn:
         prefix = force_utf8(self.PREFIX)
         logging.debug(
             'pyreBloom connect: redis://%s:%s/%s, (%s %s %s)',
             self._conf['host'],
             self._conf['port'],
             self._conf['db'],
             prefix,
             self.BF_SIZE,
             self.BF_ERROR,
         )
         self._bf_conn = pyreBloom(prefix, self.BF_SIZE, self.BF_ERROR,
                                   **self._conf)
     return self._bf_conn
Example #16
0
    def save(self, user_id, ad_id_list, method='rec'):

        r = self.redis_base_obj.connect()
        time_now = time.time()

        if type(ad_id_list) != list:
            ad_id_list = [ad_id_list]
        p = pyreBloom.pyreBloom(user_id, self.capacity, self.error_rate)
        p.extend(ad_id_list)
        self.redis_obj.insert(user_id, ad_id_list, method)

        if r:
            try:
                time_update = r.get('bloom_filter_update' + user_id)
                if time_update and time_now - float(
                        time_update) > self.rebuild_time:
                    self.build_from_redis(user_id)
                    r.set('bloom_filter_update' + user_id, time_now)
            except Exception as e:
                logging.error(
                    '[bloom filter]save err, user_id:{0}, ad_id:{1}, err:{2}'.
                    format(user_id, ad_id_list, e))
        else:
            logging.error('[bloom filter] redis obj is None')
Example #17
0
 def setUp(self):
     self.bloom = pyreBloom.pyreBloom('pyreBloomTesting', 200000000, 0.00001)
     self.bloom.delete()
Example #18
0
 def setUp(self):
     self.bloom = pyreBloom.pyreBloom('pyreBloomTesting', 10000, 0.1)
     self.bloom.delete()

def convert_utf8(data):
    """
    convert utf8
    """
    if isinstance(data, str):
        return data.encode('utf8')
    elif isinstance(data, tuple):
        data = tuple([convert_utf8(item) for item in data])
    elif isinstance(data, list):
        for idx, i in enumerate(data):
            data[idx] = convert_utf8(i)
    elif isinstance(data, dict):
        for i in data:
            data[i] = convert_utf8(data[i])
    return data


redis_conf = {'host': '127.0.0.1', 'password': '', 'port': 6379, 'db': 0}

for k, v in redis_conf.items():
    redis_conf = convert_utf8(redis_conf)

key = convert_utf8('tc')
value = convert_utf8('hello')

p = pyreBloom(key, 10000, 0.001, **redis_conf)
p.add(value)
print(p.contains(value))
Example #20
0
 def tearDown(self):
     '''Remove the bloom filter at the provided test key in all databases'''
     databases = int(self.redis.config_get('databases').get('databases', 0))
     for db in range(databases):
         pyreBloom.pyreBloom(self.KEY, 1, 0.1, db=db).delete()
Example #21
0
 def tearDown(self):
     '''Remove the bloom filter at the provided test key in all databases'''
     databases = int(self.redis.config_get('databases').get('databases', 0))
     for db in range(databases):
         pyreBloom.pyreBloom(self.KEY, 1, 0.1, db=db).delete()
Example #22
0
 def tearDown(self):
     self.p = pyreBloom.pyreBloom('pyreBloomTesting', 10000, 0.1)
     self.p.delete()
Example #23
0
import string
import unittest
import pyreBloom

count    = 10000
capacity = count * 2
error    = 0.1

print('Generating %i random test words' % (count * 2))
start = -time.time()
included  = [''.join(random.sample(string.lowercase, 20)) for i in range(count)]
outcluded = [''.join(random.sample(string.lowercase, 20)) for i in range(count)]
start += time.time()
print('Generated random test words in %fs' % start)

p = pyreBloom.pyreBloom('pyreBloomTesting', capacity, error)
p.delete()

print('Filter using %i hash functions and %i bits' % (p.hashes, p.bits))

start = -time.time()
p.extend(included)
start += time.time()
print('Batch insert : %fs (%f words / second)' % (start, (count / start)))

p.delete()
start = -time.time()
r = [p.add(word) for word in included]
start += time.time()
print('Serial insert: %fs (%f words / second)' % (start, (count / start)))
Example #24
0
 def tearDown(self):
     self.bloom = pyreBloom.pyreBloom('pyreBloomTesting', 10000, 0.1)
     self.bloom.delete()
Example #25
0

def get_redis_protocol(*args):
  # number of args
  s = "*%d%s%s" % (len(args), chr(13), chr(10))
  for arg in args:
    s += "$%d%s%s" % (len(arg), chr(13), chr(10))
    s += "%s%s%s" % (arg, chr(13), chr(10))

  return s


redis = StrictRedis(host='localhost', port=6379, db=0)
redis.flushdb()

bloom = pyreBloom('gene_symbols', 100000, 0.01)

# iterate through the file, each line is a gene symbol
# generate the redis request protocol for inserting the data
# http://redis.io/topics/protocol
command = ""
for row in csv.DictReader(open('hgnc_complete_set.txt'), delimiter="\t"):

  # skip non-approved genes
  if row["Status"] != "Approved":
    continue

  symbol = row["Approved Symbol"]
  if len(symbol) == 1:
    print "SKIPPING %s: SYMBOL TOO SHORT" % (symbol)
    continue
Example #26
0
 def __init__(self,*args,**kwargs):
     super(TitleCrawler,self).__init__(*args,**kwargs)
     self.bloomfilt= pyreBloom.pyreBloom('titles_bloomfilter',100000,0.01)
Example #27
0
    def build_from_redis(self, user_id):

        p = pyreBloom.pyreBloom(user_id, self.capacity, self.error_rate)
        p.delete()
        p.extend(self.redis_obj.select(user_id, num=500))
Example #28
0
 def setUp(self):
     self.bloom = pyreBloom.pyreBloom(self.KEY, self.CAPACITY, self.ERROR_RATE, count=False)
     self.redis = Redis()
Example #29
0
def before_request():
  g.redis = redis.StrictRedis(host='localhost', port=6379, db=0)
  g.bloom = pyreBloom.pyreBloom('gene_symbols', 100000, 0.01)
  if app.debug:
    print "start request"
    g.start = time.time()
Example #30
0
    def __init__(self, server, key):
        connection_details = server.connection_pool.connection_kwargs
        host, port = connection_details['host'], connection_details['port']
        password, db = connection_details['password'], connection_details['db']

        self.filter = pyreBloom.pyreBloom(key, 10000000, 0.00001, host=host, port=port, password=password, db=db)
Example #31
0
 def setUp(self):
     self.bloom = pyreBloom.pyreBloom(self.KEY, self.CAPACITY, self.ERROR_RATE)
     self.redis = Redis()
Example #32
0
 def setUp(self):
     self.p = pyreBloom.pyreBloom('pyreBloomTesting', 10000, 0.1)
     self.p.delete()
Example #33
0
import pyreBloom

count = 10000
capacity = count * 2
error = 0.1

print 'Generating %i random test words' % (count * 2)
start = -time.time()
included = [''.join(random.sample(string.lowercase, 20)) for i in range(count)]
outcluded = [
    ''.join(random.sample(string.lowercase, 20)) for i in range(count)
]
start += time.time()
print 'Generated random test words in %fs' % start

p = pyreBloom.pyreBloom('pyreBloomTesting', capacity, error)
p.delete()

print 'Filter using %i hash functions and %i bits' % (p.hashes, p.bits)

start = -time.time()
p.extend(included)
start += time.time()
print 'Batch insert : %fs (%f words / second)' % (start, (count / start))

p.delete()
start = -time.time()
r = [p.add(word) for word in included]
start += time.time()
print 'Serial insert: %fs (%f words / second)' % (start, (count / start))