Python BloomFilter.BloomFilterの例、pybloom.BloomFilter.BloomFilter Pythonの例

コード例 #1

0

ファイルを表示

    def add(self, key):
        """Adds a key to this bloom filter.
        If the key already exists in this filter it will return True.
        Otherwise False.

        >>> b = ScalableBloomFilter(initial_capacity=100, error_rate=0.001, \
                                    mode=ScalableBloomFilter.SMALL_SET_GROWTH)
        >>> b.add("hello")
        False
        >>> b.add("hello")
        True

        """
        if key in self:
            return True
        if not self.filters:
            filter = BloomFilter(capacity=self.initial_capacity,
                                 error_rate=self.error_rate *
                                 (1.0 - self.ratio))
            self.filters.append(filter)
        else:
            filter = self.filters[-1]
            if filter.count >= filter.capacity:
                filter = BloomFilter(capacity=filter.capacity * self.scale,
                                     error_rate=filter.error_rate * self.ratio)
                self.filters.append(filter)
        filter.add(key, skip_check=True)
        return False

コード例 #2

0

ファイルを表示

ファイル: phantomjs_spider.py プロジェクト: wwwK/threadspider

 def __init__(self,
              url,
              charset=None,
              headers=None,
              response_handle=None,
              timeout=3,
              retry_times=30,
              load_wait=None,
              execute_js=None,
              execute_js_wait=None,
              retry_delta=3,
              http_proxy_url=None,
              force=False):
     '''
         url   目标url
         charset   编码
         data   post的数据,字符串
         headers  自定义请求头,dict
         response_handle 采集结果处理函数
         timeout  超时时间,int,  比如:3
         retry_times 重试次数,int,比如3
         load_wait  加载页面后等待时间,秒.
         execute_js  加载页面完成后执行的js
         execute_js_waite 执行js之后等待的时间
         retry_delta   如果出错,重试间隔,秒,int
         http_proxy_url         代理ip,  "http://192.168.1.1:80"
         force         强制爬取,而不管有没有爬取过.
     '''
     if not PhantomjsSpider._url_buff:
         PhantomjsSpider._url_buff = [BloomFilter(1000000)]
     global _queue
     _hash = md5(url)
     self.url = url
     self.timeout = timeout
     self.retry_times = retry_times
     self.retry_delta = retry_delta
     self.response_handle = response_handle
     self.charset = charset
     self.headers = headers
     self.execute_js = execute_js
     self.execute_js_wait = execute_js_wait
     self.load_wait = load_wait
     self.proxy = http_proxy_url
     if not force:
         try:
             for bloomfilter in PhantomjsSpider._url_buff:
                 assert _hash not in bloomfilter
         except:
             pass
         else:
             try:
                 PhantomjsSpider._url_buff[-1].add(_hash)
             except:
                 PhantomjsSpider._url_buff.append(
                     BloomFilter(PhantomjsSpider._url_buff[-1].capacity +
                                 1000000))
                 PhantomjsSpider._url_buff[-1].add(_hash)
             _queue.put(self._go)
     else:
         _queue.put(self._go)

コード例 #3

0

ファイルを表示

ファイル: BloomFilter.py プロジェクト: realsightAPM/PacketProcess

 def __init__(self, capacity=(1 << 30), error_rate=0.0001):
     self.bloomFilters = []
     self.leaderIndex = 0
     self.followerIndex = 1
     self.bloomFilters.append(BloomFilter(capacity, error_rate))
     self.bloomFilters.append(BloomFilter(capacity, error_rate))
     self.status = 0

コード例 #4

0

ファイルを表示

class CrawlBSF:
    request_headers = {
        'host': "www.mafengwo.cn",
        'connection': "keep-alive",
        'cache-control': "no-cache",
        'upgrade-insecure-requests': "1",
        'user-agent':
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36",
        'accept':
        "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        'accept-language': "zh-CN,en-US;q=0.8,en;q=0.6"
    }

    cur_level = 0
    max_level = 5
    dir_name = 'iterate/'
    iter_width = 50
    downloaded_urls = []

    du_md5_file_name = dir_name + 'download.txt'
    du_url_file_name = dir_name + 'urls.txt'

    bloom_downloaded_urls = BloomFilter(1024 * 1024 * 16, 0.01)
    bloom_url_queue = BloomFilter(1024 * 1024 * 16, 0.01)

    cur_queue = deque()
    child_queue = deque()

    def __init__(self, url):
        self.root_url = url
        self.cur_queue.append(url)
        self.du_file = open(self.du_url_file_name, 'a+')
        try:
            self.dumd5_file = open(self.du_md5_file_name, 'r')
            self.downloaded_urls = self.dumd5_file.readlines()
            self.dumd5_file.close()
            for urlmd5 in self.downloaded_urls:
                self.bloom_downloaded_urls.add(urlmd5[:-2])
        except IOError:
            print "File not found"
        finally:
            self.dumd5_file = open(self.du_md5_file_name, 'a+')

    def enqueueUrl(self, url):
        if url not in self.bloom_url_queue and hashlib.md5(
                url).hexdigest() not in crawler.bloom_downloaded_urls:
            self.child_queue.append(url)
            self.bloom_url_queue.add(url)

    def dequeuUrl(self):
        try:
            url = self.cur_queue.popleft()
            return url
        except IndexError:
            return None

    def close(self):
        self.dumd5_file.close()
        self.du_file.close()

コード例 #5

0

ファイルを表示

ファイル: tests.py プロジェクト: wangdexinpython/test

    def test_intersection_k_fail():
        bloom_one = BloomFilter(100, 0.001)
        bloom_two = BloomFilter(100, 0.01)

        def _run():
            new_bloom = bloom_one.intersection(bloom_two)

        assertRaises(ValueError, _run)

コード例 #6

0

ファイルを表示

ファイル: test.py プロジェクト: Stella2019/6212-project

    def test_intersection_capacity_fail(self):
        bloom_one = BloomFilter(1000, 0.001)
        bloom_two = BloomFilter(100, 0.001)

        def _run():
            new_bloom = bloom_one.intersection(bloom_two)

        self.assertRaises(ValueError, _run)

コード例 #7

0

ファイルを表示

ファイル: test.py プロジェクト: Stella2019/6212-project

    def test_union_k_fail(self):
        bloom_one = BloomFilter(100, 0.01)
        bloom_two = BloomFilter(100, 0.001)

        def _run():
            new_bloom = bloom_one.union(bloom_two)

        self.assertRaises(ValueError, _run)

コード例 #8

0

ファイルを表示

ファイル: tests.py プロジェクト: wangdexinpython/test

    def test_union_capacity_fail():
        bloom_one = BloomFilter(1000, 0.001)
        bloom_two = BloomFilter(100, 0.001)

        def _run():
            new_bloom = bloom_one.union(bloom_two)

        assertRaises(ValueError, _run)

コード例 #9

0

ファイルを表示

ファイル: seg_by_morpheme.py プロジェクト: frankness/MPyWE

    def __init__(self):
        self.monosyllableMorphemeBf = BloomFilter(capacity=20000,
                                                  error_rate=0.0001)
        self.disyllableMorphemeBf = BloomFilter(capacity=50000,
                                                error_rate=0.0001)
        self.multisyllableMorphemeBf = BloomFilter(capacity=500000,
                                                   error_rate=0.0001)

        # 加载单音节，双音节，多音节语素表
        self.load_morphemes()

コード例 #10

0

ファイルを表示

ファイル: tests.py プロジェクト: wangdexinpython/test

 def test_union():
     bloom_one = BloomFilter(100, 0.001)
     bloom_two = BloomFilter(100, 0.001)
     chars = [chr(i) for i in range(97, 123)]
     for char in chars[len(chars) / 2:]:
         bloom_one.add(char)
     for char in chars[:len(chars) / 2]:
         bloom_two.add(char)
     new_bloom = bloom_one.union(bloom_two)
     for char in chars:
         assert_(char in new_bloom)

コード例 #11

0

ファイルを表示

ファイル: test.py プロジェクト: Stella2019/6212-project

 def test_union(self):
     bloom_one = BloomFilter(100, 0.001)
     bloom_two = BloomFilter(100, 0.001)
     chars = [chr(i) for i in range_fn(97, 123)]
     for char in chars[int(len(chars) / 2):]:
         bloom_one.add(char)
     for char in chars[:int(len(chars) / 2)]:
         bloom_two.add(char)
     new_bloom = bloom_one.union(bloom_two)
     for char in chars:
         self.assertTrue(char in new_bloom)

コード例 #12

0

ファイルを表示

 def __init__(self,
              url,
              charset=None,
              data=None,
              headers=None,
              timeout=3,
              retry_times=30,
              retry_delta=3,
              http_proxy=None,
              force=False):
     '''
         url   目标url
         charset   编码
         data   post的数据,字符串
         headers  自定义请求头,dict
         timeout  超时时间,int,  比如:3
         retry_times 重试次数,int,比如3
         retry_delta   重试间隔,int
         http_proxy         代理ip, 比如 192.168.1.1:3128  ，也可以是一个函数 lambda :"192.168.1.1:3128"
         force         强制爬取,而不管有没有爬取过.
     '''
     self.url = url
     self.data = data
     self.timeout = timeout
     self.retry_times = retry_times
     self.retry_delta = retry_delta
     self.charset = charset
     self.headers = headers
     self.http_proxy = http_proxy
     if not Spider._url_buff:
         Spider._url_buff = [BloomFilter(1000000)]
     global _queue
     if data:
         _hash = md5(url) + md5(data)
     else:
         _hash = md5(url)
     if not force:
         try:
             for bloomfilter in Spider._url_buff:
                 assert _hash not in bloomfilter
         except:
             pass
         else:
             try:
                 Spider._url_buff[-1].add(_hash)
             except:
                 Spider._url_buff.append(
                     BloomFilter(Spider._url_buff[-1].capacity + 1000000))
                 Spider._url_buff[-1].add(_hash)
             _queue.put_priority(self.__dict__, 0)
     else:
         _queue.put_priority(self.__dict__, 0)

コード例 #13

0

ファイルを表示

 def test_intersection(self):
     bloom_one = BloomFilter(100, 0.001)
     bloom_two = BloomFilter(100, 0.001)
     chars = [chr(i) for i in range(97, 123)]
     for char in chars:
         bloom_one.add(char)
     for char in chars[:len(chars)/2]:
         bloom_two.add(char)
     new_bloom = bloom_one.intersection(bloom_two)
     for char in chars[:len(chars)/2]:
         self.assert_(char in new_bloom)
     for char in chars[len(chars)/2:]:
         self.assert_(char not in new_bloom)

コード例 #14

0

ファイルを表示

 def __init__(self, cachefile, capacity=1000000, error_rate=0.001):
     self.cachefile = cachefile
     if os.name == 'nt' or not cachefile:
         from pybloom import BloomFilter
         if self.cache():
             with open(cachefile, 'r') as fp:
                 self.filter = BloomFilter.fromfile(fp)
         else:
             self.filter = BloomFilter(capacity=capacity, error_rate=error_rate)
     elif os.name == 'posix':
         from pybloomfilter import BloomFilter
         if self.cache():
             self.filter = BloomFilter.open(self.cachefile)
         else:
             self.filter = BloomFilter(capacity, error_rate, cachefile)

コード例 #15

0

ファイルを表示

def start():
    res = request_get(biqukan_url)
    index = BeautifulSoup(res, features=features)

    if os.path.exists(bf_file):
        LOG.info('bs from file')
        bf = BloomFilter.fromfile(open(bf_file, 'r'))
    else:
        LOG.info('init bs')
        bf = BloomFilter(500000)

    try:
        pool = Pool(size=pool_size)
        book_urls = find_wanben()
        book_urls += find_new_storage_block(index)
        book_urls += find_recommend_block(index, u'强力推荐')
        book_urls += find_type_block(index, u'玄幻小说')
        book_urls += find_type_block(index, u'修真小说')
        book_urls += find_type_block(index, u'都市小说')
        book_urls += find_type_block(index, u'穿越小说')
        book_urls += find_type_block(index, u'网游小说')
        book_urls += find_type_block(index, u'科幻小说')
        book_urls += find_new_update_block(index)
        book_num = len(book_urls)
        for i, url in enumerate(book_urls):
            pool.spawn(download_book, url, bf)
            # download_book(url, bf)
            LOG.info(u'开始下载%s本，剩余%s本', i + 1, book_num - i - 1)

        pool.join()
        LOG.info(u'下载完成')
    except Exception as e:
        LOG.exception(e)
    finally:
        bf.tofile(open(bf_file, 'w'))

コード例 #16

0

ファイルを表示

	def determine_lookup_speed_threshold(self):
		from time import time
		#do each one 5 times
		bf = BloomFilter(capacity=self.bloom_size, error_rate=self.bloom_error)
		count = 1
		repetitions = 5
		
		self_bf_holder = self.bf
		self.bf = bf
		while True:
			bf.add('andrew_' + str(count))
			bin_faster_count = 0
			for j in xrange(repetitions):
				#Linear scan
				t1 = time()
				self.linear_scan_count('andrew')
				t2 = time()
				linear_time = t2-t1
			
				t1 = time()
				self.binsearch_count('andrew')
				t2 = time()
				bin_time = t2-t1
			
				bin_faster_count += int(bin_time < linear_time)
		
			if 1.*bin_faster_count / repetitions >= 0.75:
				del bf
				self.bf = self_bf_holder
				return count
			
			count += 1

コード例 #17

0

ファイルを表示

def ingest_payloads(filename):
    stats = {
        "days_per_user": {},
        "users_per_versions": {},
        "num_data_days": 0,
        "ignored_users": set(),
        "num_payloads": 0,
        "ignored_submissions": 0,
        "duplicate_submissions": 0,
        "duplicate_submission_interest": 0,
    }
    bloom_filter = BloomFilter(capacity=10000000, error_rate=0.001)
    with open(filename, "r") as infile:
        db = SQLBackend.instance()
        session = db.get_session()
        session.begin(subtransactions=True)
        for line in infile:
            payload = json.loads(line)
            try:
                ingest_payload(payload, session, stats, bloom_filter)
            except:
                session.rollback()
        session.commit()
    sys.stdout.write("===== Payload ingestion =====")
    sys.stdout.flush()
    print "\n"
    return stats

コード例 #18

0

ファイルを表示

    def __init__(self,
                 domain,
                 first_url=None,
                 first_url_callback=None,
                 first_url_follow=True,
                 url_amount=100000,
                 requests_session=None,
                 tls=False,
                 max_depth=100):
        self.domain = domain
        self.max_depth = max_depth
        self.bf = BloomFilter(capacity=url_amount, error_rate=1 / url_amount)
        self.url_queue = queue.Queue()

        if first_url is None:
            if tls:
                first_url = 'https://' + domain
            else:
                first_url = 'http://' + domain

        self.url_queue.put(
            (first_url, first_url_callback, first_url_follow, 0))

        if requests_session is None:
            self.session = requests.Session()
        else:
            self.session = requests_session

コード例 #19

0

ファイルを表示

 def __init__(self):
     #mail
     # self.mailer = MailSender.from_settings(settings)
     #mongo
     self.connection = pymongo.MongoClient(settings['MONGODB_SERVER'],
                                           settings['MONGODB_PORT'])
     db = self.connection[settings['MONGODB_DB']]
     self.collection = db[settings['MONGODB_COLLECTION']]
     self.collectionurllog = db[settings['MONGODB_COLLECTION'] + "_urllog"]
     #bloom file
     filename = settings['MONGODB_COLLECTION'] + ".blm"
     #pybloom
     num = (int(settings['CrawlCar_Num']) + self.collection.count()) * 1.1
     self.df = BloomFilter(capacity=num, error_rate=0.01)
     #read
     isexists = os.path.exists(filename)
     self.fa = open(filename, "a")
     if isexists:
         fr = open(filename, "r")
         lines = fr.readlines()
         for line in lines:
             line = line.strip('\n')
             self.df.add(line)
         fr.close()
     else:
         for i in self.collection.find():
             if "status" in i.keys():
                 item = i["status"]
                 item = md5(item).hexdigest()
                 self.df.add(item)
                 self.fa.writelines(item + '\n')
     #count
     self.counts = 0

コード例 #20

0

ファイルを表示

def estimate_overlap(source_files,
                     target_files,
                     gran='word',
                     n=8,
                     capacity=10000,
                     error_rate=1e-5,
                     header=0,
                     interval=100000):
    """ Estimate overlapping of target_files with source_files using n-grams
    gran: granularity of the token. It can be 'word' or 'char'
    header: number of lines of each file to skip. It's because in our format, the first line is the url
    """
    if not gran in set(['word', 'char']):
        raise ValueError("gran has to be 'word' or 'char'")
    if isinstance(source_files, str):
        source_files = [source_files]
    if isinstance(target_files, str):
        target_files = [target_files]

    bf = BloomFilter(capacity=capacity, error_rate=error_rate)
    for source_file in source_files:
        bf = build_ngram(file=source_file,
                         bf=bf,
                         gran=gran,
                         n=n,
                         uncase=True,
                         alphanumeric=True,
                         interval=interval)

    results = []
    for file in target_files:
        print(file)
        results.append(
            estimate_overlap_bf(bf, file, gran=gran, n=8, header=header))
    return results

コード例 #21

0

ファイルを表示

 def open_spider(self, spider):
     brandName = 'mybloom'
     isexists = os.path.exists(brandName + '.blm')
     if isexists == True:
         self.bf = BloomFilter.fromfile(open(brandName + '.blm', 'rb'))
     else:
         self.bf = BloomFilter(100000, 0.001)

コード例 #22

0

ファイルを表示

def build_bloom_filter_and_iblt(m, include_value_in_iblt=False):
    c = 8 * math.pow(math.log(2), 2)
    tau = 16.5
    n = len(selected_txs)
    alpha = n / (c * tau)
    # print(alpha * tau)

    if m <= n:
        fpr = 0.1
    else:
        fpr = alpha / m - n
    print("Mempool difference", abs(m - n))
    n_cells = int((4 / 3) * abs(m - n)) + 30
    print('n_cells', n_cells)
    logging.info("Calculated FPR: %f" % fpr)
    fpr = 0.1
    b = BloomFilter(capacity=n, error_rate=fpr)
    i = IBLT(m=n_cells, k=3, key_size=32, value_size=0)
    for tx in selected_txs:
        b.add(tx['hash'])
        v = ''
        if include_value_in_iblt:
            v = tx_to_bytes(tx)
        i.insert(tx['hash'], v)
    return b, i

コード例 #23

0

ファイルを表示

def returnItemsWithMinSupportV3(itemSet, lenItem, transactionList, minSupport,
                                freqSet):
    _itemSet = set()
    localSet = defaultdict(int)
    if len(itemSet):
        filterCdd = BloomFilter(capacity=len(itemSet), error_rate=0.0001)
    else:
        print("As I say, ValueError: Capacity must be > 0")
        return set([])
    print("Store cdds in BF ... - %s" % getTime())
    for val in itemSet:
        pass  # 待引入counting BF，如达到minSup*len(transactionList)，则不插入；or 不用counting BF，判断，已在BF的则不再插入。
        filterCdd.add(val)
    print("Mapping cddFromTrans on BF ... - %s" % getTime())
    for trans in transactionList:
        for cdd in combinations(trans, lenItem):
            cdd = frozenset(cdd)
            if cdd in filterCdd:
                freqSet[cdd] += 1  #zi 全局存一个
                localSet[cdd] += 1  #zi 局部存一个，(item, count)，然后过滤小于minSupport的。
    print("Filter cdds that less than minSup. - %s" % getTime())
    for item, count in localSet.items():
        support = float(count) / len(transactionList)
        if support > minSupport:
            _itemSet.add(item)

    return _itemSet

コード例 #24

0

ファイルを表示

ファイル: 简书爬虫.py プロジェクト: xxllp/job-scrapy

def user_init():
    import re
    users = BloomFilter(10000000, 0.001)
    f = open(u"D:/工作/数据美化/data/简书用户id1.txt")
    for line in f:
        users.add(line.strip())
    return users

コード例 #25

0

ファイルを表示

ファイル: SpiderInit.py プロジェクト: Flat-Chen/ChenProject

def spider_update_Init(dbname, website, carnum):

    # Mongo setting
    # spider_original_Init(dbname, website, carnum)
    # Mongo con
    connection = pymongo.MongoClient(settings['MONGODB_SERVER'],
                                     settings['MONGODB_PORT'])
    dbdata = connection[dbname]
    collectiondata = dbdata[website]

    # pybloom
    num = (int(carnum) + collectiondata.count()) * 1.1
    df = BloomFilter(capacity=num, error_rate=0.01)

    # urllist
    urllist = []
    for i in collectiondata.find():
        if "url" in i.keys():
            item = i["url"]
            if "status" in i.keys():
                if not (i['status'].find('sold') == -1):
                    continue
            itemmd5 = md5(item).hexdigest()
            returndf = df.add(itemmd5)
            if not (returndf):
                urllist.append(item)
    connection.close()
    return urllist

コード例 #26

0

ファイルを表示

ファイル: passcheck.py プロジェクト: milo-minderbinder/docker-passcheck

 def __init__(self, password_file=pw_file, fp_rate=0.001, ignore_case=True):
     self._log = logging.getLogger('passcheck.passcheck.PassCheck')
     self._fp_rate = fp_rate
     self._pw_file = os.path.realpath(password_file)
     self._ignore_case = ignore_case
     self._log.debug('Counting items in password file')
     with open(self._pw_file, 'r') as f:
         for line_num, line in enumerate(f):
             pass
     self._num_passwords = line_num + 1
     self._log.debug('Creating BloomFilter with capacity=%d'
                     % self._num_passwords)
     self._bf = BloomFilter(capacity=self._num_passwords,
                            error_rate=self._fp_rate)
     self._log.debug('Loading passwords into BloomFilter')
     num_added = 0
     with open(self._pw_file, 'r') as f:
         for line in f:
             pw = line[:-1]
             if self._ignore_case:
                 pw = pw.lower()
             if not self._bf.add(pw):
                 num_added += 1
             if num_added > self._num_passwords:
                 e = Exception('Password file was modified during load')
                 self._log.error(e)
                 raise e
     # Handle possibility of duplicates (especially if case is ignored)
     if num_added < self._num_passwords:
         self._log.warn('Expected %d passwords, but added %d'
                        % (self._num_passwords, num_added))
         self._num_passwords = num_added

コード例 #27

0

ファイルを表示

    def global_rebuild(self):
        #print "Lookup"
        #for e in sbf.lookup:
        #	print e

        opt_m = self._global_optimal(self.num_filter, self.total_bits,
                                     self.num_insert, self.lookup)
        self.bf = []
        for i, m in enumerate(opt_m):
            if m > 0:
                self.bf.append(BloomFilter(int(m), len(self.mapping_table[i])))
            else:
                self.bf.append(None)

        # Clear counters or not
        #self.lookup = [0]*self.num_filter

        # Re-insert all keys into the sbf
        for i, m in enumerate(self.mapping_table):
            for n in m:
                if self.bf[i] != None:
                    if not is_number:
                        self.bf[i].add(padding_zero(n, item_len))
                    else:
                        self.bf[i].add(int(n))

        self.target_fpp = [
            b.target_fpp if b is not None else 1.0 for b in self.bf
        ]
        self.old_fr = [float(l) / total_lookup for l in self.lookup]

コード例 #28

0

ファイルを表示

 def __init__(self, lines, estimated_lines, dup_proportion, truncate):
     super().__init__()
     estimated_dups = estimated_lines * dup_proportion
     self.truncate = truncate
     self.potential = BloomFilter(capacity=estimated_dups, error_rate=0.001)
     self.seen = set()
     self._find_collisions(lines, estimated_lines)

コード例 #29

0

ファイルを表示

ファイル: build.py プロジェクト: blacktop/docker-nsrl

def main(argv):
    if argv:
        error_rate = float(argv[0])
    print "[BUILDING] Using error-rate: {}".format(error_rate)
    if os.path.isfile(nsrl_path):
        print "[BUILDING] Reading in NSRL Database"
        with open(nsrl_path) as f_line:
            # Strip off header
            _ = f_line.readline()
            print "[BUILDING] Calculating number of hashes in NSRL..."
            num_lines = sum(bl.count("\n") for bl in blocks(f_line))
            print "[BUILDING] There are %s hashes in the NSRL Database" % num_lines
        with open(nsrl_path) as f_nsrl:
            # Strip off header
            _ = f_nsrl.readline()
            print "[BUILDING] Creating bloomfilter"
            bf = BloomFilter(num_lines, error_rate)
            print "[BUILDING] Inserting hashes into bloomfilter"
            for line in f_nsrl:
                md5_hash = line.split(",")[1].strip('"')
                if md5_hash:
                    try:
                        md5 = binascii.unhexlify(md5_hash)
                        bf.add(md5)
                    except Exception as e:
                        print "[ERROR] %s" % e
            print "[BUILDING] NSRL bloomfilter contains {} items.".format(
                len(bf))
            with open('nsrl.bloom', 'wb') as nb:
                bf.tofile(nb)
            print "[BUILDING] Complete"
    else:
        print("[ERROR] No such file or directory: %s", nsrl_path)

    return

コード例 #30

0

ファイルを表示

ファイル: util.py プロジェクト: lucamelis/randforests

def feedBloom(row):
    f = BloomFilter(capacity=200, error_rate=0.6)
    f.add(row.src_ip)
    f.add(row.src_ip[0:5])
    f.add(row.src_ip[5:8])
    f.add(row.target_ip)
    return np.array(f.bitarray.tolist(), dtype=np.int)