class SpellChecker(): def __init__(self): self.dictionary = BloomFilter() word_list = open('/usr/share/dict/words', 'r') for word in word_list: self.dictionary.add(word.strip()) def valid(self, string): return self.dictionary.includes(string)
def testDumpGzippedAndLoadBloomFilter(self): bloom_filter = BloomFilter(self.BLOOM_CAPACITY, self.BLOOM_ERROR_RATE) for key in self.all_keys: bloom_filter.add(key) dump_bytes = bloom_filter.dump(gzipped=True); bloom_filter2 = BloomFilter.load(dump_bytes) self.assertEqual(bloom_filter, bloom_filter2) self.check_contains(bloom_filter2)
def testDumpAndLoadBase64BloomFilter(self): bloom_filter = BloomFilter(self.BLOOM_CAPACITY, self.BLOOM_ERROR_RATE) for key in self.all_keys: bloom_filter.add(key); dump_str = bloom_filter.dump_to_base64_str(gzipped=True); bloom_filter2 = BloomFilter.load_from_base64_str(dump_str); self.assertEqual(bloom_filter, bloom_filter2) self.check_contains(bloom_filter2)
class EventValidator: last = -1 current = -1 bloom = BloomFilter(4980000, 0.01) orderErrors = [] uniqueErrors = [] def __init__(self): self def checkOrder(self, value): if self.last == -1: self.last = value else: self.current = value if self.last > self.current: self.orderErrors.append(self.current) else: self.last = self.current def checkUnique(self, value): inside = value in self.bloom if inside: self.uniqueErrors.append(value) else: self.bloom.add(value)
def __init__(self, file_name): self.bf = BloomFilter(10000000, 8) input_file = open(file_name, "r") for file_line in input_file: file_line = file_line.rstrip() self.bf.add(file_line) input_file.close()
def build_entity_bloom(p="../../data/buboqa/indexes/names_2M.pkl", outp="../../data/buboqa/data/names_2M.labels.bloom"): tt = q.ticktock("bloom-entities") tt.tick("loading names pkl") entnames = pkl.load(open(p, "rb")) tt.tock("loaded names pkl") tt.tick("building bloom filter") fset = BloomFilter(2e6, 1e-3) for uri, names in tqdm(entnames.items()): for name in names: fset.add(name) tt.tock("built") tt.tick("saving bloom filter") with open(outp, "wb") as f: pkl.dump(fset, f) tt.tock("saved") return fset
class AutoResetBloomFilter: RESET_TIME = 60 * 60 * 10 # NOTE 每10小时重置一次 def __init__(self): self.bf = BloomFilter() self.last_reset_time = int(time.time()) def add(self, v): now = int(time.time()) if now - self.last_reset_time > self.RESET_TIME: logging.info("bloom filter reset") self.bf = BloomFilter() self.last_reset_time = now self.bf.add(v) def __contains__(self, key): return key in self.bf
def __init__(self, base_url: str, cfg: configparser) -> None: self.base_url = base_url self.config = cfg self.closure_url = '{scheme}://{netloc}'.format( scheme=urlsplit(self.base_url).scheme, netloc=urlsplit(self.base_url).netloc) self.pool = ThreadPoolExecutor( max_workers=int(self.config['MAX_WORKER'])) self.task_queue = Queue(maxsize=3 * int(self.config['MAX_WORKER'])) self.task_queue.put(self.base_url) self.crawled_pages = BloomFilter( max_elements=int(self.config['MAX_ELEMENTS']), error_rate=float(self.config['ERROR_RATE'])) self.crawled_pages.add(self.base_url) self.total = 1 self.lock = Lock() self.run_time = time.time()
def process_two(pipe21, pipe23): pid = 1 counter = BloomFilter(n, p, 0) counter = recv_message(pipe21, pid, counter, 'g') counter = send_message(pipe21, pid, counter, 'h') counter = send_message(pipe23, pid, counter, 'i') counter = recv_message(pipe23, pid, counter, 'j') print_history(counter, pid)
def __init__(self, event_bus: EndpointAPI, peer_pool: ETHProxyPeerPool, tx_validation_fn: Callable[[BaseTransactionFields], bool], token: CancelToken = None) -> None: super().__init__(token) self._event_bus = event_bus self._peer_pool = peer_pool if tx_validation_fn is None: raise ValueError('Must pass a tx validation function') self.tx_validation_fn = tx_validation_fn # 1m should give us 9000 blocks before that filter becomes less reliable # It should take up about 1mb of memory self._bloom = BloomFilter(max_elements=1000000) self._bloom_salt = str(uuid.uuid4())
async def filter(self, existed_vid_list=None): bloom = BloomFilter(max_elements=config.MAX_ESTIMATE_RECORD_NUMBER ) ## construct a bloom filter for ele in existed_vid_list: bloom.add(ele) ## add origin_id into the filter latest_results = [] ## final result to output ## The one who is responsible for the paging for xinpianchang do not have any kids buffer = config.check_latest_buffer latest = await self.fetch() for ele in latest: if bloom.__contains__( ele['vid']): ## determine whether if the ele is reocrded ## if the ele is recorded ## meaning that the upcoming ele are repeated ## so we just return the current latest_results ## but due to the unreasonable paging issue ## we need a buffer to make sure that we make it to the end if buffer == 0: del bloom ## release memory return jmespath.search( '[]', latest_results) if latest_results else [] else: buffer -= 1 continue else: bloom.add(ele['vid']) ## add origin_id into the filter latest_results.append(ele) else: return jmespath.search('[]', latest_results) if latest_results else []
class DoubleBloom: def __init__(self, values, probability): self.zero_bloom = BloomFilter(values, probability, False) self.one_bloom = BloomFilter(values, probability, False) self.next_level = None def insert(self, key, value): if value == '0': self.zero_bloom.insert(key) else: self.one_bloom.insert(key) def get_value(self, key): if self.zero_bloom.contains(key): if self.one_bloom.contains(key): if self.next_level is not None: return self.next_level.get_value(key) return 'Both' return '0' elif self.one_bloom.contains(key): return '1' return None def add_level(self, values, probability): self.next_level = DoubleBloom(values, probability)
def do_add_strings(self, string_lists): try: for key in string_lists: document = {} bf = BloomFilter(max_elements=2**16, error_rate=0.01) [bf.add(element) for element in string_lists.get(key)] bf_pickle = cPickle.dumps(bf) document[key] = bf_pickle sig = key.split('_') for type in self.collection: if type == sig[len(sig) - 1]: size = sys.getsizeof(document[key]) print( "asynic save bloom tree into mongDB: %s \t size is %f" % (key, size / 1024 / 1024)) break except Exception as e: print(e)
def check(): bloom_filter = BloomFilter(max_elements=2000000, error_rate=0.0001) with open('urls.jsonl', mode='w') as f: for file_name in os.listdir("url_outlinks"): with open(f"url_outlinks/{file_name}", mode='r') as fp: data = json.load(fp) for domain, obj in data.items(): for url, outlinks in obj.items(): if url not in bloom_filter: f.write( f"""{json.dumps({"url": url, "no_outlinks": len(outlinks), "outlinks": outlinks})}\n""" ) bloom_filter.add(url) fp.close() f.flush() f.close()
class BLOOMDupeFilter(BaseDupeFilter): """Request Fingerprint duplicates filter""" def __init__(self, path=None): self.file = None self.fingerprints = BloomFilter(10000, 0.00001) @classmethod def from_settings(cls, settings): return cls(job_dir(settings)) def request_seen(self, request): fp = request.url if fp in self.fingerprints: return True self.fingerprints.add(fp) def close(self, reason): self.fingerprints = None
def process_one(pipe12): pid = 0 counter = BloomFilter(n, p, 0) counter = event(pid, counter, 'b') counter = send_message(pipe12, pid, counter, 'c') counter = event(pid, counter, 'd') counter = recv_message(pipe12, pid, counter, 'e') counter = event(pid, counter, 'f') print_history(counter, pid)
def add(self, key): bfilter = self.sbfilters[-1] if bfilter.can_accomodate() == False: new_expected_inserts = bfilter.expected_inserts * self.space_scale new_error_rate = bfilter.error_rate * self.error_prob_ratio new_bfilter = BloomFilter(new_expected_inserts, new_error_rate) self.sbfilters.append(new_bfilter) bfilter = new_bfilter bfilter.add(key)
def put_all(self, kvps): timestamp = self.next_timestamp() txn_keys = None if self.algorithm == RAMPAlgorithm.Fast: txn_keys = kvps.keys() bloom_filter = None if self.algorithm == RAMPAlgorithm.Hybrid: bloom_filter = BloomFilter(BLOOM_FILTER_SIZE, BLOOM_FILTER_HASHES) bloom_filter.list_to_bloom(kvps.keys()) for key in kvps: self.key_to_partition(key).prepare( key, DataItem(kvps[key], timestamp, txn_keys, bloom_filter), timestamp) for key in kvps: self.key_to_partition(key).commit(key, timestamp)
def __init__(self, start_urls: List[str], crawled_pages_count: int, chunk_size: int, fetch_workers: int, database_workers: int): # При использоание set на больших объемах достигнем лимита # памяти.Поэтому используем фильтр блума, при этом проигрываем # по скорости. self._visited = BloomFilter(max_elements=crawled_pages_count) self._logger = get_logger(__name__) self._stop_crawling = False self._urls = start_urls self._data = [] self._buffer = [] self._total_crawled_pages = 0 self._stop_crawling = False self._fetch_error_rate = 0.9 self._crawled_pages_count = crawled_pages_count self._chunk_size = chunk_size self._fetch_workers = fetch_workers self._database_workers = database_workers self._max_buffer_len = self._chunk_size * self._fetch_error_rate
def __init__(self, env, feature_transformer): self.env = env self.models = {} self.models_elite = {} self.feature_transformer = feature_transformer for a in env.actions_available: self.models[a] = PassiveAggressiveRegressor( C=1.0, fit_intercept=True, shuffle=False, loss='epsilon_insensitive', epsilon=0.1) self.models_elite[a] = PassiveAggressiveRegressor( C=1.0, fit_intercept=True, shuffle=False, loss='epsilon_insensitive', epsilon=0.1) self.bloom_states = BloomFilter(max_elements=256**2)
def __init__(self, settings, stats): self.never_cache = set( urlparse(url).path for url in settings.get("NEVER_CACHE", [])) self.never_cache.add(urlparse("/robots.txt").path) logger.info( f"Initiating bloom filter.... Never Cache paths: {sorted(self.never_cache)}" ) self.visited = BloomFilter( max_elements=settings.getint("VISITED_FILTER_MAX_REQUESTS", 4000000), error_rate=settings.getfloat("VISITED_FILTER_ERROR_RATE", 1e-9), filename=settings.get("VISITED_FILTER_PATH"), ) self.stats = stats logger.info( f"Loaded visited urls bloomfilter. Size {self.visited.num_bits_m / (1024 ** 2 * 8)} MiB." )
def add_test(self): try: for i in range(1000): # Pick a random number for string size. string_size = random.randrange(0, 10000) # Create random alphanumeric strings. random_string = ''.join( random.choices(string.ascii_uppercase + string.digits, k=string_size)) # The size of the filter is not important in this test. bloom = BloomFilter(bit_vector_size=10) # Add the random string to the filter. bloom.add(random_string) logging.info("add_test: PASS") except ValueError: logging.error("add_test: FAIL. A random string caused an error.")
def bloom_filter(fp, pattern): ret = f'Query {pattern} pattern in date set\n' for path in fp: stock_name = os.path.basename(path)[:-4] mvg,date = moving_average(path,10) bmvg = trans2_01(mvg) bloom = BloomFilter(max_elements=10000) length = len(pattern) ele = '' for i in range(length): ele = ele + str(bmvg[i]) for i in range(length - 1, len(bmvg)): if i != length - 1: ele = ele + str(bmvg[i]) ele = ele[1:] bloom.add(ele) if pattern in bloom: ret = ret + f'Find {pattern} pattern in {stock_name}\n' return ret
def __init__(self, server, key, debug=False): """Initialize the duplicates filter. Parameters ---------- server : redis.StrictRedis The redis server instance. key : str Redis key Where to store fingerprints. debug : bool, optional Whether to log filtered requests. """ self.server = server self.key = key self.debug = debug self.logdupes = True self.bloomfilter = BloomFilter(redis_conn=server, key=key)
def __init__(self, *args, **kwargs): # // 要爬取网站的跟 self.base_url = 'http://wzggzy.wz.gov.cn/' # super(QhSpider, self).__init__(*args, **kwargs) self.bloom_filter = BloomFilter(max_elements=1000000, error_rate=0.1, filename='bf.data') self.num = 0 self.scrawl_mode = ScrawlMode.HISTORY self._stop_parse = False
class StatisticsHolder: MAX_DELAY = 5 def __init__(self, start_ts, end_ts): self.counter = 0 self.start_ts = start_ts self.end_ts = end_ts self.filter = BloomFilter(max_elements=50000, error_rate=0.01) def add(self, uid): if uid not in self.filter: self.counter += 1 self.filter.add(uid) def __contains__(self, ts): return ts >= self.start_ts and ts < self.end_ts def closed_by(self, ts): return ts >= self.end_ts + StatisticsHolder.MAX_DELAY
class Node: def __init__(self, k, expected_num, fp_prob): """ Represents a single node of Bloom Tree """ self.children: List[Node] = [] self.parent: Optional[Node] = None self.filter = BloomFilter(expected_num, fp_prob) self.dataset_id: Optional[str] = None self.k = k def populate_dataset_info(self, dataset: List[Read]) -> None: self.dataset_id = dataset[0].filename self.insert_kmers_from_dataset(dataset) def insert_kmers_from_dataset(self, dataset: List[Read]) -> None: for read in dataset: for kmer in read.kmers(self.k): self.filter.insert(kmer) def add_node_kmers(self, other: 'Node') -> None: self.filter.filter |= other.filter.filter def num_children(self) -> int: return len(self.children) def score(self, other: 'Node') -> int: """ "Hamming distance" score where lower is better :param other: The node to compare against """ return count_xor(self.filter.filter, other.filter.filter) def get_size(self): """ Returns the total number of bytes occupied by the filter object """ return (sys.getsizeof(self.children) + sys.getsizeof(self.parent) + sys.getsizeof(self.dataset_id) + sys.getsizeof(self.k) + self.filter.get_size())
def false_positive_rate_benchmark(error_rate): trial_arr = [] fps_rate = [] capacity = 10000 bfilter = BloomFilter(capacity, error_rate) for trial in range(10000, 100000, 1000): fp = 0 for i in range(trial): bfilter.add(i) for i in range(trial, 2 * trial + 1): if i in bfilter: fp += 1 fp_rate = fp / float(trial) fps_rate.append(fp_rate) trial_arr.append(trial) plt.plot(trial_arr, fps_rate, 'bs') plt.ylabel('False Positive Rate') plt.xlabel('No of Trials') plt.show()
def __init__(self): self.headers = { 'User-Agent': 'Dalvik/2.1.0 (Linux; U; Android 8.0.0; MIX 2 MIUI/V10.2.2.0.ODECNXM) NewsArticle/8.4.4' } self.cookies = { 'cookies': 'install_id=6672646082571388678; ttreq=1$a9ed7f4ce8fc84fced473d6e25c22226f381c13d; odin_tt=3e76568447d177856560d524c6ef5400407a437cfdd62767a36fb3b2decdeb01d43b9a7978232dc05c57af3c81bd10c277e78619093795e8392c1302c9aa8a75; sid_guard=c8f84a23bcce86b376964aeb42991709%7C1554173959%7C5184000%7CSat%2C+01-Jun-2019+02%3A59%3A19+GMT; uid_tt=2ad7176029f7302e11b7924e6e6566b7120075732cedcd39bc999fa5cbcf07a1; sid_tt=c8f84a23bcce86b376964aeb42991709; sessionid=c8f84a23bcce86b376964aeb42991709' } self.headers_details = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36' } self.cookies_details = { 'Cookie': 'tt_webid=6683297640216282629; __tea_sdk__user_unique_id=6683297640216282629; __tea_sdk__ssid=40d2e59e-696c-4a93-ace8-e1479b10aeef; csrf-token=61575f8b568b577d9d06c777d103ae53e6c10723; csrf-secret=6qDUsFL6WZ1aG2soaPw7PpmCtnxCv7fw' } self.post_video_url = 'http://127.0.0.1:30008/crawler/video/transfer' self.filter_url = 'http://console.cc.clipclaps.tv/crawler/log' self.have_met = BloomFilter(max_elements=100000, error_rate=0.1)
class WordLookup: def __init__(self, file_name): self.bf = BloomFilter(10000000, 8) input_file = open(file_name, "r") for file_line in input_file: file_line = file_line.rstrip() self.bf.add(file_line) input_file.close() def is_qualified(self, string): str_len = len(string) if str_len != 6: return False for i in range(1, str_len - 1): first = string[:i] second = string[i:] if self.bf.lookup(first) and self.bf.lookup(second): #print first + '+' + second + '=>' + string return True return False
def process_three(pipe32, return_dict): pid = 2 counter = BloomFilter(n, p, 0) print('\nInitial Bit array: 3') print(counter.bit_array) print() counter = recv_message(pipe32, pid, counter, 'k') counter = send_message(pipe32, pid, counter, 'l') print_history(counter, pid) for i in counter.history.keys(): return_dict[i] = counter.history[i]
def test_bloom_filter(): bloomfilter = BloomFilter(NUM_KEYS, FALSE_POSITIVE_PROBABILITY) word_present = ['abound','abounds','abundance','abundant','accessable', 'bloom','blossom','bolster','bonny','bonus','bonuses', 'coherent','cohesive','colorful','comely','comfort', 'gems','generosity','generous','generously','genial'] word_absent = ['facebook','twitter'] for item in word_present: bloomfilter.add(item) test_words = word_present[:10] + word_absent shuffle(test_words) for word in test_words: if bloomfilter.is_member(word): if word in word_absent: print(f"'{word}' is a false positive!") else: print(f"'{word}' is probably present!") else: print(f"'{word}' is definitely not present!")
def put_all(self, kvps): timestamp = self.next_timestamp() txn_keys = None if self.algorithm == RAMPAlgorithm.Fast: txn_keys = kvps.keys() bloom_filter = None if self.algorithm == RAMPAlgorithm.Hybrid: bloom_filter = BloomFilter(BLOOM_FILTER_SIZE, BLOOM_FILTER_HASHES) bloom_filter.list_to_bloom(kvps.keys()) for key in kvps: self.key_to_partition(key).prepare(key, DataItem(kvps[key], timestamp, txn_keys, bloom_filter), timestamp) for key in kvps: self.key_to_partition(key).commit(key, timestamp)
def generate_bf_list(cols): block_cnt = 20 block_len = 30 n = block_cnt * block_len # code space. set it to the max size of a col for now p = 0.01 # false positive probability # build bloom filter for all cols bloom_filter_list = [] for col in cols: bloom_filter = BloomFilter(n, p) for num in col: bloom_filter.add(chr(num)) bloom_filter_list.append(bloom_filter) return bloom_filter_list
def process_two(pipe21, pipe23, return_dict): pid = 1 counter = BloomFilter(n, p, 0) print('\nInitial Bit array: 2') print(counter.bit_array) print() counter = recv_message(pipe21, pid, counter, 'g') counter = send_message(pipe21, pid, counter, 'h') counter = send_message(pipe23, pid, counter, 'i') counter = recv_message(pipe23, pid, counter, 'j') print_history(counter, pid) for i in counter.history.keys(): return_dict[i] = counter.history[i]
def do_add(self, string_lists): """ :param string_lists: eg. {lib1_fn: string_list, lib1_1g: string_list, lib1_2g: string_list, ..., } :return: """ try: for key in string_lists: document = {} bf = BloomFilter(max_elements=2**16, error_rate=0.01) [bf.add(element) for element in string_lists.get(key)] bf_pickle = cPickle.dumps(bf) document[key] = bf_pickle sig = key.split('_') for type in self.collection: if type == sig[len(sig) - 1]: self.collection[type].save(document) size = sys.getsizeof(document[key]) print( "save bloom tree into mongoDB: %s \t size is %f M" % (key, size / 1024 / 1024)) break except Exception as e: print(e)
def main(): inital_page = "http://yue.ifeng.com" url_queue = Queue.Queue() filter = BloomFilter() filter.add(inital_page) url_queue.put(inital_page) while True: urls = [] current_url = url_queue.get() # 取队列第一个元素 try: store(current_url) urls = extract_urls(current_url) # 抽取页面中的链接 except Exception, e: print "Error extract_urls" print e for next_url in urls: if filter.notcontains(next_url): filter.add(next_url) url_queue.put(next_url)
count+=1 f.close() num_of_items= count print("Number of Items: " + str(num_of_items)) p = 0.005 print("Probability of false positive error " + str(p)) bit_size = bit_array_size(num_of_items, p) print("Bit Size: "+str(bit_size)) hash_size = size_of_hash(num_of_items, bit_size) print("Hash Size: "+str(hash_size)) bf = BloomFilter(num_of_items, hash_size) word_list = open("word_list.txt").read().splitlines() for word in word_list: bf.add(word) word_list.close() print(bf.lookup("99")) print(bf.lookup("donkey")) print(bf.lookup("oitqv")) print(bf.lookup("fart")) print(bf.lookup("Max")) print(bf.lookup("Dichha")) print(bf.lookup("Khuwalung"))
import sys from trie import Trie from bloom_filter import BloomFilter from tools import get_size if __name__ == '__main__': bf_dups = 0 tr = Trie() bf = BloomFilter(capacity=700000, error_rate=0.001) with open("words.txt") as file: for line in file: tr.put(line.strip()) if bf.put(line.strip()): print("Duplicate in bloom filter: {0}".format(line.strip())) bf_dups += 1 print("Trie. number of objects put: {0}".format(len(tr))) print("Bloom filter. number of objects put: {0}".format(len(bf))) print() print("Trie. Size of the object: {0}".format(sys.getsizeof(tr))) print("Bloom filter. Size of the object: {0}".format(sys.getsizeof(bf))) print() print("Trie. Size of the object(full): {0}".format(get_size(tr))) print("Bloom filter. Size of the object(full): {0}".format(get_size(bf))) print() print("Bloom filter errors: {0}".format(bf_dups)) print("----------------------------------------------------------")
def __init__(self): self.dictionary = BloomFilter() word_list = open('/usr/share/dict/words', 'r') for word in word_list: self.dictionary.add(word.strip())
def test_can_insert(self): bloom = BloomFilter(2000, 4) bloom.insert(5)
def test_inserted_is_probably_contained(self): bloom = BloomFilter(2000, 4) bloom.insert(42) self.assertTrue(42 in bloom)
def testLoadBloomFilterFromBase64File(self): with open(self.BF_DUMP_FILE_BASE_64, "r") as f: dump_str = f.read() bloom_filter = BloomFilter.load_from_base64_str(dump_str) self.check_contains(bloom_filter)
def testLoadBloomFilterFromFile(self): with open(self.BF_DUMP_FILE, "rb") as f: dump_bytes = f.read() bloom_filter = BloomFilter.load(dump_bytes) self.check_contains(bloom_filter)