def generate_mlbf(cls, stats, blocked, not_blocked): """Originally based on: https://github.com/mozilla/crlite/blob/master/create_filter_cascade/certs_to_crlite.py (not so much any longer, apart from the fprs calculation) """ salt = secrets.token_bytes(16) stats['mlbf_blocked_count'] = len(blocked) stats['mlbf_notblocked_count'] = len(not_blocked) fprs = [len(blocked) / (math.sqrt(2) * len(not_blocked)), 0.5] log.info("Generating filter") cascade = FilterCascade( error_rates=fprs, defaultHashAlg=HashAlgorithm.SHA256, salt=salt, ) cascade.initialize(include=blocked, exclude=not_blocked) stats['mlbf_fprs'] = fprs stats['mlbf_version'] = cascade.version stats['mlbf_layers'] = cascade.layerCount() stats['mlbf_bits'] = cascade.bitCount() log.debug("Filter cascade layers: {layers}, bit: {bits}".format( layers=cascade.layerCount(), bits=cascade.bitCount())) cascade.verify(include=blocked, exclude=not_blocked) return cascade
def generate_mlbf(stats, blocked, not_blocked): log.info('Starting to generating bloomfilter') cascade = FilterCascade( defaultHashAlg=HashAlgorithm.SHA256, salt=secrets.token_bytes(16), ) error_rates = sorted((len(blocked), len(not_blocked))) cascade.set_crlite_error_rates(include_len=error_rates[0], exclude_len=error_rates[1]) stats['mlbf_blocked_count'] = len(blocked) stats['mlbf_notblocked_count'] = len(not_blocked) cascade.initialize(include=blocked, exclude=not_blocked) stats['mlbf_version'] = cascade.version stats['mlbf_layers'] = cascade.layerCount() stats['mlbf_bits'] = cascade.bitCount() log.info(f'Filter cascade layers: {cascade.layerCount()}, ' f'bit: {cascade.bitCount()}') cascade.verify(include=blocked, exclude=not_blocked) return cascade
def generateMLBF(stats, *, blocked, not_blocked, capacity, diffMetaFile=None): """Based on: https://github.com/mozilla/crlite/blob/master/create_filter_cascade/certs_to_crlite.py """ fprs = [len(blocked) / (math.sqrt(2) * len(not_blocked)), 0.5] if diffMetaFile is not None: log.info( "Generating filter with characteristics from mlbf base file {}". format(diffMetaFile)) mlbf_meta_file = open(diffMetaFile, 'rb') cascade = FilterCascade.loadDiffMeta(mlbf_meta_file) cascade.error_rates = fprs else: log.info("Generating filter") cascade = FilterCascade.cascade_with_characteristics( int(len(blocked) * capacity), fprs) cascade.version = 1 cascade.initialize(include=blocked, exclude=not_blocked) stats['mlbf_fprs'] = fprs stats['mlbf_version'] = cascade.version stats['mlbf_layers'] = cascade.layerCount() stats['mlbf_bits'] = cascade.bitCount() log.debug("Filter cascade layers: {layers}, bit: {bits}".format( layers=cascade.layerCount(), bits=cascade.bitCount())) return cascade
def test_generate_and_write_filter(self): self.setup_data() mlbf = MLBF.generate_from_db(123456) mlbf.generate_and_write_filter() with open(mlbf.filter_path, 'rb') as filter_file: buffer = filter_file.read() bfilter = FilterCascade.from_buf(buffer) blocked_versions = fetch_blocked_from_db() blocked_guids = blocked_versions.values() for guid, version_str in blocked_guids: key = MLBF.KEY_FORMAT.format(guid=guid, version=version_str) assert key in bfilter all_addons = fetch_all_versions_from_db(blocked_versions.keys()) for guid, version_str in all_addons: # edge case where a version_str exists in both if (guid, version_str) in blocked_guids: continue key = MLBF.KEY_FORMAT.format(guid=guid, version=version_str) assert key not in bfilter # Occasionally a combination of salt generated with secrets.token_bytes # and the version str generated in version_factory results in a # collision in layer 1 of the bloomfilter, leading to a second layer # being generated. When this happens the bitCount and size is larger. expected_size, expected_bit_count = ( (203, 1384) if bfilter.layerCount() == 1 else (393, 2824) ) assert os.stat(mlbf.filter_path).st_size == expected_size, ( blocked_guids, all_addons, ) assert bfilter.bitCount() == expected_bit_count, (blocked_guids, all_addons)
def test_generate_and_write_mlbf(self): self.setup_data() mlbf = MLBF(123456) mlbf.generate_and_write_mlbf() with open(mlbf.filter_path, 'rb') as filter_file: buffer = filter_file.read() bfilter = FilterCascade.from_buf(buffer) blocked_versions = mlbf.fetch_blocked_from_db() blocked_guids = blocked_versions.values() for guid, version_str in blocked_guids: key = mlbf.KEY_FORMAT.format(guid=guid, version=version_str) assert key in bfilter all_addons = mlbf.fetch_all_versions_from_db(blocked_versions.keys()) for guid, version_str in all_addons: # edge case where a version_str exists in both if (guid, version_str) in blocked_guids: continue key = mlbf.KEY_FORMAT.format(guid=guid, version=version_str) assert key not in bfilter assert os.stat(mlbf.filter_path).st_size == 203, ( blocked_guids, all_addons) assert bfilter.bitCount() == 1384, ( blocked_guids, all_addons)
def generate_mlbf(stats, key_format, *, blocked=None, not_blocked=None): """Based on: https://github.com/mozilla/crlite/blob/master/create_filter_cascade/certs_to_crlite.py """ blocked = hash_filter_inputs( blocked or get_blocked_guids(), key_format) not_blocked = hash_filter_inputs( not_blocked or get_all_guids(), key_format) not_blocked = list(set(not_blocked) - set(blocked)) stats['mlbf_blocked_count'] = len(blocked) stats['mlbf_unblocked_count'] = len(not_blocked) fprs = [len(blocked) / (math.sqrt(2) * len(not_blocked)), 0.5] log.info("Generating filter") cascade = FilterCascade.cascade_with_characteristics( int(len(blocked) * 1.1), fprs) cascade.version = 1 cascade.initialize(include=blocked, exclude=not_blocked) stats['mlbf_fprs'] = fprs stats['mlbf_version'] = cascade.version stats['mlbf_layers'] = cascade.layerCount() stats['mlbf_bits'] = cascade.bitCount() log.debug("Filter cascade layers: {layers}, bit: {bits}".format( layers=cascade.layerCount(), bits=cascade.bitCount())) cascade.check(entries=blocked, exclusions=not_blocked) return cascade
def load_filter(self, *, filter_path, coverage_path): self.filter_file = filter_path self.coverage_file = coverage_path self.filtercascade = FilterCascade.from_buf( self.filter_file.read_bytes()) self.issuer_to_revocations = collections.defaultdict(list) self.stash_files = list() self.coverage = {} with open(coverage_path, "r") as f: for ct_log in json.load(f): log_id = base64.b64decode(ct_log["logID"]) self.coverage[log_id] = (ct_log["minTimestamp"], ct_log["maxTimestamp"])
def test_generate_and_write_mlbf(self): mlbf = MLBF(123456) mlbf.generate_and_write_mlbf() with open(mlbf.filter_path, 'rb') as filter_file: buffer = filter_file.read() bfilter = FilterCascade.from_buf(buffer) assert bfilter.bitCount() == 3008 blocked_versions = mlbf.get_blocked_versions() for guid, version_str in blocked_versions.values(): key = mlbf.KEY_FORMAT.format(guid=guid, version=version_str) assert key in bfilter for guid, version_str in mlbf.get_all_guids(blocked_versions.keys()): key = mlbf.KEY_FORMAT.format(guid=guid, version=version_str) assert key not in bfilter assert os.stat(mlbf.filter_path).st_size == 406
def generateMLBF(args, stats, *, revoked_certs, nonrevoked_certs, nonrevoked_certs_len): revoked_certs_len = len(revoked_certs) log.info("Generating filter") cascade = FilterCascade([], version=1, defaultHashAlg=fileformats.HashAlgorithm.MURMUR3) cascade.set_crlite_error_rates(include_len=revoked_certs_len, exclude_len=nonrevoked_certs_len) cascade.initialize(include=revoked_certs, exclude=nonrevoked_certs) stats["mlbf_fprs"] = cascade.error_rates stats["mlbf_version"] = cascade.version stats["mlbf_layers"] = cascade.layerCount() stats["mlbf_bits"] = cascade.bitCount() log.debug("Filter cascade layers: {layers}, bit: {bits}".format( layers=cascade.layerCount(), bits=cascade.bitCount())) metrics.gauge("GenerateMLBF.BitCount", cascade.bitCount()) metrics.gauge("GenerateMLBF.LayerCount", cascade.layerCount()) return cascade