def generate_new_bloom(self): if self.estimate_capacity: self._est_num_recs_and_set() self._bloom = self._get_bloom_class_inst(self.capacity, self.error_rate) for nsrl_file in get_all_nsrl_files(iso_base_dir=self.nsrl_dir): self._process_nsrl_file(nsrl_file, self._process_csv_chunk)
def _estimate_number_of_recs(self): s = set() def add_csv_to_set(csv_chunk): for csv_entry in self._filter_csv_chunk(csv_chunk): s.add(csv_entry[MD5_KEY_CSV]) for nsrl_file in get_all_nsrl_files(iso_base_dir=self.nsrl_dir): self._process_nsrl_file(nsrl_file, add_csv_to_set) s_count = len(s) # this is doing to get large, but at least we'll have a more exact estimate that strips out doubles return s_count