def set_bloom_filter(bbf_file): global bloom if os.path.exists(bbf_file): bloom = pydablooms.load_dabloom(capacity=10000000, error_rate=.05, filepath=bbf_file) else: print "WARNING: bloom filter file", bbf_file, "doesn't exist"
for line in words_file: bloom.add(line.rstrip(), i) i += 1 words_file.seek(0) i = 0 for line in words_file: if i % 5 == 0: bloom.delete(line.rstrip(), i) i += 1 bloom.flush() del bloom bloom = pydablooms.load_dabloom(capacity=capacity, error_rate=error_rate, filepath=bloom_fname) true_positives = 0 true_negatives = 0 false_positives = 0 false_negatives = 0 words_file.seek(0) i = 0 for line in words_file: exists = bloom.check(line.rstrip()) contains = line.rstrip() in bloom assert exists == contains, \ "ERROR: %r from 'bloom.check(x)', %i from 'x in bloom'" \ % (exists, contains)
def main(): """ By default will read from stdin and write to stdout using a temporary bloom file. """ parser = argparse.ArgumentParser() parser.add_argument('infile', nargs='?', type=argparse.FileType('rb'), default=sys.stdin, help="Defaults to stdin") parser.add_argument('outfile', nargs='?', type=argparse.FileType('wb'), default=sys.stdout, help="Defaults to stdout") parser.add_argument('--offsetfile', type=argparse.FileType('w'), default=None, help="Defaults to None, requires a named outfile") parser.add_argument('-b', '--bloom', help='The path to the bloom filter to check against') parser.add_argument('-c', '--capacity', type=int, default=CAPACITY, help='The capacity of the bloom filter, DEFAULT: 1 million') #Add and delete are mutually exclusive group = parser.add_mutually_exclusive_group() group.add_argument('-a', '--add', action='store_true', default=False, help='add the phone to the bloom file.') group.add_argument('-d', '--delete', action='store_true', default=False, help='remove the phone from the bloom file.') args = parser.parse_args() bloom_self, tmpdir = __get_tmp_bloom() try: if args.offsetfile: args.outfile.tell() args.offsetfile.write('#Input:%s; ' % args.infile.name) args.offsetfile.write('Output:%s%s' % (args.outfile.name, os.linesep)) if args.bloom: args.offsetfile.write('#Bloom:%s%s' % (args.bloom, os.linesep)) except IOError: #File.tell() does not work with pipes. sys.stderr.write('Warning: Cannot write offset file when the output file is a pipe.' + os.linesep) args.offsetfile = None if args.bloom and os.path.isfile(os.path.abspath(args.bloom)): bloom_abspath = os.path.abspath(args.bloom) #sys.stderr.write('Using bloom: %s' % bloom_abspath + os.linesep) bloom = pydablooms.load_dabloom(capacity=args.capacity, error_rate=ERROR_RATE, filepath=bloom_abspath) elif args.bloom and args.add: bloom_abspath = os.path.abspath(args.bloom) bloom = pydablooms.Dablooms(capacity=args.capacity, error_rate=ERROR_RATE, filepath=bloom_abspath) sys.stderr.write('Created bloom with capacity %d at %s' % (args.capacity, bloom_abspath + os.linesep)) else: if args.bloom: sys.stderr.write("Bloom file does not exist and we cannot create new bloom without the --add flag" + os.linesep) elif args.add: sys.stderr.write("Add option ignored without bloom file specified" + os.linesep) bloom = bloom_self sys.stderr.write('Created tmp bloom with capacity %d at %s' % (args.capacity, tmpdir + os.linesep)) if args.delete: __remove_from_bloom(bloom, args.infile) elif args.add: __filter_with_add(bloom, args.infile, args.outfile, args.offsetfile) else: __filter_no_add(bloom, bloom_self, args.infile, args.outfile, args.offsetfile) if os.path.exists(tmpdir) and os.getcwd() != tmpdir: args.outfile.close() del bloom shutil.rmtree(tmpdir) args.infile.close() args.outfile.close()