def call_main(author_name): bf = Bloom() count = bf.search("/Users/vivekris/GS/code/gsbloomboost/input/filters/",author_name) lookup = get_id_lookup() return_id = get_id(count, lookup) print return_id
def getMeme(cid, sub): path = str(os.getcwd() + "/stats/" + str(cid) + "/") os.makedirs(path, exist_ok=True) f = Path(path + "bloom.txt") print(f) filter = Bloom() if f.is_file(): file = open(f, "r") filter.getTable(file) file.close() else: file = open(f, "w+") global reddit for s in reddit.subreddit(sub).hot(): if not filter.find(str(s)) and s.is_reddit_media_domain: filter.insert(str(s)) with open(f, "w") as file: filter.writeTable(file) break request.urlretrieve(s.url, "temp.jpg") return open("temp.jpg", "rb") #getMeme(123,"dankmemes")
def create_collection(self, genome): section_size = int(len(genome) / (self.k)) for i in range(self.k): section = genome[i * (section_size):(i + 1) * (section_size)] bloom = Bloom(1000000000, 0.3) for k in range(0, len(section) - segment_size + 1): s = section[k:k + segment_size] bloom.bloom_add(s) self.collection.append(bloom) print("Collection created successfully") self.collection = tuple(self.collection)
def post(self): bloom = memcache.get("filter") if bloom is None: bloom = Bloom(2**21, 5) value = self.request.POST["value"] value_already_included = False if value in bloom: value_already_included = True bloom.add(value) memcache.set("filter", bloom) self.redirect("/?success=" + str(not value_already_included) + "&value=" + value)
def call_main(lists_folder, bloom_folder): list_files = get_files(lists_folder) #bloom_files = get_files(bloom_folder) for filee in list_files: # Create a new bloom filter file if it does not exist.. #Get abc in /Users/xya/asdf/abc.2bloom file_name = filee.rsplit('/',1)[1].split('.')[0] bf = Bloom(bloom_folder+"/"+file_name+".bloom") # Get the list of elements from the file. list_of_authors = [ element.strip() for element in file(filee) ] print list_of_authors # Now start adding elements bf.add_elements(list_of_authors)
def train(self, xs, ys, epochs): """ Train the model and setup the two amqs. """ # Filter pos/neg examples # TODO: make more efficient (don't necessarily need to compute pos/negs here) positives = [x for x, y in zip(xs, ys) if y] negatives = [x for x, y in zip(xs, ys) if not y] # Setup first filter self.amq1.add_set(positives) # Train the neural net on reported positives of first filter amq1_pos_indices = [ i for i, x in enumerate(xs) if self.amq1.contains(x) ] amq1_pos_xs = [xs[i] for i in amq1_pos_indices] amq1_pos_ys = [ys[i] for i in amq1_pos_indices] self.model.train(amq1_pos_xs, amq1_pos_ys, epochs) # Tune tau self.tau, fpr, fnr = self._choose_tau(amq1_pos_xs, amq1_pos_ys) # Get false negatives from model model_false_negs = [ x for x in amq1_pos_xs if not (self.model(x) > self.tau) ] num_model_false_negs = len(model_false_negs) # Setup second filter if we have false negs if num_model_false_negs > 0 and fnr > 0: # Compute optimal bitarray size ratio for second filter inside = fpr / ((1 - fpr) * (1 / fnr - 1)) m2 = int(0 if inside == 0 else -log2(inside) / log(2)) if m2 == 0: self.amq2 = WordBloom( Bloom.init_ne(num_model_false_negs, self.err)) else: self.amq2 = WordBloom(Bloom.init_nm(num_model_false_negs, m2)) self.amq2.add_set(model_false_negs)
def simulation(): input = np.random.randint(1, 1000000, size=10000) test = np.random.randint(1, 1000000, size=1000) b = Bloom(input, 0.01) b.train_bloom() b.test_bloom(test)
def __init__(self, n, c, err, set_size, err1k): """ n: number of letters in string c: size of alphabet err: total error rate of sandwich """ self.n = n self.c = c self.model = WordNet(n, c) self.tau = 0.5 # default value, adjust by tuning later self.alpha = 0.618503137801576 # 2 ** -log(2) # AMQs can only be set up after training model self.err = err self.err1 = self.err * err1k self.amq1 = WordBloom(Bloom.init_ne(set_size, self.err1)) self.amq2 = None # Determine size after training
def train(self, xs, ys, epochs): """ Train on examples for a certain number of epochs """ # Train neural net # Note: torch dataloader takes care of shuffling self.model.train(xs, ys, epochs) # Tune tau self.tau = self._choose_tau(xs, ys) # Get false negatives positives = [x for x, y in zip(xs, ys) if y] false_negs = [x for x in positives if not (self.model(x) > self.tau)] # Build filter for negatives if len(false_negs) > 0: self.amq = WordBloom(Bloom.init_ne(len(false_negs), self.err / 2)) self.amq.add_set(false_negs)
def bloom_test(xs, ys, num_pos, num_neg, n, c, e): """ Perform a test on the Bloom filter """ bloom = WordBloom(Bloom.init_ne(num_pos, e)) positives = [x for x, y in zip(xs, ys) if y] bloom.add_set(positives) false_pos = false_neg = 0 for x, y in zip(xs, ys): filter_contains = bloom.contains(x) false_pos += not y and filter_contains false_neg += y and not filter_contains print(bloom) print("fpr: {}, fnr: {}, correct%: {}".format( false_pos / num_neg, false_neg / num_pos, 1 - (false_pos + false_neg) / (num_pos + num_neg)))
def main(): """Main function.""" bloom = Bloom(12) # Add some values to the set bloom.add("Curie") bloom.add("Laplace") # Now test some values person = "Pasteur" if (bloom.check(person)): print("%s is probably in the set." % person) else: print("%s is definitely not in the set." % person) # Test some more values person = "Curie" if (bloom.check(person)): print("%s is probably in the set." % person) else: print("%s is definitely not in the set." % person)
class TestBloomMethods(unittest.TestCase): """Test the Bloom Filter.""" def setUp(self): """Set up the Bloom Filter table.""" self.size = 10 self.b = Bloom(self.size) def test_add(self): """Make sure that add properly activates bits in the table.""" self.b.add('Archimedes') self.assertEqual(self.b.table, [1, 0, 0, 1, 0, 0, 0, 0, 0, 0]) def test_check_element_in_table(self): """Ensure check method returns true if element was added.""" self.b.add('Copernicus') self.assertEqual(self.b.check('Copernicus'), True) def test_check_element_not_in_table(self): """Ensure check method returns false if elmt definitely not in set.""" self.assertEqual(self.b.check('Galileo'), False)
# rings, steps = gen_circle(rings=None, pixels_per=pixels_per, offset=0, invert=False) # layout = layout_from_rings(rings, origin=(0, 0, 0), z_diff=8) # driver = DriverSimPixel(sum(pixels_per), layout=layout) # led = LEDCircle(driver, rings=rings, maxAngleDiff=0) def shutdown(): log.debug('Force close server') driver.server.close() time.sleep(2) sys.exit() try: if isinstance(led, LEDMatrix): anim = Bloom(led, dir=True) anim.run(amt=2, fps=60) elif isinstance(led, LEDCube): # anim = cuby(led, color_list=c_list) # anim.run(amt=1, fps=8) anim = Simplex(led, freq=16, octaves=1) anim.run(amt=1, fps=30) # anim = Spectrum(led, vis_list=['Spread'], steps_per_vis=None, # bins=12, max_freq=4000, log_scale=True, auto_gain=False, gain=3) anim.run(amt=1, fps=8) elif isinstance(led, LEDCircle): anim = Diag(led, turns=1, angle=6, direction=False) anim.run(amt=6, fps=20) except Exception as e: shutdown() raise
def setUp(self): """Set up the Bloom Filter table.""" self.size = 10 self.b = Bloom(self.size)
return [word] letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'] words = [] for char in word: if char == '*': for letter in letters: words.append(word.replace('*', letter, 1)) for item in list(words): for char in list(item): if char == '*': for letter in letters: words.append(item.replace('*', letter, 1)) return words bloom = Bloom(15000000, 8) parser = argparse.ArgumentParser(description="Bloom filter. Builds a\ bloom filter. Searches for a word.\ Wildcards(*) allowed.") parser.add_argument('file', type=str, help="File location") parser.add_argument('word', type=str, help="Word to search") args = parser.parse_args() with open(args.file, 'r') as reader: for line in reader: bloom.insert(line.rstrip()) print("Bloom filter built.") print("Searching for \"{}\"...".format(args.word)) if '*' in args.word: print("The more wildcards, the longer this takes...") matches = [] for word in replace(args.word):
import datetime from bloom import Bloom f = open("data/emerson_essays.txt", "r") sample = f.read() lines = sample.splitlines() mb = Bloom() false_positive = 0 new_key = 0 start_time = datetime.datetime.now() for line in lines: if not mb.Add(line): false_positive += 1 else: new_key += 1 end_time = datetime.datetime.now() print(" false_positive : %s - new_key: %s on %s " % (false_positive,new_key, len(lines))) diff = (start_time-end_time).microseconds ratio = diff / len(lines) print("{} ms for ratio: {}".format(diff,ratio)) print(" {} LEN ".format(mb.Len()))