offset = 123456789; t = 5*1048576; a = []; b = []; i = 0; # Test Element test_element = randint(ielements,offset-1); filter_name = str(test_element); # Create the Cuckoo Filter r = Client() r.cfCreate(filter_name, cfsize); # Insert a fraction of the elements for x in range(1,ielements-1): r.cfAdd(filter_name, str(x)); # Test a large number of elements for x in range(offset,t+offset): pos = r.cfExists(filter_name, str(x)); #print(pos,x) if pos == 0: a.append(x) # Print FPR and set size print("The length of list A is: ", len(a))
datapath = Path('../input') def parse_json_body_text(json_filename): print("Processing ..", json_filename.stem) with open(json_filename) as json_data: data = json.load(json_data) paper_id=data['paper_id'] for body_text in data['body_text']: para = body_text['text'] yield para try: redisbloomclient.cfCreate('processed_documents', 40000) except ResponseError as e: print("Error:", repr(e)) #process document return sentences and entities def process_file(f,redisbloomclient=redisbloomclient, rediscluster_client=rediscluster_client): pid = 0 article_id=f.stem print("Processing article_id ", article_id) if redisbloomclient.cfExists('processed_documents', article_id): print("already processed ", article_id) return article_id for para in parse_json_body_text(f): rediscluster_client.setnx(f"paragraphs:{article_id}:pid:{pid}",para) pid+= 1