def test_insert_then_test(self): result = create_index( '/tmp/fake.csv', # input filename self.test_file, # file-like object 0.0001, # error rate 1, # skip lines [1, 2], # fields ',', # delimiter False) # recursive domain self.assertEqual( { '/tmp/fake.csv.2.bfindex': 6, '/tmp/fake.csv.1.bfindex': 5 }, result) b1 = BloomFilter.fromfile(open('/tmp/fake.csv.1.bfindex', 'rb')) b2 = BloomFilter.fromfile(open('/tmp/fake.csv.2.bfindex', 'rb')) self.assertEqual(False, 'FieldA' in b1) self.assertEqual(False, 'FieldB' in b2) for word in ('apple', 'banana', 'orange', 'pear', 'pineapple'): self.assertEqual(True, word in b1) self.assertEqual(False, word in b2) for word in ('carrot', 'potato', 'leek', 'cauliflower', 'bean'): self.assertEqual(True, word in b2) self.assertEqual(False, word in b1)
def test_insert_then_test(self): result = create_index( '/tmp/fake.csv', # input filename self.test_file, # file-like object 0.0001, # error rate 1, # skip lines [1, 2], # fields ',', # delimiter False) # recursive domain self.assertEqual( {'/tmp/fake.csv.2.bfindex': 6, '/tmp/fake.csv.1.bfindex': 5}, result) b1 = BloomFilter.fromfile(open('/tmp/fake.csv.1.bfindex', 'rb')) b2 = BloomFilter.fromfile(open('/tmp/fake.csv.2.bfindex', 'rb')) self.assertEqual(False, 'FieldA' in b1) self.assertEqual(False, 'FieldB' in b2) for word in ('apple', 'banana', 'orange', 'pear', 'pineapple'): self.assertEqual(True, word in b1) self.assertEqual(False, word in b2) for word in ('carrot', 'potato', 'leek', 'cauliflower', 'bean'): self.assertEqual(True, word in b2) self.assertEqual(False, word in b1)
def jaccard_ind(filename_1, filename_2): with open(filename_1, 'rb') as f_1: with open(filename_2, 'rb') as f_2: print(filename_1) b_1 = BloomFilter.fromfile(f_1) b_2 = BloomFilter.fromfile(f_2) b_inter = b_1.intersection(b_2) b_union = b_1.union(b_2) bits_inter = b_inter.bitarray.count(True) bits_union = b_union.bitarray.count(True) j_i = float(bits_inter) / float(bits_union) #print("%s ~ %s, %f" % filename_1, filename_2, j_i) print("%s %s %f" % (filename_1, filename_2, j_i)) f_2.close() f_1.close()
def __init__(self): try: with open(FILTER_FILE) as f: self.f = BloomFilter.fromfile(f) except IOError: self.f = BloomFilter(capacity=10000000, error_rate=0.001) self.num = 0
def __init__(self, endpoint=config.config['general']['dbpedia']['endpoint'], one_hop_bloom_file=config.config['general']['dbpedia'] ['one_hop_bloom_file'], two_hop_bloom_file=config.config['general']['dbpedia'] ['two_hop_bloom_file']): super(DBpedia, self).__init__(endpoint) self.type_uri = "<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>" if os.path.exists(one_hop_bloom_file): with open(one_hop_bloom_file) as bloom_file: self.one_hop_bloom = BloomFilter.fromfile(bloom_file) else: self.one_hop_bloom = None self.two_hop_bloom_file = two_hop_bloom_file self.two_hop_bloom = dict() for item in [True, False]: file_path = two_hop_bloom_file.replace('spo2', 'spo2' + str(item)) if os.path.exists(file_path): with open(file_path) as bloom_file: self.two_hop_bloom[item] = ScalableBloomFilter.fromfile( bloom_file) else: self.two_hop_bloom[item] = ScalableBloomFilter( mode=ScalableBloomFilter.LARGE_SET_GROWTH) self.two_hop_bloom_counter = 0
def main(): parser = argparse.ArgumentParser(prog='blacktop/nsrl') parser.add_argument("-v", "--verbose", help="Display verbose output message", action="store_true", required=False) parser.add_argument('name', metavar='FILE', type=str, nargs='+', help='a file name to search for.') args = parser.parse_args() with open('nsrl.bloom', 'rb') as nb: bf = BloomFilter.fromfile(nb) for file_name in args.name: if args.verbose: if file_name in bf: print "File {} found in NSRL Database.".format(file_name) else: print "File {} was NOT found in NSRL Database.".format( file_name) else: print file_name in bf return
def update_bf(request): global proxies temp_list = request.split('\n') #print temp_list updated_proxy = temp_list[1] # ip of other proxy # write the bloom filter to a file bf_upd temp_bf_recv = open('bf_upd', "w") temp_bf_recv.write(temp_list[2]) temp_bf_recv.write('\n') temp_bf_recv.write(temp_list[3]) temp_bf_recv.close() temp_bf_recv = open('bf_upd', 'r') # de-serialize the bloom filter temp_bf = BloomFilter.fromfile(temp_bf_recv) temp_bf_recv.close() if os.path.isfile('bf_upd'): # remove the file used to hold the bloom filter os.remove('bf_upd') index = 0; for proxy in proxies: if proxy[0] == updated_proxy: # update only the proxy who's bloom filter is updated print "UPDATING BF OF PROXY: ", proxy[0] curr_port = proxy[1] break index = index + 1 proxies[index] = (updated_proxy, curr_port, temp_bf) # update entry in the list
def main(): parser = argparse.ArgumentParser(prog='blacktop/nsrl') parser.add_argument("-v", "--verbose", help="Display verbose output message", action="store_true", required=False) parser.add_argument('hash', metavar='MD5', type=str, nargs='+', help='a md5 hash to search for.') args = parser.parse_args() with open('nsrl.bloom', 'rb') as nb: bf = BloomFilter.fromfile(nb) for hash_hex in args.hash: hash = binascii.unhexlify(hash_hex) if args.verbose: if hash in bf: print "Hash {} found in NSRL Database.".format(hash_hex) else: print "Hash {} was NOT found in NSRL Database.".format( hash_hex) else: print hash in bf return
def open_spider(self, spider): brandName = 'mybloom' isexists = os.path.exists(brandName + '.blm') if isexists == True: self.bf = BloomFilter.fromfile(open(brandName + '.blm', 'rb')) else: self.bf = BloomFilter(100000, 0.001)
def start(): res = request_get(biqukan_url) index = BeautifulSoup(res, features=features) if os.path.exists(bf_file): LOG.info('bs from file') bf = BloomFilter.fromfile(open(bf_file, 'r')) else: LOG.info('init bs') bf = BloomFilter(500000) try: pool = Pool(size=pool_size) book_urls = find_wanben() book_urls += find_new_storage_block(index) book_urls += find_recommend_block(index, u'强力推荐') book_urls += find_type_block(index, u'玄幻小说') book_urls += find_type_block(index, u'修真小说') book_urls += find_type_block(index, u'都市小说') book_urls += find_type_block(index, u'穿越小说') book_urls += find_type_block(index, u'网游小说') book_urls += find_type_block(index, u'科幻小说') book_urls += find_new_update_block(index) book_num = len(book_urls) for i, url in enumerate(book_urls): pool.spawn(download_book, url, bf) # download_book(url, bf) LOG.info(u'开始下载%s本,剩余%s本', i + 1, book_num - i - 1) pool.join() LOG.info(u'下载完成') except Exception as e: LOG.exception(e) finally: bf.tofile(open(bf_file, 'w'))
def __enter__(self): if os.path.exists(self.bloom_file): with open(self.bloom_file, 'rb') as f: self.bloom = BloomFilter.fromfile(f) else: self.bloom = BloomFilter(capacity=10000000, error_rate=0.001) return self.bloom
def test(): with open('blooms/boys', 'r') as f: boys = BloomFilter.fromfile(f) with open('blooms/girls', 'r') as f: girls = BloomFilter.fromfile(f) print "Enter a name:" while True: name = raw_input().strip().lower() if name in boys and name not in girls: print "That is a boy's name." elif name not in boys and name in girls: print "That is a girl's name." elif name in boys and name in girls: print "That could be either a boy's or a girl's name." else: print "That doesn't look like a boy's or a girl's name."
def add_proxy(request, conn): global proxies recv_list = request.split('\n') #print "RECV LIST: ",recv_list new_proxy = recv_list[1] new_port = int(recv_list[2]) temp_bf = open("temp_bf_rec", "w+") # write the bloom filter string to a file temp_bf.write(recv_list[3]) temp_bf.write('\n'); temp_bf.write(recv_list[4]) temp_bf.close() temp_bf = open("temp_bf_rec", 'r') new_bf = BloomFilter.fromfile(temp_bf) # de-serialize the bloom filter temp_bf.close() if os.path.isfile('temp_bf_rec'): # remove the temp file os.remove('temp_bf_rec') print "GOT BLOOM FILTER" proxies.append((new_proxy,new_port,new_bf)) #print "NEW PROXIES: ",proxies temp_list = ['NEW LIST OF PROXIES'] for proxy in proxies: temp_list.append('\n') temp_list.append(proxy[0]) # add IP temp_list.append('\n') temp_list.append(str(proxy[1])) # add port temp_list.append('\n') temp_bf = open('temp_bf_send', "w") # serialize the bloom filter for sending proxy[2].tofile(temp_bf) temp_bf.close() temp_bf = open('temp_bf_send', "r") # create the a string so the bloom filter can be sent over socket connect temp2 = '' while 1: temp = temp_bf.read() #print "Reading from file: ", len(temp) if len(temp) > 0: temp2 = temp2 + temp else: break temp_bf.close() temp_list.append(temp2) # add bloom filter if os.path.isfile('temp_bf_send'): # remove temp file os.remove('temp_bf_send') #print temp_list temp_string = ''.join(temp_list) # create a string representation of the list for proxy in proxies: if proxy[0] != bootstrap_proxy: #only bootstrapping proxy will do this temp_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) temp_sock.connect((proxy[0],proxy[1])) # connect to each proxy and send the new list of proxies temp_sock.send(temp_string) temp_sock.close()
def init_bloom_filter(self, spider_name): self.bloom_file = '%s.bloom' % spider_name if os.path.exists(self.bloom_file): self.bloom_filter = \ BloomFilter.fromfile(open(self.bloom_file, 'r')) else: self.bloom_filter = \ BloomFilter(capacity=100000000, error_rate=0.001) pass
def open_spider(self, spider): file_name = 'bloomfilter' is_exist = os.path.exists(file_name + '.blm') if is_exist: self.bf = BloomFilter.fromfile(open('bloomfilter.blm', 'rb')) print('open blm file success') else: self.bf = BloomFilter(100000, 0.001) print('didn\'t find the blm file')
def __init__(self): print "Joint Linker initializing" try: f = open('../data/blooms/bloom1hoppredicate.pickle') self.bloom1hoppred = BloomFilter.fromfile(f) f.close() f = open('../data/blooms/bloom1hopentity.pickle') self.bloom1hopentity = BloomFilter.fromfile(f) f.close() f = open('../data/blooms/bloom2hoppredicate.pickle') self.bloom2hoppredicate = BloomFilter.fromfile(f) f.close() f = open('../data/blooms/bloom2hoptypeofentity.pickle') self.bloom2hoptypeofentity = BloomFilter.fromfile(f) f.close() except Exception,e: print e sys.exit(1)
def __init__(self, path): self.path = path self.rfile = None self.is_tofile = False if not os.path.isfile(path): self.bf = BloomFilter(100000, 0.001) else: self.rfile = open(path, 'r') self.bf = BloomFilter.fromfile(self.rfile)
def __init__(self, endpoint="http://sda-srv01.iai.uni-bonn.de:8164/sparql", one_hop_bloom_file="./data/blooms/spo1.bloom"): super(DBpedia, self).__init__(endpoint) self.type_uri = "<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>" if os.path.exists(one_hop_bloom_file): with open(one_hop_bloom_file) as bloom_file: self.one_hop_bloom = BloomFilter.fromfile(bloom_file) else: self.one_hop_bloom = None
def _fromfile_(self): try: f= open('../out/filter', 'r') self.bloom_cache=BloomFilter.fromfile(f) self.count=self.bloom_cache.count f.close() except Exception, ex: print(Exception, ex) self.bloom_cache = BloomFilter(capacity=10000000, error_rate=0.00001) self.count=0
def __init__(self, name): super(BloomZip, self).__init__() self.__data = StringIO() self._name = name self._bf = None if os.path.isfile(self._name): with open(self._name, 'rb') as f: length = struct.unpack(">L", f.read(4))[0] self._bf = BloomFilter.fromfile(f, length)
def loadBloomFromFile(fileName = bloomDumpCapsName): from pybloom import BloomFilter try: bloom = BloomFilter.fromfile(open(fileName, 'r')) except IOError as er: print 'load bloom from file fail, return null', er return None except Exception as e: print 'load bloom from file got exception, return null', e return None return bloom
def get_values_by_key_data(self, token, word_freq, offset, bloom_filter_dump_size): if word_freq == None: return numpy.zeros(0), None, 0 if token in self.cache: self.update_cache(token) return self.cache[token][:3] self.values_file.seek(offset) codes = pickle.load(self.values_file) prob_filter = None if bloom_filter_dump_size: prob_filter = BloomFilter.fromfile(self.values_file, bloom_filter_dump_size) self.update_cache(token, (codes, prob_filter, word_freq, offset, bloom_filter_dump_size)) return codes, prob_filter, word_freq
def read_redis_bf_from_file(): try: f_r = open('bf_redis', 'rb') bf_redis = BloomFilter.fromfile(f_r) f_r.close() print "file is exists,ok" except: f_w = open('bf_redis', 'wb') bf_redis = BloomFilter(capacity=10000000, error_rate=0.001) bf_redis.tofile(f_w) f_w.close() print "file not exists,new" return bf_redis
def __init__(self): self.faillog = open('fail.txt', 'a') if os.path.exists(settings['MONGODB_DB'] + '.urls'): #print("SEEN FILE EXISTS!!!!!!!!!!!!!!!!!!!!!!!!!") self.bloomFilter = BloomFilter.fromfile( open(settings['MONGODB_DB'] + '.urls', 'r')) else: self.bloomFilter = BloomFilter(1000000, 0.001) connection = pymongo.MongoClient(settings['MONGODB_SERVER'], settings['MONGODB_PORT']) db = connection[settings['MONGODB_DB']] self.collection = db[settings['MONGODB_COLLECTION']] self.collection.ensure_index('url', unique=True) self.collection.create_index([("crawltime", DESCENDING)])
def __init__(self, cachefile, capacity=1000000, error_rate=0.001): self.cachefile = cachefile if os.name == 'nt' or not cachefile: from pybloom import BloomFilter if self.cache(): with open(cachefile, 'r') as fp: self.filter = BloomFilter.fromfile(fp) else: self.filter = BloomFilter(capacity=capacity, error_rate=error_rate) elif os.name == 'posix': from pybloomfilter import BloomFilter if self.cache(): self.filter = BloomFilter.open(self.cachefile) else: self.filter = BloomFilter(capacity, error_rate, cachefile)
def fromfile(cls, f): """Deserialize the ScalableBloomFilter in file object `f'.""" filter = cls() filter._setup(*unpack(cls.FILE_FMT, f.read(calcsize(cls.FILE_FMT)))) nfilters, = unpack("<l", f.read(calcsize("<l"))) if nfilters > 0: header_fmt = "<" + "Q" * nfilters bytes = f.read(calcsize(header_fmt)) filter_lengths = unpack(header_fmt, bytes) for fl in filter_lengths: filter.filters.append(BloomFilter.fromfile(f, fl)) else: filter.filters = [] return filter
def __init__(self): self.faillog = open('fail.txt', 'a') #self.urls_seen = set() if os.path.exists(settings['DB_DB'] + '.urls'): self.bloomFilter = BloomFilter.fromfile( open(settings['DB_DB'] + '.urls', 'r')) else: self.bloomFilter = BloomFilter(1000000, 0.001) connection = pymongo.MongoClient(settings['DB_SERVER'], settings['DB_PORT']) db = connection[settings['DB_DB']] self.collection = db[settings['DB_COLLECTION']] self.collection.ensure_index('url', unique=True) self.collection.create_index([("crawltime", DESCENDING)]) self.url2name = self.loadDict(connection)
def fromfile(cls, f): """Deserialize the ScalableBloomFilter in file object `f'.""" filter = cls() filter._setup(*unpack(cls.FILE_FMT, f.read(calcsize(cls.FILE_FMT)))) nfilters, = unpack('<l', f.read(calcsize('<l'))) if nfilters > 0: header_fmt = '<' + 'Q' * nfilters bytes = f.read(calcsize(header_fmt)) filter_lengths = unpack(header_fmt, bytes) for fl in filter_lengths: filter.filters.append(BloomFilter.fromfile(f, fl)) else: filter.filters = [] return filter
def waitForBloom(from_ip, print_labels=False, frame=None, print_start=0): # Create file for bloom filter import f = open('bloomFileIn', 'wb') # Connect to IP address host = from_ip port = 10000 s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) # Keep trying until a connection is made while True: try: s.bind((host, port)) except socket.error: pass else: break s.listen(1) print "Waiting for data..." conn, addr = s.accept() print "Connection from " + addr[0] # Receive bloom filter in increments data = conn.recv(1024) size = sys.getsizeof(data) while (data): f.write(data) data = conn.recv(1024) size += sys.getsizeof(data) print "Received " + str(size / 1000) + " KB" # Print to GUI if print_labels: ttk.Label(frame, text=("Received bloom filter (" + str(size / 1000) + " KB)")).grid(row=print_start, column=0) # Cleanup & bloom filter creation f.close() f = open('bloomFileIn', 'rb') bloom = BloomFilter.fromfile(f) f.close() conn.close() s.close() os.remove('bloomFileIn') return bloom
def __init__(self): self.redis = connects.RedisConnect(**config.redis_config) #self.p = self.redis.r.pipeline() try: print u'正在初始化,请稍后.....' f = open('/home/hujun/bloomfilter.txt') #尝试打开保存bloomfilter的文件 except IOError: print 'create a new bloomfilter without file' #如果打开失败,说明不存在这个文件,就重新创建一个bloomfilter self.bloomfilter = BloomFilter(capacity=1000000, error_rate=0.00001) #初始化bloomfilter之后,直接将第一次抓取的url加到bloomfilter中 self.bloomfilter.add(config.first_url) else: print 'reload bloomfilter from a file' self.bloomfilter = BloomFilter.fromfile(f)
def test_recursive_domains(self): result = create_index( '/tmp/fake.csv', # input filename self.test_file, # file-like object 0.0001, # error rate 1, # skip lines [3], # fields ',', # delimiter True) # recursive domain self.assertEqual({'/tmp/fake.csv.3.bfindex': 9}, result) b = BloomFilter.fromfile(open('/tmp/fake.csv.3.bfindex', 'rb')) for word in ('subdomain.yahoo.com', 'yahoo.com', 'com', 'example.domain.com', 'domain.com', 'www.google.co.uk', 'google.co.uk', 'co.uk', 'uk'): self.assertEqual(True, word in b)
def waitForBloom(from_ip, print_labels=False, frame=None, print_start=0): # Create file for bloom filter import f = open('bloomFileIn', 'wb') # Connect to IP address host = from_ip port = 10000 s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) # Keep trying until a connection is made while True: try: s.bind((host, port)) except socket.error: pass else: break s.listen(1) print "Waiting for data..." conn, addr = s.accept() print "Connection from " + addr[0] # Receive bloom filter in increments data = conn.recv(1024) size = sys.getsizeof(data) while(data): f.write(data) data = conn.recv(1024) size += sys.getsizeof(data) print "Received " + str(size/1000) + " KB" # Print to GUI if print_labels: ttk.Label(frame, text=("Received bloom filter (" + str(size/1000) + " KB)")).grid(row=print_start,column=0) # Cleanup & bloom filter creation f.close() f = open('bloomFileIn', 'rb') bloom = BloomFilter.fromfile(f) f.close() conn.close() s.close() os.remove('bloomFileIn') return bloom
def fromfiles( cls, name, bufferSize=1024, dataDir="" ): bloomFiles = glob.glob( os.path.join( dataDir, "%s-*.bloom" % name ) ) dataFiles = glob.glob( os.path.join( dataDir, "%s-*.data" % name ) ) if( len( bloomFiles ) == 0 ): return None factory = cls( name, bufferSize=bufferSize, dataDir=dataDir ) for i in range( len( dataFiles ) ): b, d = bloomFiles[i], dataFiles[i] box = GadgetBox( b.split( "." )[0].split( os.sep )[-1], bufferSize, dataDir=dataDir ) with open( os.path.join( dataDir, b ), "rb" ) as f: box.filter = BloomFilter.fromfile( f ) factory.boxes.append( box ) return factory
def main(): parser = argparse.ArgumentParser(prog='blacktop/nsrl') parser.add_argument("-v", "--verbose", help="Display verbose output message", action="store_true", required=False) parser.add_argument('name', metavar='FILE', type=str, nargs='+', help='a file name to search for.') args = parser.parse_args() with open('nsrl.bloom', 'rb') as nb: bf = BloomFilter.fromfile(nb) for file_name in args.name: if args.verbose: if file_name in bf: print "File {} found in NSRL Database.".format(file_name) else: print "File {} was NOT found in NSRL Database.".format(file_name) else: print file_name in bf return
def test_recursive_domains(self): result = create_index( '/tmp/fake.csv', # input filename self.test_file, # file-like object 0.0001, # error rate 1, # skip lines [3], # fields ',', # delimiter True) # recursive domain self.assertEqual( {'/tmp/fake.csv.3.bfindex': 9}, result) b = BloomFilter.fromfile(open('/tmp/fake.csv.3.bfindex', 'rb')) for word in ('subdomain.yahoo.com', 'yahoo.com', 'com', 'example.domain.com', 'domain.com', 'www.google.co.uk', 'google.co.uk', 'co.uk', 'uk'): self.assertEqual(True, word in b)
def main(): parser = argparse.ArgumentParser(prog='blacktop/nsrl') parser.add_argument("-v", "--verbose", help="Display verbose output message", action="store_true", required=False) parser.add_argument('hash', metavar='MD5', type=str, nargs='+', help='a md5 hash to search for.') args = parser.parse_args() with open('nsrl.bloom', 'rb') as nb: bf = BloomFilter.fromfile(nb) for hash_hex in args.hash: hash = binascii.unhexlify(hash_hex) if args.verbose: if hash in bf: print "Hash {} found in NSRL Database.".format(hash) else: print "Hash {} was NOT found in NSRL Database.".format(hash) else: print hash in bf return
def get_accurate_case(sql_result,blame_line,func_case_list): import os hash_code = sql_result[0] func_name = sql_result[1] file_path = sql_result[2] func_start_line = sql_result[3] func_end_line = sql_result[4] sql = "select CASE_ID,BLOOMFILTER from CASE_VERSION" cursor.execute(sql) rows = cursor.fetchall() for row in rows: case_id = row[0] BloomFilter_file = row[1] if os.path.exists(BloomFilter_file): bf = BloomFilter.fromfile(open(BloomFilter_file,"rb")) if hash_code in bf: global_accurate_case_list.append(case_id) analyze_git_blame(file_path,func_name,blame_line) func_case_list.append(case_id) return func_case_list
def get_accurate_case(sql_result, blame_line, func_case_list): import os hash_code = sql_result[0] func_name = sql_result[1] file_path = sql_result[2] func_start_line = sql_result[3] func_end_line = sql_result[4] sql = "select CASE_ID,BLOOMFILTER from CASE_VERSION" cursor.execute(sql) rows = cursor.fetchall() for row in rows: case_id = row[0] BloomFilter_file = row[1] if os.path.exists(BloomFilter_file): bf = BloomFilter.fromfile(open(BloomFilter_file, "rb")) if hash_code in bf: global_accurate_case_list.append(case_id) analyze_git_blame(file_path, func_name, blame_line) func_case_list.append(case_id) return func_case_list
def convert_string_to_list(temp_buff): global proxies temp_list = temp_buff.split('\n') # break up the string at each \n proxies = [] size = len(temp_list) # create a list of proxies with the special tuple i = 1; while i < size: temp_bf_read = open('bf_read', "w") # create new file bf_read temp_bf_read.write(temp_list[i+2]) # write the bloom filter to the file temp_bf_read.write('\n') temp_bf_read.write(temp_list[i+3]) temp_bf_read.close() temp_bf_read = open('bf_read', "r") temp_bf = BloomFilter.fromfile(temp_bf_read) # de-serialize the bloom filter # add the tuple to the list of proxies proxies.append((temp_list[i], int(temp_list[i+1]), temp_bf)) i = i + 4 if os.path.isfile('bf_read'): # remove the file used to hold the bloom filter os.remove('bf_read')
def test(): with open('blooms/uni_names', 'r') as f: uni_names = BloomFilter.fromfile(f) with open('ngpols/uni_names', 'r') as f: uni_names_2 = NGPOLFilter.fromfile(f) with open('ngpols/uni_names_cluster', 'r') as f: uni_names_3 = NGClusterFilter.fromfile(f) print "Enter a name:" def testname(name, filt): if name in filt: print "Yup, that name was in the owl doc." else: print "That name wasn't in the owl doc." while True: name = raw_input().strip().lower() print "Bloom:" testname(name, uni_names) print "NGOPL:" testname(name, uni_names_2) print "NGCluster:" testname(name, uni_names_3) print
def main(): #url =raw_input('input the tieba you like') #the tieba url you want to download the img ,pay attention to the format,'/f?kw=XXXXXXXXX' url = 'http://tieba.baidu.com/f?kw=轴心国画室' if not os.path.exists('%s' % (url[28:])): os.makedirs('%s' % (url[28:])) os.makedirs('./%s/check' % (url[28:])) record(url) else: numlist = getallnumlist(url) #ues the bloomfilter first time create and find the tie that haven't download with open('./%s/check/bloomfilter' % (url[28:]), 'rb') as b: bloomfilter = BloomFilter.fromfile(b) img_no_download = [] for i in numlist: if not bloomfilter.add(i): img_no_download += i if not img_no_download: print 'nothing update' else: multiprocessdownload(img_no_download)
def test(): with open('blooms/actor_names', 'rb') as f: bloom = BloomFilter.fromfile(f) with open('ngpols/actor_names', 'rb') as f: ngpol = NGPOLFilter.fromfile(f) with open('probsets/actor_names', 'rb') as f: probset = NgramProbSet.fromfile(f) print "Enter a title:" def testname(name, filt): if name in filt: print "Yup, that is an actor's name." if isinstance(filt, ProbabilitySet): print filt.getProbability(name) else: print "That wasn't an actor's name." while True: name = ''.join(re.findall('[A-Z0-9]+',raw_input().upper())) print "Bloom:" testname(name, bloom) print "NGOPL:" testname(name, ngpol) print "NgramProbSet:" testname(name, probset)
def start(): res = request_get(biqukan_url) index = BeautifulSoup(res, features=features) if os.path.exists(bf_file): LOG.info('bs from file') bf = BloomFilter.fromfile(open(bf_file, 'r')) else: LOG.info('init bs') bf = BloomFilter(500000) try: book_urls = find_wanben() book_urls += find_new_storage_block(index) book_urls += find_recommend_block(index, u'强力推荐') book_urls += find_type_block(index, u'玄幻小说') book_urls += find_type_block(index, u'修真小说') book_urls += find_type_block(index, u'都市小说') book_urls += find_type_block(index, u'穿越小说') book_urls += find_type_block(index, u'网游小说') book_urls += find_type_block(index, u'科幻小说') book_urls += find_new_update_block(index) book_urls += find_wanben() book_num = len(book_urls) start = time.time() for i, url in enumerate(book_urls[:10]): download_book(url, bf) LOG.info(u'已经下载%s本,剩余%s本', i+1, book_num - i -1) # time.sleep(30) print '%s' % (time.time() - start) LOG.info(u'下载完成') except Exception as e: LOG.exception(e) finally: bf.tofile(open(bf_file, 'w'))
def main(): #url =raw_input('input the tieba you like') #the tieba url you want to download the img ,pay attention to the format,'/f?kw=XXXXXXXXX' url ='http://tieba.baidu.com/f?kw=轴心国画室' if not os.path.exists('%s' %(url[28:]) ): os.makedirs('%s' %(url[28:]) ) os.makedirs('./%s/check' %(url[28:]) ) record(url) else: numlist =getallnumlist(url) #ues the bloomfilter first time create and find the tie that haven't download with open('./%s/check/bloomfilter' %(url[28:]) ,'rb') as b: bloomfilter =BloomFilter.fromfile(b) img_no_download =[] for i in numlist: if not bloomfilter.add(i): img_no_download += i if not img_no_download: print 'nothing update' else: multiprocessdownload(img_no_download)
#!/usr/bin/env python from rdp import * from ngrams import * from pybloom import BloomFilter from edits import * #initialize actor inclusion set with open('blooms/actor_names', 'rb') as f: actor_bloom = BloomFilter.fromfile(f) with open('ngpols/actor_names', 'rb') as f: actor_ngpol = NGPOLFilter.fromfile(f) actor_fuzzy = BloomFSS(actor_bloom, 1) actor_filt = OrSet([actor_ngpol, actor_fuzzy]) #initialize movie name inclusion set with open('ngpols/film_titles', 'rb') as f: titles_filt = NGClusterFilter.fromfile(f) #setup the grammar S = Symbol('S') is_ = InclusionSetTerminal( 'IS', set(['BE','IS','WAS','WERE','ARE','DOES']) ); will_ = InclusionSetTerminal( 'WILL', set(['WILL', 'ARE']) ); actor = InclusionSetTerminal('actor', actor_filt , max_words=3)
class UidQueue(): """ Uid queue, include queue and bloom filter """ def __init__(self, max_count=200000, error_rate=0.001): """ Initialize @param max_count: capacity of bloom filter @param error_rate: error_rate of bloom filter @return: None """ self.queue = Queue() self.bloom = BloomFilter(capacity=max_count, error_rate=error_rate) self.crawled = 0 @staticmethod def _remove_duplicate(list_in): """ remove duplicated item in list @param list_in: list @return: None """ return list(set(list_in)) def dump(self, path, encoding): """ Dump data to file @param path: path prefix @param encoding: file encoding @return: None """ try: print "Saving ... " with codecs.open(path+'-queue.bak', 'wb', encoding) as wf: tmp = {'queue': list(set(list(self.queue.queue))), 'count': self.crawled} json.dump(tmp, wf) with codecs.open(path+'-bloom.bak', 'wb') as wf: self.bloom.tofile(wf) except Exception as e: print "Dump Uid Queue Failed" print e def restore(self, path, encoding): """ Restore data from file @param path: path prefix @param encoding: file encoding @return: None """ try: with codecs.open(path+'-bloom.bak', 'rb') as rf: self.bloom.fromfile(rf) with codecs.open(path+'-queue.bak', 'rb', encoding) as rf: tmp = json.load(rf) [self.queue.put(uid) for uid in tmp['queue']] self.crawled = tmp['count'] # set encoding=utf-8 is wrong, only deal with ascii except Exception as e: print "Restore Uid Queue Failed: ", e def _put_all(self, list_in, block, timeout): """ Put all item in list to queue while item not in bloom @param list_in: Item from @param block: Is block for put open @param timeout: Timeout of put @return: None """ [self.queue.put(uid, block, timeout) for uid in list_in] def extend(self, container, block=True, timeout=3): """ Extend uid Queue, remove duplicated and put all in container to queue @param container: Where items in @param block: Is block for put open @param timeout: Timeout of put @return: Error return WRONG_TYPE """ # TODO: seem like can not remove duplicate using set tmp = [] for uid in container: if uid not in self.bloom and uid not in self.queue.queue: tmp.append(uid) self._put_all(list_in=tmp, block=block, timeout=timeout) def get(self, block=True, timeout=3): """ Get uid from queue, and add it into bloom filter @param block: Is block for put open @param timeout: Timeout of put @return: uid """ uid = self.queue.get(block=block, timeout=timeout) self.bloom.add(uid) self.crawled += 1 return uid def __len__(self): """ Length of uid queue """ return self.queue.qsize()
def load_bloomfilter(flname): with open(flname) as fl: bfilter = BloomFilter.fromfile(fl) return bfilter
from django.shortcuts import render from django.http import HttpResponse from django.template import loader from django.http import JsonResponse from django.views.decorators.csrf import csrf_exempt from kafka import KafkaProducer import random import redis from pybloom import BloomFilter bloom_filter = BloomFilter(capacity=1000, error_rate=0.001) try : with open("bloom","r") as bloom_file: bloom_filter = bloom_filter.fromfile(bloom_file) except : print "File not found" _redis = redis.StrictRedis(password='******') producer = KafkaProducer(bootstrap_servers='localhost:9092') def validate_url(request): long_url = request.GET.get('long_url', None) data = { 'is_taken': long_url == 'google.com' } return JsonResponse(data) def manage_url(request, short_url): # check in bloom filter global bloom_filter try :
def main(): default_config_file = '/nsrl/nsrl.conf' config = configparser.ConfigParser() config.read(default_config_file) #add commandline options hash_type = config.get('config', 'hashfile_type') parser = argparse.ArgumentParser(prog='nsrl') parser.add_argument("-v", "--verbose", help="Display verbose output message", action="store_true", required=False) parser.add_argument("-0", "--no-hits", help="Suppress Output of matching hashes", action="store_true", required=False) parser.add_argument("-1", "--no-misses", help="Suppress Output of mismatching hashes", action="store_true", required=False) inputs = parser.add_mutually_exclusive_group(required=True) inputs.add_argument('hash', metavar='<{}>'.format(hash_type), type=str, nargs='*', default=[], help='{} hash to search for.'.format(hash_type)) inputs.add_argument('-s', '--stdin', help="Read hashes from stdin", action="store_true") args = parser.parse_args() if args.verbose: print("Version INFO: {}".format(config.get('config', "rds_version"))) print("Error Rate: {}".format(config.get('config', "error_rate"))) print("Build Date: {}".format(config.get('config', "build_date"))) print("Filename: {}".format(config.get('config', "hashfile_name"))) print("Hashcount: {}".format(config.get('config', "hash_count"))) with open('nsrl.bloom', 'rb') as nb: bf = BloomFilter.fromfile(nb) if args.stdin: hashlist = [hash.strip() for hash in sys.stdin.readlines()] else: hashlist = args.hash for hash_hex in hashlist: hash = binascii.unhexlify(hash_hex) output = "" # only print output if for mismatches if selected hash_is_a_match = (hash in bf) if (hash_is_a_match and not args.no_hits) or (not hash_is_a_match and not args.no_misses): #output if args.verbose: output = "{}:{}".format(hash_hex, hash_is_a_match) elif args.no_hits != args.no_misses: output = "{}".format(hash_hex) else: output = "{}:{}".format("+" if hash_is_a_match else "-", hash_hex) print(output) return
def get_bf_by_case(case_file_path): sql = "select BLOOMFILTER from CASE_VERSION where PATH=" + case_file_path cursor.execute(sql) bf_filename = cursor.fetchone() return BloomFilter.fromfile(open(bf_filename,"rb"))
def fromfile(f): inst = BloomFFS(None, 0) inst.max_edits = pickle.load(f) inst.alphabet = pickle.load(f) inst.bloom = BloomFilter.fromfile(f)
features = 'lxml' url = 'https://www.biqukan.com' wan_ben_url = 'https://www.biqukan.com/wanben' bf_file = 'ikantxt2' base_dir = u'/downloads/小说' # base_dir = u'小说' content_f = re.compile(u'.*正文卷') req = requests.get(url=url, timeout=10) html = req.text index = BeautifulSoup(html, features=features) if os.path.exists(bf_file): LOG.info('bs from file') bf = BloomFilter.fromfile(open(bf_file, 'r')) else: LOG.info('init bs') bf = BloomFilter(500000) def find_title(name): return index.find('h2', text=name) def find_container(name): return index.find('h2', text=name).find_next() def find_wanben(): book_urls = []
def _load_filter(): bf = BloomFilter.fromfile(open(_BLOOM_DUMP)) return bf
def get_bf_by_case(case_file_path): sql = "select BLOOMFILTER from CASE_VERSION where PATH=" + case_file_path cursor.execute(sql) bf_filename = cursor.fetchone() return BloomFilter.fromfile(open(bf_filename, "rb"))
#! /usr/bin/env python import os import sys from pybloom import BloomFilter from unittest import TestSuite if __name__=="__main__": ser=open("oom","rb") ser.seek(0) oom=BloomFilter.fromfile(ser) for f in os.listdir(sys.argv[1]): if f.find(".log") == -1: continue for line in open(os.path.join(sys.argv[1],f)): ss=line.strip().split("\t") if not (ss[len(ss)-1] in oom): print ss[len(ss)-1] sys.exit(0) oom = BloomFilter(capacity=1000*1000*200,error_rate=0.0001) for f in os.listdir(sys.argv[1]): if f.find(".log") == -1: continue for line in open(os.path.join(sys.argv[1],f)): ss=line.strip().split("\t") oom.add(ss[len(ss)-1]) if not (ss[len(ss)-1] in oom ): print ss[len(ss)-1] ser=open("oom","wb") oom.tofile(ser)