def setUpClass(cls) -> None: """Disable logging.""" logging.getLogger().setLevel(logging.FATAL) shutil.rmtree(cls.test_dir, ignore_errors=True) os.makedirs(cls.test_dir, exist_ok=True) cls.records = [ Record([0, 0, 0, 0, 0]), # not in Bloom Record([1, 2, 3, 4, 5]), # in Bloom Record([2, 2, 3, 4, 5]), # in Bloom Record([3, 2, 3, 4, 5]), # in Bloom Record([4, 2, 3, 4, 5]), # not in Bloom Record([5, 2, 3, 4, 5]), # not in Bloom ] for r in cls.records: r.set_hash_key(cls.hash_key) b = BloomFilter(100, 0.0001, cls.test_dir + "test.bloom") b.update([1, 2, 3, 4, 5, 6, 7, 8, 9, 'a', 'b', 'c']) b.add(b64encode(cls.records[1].get_long_hash()).decode()) b.add(b64encode(cls.records[2].get_long_hash()).decode()) b.add(b64encode(cls.records[3].get_long_hash()).decode()) cls.b_encoded = b.to_base64().decode() cls.b = b cls.psi_ind = [ cls.records[1].get_psi_index(), cls.records[2].get_psi_index(), cls.records[3].get_psi_index() ]
class List: """ Class to read lists from disk, create regex and bloom filters, and finally write them back to disk on crawl completion """ def __init__(self, path): print("Loading " + path + "...") self.path = path with open(path) as f: array = f.readlines() array = [x.strip() for x in array] array = list(set(array)) self.array = array if path == "exclude.txt": self.regex = re.compile('(?:% s)' % '|'.join(self.array)) self.bloom = BloomFilter(10000000, 0.01) self.bloom.update(self.array) def append(self, element): self.bloom.add(element) self.array.append(element) def concat(self, elements): self.array += elements def write(self): with open(self.path, 'w') as f: for item in self.array: f.write("%s\n" % item)
def main(): #Check for command line arguments if len(sys.argv) != 2: print 'Usage: %s [trace file]' % os.path.basename(sys.argv[0]) sys.exit(1) #Read arguments from command line inFile = sys.argv[1] bf1 = BloomFilter(100000000, 0.001, 'bf1') bf2 = BloomFilter(100000000, 0.001, 'bf2') outputFileName="converted-"+sys.argv[1] f = open(outputFileName, "a") for line in open(inFile,'r'): if (line[0:2]=="W," or line[0:2]=="R,"): hash1=int(hashlib.sha1(line[2:]).hexdigest(), 16) % (10 ** 10) hash2=int(hashlib.md5(line[2:]).hexdigest(), 16) % (10 ** 10) if (bf1.add(hash1) and bf2.add(hash2)): f.write('%s,%d\n' % (line[0],hash1*10000) ) else: f.write('%s,%d\n' % (line[0],hash2*10000) ) elif(line==''): break else: pass f.close()
class LinkFilter(): def __init__(self, domain): self.file_index = '%s_%s' % (domain, 'index.bf') self.file_html = '%s_%s' % (domain, 'html.bf') if os.path.exists(self.file_index): self.bf_index = BloomFilter.open(self.file_index) else: self.bf_index = BloomFilter(100000000, 0.001, self.file_index) if os.path.exists(self.file_html): self.bf_html = BloomFilter.open(self.file_html) else: self.bf_html = BloomFilter(100000000, 0.001, self.file_html) def index_filter(self, links): new_links = [] for link in links: if not self.bf_index.add(link.url): new_links.append(link) return new_links def html_filter(self, links): new_links = [] for link in links: #log.msg('This is a link : %s' % link, level=log.WARNING) if not self.bf_html.add(link.url): new_links.append(link) return new_links
def getbloomFilter(bf, bf_capacity, fem_kmers, kmer_size): if bf: print("Opening Bloom Filter of k-mers from female") female_kmers_bf = BloomFilter.open("data/female.bloom") print("Done") else: print("Need to make Bloom Filter of k-mers from female") bf_filename = "data/female.bloom" female_kmers_bf = BloomFilter(bf_capacity, .001, bf_filename) if fem_kmers: # if female kmers file exist female_kmers_file = "data/female_kmers" with open(female_kmers_file, 'r') as fm_kmers: #assumes kmers are uppercase first_line = fm_kmers.readline() kmers.test_valid_kmer_format(first_line, kmer_size) fm_kmers.seek(0) for line in fm_kmers: female_kmers_bf.add(line[:kmer_size]) else : print("Reading female reference one record at a time and k-merizing each record...") female_reference_file = "data/female.fasta" n_kmers = "N"*kmer_size for record in SeqIO.parse(female_reference_file,"fasta"): to_kmerize_fwd = str(record.seq).upper() length = len(to_kmerize_fwd) for i in range(0, length-kmer_size+1): female_kmer = to_kmerize_fwd[i:i+kmer_size] if female_kmer != n_kmers: female_kmers_bf.add(to_kmerize_fwd[i:i+kmer_size]) print("Done creating bloom filter") return female_kmers_bf
def get_spectrum(input_file, size=31): bams = pysam.AlignmentFile(input_file, 'rb') bloom_filter = BloomFilter(capacity=999999999, error_rate=0.1) # print(bloom_filter.bitarray) # print(bloom_filter.num_bits) # 统计每一个kmer出现的次数,即多样性 hash_dict = {} cnt = 0 for r in bams: cnt += 1 print(cnt) if cnt == 200000: break read = r.query_sequence kmers = get_kmers(read, size) # print(kmers) for kmer in kmers: is_in = kmer in bloom_filter if is_in is True: # 排除假阳性 if kmer in hash_dict: hash_dict[kmer] += 1 else: hash_dict[kmer] = 1 else: bloom_filter.add(kmer) hash_dict[kmer] = 1 # 删除只有一个kmer unique_kmer = [] for key in hash_dict.keys(): if hash_dict[key] == 1: unique_kmer.append(key) for i in range(len(unique_kmer)): hash_dict.pop(unique_kmer[i]) # print(hash_dict) # 统计多样性为相同值的所有kmer个数 stat_dict = {} for key in hash_dict.keys(): multiplicity = hash_dict[key] if multiplicity not in stat_dict.keys(): stat_dict[multiplicity] = 1 else: stat_dict[multiplicity] += multiplicity frequency = [] density = [] for key in stat_dict.keys(): frequency.append(key) density.append(stat_dict[key]) return stat_dict, frequency, density
class Parser: def __init__(self, rule, item=None): self.rule = rule self.item = item self.parsing_urls = [] self.pre_parse_urls = [] self.filter_urls = BloomFilter(10000000, 0.01) self.done_urls = [] def add(self, urls): url = '{}'.format(urls) if url.encode('utf-8') not in self.filter_urls: self.filter_urls.add(url.encode('utf-8')) self.pre_parse_urls.append(url) def parse_urls(self, html): urls = re.findall(self.rule, html) for url in urls: self.add(url) async def parse_item(self, html): item = self.item(html) await item.save() self.item._item_count += 1 return item async def execute_url(self, spider, session, semaphore, url): html = await fetch(url, session, semaphore) if html is None: spider.error_urls.append(url) self.pre_parse_urls.append(url) return if url in spider.error_urls: spider.error_urls.remove(url) spider.urls_count += 1 self.parsing_urls.remove(url) self.done_urls.append(url) if self.item is not None: await self.parse_item(html) logger.info('Parsed({}/{}): {}'.format(len(self.done_urls), len(self.filter_urls), url)) else: spider.parse(html) logger.info('Followed({}/{}): {}'.format(len(self.done_urls), len(self.filter_urls), url)) async def task(self, spider, semaphore): with aiohttp.ClientSession() as session: while spider.is_running(): if len(self.pre_parse_urls) == 0: await asyncio.sleep(0.5) continue url = self.pre_parse_urls.pop() self.parsing_urls.append(url) asyncio.ensure_future( self.execute_url(spider, session, semaphore, url))
class URIBloomFilter(BaseDupeFilter): def __init__(self, settings, debug = False): self.capacity = settings.getint("DUPEFILTER_CAPACITY") self.filename = settings.get("DUPEFILTER_FILENAME") self.debug = debug self.error_rate = 0.01 self.logger = logging.getLogger(__name__) self.bloom_filter_ =BloomFilter(self.capacity, self.error_rate, self.filename) @classmethod def from_settings(cls, settings): debug = settings.getbool('DUPEFILTER_DEBUG') return cls(settings, debug) def request_seen(self, request): fp = self.request_fingerprint(request) if self.check(fp): return True else: self.insert(fp) ###-------todo-------## def request_fingerprint(self, request): return request_fingerprint(request) def check(self, request): ret = request in self.bloom_filter_ return ret def insert(self, request): self.bloom_filter_.add(request) #print len(self.bloom_filter_) #print self.bloom_filter_.hash_seeds #print self.bloom_filter_.num_bits #print self.bloom_filter_.num_hashes def reset(self): self.bloom_filter_.clear_all() def save(self): pass def load(self): self.bloom_filter_.sync() self.bloom_filter_.open("bloom.dump") pass def log(self, request, spider): if self.debug: msg = "Filtered duplicate request: %(request)s" self.logger.debug(msg, {'request': request}, extra={'spider': spider}) elif self.logdupes: msg = ("Filtered duplicate request: %(request)s" " - no more duplicates will be shown" " (see DUPEFILTER_DEBUG to show all duplicates)") self.logger.debug(msg, {'request': request}, extra={'spider': spider}) self.logdupes = False spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)
def loadUsers(filename): usernames = BloomFilter(capacity=30000000, error_rate=0.00001) try: with open(filename, 'r') as f: for line in f: username = str(json.loads(line.strip())["username"]) usernames.add(username) except IOError: print "file doesn't exist" return usernames
def slave_node(kmer_pool, slave_num, stat_pool): local_filter = BloomFilter(capacity=9999999999, error_rate=0.1) local_dict = {} try: print('Slave: ', slave_num, ' start') time_to_stop = 0 while True: self_kmres = kmer_pool[slave_num] while len(self_kmres) is not 0: time_to_stop = 0 kmer = self_kmres[0] # print('Slave: ', slave_num, " del: ", kmer) del self_kmres[0] # 加入到Bloom filter中 is_in = kmer in local_filter if is_in is True: # 排除假阳性 if kmer in local_dict: local_dict[kmer] += 1 else: local_dict[kmer] = 1 else: local_filter.add(kmer) local_dict[kmer] = 1 kmer_pool[slave_num] = self_kmres time.sleep(0.001) time_to_stop += 1 # 停止时间到 if time_to_stop == 50: # 删除只有一个kmer unique_kmer = [] local_stat = {} # 去除唯一的kmer,统计多样性为相同值的所有kmer个数 for key in local_dict.keys(): multiplicity = local_dict[key] if multiplicity == 1: unique_kmer.append(key) elif multiplicity not in local_stat.keys(): local_stat[multiplicity] = 1 else: local_stat[multiplicity] += multiplicity for i in range(len(unique_kmer)): local_dict.pop(unique_kmer[i]) stat_pool = merge_dict(stat_pool, local_stat) break print('Slave: ', slave_num, ' end------------------') except Exception as ex: print('Slave: ', slave_num, ex)
def hashtable_with_bf(file_name, kmer_size, bf_capacity, bf_error, top_count, verbose): """ Hash table with bloom filter. :param file_name: File to be processed. :param kmer_size: Length of the kmer. :param bf_capacity: Capacity of the bloom filter. :param bf_error: Probability of false positive in bloom filter. :param top_count: Number of kmers to be printed :param verbose: Option to print elapsed time and memory usage. :return: """ start = time.time() # initialise a min heap h = Heap() h.populate(top_count) # initialise bloom filter bf = BF(bf_capacity, bf_error, "hashtable_with_bf") kmer_freq = dict() with open(file_name, "r") as file_from_read: count = 0 for line in file_from_read: # take the second line to parse kmers. if count % 4 == 1: line_length = len(line) for i in range(line_length - kmer_size + 1): kmer = line[i:kmer_size + i] if kmer in bf: if kmer not in kmer_freq: kmer_freq[kmer] = 1 kmer_freq[kmer] += 1 else: bf.add(kmer) count += 1 end = time.time() if verbose: print("Hash table done in {0} seconds".format(end - start)) start_heap = time.time() for kmer, freq in kmer_freq.iteritems(): if freq > h.min(): # h.pop() # h.push((freq, kmer)) h.push_pop((freq, kmer)) for item in h.nlargest(top_count): freq, kmer = item print(kmer, freq) end = time.time() # clean bf os.remove("hashtable_with_bf") if verbose: print("Heap done in {0} seconds".format(end - start_heap)) print("Process done in {0} seconds".format(end - start)) print("Hash table size: {0} MB".format( int(sys.getsizeof(kmer_freq) / 10**6)))
def bloom_full(basename: str): """Measure all values at once.""" if basename is None: raise ValueError("No basename given.") file_path = get_file_path("bloom_full", basename) partial_insert_file = file_path.replace(".csv", "_partial_insert.csv") rs = ROUNDS_START if not RESUME or not os.path.exists(file_path): # Write header if new file only row_fmt = f"ROUND;CAPACITY;ERROR_RATE;INSERTED ELEMENTS;" \ f"QUERIED ELEMENTS;SIZE;INSERT TIME;QUERY TIME;" \ f"# False Positives" write_header("Bloom Full", file_path, row_fmt) # row_fmt = f"ROUND;CAPACITY;ERROR_RATE;INSERTED ELEMENTS;" \ # f"SIZE;INSERT_TIME[s](for elements added in step);" # write_header("Bloom Partial Insert", partial_insert_file, row_fmt) else: # Read values to resume rs = get_round(file_path) for r in lb(range(rs, ROUNDS_END), "Rounds"): for capacity in lb(CAPACITY, "Capacities", leave=False): for error_rate in lb(ERROR_RATE, "Error Rates", leave=False): if FILL: i = [capacity] else: i = lb(INSERT, "Inserts", leave=False) for insert in i: with NamedTemporaryFile() as tmp: b = BloomFilter(capacity, error_rate, tmp.name) real_set = [random.random() for _ in range(insert)] start = time.monotonic() for s in real_set: # Add random value b.add(s) insert_time = time.monotonic() - start size = len(b.to_base64()) if QUERY_ALL: query_range = int(math.ceil(100 / error_rate)) else: query_range = QUERY for query in lb(query_range, "Queries", leave=False): # +1 because only values <1 stored query_set = [ random.random() + 1 for _ in range(query)] start = time.monotonic() false_positives = 0 for q in query_set: if q in b: false_positives += 1 query_time = time.monotonic() - start with open(file_path, "a") as fd: fd.write( f"{r};{capacity};{error_rate};" f"{insert};{query};{size};{insert_time};" f"{query_time};{false_positives}\n")
def test_bloom(self): s = server.StorageServer(test_dir) self.assertFalse(os.path.exists(self.bloom_path)) with patch.object(s, "_initialize_bloom_filter") as m: b = s.bloom m.assert_called_once() b = BloomFilter(2, .1, self.bloom_path) b.add(5) c = s.bloom m.assert_called_once() # No second call self.assertIn(5, c)
def load_bloom(kmers): """ Inserts all of the k-mers into the bloom filter """ global bloom, filename filename = '%d_kmer_%d_rate.bloom' % (kmer_length, int(100 * error_rate)) print(len(kmers) // 2) bloom = BloomFilter(len(kmers) // 2, error_rate, filename) for kmer in kmers: bloom.add(kmer)
class BloomFilterQueue(BaseQueue): def __init__(self, bloomfilter_path, capacity, wrong_rate, maxsize, *argv, **kw): super(BloomFilterQueue, self).__init__(maxsize) self.crawled = BloomFilter(capacity, wrong_rate, bloomfilter_path) def put_request(self, request, block=True, timeout=None): url = request["url"] if isinstance(request, ZRequest) else request if url in self.crawled: return False self.crawled.add(url) self._queue.put(request, block=block, timeout=timeout)
def get_spectrum(input_file, size=31): bams = pysam.AlignmentFile(input_file, 'rb') bloom_filter = BloomFilter(capacity=9999999999, error_rate=0.1) # print(bloom_filter.bitarray) # print(bloom_filter.num_bits) # 统计每一个kmer出现的次数,即多样性 hash_dict = {} cnt = 0 for r in bams: cnt += 1 print(cnt) # if cnt % 100000 == 0: # print(cnt) if cnt == 100000: break read = r.query_sequence kmers = get_kmers(read, size) # 将kmer加入到bloom中 for kmer in kmers: # a= int(hash(kmer) % 3) # get_hash(kmer, 3) is_in = kmer in bloom_filter if is_in is True: # 排除假阳性 if kmer in hash_dict: hash_dict[kmer] += 1 else: hash_dict[kmer] = 1 else: bloom_filter.add(kmer) hash_dict[kmer] = 1 unique_kmer = [] stat_dict = {} # 统计多样性为相同值的所有kmer个数,并删除只有一个kmer for key in hash_dict.keys(): multiplicity = hash_dict[key] if multiplicity == 1: unique_kmer.append(key) elif multiplicity not in stat_dict.keys(): stat_dict[multiplicity] = 1 else: stat_dict[multiplicity] += multiplicity for i in range(len(unique_kmer)): hash_dict.pop(unique_kmer[i]) get_usage() return stat_dict
def create_bf(): bf = BloomFilter(count, error_rate, 'filter_base.bloom') keyDigest_list = [] FILE = open(keyDigestFile, 'r') for i in range(count): keyDigest = FILE.read(keyDigestLen) keyDigest_list.append(keyDigest) FILE.close() for publicKeyID in keyDigest_list: bf.add(publicKeyID)
def threaded_crawl(tid, n, proxies, lock, output_dir="."): global count global failures fails = 0 logger = logging.getLogger(__name__) fptr = open("top-1m.csv", "r") fail_thresh = 10 # Use a different proxy after 10 failed requests in a row proxy = dict() linum = fails = 0 start = tid * n # First seed site to crawl end = tid * n + n # Last seed site to crawl seed = BloomFilter(n * 1000000, 0.1, '/tmp/{}.bloom'.format(tid).encode()) frontier = deque() logger.info('[tid {}] Loading seed URLs {} - {}'.format(tid, start, end)) for line in fptr: if linum >= start and linum < end: url = "http://" + line.split(',')[1].strip() seed.add(url.encode()) frontier.append(url) linum += 1 fptr.close() while True: url = frontier.popleft() urls = [] try: urls = parse_url(url, proxy, output_dir) except Exception as e: logger.error( "[tid {}] Fatal error occured while crawling: {}.".format( tid, url)) if len(urls) == 0: with lock: failures += 1 fails += 1 if fails > fail_thresh: proxy['http'] = proxies[randint(0, len(proxies) - 1)] logger.error("[tid {}] Failure: Activating proxy:{}".format( tid, proxy['http'])) fails = 0 for u in urls: link = u.encode() if link not in seed: seed.add(link) frontier.append(link) with lock: count += 1 if (count % 1000 == 0): logger.info('Page count: {}'.format(count)) if len(frontier) % 1000 == 0: logger.info("[tid {}] Frontier count: {}".format( tid, len(frontier)))
def create(infile, outfile, capacity: int, error_rate: float = 0.05): import tqdm import urllib from pybloomfilter import BloomFilter bf = BloomFilter(capacity, error_rate, outfile) with open(infile) as f: for _, word in enumerate(tqdm.tqdm(f, total=capacity)): if "%" in word: word = urllib.parse.unquote(word).lower() word = word.rstrip() bf.add(word) bf.close()
def process(files): #Iterate over the lines of all files listed in sys.argv[1:], defaulting to sys.stdin if the list is empty. #If a filename is '-', it is also replaced by sys.stdin. if os.path.isfile(bloomfile): UNIQUES = BloomFilter.open(bloomfile) else: UNIQUES = BloomFilter(MAXUNIQUES, ACCUACY, bloomfile) for record in fileinput.input(files): record = str(record).strip() if not record in UNIQUES: UNIQUES.add(record) print record UNIQUES.sync() UNIQUES.close()
class DuplicatesPipeline(object): def __init__(self): self.bf = BloomFilter(10000000, 0.01, 'filter.bloom') self.f_write = open('visitedsites','w') self.si = SearchIndex() self.si.SearchInit() def process_item(self, item, spider): print '************%d pages visited!*****************' %len(self.bf) if self.bf.add(item['url']):#True if item in the BF raise DropItem("Duplicate item found: %s" % item) else: #print '%d pages visited!'% len(self.url_seen) self.save_to_file(item['url'],item['title']) self.si.AddIndex(item) return item def save_to_file(self,url,utitle): self.f_write.write(url) self.f_write.write('\t') self.f_write.write(utitle.encode('utf-8')) self.f_write.write('\n') def __del__(self): """docstring for __del__""" self.f_write.close() self.si.IndexDone()
class URLBloomFilter: dbconn = None cur = None urlbf = None sql = None def initdb(self, host = 'localhost', user = '******', passwd = 'muye', db = 'muye', port = 3306, charset = 'utf8'): self.dbconn = MySQLConnection.MySQLConn() self.dbconn.connect(m_host = host, m_user = user, m_passwd = passwd, m_db = db) self.cur = self.dbconn.cursor() def initfilter(self, filename = './url.filter'): if os.path.isfile(filename): self.urlbf = BloomFilter.open(filename) else: self.urlbf = BloomFilter(10000000, 0.001, filename) def initsql(self, m_sql): self.sql = m_sql def add(self, url): if not self.urlbf.add(url): self.cur.execute(self.sql, url) return True else: return False def close(self): self.dbconn.close()
class URLBloomFilter: dbconn = None cur = None urlbf = None sql = None def initdb(self, host='localhost', user='******', passwd='muye', db='muye', port=3306, charset='utf8'): self.dbconn = MySQLConnection.MySQLConn() self.dbconn.connect(m_host=host, m_user=user, m_passwd=passwd, m_db=db) self.cur = self.dbconn.cursor() def initfilter(self, filename='./url.filter'): if os.path.isfile(filename): self.urlbf = BloomFilter.open(filename) else: self.urlbf = BloomFilter(10000000, 0.001, filename) def initsql(self, m_sql): self.sql = m_sql def add(self, url): if not self.urlbf.add(url): self.cur.execute(self.sql, url) return True else: return False def close(self): self.dbconn.close()
class Filter(object): def __init__(self, capacity=1000000, errrate=0.01, fname="filter.bloom"): try: self.bf = BloomFilter.open(fname) except: self.bf = BloomFilter(capacity, errrate, fname) self.syncmax = 100 self.synccnt = 0 def isExists(self, value): if value: return value in self.bf return True def add(self, value): if value: try: ret = self.bf.add(value) self.synccnt += 1 if self.synccnt >= self.syncmax: self.bf.sync() self.synccnt = 0 return ret except Exception as e: mylog.info("bf add fail! %s %s" % (e, value)) return True def sync(self): self.bf.sync()
class MongoDBPipeline(object): def __init__(self): connection = pymongo.MongoClient( settings['MONGODB_SERVER'], settings['MONGODB_PORT'] ) db = connection[settings['MONGODB_DB']] self.collection = db[settings['MONGODB_COLLECTION']] self.bf = BloomFilter(10000000, 0.01, 'filter.bloom') self.si = SearchIndex() self.si.SearchInit() def process_item(self, item, spider): if self.bf.add(item['link']):#True if item in the BF raise DropItem("Duplicate item found: %s" % item) else: for data in item: if not data: raise DropItem("Missing data!") self.collection.update({'link': item['link']}, dict(item), upsert=True) log.msg("Question added to MongoDB database!",level=log.DEBUG, spider=spider) self.si.AddIndex(item) return item def __del__(self): self.si.IndexDone()
def seating_simulated (payloads, n, k, filter_filename=None): packets, ngrams, repeats = 0, 0, 0 add_ok, add_no, add_re = 0, 0, 0 add_repeat_but_bf_add_true = 0 bf = BloomFilter(n, 1/np.power(2,k), filter_filename) duniq = set() for payload in payloads: data = unhexlify(payload) dlen = len(data) packets += 1 for i in range(min(100, dlen-k+1)): d = data[i:i+k] ngrams += 1 if d not in duniq: duniq.add(d) repeated = False else: repeats += 1 repeated = True result = bf.add(d) if result == False: add_ok += 1 if repeated: add_repeat_but_bf_add_true += 1 elif result == True: if repeated: add_re += 1 else: add_no += 1 print("simulated: add_ok=(%d) add_no=(%d) add_re=(%d) repeats=(%d)" % (add_ok, add_no, add_re, repeats)) print("add_repeat_but_bf_add_true=(%d)" % add_repeat_but_bf_add_true) print("bf: num_bits=%d, capacity=%d, error_rate=%f, added=%d" % (bf.num_bits, bf.capacity, bf.error_rate, len(bf))) return add_ok
class bloomFilter(): # Create an empty bloom filter def create_new_bf(self, capacity, error_rate, filename): self.bf = BloomFilter(capacity, error_rate, filename) # Open an existing bloom filter def open_bf(self, filename): self.bf = BloomFilter.open(filename) def add_item(self, item): self.bf.add(item) def check_membership(self, item): return item in self.bf def clear_all(self): self.bf.clear_all()
class TwoColumnBloomFilter(Filter): """ Bloom filter that takes in inputs as 2-tuples of coordinates """ def __init__(self, capacity, error_rate): super().__init__() self.bloom_filter = BloomFilter(capacity, error_rate) def build_filter(self, matrix): for row in matrix: self.bloom_filter.add(tuple(row)) def __contains__(self, item): return tuple(item) in self.bloom_filter def size(self): return self.bloom_filter.num_bits // 8
class SpamCheck (object): def __init__(self): # Setup the logging self.ilog= logging.getLogger('prog') self.ilog.setLevel(logging.INFO) self.console = logging.StreamHandler(sys.stderr) self.console.setLevel(logging.INFO) self.console.setFormatter(logging.Formatter('%(message)s')) self.ilog.addHandler(self.console) # Try loading the filter try: self.__loadFilter__() ilog.debug("loading filter.." ) # Create the filter if not present except: self.ilog.debug("Exception in loading ...." ) self.__create__() self.ilog.debug("Creating the file ... ") def __loadFilter__(self): self.bf = BloomFilter.open('filter.bloom') def __create__(self): self.bf = BloomFilter(10000000, 0.01, 'filter.bloom') # Let us initalize the first time, it hacky but ok self.spam("000") # Generate the filter from a file with open("bad_numbers.txt") as f: for nums in f: self.bf.add(nums.rstrip()) self.ilog.debug(".") def spam(self, bad_entity): with open("bad_numbers.txt","a+") as f: f.write(bad_entity) f.write("\n") self.ilog.info("Added bad entry to file") self.bf.add(bad_entity) def isSpam(self, entity): return entity in self.bf
class BLOOMDupeFilter(BaseDupeFilter): """Request Fingerprint duplicates filter""" def __init__(self, path=None): self.file = None self.fingerprints = BloomFilter(3000000, 0.00001, 'bloomTemp') @classmethod def from_settings(cls, settings): return cls(job_dir(settings)) def request_seen(self, request): fp = request.url if fp in self.fingerprints: return True self.fingerprints.add(fp) def close(self, reason): self.fingerprints = None
def _count(self): """ Implements the Bloom Filter k-mer counting algorithm """ # initialize Bloom Filter bf = BloomFilter(self._reader.total_kmer, self._error_rate, 'kmer_bf') if self._verbose: # initialize progress bar current = 0 update_threshold = int(self._reader.total_kmer / 100) format_custom_text = FormatCustomText('Hash Size: %(value).1f MB', dict(value=0)) print('Hashing...') bar = ProgressBar( max_value=self._reader.total_kmer, widgets=[ Percentage(), ' ', SimpleProgress(format='(%s)' % SimpleProgress.DEFAULT_FORMAT, ), ' ', Bar(), ' ', Timer(), ' ', AdaptiveETA(), ' ', format_custom_text ]) bar.start() for kmer in self._reader.kmer(): if kmer not in bf: # not in Bloom Filter bf.add(kmer) else: # in Bloom Filter try: self._kmer_counter[kmer] += 1 # Increment except KeyError: self._kmer_counter[kmer] = 2 # Add to Hash Table if self._verbose: # update progress bar current += 1 if update_threshold == 0 or current % update_threshold == 0: size = sys.getsizeof(self._kmer_counter) / (1024**2) bar.update(current, format_custom_text.update_mapping(value=size)) os.remove('kmer_bf') # remove Bloom Filter from disk if self._verbose: bar.finish() print('Hashing Done!')
def _read_files_and_count(self): if self._verbose: print('Reading from files...') for j in range(self._np): if self._verbose: # initialize progress bar print('Partition #{}'.format(j + 1)) bar = ProgressBar(max_value=UnknownLength) bar.start() count = 0 bf = BloomFilter(self._capacity, self._error_rate, 'kmer_bf') kmer_counter = dict() with open(str(j), 'r') as f: # open file for the current partition for kmer in f: if kmer not in bf: # not in Bloom Filter bf.add(kmer) else: # in Bloom Filter try: kmer_counter[kmer] += 1 # in Hash Table except KeyError: # not in Hash Table kmer_counter[kmer] = 2 # Add to Hash Table if self._verbose: # update progress bar count += 1 bar.update(count) if self._verbose: bar.finish() print('Populating the heap...') for kmer, count in kmer_counter.items(): if count > self._heap[0][0]: # item is bigger than minimum # replace minimum item with the recent one # kmer.rstrip() is used to eliminate the new line heapq.heappushpop(self._heap, (count, kmer.rstrip())) if self._verbose: print('Heap is populated') print( ('Partition #{} has been completed with {:.1f} MB hash ' + 'table').format(j + 1, sys.getsizeof(kmer_counter) / (1024**2))) os.remove(str(j)) # remove the partition file os.remove('kmer_bf')
def dedup(fname): bf = BloomFilter(1E8, 0.01) with open(fname, 'r') as fin: with open('deduped.tsv', 'w') as fout: for line in fin: splitLine = line.split('\t') description = splitLine[5] if bf.add(md5.new(description).digest()): continue else: fout.write(line)
def _create_or_return_bloom(self, elements=None, filename='hashdd.bloom'): """Creates and/or returns a bloom filter. If the filter does not exist, it will be created using the items in elements. If it does exist, it will be returned. Keyword Arguments: elements -- A list of strings to add to the bloom filter filename -- The filename where the bloom filter should be stored """ if os.path.isfile(filename): bf = BloomFilter.open(filename) else: print('[+] Creating Bloom filter with {} elements'.format(len(elements))) if not elements: raise Exception('Attempting to build a bloom filter, but have no items to add') limit = len(elements) bf = BloomFilter(limit, 0.0001, '{}'.format(filename)) for element in elements: bf.add(element) return bf
class LinkFilter(): def __init__(self): if os.path.exists('bloomfilter'): self.bloomfilter = BloomFilter.open('bloomfilter') else: self.bloomfilter = BloomFilter(1000000, 0.01, 'bloomfilter') def process(self, links): new_links = [] for link in links: if not self.bloomfilter.add(link.url): new_links.append(link) return new_links
class FilterPipeline(object): def __init__(self): self.bloomname = "filter" self.f = open("/home/hong/文档/sina_working/2to3_test/log.txt", 'a') self.now = time.time() self.es = Elasticsearch("10.13.1.126:9200") self.one_month_ago = datetime.datetime( time.localtime(self.now).tm_year, time.localtime(self.now).tm_mon - 1, time.localtime(self.now).tm_mday) def open_spider(self, spider): self.bloomname = "filter" isexists = os.path.exists(self.bloomname + ".bloom") if isexists: print("打开一个存在的filter文件", file=self.f) self.bf = BloomFilter.open(self.bloomname + ".bloom") else: print("创建一个新的filter文件", file=self.f) self.bf = BloomFilter(100000000, 0.001, self.bloomname + ".bloom") def process_item(self, item, spider): token = item['lost_mid'] time_temp = re.search(r'(\d+).?(\d+).?(\d+)', str(item['lost_time'])) time_stamp = datetime.datetime(int(time_temp.group(1)), int(time_temp.group(2)), int(time_temp.group(3))) if time.mktime(time_stamp.timetuple()) < time.mktime( self.one_month_ago.timetuple()): #print("At Time %s , the item[%s] : the datetime is overtimed._____________"%(time.ctime(),token),file=self.f) raise DropItem( "****************************The datetime is overtimed!!!!!") item['lost_title'] = item['lost_describe'] items = get_thing_array(item['lost_describe']) if not items: raise DropItem( "****************************the items has no match!!!!!") else: item['lost_describe'] = items flag = self.bf.add(token) if flag == False: return item #这里表示:如果没有重复,item接着放到下面的pipeline类中处理 else: self.f.write( "At Time %s , the item[%s] is overread url , Not Allowed._____________" % (time.ctime(), token)) self.f.close() raise DropItem( "****************************is the overread url!!!!!")
class DuplicatedFlowFilter(object): def __init__(self): self.bf = BloomFilter(10000000, 0.01, 'filter.bloom') def add(self, flow): """ :param flow: the flow dict received from Proxy. :return: if the flow already in the filter. """ f = (flow[METHOD], flow[URL]) return self.bf.add(f) def __contains__(self, flow): f = (flow[METHOD], flow[URL]) return self.bf.__contains__(f)
class LinkFilter(): def __init__(self, name): self.name = name + ".bf" self.bf = BloomFilter(100000000, 0.01, self.name) ''' if os.path.exists(self.name): self.bf = BloomFilter.open(self.name) else: self.bf = BloomFilter(100000000, 0.01, self.name) ''' def link_filter(self, links): new_links = [] for link in links: if not self.bf.add(link.url): new_links.append(link) return new_links
def createBloomFilter(contentFile, filterFilename): bf = BloomFilter(10000000, 0.9999999, filterFilename) total = 0 count = 0 failed = 0 with open(contentFile, "r") as f: for domain in f: total += 1 d = domain.rstrip() if bf.add(d): count += 1 print(d) else: failed += 1 print "Total ", total print "Added ", count print "Conflicted", failed
class DuplicatesPipeline(object): def __init__(self): self.conn = Connection() self.db = self.conn.blog self.siteTables = self.db.siteTables self.bf = BloomFilter(100000000, 0.01, 'filter.bloom') self.f_write = open('visitedsites','w') def process_item(self, item, spider): print '***********%d pages visited! *********'%len(self.bf) if self.bf.add(item['web_urls']): raise DropItem("Duplicate item found: %s"%item) else: self.save_to_visited(item['web_urls'], item['title'], item['content']) return item def save_to_visited(self, url, utitle,content): self.siteTables.insert({"url":url,"title":utitle.encode('utf-8'), "content": content.encode('utf-8')}) self.f_write.write(url) self.f_write.write('\t') self.f_write.write(utitle.encode('utf-8')) self.f_write.write('\n') def __del__(self): self.f_write.close()
class DuplicateFilter(RFPDupeFilter): """ A dupe filter for url """ def __init__(self, path=FILTER_PATH, debug=False): if os.path.exists(FILTER_PATH): self.url_filter = BloomFilter.open(FILTER_PATH) else: print "created a new bloom filter. " self.url_filter = BloomFilter(100000, 0.00001, FILTER_PATH) super(DuplicateFilter, self).__init__(path, debug) def request_fingerprint(self, request): return request_fingerprint(request) def request_seen(self, request): if request.url.startswith("http://www.dianping.com/shop/"): fp = self.request_fingerprint(request) if self.url_filter.add(fp): print ">" * 5 + "filtered " + request.url + "<" * 5 return True def close(self, reason): self.url_filter = None
class MultinomiamNaiveBayes(object): def __init__(self, base, alpha, initial_capacity, error_rate, cache_size): self.initial_capacity = initial_capacity self.error_rate = error_rate self.alpha = alpha self.base = base #Tracks count | class for p(x|c) self.class_conditional_counts = BloomFreqMap(base) #Tracks count all tokens | class for p(x|c) self.tokens_per_class = {} #Tracks count(class) for p(c) self.class_freqs = {} #Counts vocab size for smoothing self.token_type_bf = BloomFilter(capacity=initial_capacity, error_rate=error_rate) self.vocab_sizes = {} #Tracks the tokens in each class so that we can penalize unseen tokens #self.class_to_toks_bf = {} self.N = 0 #instance count def makeTokenFreqmap(self, tokens): f = {} get = f.get for token in tokens: f[token] = get(token, 0) + 1 return f def fit(self, tokens, class_label): #if class_label not in self.class_to_toks_bf: # self.class_to_toks_bf[class_label] = BloomFilter(capacity=self.initial_capacity, error_rate=self.error_rate) if class_label not in self.vocab_sizes: self.vocab_sizes[class_label] = BloomFilter(capacity=self.initial_capacity, error_rate=self.error_rate) self.tokens_per_class[class_label] = self.tokens_per_class.get(class_label, 0) + len(tokens) tok_freqs = self.makeTokenFreqmap(tokens) for token, token_freq in tok_freqs.iteritems(): #self.class_to_toks_bf[class_label].add(token) self.token_type_bf.add(token) #conditional_counts_bf[token+'_'+class_label] += token_freq self.class_conditional_counts[token+'_'+class_label] += token_freq self.vocab_sizes[class_label].add(token) self.class_freqs[class_label] = self.class_freqs.get(class_label, 0) + 1 self.N += 1 def predict(self, tokens, tie_breaker='highest_freq', use_class_prior=True): N = self.N max_class, max_score = None, -inf tok_freqs = self.makeTokenFreqmap(tokens) num_instances = sum((item[1] for item in self.class_freqs.iteritems())) for c, cf in self.class_freqs.iteritems(): this_score = log(cf) - log(N) if use_class_prior else 0.0 f_t_c = self.tokens_per_class[c] num_unseen = 0 V = len(self.vocab_sizes[c]) theta_denominator = log(f_t_c + V) for token, freq in tok_freqs.iteritems(): count_in_c = self.class_conditional_counts[token+'_'+c] if count_in_c == 0: num_unseen += freq continue this_score += freq*(log(count_in_c + self.alpha) - theta_denominator) #Penalize unseen tokens this_score += num_unseen*(log(self.alpha) - log(theta_denominator)) max_score, max_class = max((max_score, max_class), (this_score, c)) return max_class, max_score
class Crawler: """Crawl a set of URLs. This manages two sets of URLs: 'urls' and 'done'. 'urls' is a set of URLs seen, and 'done' is a list of FetchStatistics. """ def __init__(self, roots, exclude=None, strict=True, # What to crawl. max_redirect=10, max_tries=4, # Per-url limits. max_tasks=10, *, loop=None): self.loop = loop or asyncio.get_event_loop() self.roots = roots self.exclude = exclude self.strict = strict self.max_redirect = max_redirect self.max_tries = max_tries self.max_tasks = max_tasks self.q = Queue(loop=self.loop) self.seen_urls = BloomFilter(10000000, 0.01) self.done = [] self.session = aiohttp.ClientSession(loop=self.loop) self.root_domains = set() for root in roots: parts = urllib.parse.urlparse(root) host, port = urllib.parse.splitport(parts.netloc) if not host: continue if re.match(r'\A[\d\.]*\Z', host): self.root_domains.add(host) else: host = host.lower() if self.strict: self.root_domains.add(host) else: self.root_domains.add(lenient_host(host)) for root in roots: self.add_url(root) self.t0 = time.time() self.t1 = None def close(self): """Close resources.""" self.session.close() def host_okay(self, host): """Check if a host should be crawled. A literal match (after lowercasing) is always good. For hosts that don't look like IP addresses, some approximate matches are okay depending on the strict flag. """ host = host.lower() if host in self.root_domains: return True if re.match(r'\A[\d\.]*\Z', host): return False if self.strict: return self._host_okay_strictish(host) else: return self._host_okay_lenient(host) def _host_okay_strictish(self, host): """Check if a host should be crawled, strict-ish version. This checks for equality modulo an initial 'www.' component. """ host = host[4:] if host.startswith('www.') else 'www.' + host return host in self.root_domains def _host_okay_lenient(self, host): """Check if a host should be crawled, lenient version. This compares the last two components of the host. """ return lenient_host(host) in self.root_domains def record_statistic(self, fetch_statistic): """Record the FetchStatistic for completed / failed URL.""" self.done.append(fetch_statistic) async def parse_links(self, response): """Return a FetchStatistic and list of links.""" links = set() content_type = None encoding = None body = await response.read() if response.status == 200: content_type = response.headers.get('content-type') pdict = {} if content_type: content_type, pdict = cgi.parse_header(content_type) encoding = pdict.get('charset', 'utf-8') if content_type in ('text/html', 'application/xml'): text = await response.text() # Replace href with (?:href|src) to follow image links. urls = set(re.findall(r'''(?i)href=["']([^\s"'<>]+)''', text)) if urls: LOGGER.info('got %r distinct urls from %r', len(urls), response.url) for url in urls: LOGGER.info("response.url:%s,type:%s", response.url, type(response.url)) LOGGER.info("parse_links url:%s,type:%s", url, type(url)) normalized = urllib.parse.urljoin(str(response.url), url) defragmented, frag = urllib.parse.urldefrag(normalized) if self.url_allowed(defragmented): links.add(defragmented) stat = FetchStatistic( url=response.url, next_url=None, status=response.status, exception=None, size=len(body), content_type=content_type, encoding=encoding, num_urls=len(links), num_new_urls=len(links) - len(self.seen_urls)) return stat, links async def fetch(self, url, max_redirect): """Fetch one URL.""" tries = 0 exception = None while tries < self.max_tries: try: response = await self.session.get( url, allow_redirects=False) if tries > 1: LOGGER.info('try %r for %r success', tries, url) break except aiohttp.ClientError as client_error: LOGGER.info('try %r for %r raised %r', tries, url, client_error) exception = client_error tries += 1 else: # We never broke out of the loop: all tries failed. LOGGER.error('%r failed after %r tries', url, self.max_tries) self.record_statistic(FetchStatistic(url=url, next_url=None, status=None, exception=exception, size=0, content_type=None, encoding=None, num_urls=0, num_new_urls=0)) return try: if is_redirect(response): location = response.headers['location'] next_url = urllib.parse.urljoin(url, location) self.record_statistic(FetchStatistic(url=url, next_url=next_url, status=response.status, exception=None, size=0, content_type=None, encoding=None, num_urls=0, num_new_urls=0)) if next_url in self.seen_urls: return if max_redirect > 0: LOGGER.info('redirect to %r from %r', next_url, url) self.add_url(next_url, max_redirect - 1) else: LOGGER.error('redirect limit reached for %r from %r', next_url, url) else: stat, links = await self.parse_links(response) self.record_statistic(stat) for link in utils.difference(links, self.seen_urls): # for link in links.difference(self.seen_urls): self.q.put_nowait((link, self.max_redirect)) # self.seen_urls.update(links) self.seen_urls.update(links) finally: await response.release() async def work(self): """Process queue items forever.""" try: while True: url, max_redirect = await self.q.get() assert url in self.seen_urls LOGGER.info("url:%s", url) LOGGER.info("max_redirect:%s", max_redirect) await self.fetch(url, max_redirect) self.q.task_done() except asyncio.CancelledError: pass def url_allowed(self, url): if self.exclude and re.search(self.exclude, url): return False parts = urllib.parse.urlparse(url) if parts.scheme not in ('http', 'https'): LOGGER.debug('skipping non-http scheme in %r', url) return False host, port = urllib.parse.splitport(parts.netloc) if not self.host_okay(host): LOGGER.debug('skipping non-root host in %r', url) return False return True def add_url(self, url, max_redirect=None): """Add a URL to the queue if not seen before.""" if max_redirect is None: max_redirect = self.max_redirect LOGGER.debug('adding %r %r', url, max_redirect) self.seen_urls.add(url) self.q.put_nowait((url, max_redirect)) async def crawl(self): """Run the crawler until all finished.""" workers = [asyncio.Task(self.work(), loop=self.loop) for _ in range(self.max_tasks)] self.t0 = time.time() yield self.q.join() self.t1 = time.time() for w in workers: w.cancel()
if __name__ == "__main__": processes = 4 urls_pool = Pool(processes = processes) for host in HOSTS: urls_to_crawl.append(host) try: while len(urls_to_crawl) > 0: time.sleep(1) for url in urls_to_crawl: #1__get items to crawl url = _remove_hash_in_url(url) #2__remove hash in each url if present urls_to_crawl.remove(url)#3__remove from list print len(urls_to_crawl) if in_visited(url) == False: #4__check to see if in visited visited.add(url) #5__add url to visited list VISITED_LIST.append(url) new_urls = urls_pool.apply_async(get_new_urls, [url]) #6__get new links and add them to tocrawl try: new_urls_get = new_urls.get() except (requests.exceptions.ConnectionError) as c: new_urls_get = False if new_urls_get: for new_url in new_urls_get: urls_to_crawl.append(new_url) write_crawled_sites(VISITED_LIST) except KeyboardInterrupt: print "quit..." exit(0)
path = os.path.join(base_dir, name) if not os.path.exists(path): os.mkdir(path) ##mm homepage url = "http:" + url mm_html = urllib2.urlopen(url).read().decode("GBK") mm_html = bs4.BeautifulSoup(mm_html).find_all("div", {"id":"J_AixiuShow"}) #find all pics pics = re.findall(rc, str(mm_html)) print pics count = 0 for pic_url in pics: count += 1 print "url", pic_url pic_name = os.path.join(path, str(count) + '.jpg') urllib.urlretrieve(pic_url, pic_name) print list_url # print tag_mm_name if __name__ == "__main__": print "hello" print checkIfInBloom("test") bf.add("test") print checkIfInBloom("test") get_mms(2, 3) print base_dir
count = 1000000 error_rate = 0.01 keyLen = 2048 # key digest length is 32 bytes bf = BloomFilter(count, error_rate, 'filter.bloom') keyDigest_list = [] FILE = open('/home/enzo/CCNx/python_projects/bloomfilter/PublicKeyDigest', 'r') for i in range(count): keyDigest = FILE.read(32) keyDigest_list.append(keyDigest) FILE.close() for publicKeyID in keyDigest_list: bf.add(publicKeyID) #print "length of list is %d" % len(keyDigest_list) #randindex = random.randint(0, count-1) #rint "randindex is %s" % randindex #publicKeyID = keyDigest_list[randindex] #if publicKeyID in bf: # print "True" #else: # print "False"
import sys import glob import cPickle from pybloomfilter import BloomFilter folderpath = sys.argv[1] for book_filepath in glob.glob(folderpath+'/*.txt'): book = open(book_filepath).read() sentences = book.split('.') bf = BloomFilter(100000,0.01,'filter.bloom') for sentence in sentences: words = sentence.split() for word in words: bf.add(word.strip('"')) print 'the' in bf print 'wut' in bf print 'laughter' in bf BloomFilter.from_base64(book_filepath+'.bf',BloomFilter.to_base64(bf))
[0] https://github.com/andresriancho/w3af/issues/485 [1] https://github.com/axiak/pybloomfiltermmap/issues/50 """ print(OSX_MSG) else: try: # This might fail since it is a C library that only works in Linux from pybloomfilter import BloomFilter as CMmapFilter # There were reports of the C mmap filter not working properly in OSX, # just in case, I'm testing here... temp_file = GenericBloomFilter.get_temp_file() try: bf = CMmapFilter(1000, 0.01, temp_file) bf.add(1) assert 1 in bf assert 2 not in bf except: WrappedBloomFilter = FileSeekFilter else: WrappedBloomFilter = CMmapFilter except: WrappedBloomFilter = FileSeekFilter class BloomFilter(GenericBloomFilter): def __init__(self, capacity, error_rate): """ :param capacity: How many items you want to store, eg. 10000 :param error_rate: The acceptable false positive rate, eg. 0.001
# For each bloom filter opened_bloom = [] for bloo in bloop_path_set: # Opening blooms opened_bloom.append(BloomFilter.open(bloo)) # For each hash of the paste for line_hash in PST._get_hash_lines(min=5, start=1, jump=0): nb_hash_current += 1 # Adding the hash in Redis & limiting the set if r_serv1.scard(line_hash) <= set_limit: r_serv1.sadd(line_hash, index) r_serv1.sadd("HASHS", line_hash) # Adding the hash in the bloom of the month bloom.add(line_hash) # Go throught the Database of the bloom filter (of the month) for bloo in opened_bloom: if line_hash in bloo: db = bloo.name[-6:] # Go throught the Database of the bloom filter (month) r_serv_bloom = dico_redis[db] # set of index paste: set([1,2,4,65]) hash_current = r_serv_bloom.smembers(line_hash) # removing itself from the list hash_current = hash_current - set([index]) # if the hash is present at least in 1 files # (already processed) if len(hash_current) != 0:
class ObjectTracker(object): invCleanPeriod = 300 invInitialCapacity = 50000 invErrorRate = 0.03 trackingExpires = 3600 initialTimeOffset = 60 def __init__(self): self.objectsNewToMe = RandomTrackingDict() self.objectsNewToThem = {} self.objectsNewToThemLock = RLock() self.initInvBloom() self.initAddrBloom() self.lastCleaned = time.time() def initInvBloom(self): if haveBloom: # lock? self.invBloom = BloomFilter(capacity=ObjectTracker.invInitialCapacity, error_rate=ObjectTracker.invErrorRate) def initAddrBloom(self): if haveBloom: # lock? self.addrBloom = BloomFilter(capacity=ObjectTracker.invInitialCapacity, error_rate=ObjectTracker.invErrorRate) def clean(self): if self.lastCleaned < time.time() - ObjectTracker.invCleanPeriod: if haveBloom: # FIXME if PendingDownloadQueue().size() == 0: self.initInvBloom() self.initAddrBloom() else: # release memory deadline = time.time() - ObjectTracker.trackingExpires with self.objectsNewToThemLock: self.objectsNewToThem = {k: v for k, v in self.objectsNewToThem.iteritems() if v >= deadline} self.lastCleaned = time.time() def hasObj(self, hashid): if haveBloom: return hashid in self.invBloom else: return hashid in self.objectsNewToMe def handleReceivedInventory(self, hashId): if haveBloom: self.invBloom.add(hashId) try: with self.objectsNewToThemLock: del self.objectsNewToThem[hashId] except KeyError: pass if hashId not in missingObjects: missingObjects[hashId] = time.time() self.objectsNewToMe[hashId] = True def handleReceivedObject(self, streamNumber, hashid): for i in network.connectionpool.BMConnectionPool().inboundConnections.values() + network.connectionpool.BMConnectionPool().outboundConnections.values(): if not i.fullyEstablished: continue try: del i.objectsNewToMe[hashid] except KeyError: if streamNumber in i.streams and \ (not Dandelion().hasHash(hashid) or \ Dandelion().objectChildStem(hashid) == i): with i.objectsNewToThemLock: i.objectsNewToThem[hashid] = time.time() # update stream number, which we didn't have when we just received the dinv # also resets expiration of the stem mode Dandelion().setHashStream(hashid, streamNumber) if i == self: try: with i.objectsNewToThemLock: del i.objectsNewToThem[hashid] except KeyError: pass def hasAddr(self, addr): if haveBloom: return addr in self.invBloom def addAddr(self, hashid): if haveBloom: self.addrBloom.add(hashid)
class Worker: def __init__(self, seeds, done_que, run_que): self.showpercounts = 10 self.timeout = 5 self.starttime = time.time() self.oldtime = 0 self.quit = 0 self.https_enable = 0 self.run_que = run_que self.done_que = done_que self.tasks = [] self.done = 1 self.errdone = set() self.err = Error() self.loadstate() self.blacklist = set (( '.blog.','.taobao.com','.baidu.com','.edu','.gov','.mil','mail','.google', 'weibo.com','t.cn','wikipedia','facebook','twitter','dropbox' )) self.allowdDomain = set(('com','net','org','cn','info','biz','me','name','cc','tv')) self.httpget = self.httpget_requests # down method self.httpget_requests | httpget_curl self.poolsize = 60 self.poolmaxfree = 20 self.freecount = 0 self.down_pool = Pool(size=self.poolsize) self.totalnettime = 0 self.cbcputime = 0 self.totaldownsize = 0 self.curspeed = 0 self.debugnosave = 1 self.tt = 1 self.done_sites_fname='done_sites.bin' try: self.bfdone = BloomFilter.open(self.done_sites_fname) except: self.bfdone = BloomFilter(2**23, 10**(-5), self.done_sites_fname) #8M if self.run_que.qsize() == 0: for seed in seeds: self.run_que.put( seed.split("http://")[1] ) if self.https_enable == 0: self.urlpatern = re.compile(r'href=["\']http://([^/?#\"\']+)',re.I) else: self.urlpatern = re.compile(r'href=["\']http[s]?://([^/?#\"\'"]+)',re.I) def cb_httpget(self, data = None): if not data: return seed, err, headers, content = data st = time.time() if err: self.handle_error(err,seed) return if self.https_enable == 0: seed = seed[7:] self.bfdone.add(seed) self.done += 1 data={'seed':seed,'headers':headers,'content':content} dat = cPickle.dumps(data) self.done_que.put(dat) et = time.time() self.cbcputime += (et-st) #self.tt=(et-st) if self.done % self.showpercounts == 0: self.out(seed) pass def out(self, seed): spendtime = time.time() - self.starttime spendtime = 1 if spendtime == 0 else spendtime nowh = str(int(spendtime)/3600)+":" if spendtime>3600 else "" now = "%s%02d:%02d" % (nowh, spendtime%3600/60, spendtime%60 ) print "%s D:%-4d R:%-7d [Speed: T%.2f/s C%3d/s A%.2f] CB:%0.4f Active:%d %s %s" % (now, (self.done), self.run_que.qsize(), \ (self.done)/(spendtime+self.oldtime), self.curspeed, self.tt, self.totalnettime / self.done ,self.poolsize-self.freecount, str(self.err), seed ) def work(self): while self.quit == 0: st = time.time() curdone = self.done self.freecount = self.down_pool.free_count() if self.freecount > self.poolmaxfree: self.tasks = [] minlen = min(self.freecount+1,self.run_que.qsize()) #if minlen <=0:break for i in range( minlen): stt = time.time() url = self.run_que.get() ett = time.time() if url in self.bfdone:# 5%-10% continue url = "http://"+url self.tasks.append(url) for url in self.tasks: self.down_pool.apply_async(self.httpget, (url,), callback=self.cb_httpget) time.sleep(0.1) et = time.time() self.curspeed = (self.done - curdone) / (et-st) #self.tt = (et-st) self.down_pool.join() print "All OVER" def handle_error(self,e,url): if e.find('DNSError') > 0 : self.err.dns += 1 self.err.rdns.append(url) elif e.find('reset') > 0 :#Connection reset self.err.reset += 1 self.err.rreset.append(url) elif e.find('Max retries') > 0 or e.find('Connection aborted'): # self.err.conntimeout += 1 self.err.rconntimeout.append(url) elif e.find('refused') > 0: #Connection refused self.err.refuse += 1 self.err.rrefuse.append(url) else: self.err.others +=1 self.err.rothers.append(url) print "Error", url, e # requests is better through test def httpget_requests(self, url): st = time.time() con = "" e = "" res_headers = "" headers = { 'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.6', 'Accept-Encoding':'gzip,deflate', 'Connection':'close', 'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36' } res = None try: # todo: query the ip of the website before get through dns req = requests req.max_redirects = 1 res = req.get(url, timeout = (3,2), headers = headers ) if self.https_enable == 0 and res.url.lower().startswith('http:'): if 'content-type' not in res.headers.keys() or 'html' not in res.headers['content-type']: return None con = res.content res.close() except KeyboardInterrupt: raise except Exception as e: e = str(e) if res: res.close() return url,e,None,None et = time.time() self.totalnettime += (et-st) self.tt = (et-st) return url, e, res.headers, con def savestate(self): self.quit = 1 now = time.time() self.oldtime += (now - self.starttime) #should hold on the singal for procdata done with open('state.txt','wb') as f: f.write(str(self.oldtime) + '\n') # tasks run_queue done f.write(str(len(self.tasks)) + '\n') for t in self.tasks: f.write(t + '\n') l = self.run_que.qsize() f.write(str(l)+ '\n') while l > 0: f.write( self.run_que.pop() + '\n') l-=1 f.write(str((self.done)) + '\n') with open('err_records.pack','wb') as f: cPickle.dump(self.err,f,2) print time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), " Save state successfully." f.close() exit(0) def loadstate(self): try: with open('state.txt') as f: self.oldtime = float(f.readline()) tasks = int(f.readline()) for i in xrange(tasks): self.run_que.add(f.readline().rstrip('\n')) runnings = int(f.readline()) for i in xrange(runnings): self.run_que.add(f.readline().rstrip('\n')) self.done = int(f.readline()) with open('err_records.pack','rb') as f: self.err = cPickle.load(f) print time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), " Load state successfuly." except Exception as e: print e
class MySpider(object): def __init__(self, start_url, basic_url): self.basic_url = basic_url self.start_url = start_url self.mysql = mysql.Mysql() self.re = re self.time = time self.datetime = datetime self.requests = requests # 使用bloom_filter去重,每次从文件中读取dump.bloom if os.path.isfile('filter.bloom'): self.bf = BloomFilter.open('filter.bloom') else: self.bf = BloomFilter(10000000, 0.01, 'filter.bloom') def __get_time(self): return self.datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") def __cal_time(self, date1, date2): date1 = self.time.strptime(date1, "%Y-%m-%d %H:%M:%S") date2 = self.time.strptime(date2, "%Y-%m-%d %H:%M:%S") date1 = self.datetime.datetime(date1[0], date1[1], date1[2], date1[3], date1[4], date1[5]) date2 = self.datetime.datetime(date2[0], date2[1], date2[2], date2[3], date2[4], date2[5]) return str(date2 - date1) def __log(self, log_str, *args, **kw): current_time = self.__get_time() print current_time + ' : ' + log_str, for each in args: print each, print '\n', for each in kw.keys(): print current_time + ' : ' + each + ' is ' + kw[each] def __process_text(self, my_str): my_str = self.re.sub("http.*?html", "", my_str).encode('utf-8') if type(my_str) == 'unicode': my_str = my_str.encode('utf-8') return my_str.replace(" ", "").replace(" ", "").replace('\n', '') def __open_url(self, url): req = self.requests.get(url) content = req.text soup = BeautifulSoup(content) return soup def __process_sub_content(self, result, insert_id): for each in result: # print '\n' + '楼层:' # 时间以及作者 # print each.attrs.get('js_restime') # print each.a.text # 内容 # bbs_content = each.select("[class~=bbs-content]") # text = bbs_content[0].text.strip() # 去除链接以及空格 # text = re.sub("http.*?html", "", text).encode('utf-8') # text = text.replace(" ", "").replace(" ", "") # print self.__process_text(text) # print process_text(text) replies = each.select('ul li') for reply in replies: self.__log('process the reply ... start') reply_time = reply.get('_replytime') reply_author = reply.get('_username') reply_content = reply.select("[class~=ir-content]") reply_text = reply_content[0].text reply_dict = { "title_id": insert_id, "author": reply_author, "time": reply_time, "text": reply_text } self.__log('content is', reply_text) self.__log('insert to database ...start') self.mysql.insert_data('reply', reply_dict) self.__log('insert to database ...done') self.__log('process the reply ... done') # 处理每一层楼 def process_content_page(self, url, author, reply_time, insert_id): self.__process_reply_page(url, author, reply_time, insert_id) def __process_reply_page(self, url, author, reply_time, insert_id): self.__log('process reply page... start') soup = self.__open_url(url) # 各层楼的tag result = soup.select("[class~=atl-item]") if len(result): self.__log('the html was read successfully') else: self.__log('html read fail. maybe the page is lose. function return') self.__log('process reply page ... done') return # 回复页总页数 page_id = soup.select("form a") if page_id: total_page_num = int(page_id[-2].text) else: total_page_num = 1 self.__log('have read', total_page_num, 'pages') # 首层楼的内容 main_content = result[0].select("[class~=bbs-content]") main_content = main_content[0].text.strip() main_text = self.__process_text(main_content) reply_dict = { "title_id": insert_id, "author": author, "time": reply_time, "text": main_text } self.mysql.insert_data('reply', reply_dict) result = result[1:] self.__log('process every floor') self.__process_sub_content(result, '1') if total_page_num > 1: for num in range(2, total_page_num + 1): self.__log('process the', str(num), 'reply page ... start') next_url = url[:-7]+str(num)+url[-6:] print next_url new_soup = self.__open_url(next_url) result = new_soup.select("[class~=atl-item]") self.__process_sub_content(result, insert_id) self.__log('process the', str(num), 'reply page ... done') self.__log('process reply page ... done') def __process_titles_page(self, page_url): self.__log('reading titles page .... start') req = self.requests.get(page_url) content = req.text soup = BeautifulSoup(content) # 获取所有标题 titles = soup.select('tbody tr') # 去掉不符合的部分 titles = titles[1:] # 对每一个标题进行处理 self.__log('reading titles page .... done') self.__log('processing all titles in', self.start_url, ' ... start') counter = 1 for each in titles: # 获取标题的tag信息 # 注意在beautifulSoup的tag中,空白也是标签,即相邻两个td之间标签还有空白 # 所以下面content索引需要考虑到这点 self.__log('process the', counter, 'title', ' ... start') counter += 1 title_content = each.contents title_href = title_content[1].a.get('href') # 获取标题链接 title_text = title_content[1].text.strip() # 获取标题内容 title_author = title_content[3].a.text # 获取作者 title_click_num = title_content[5].text # 点击数 title_reply_number = title_content[7].text # 获取回复数 title_time = title_content[9].get('title') # 获取时间 sub_href = self.basic_url + title_href # 子链接 # 构造标题的字典,插入标题 title_dict = { "reply_num": title_reply_number, "click_num": title_click_num, "author": title_author, "time": title_time, "link": sub_href, "text": title_text } # for each in title_dict: # print each # print type(title_dict[each]) # 利用链接地址和回复数判断是否重复 # flag = sub_href + title_click_num flag = sub_href if not (self.bf.add(flag)): self.__log('', flag, 'not in bloom filter') self.__log('insert to database ... start') insert_id = self.mysql.insert_data("titles", title_dict) self.__log('insert to database ... done') self.__process_reply_page(sub_href, title_author.encode('utf-8'), title_time, str(insert_id)) self.__log('process the', counter, 'title', ' ... done') # 下一页的链接 next_page_tag = soup.find('a', text='下一页') if next_page_tag: next_page = next_page_tag.get('href') next_page = self.basic_url + next_page else: next_page = None return next_page # 清空bloom filter def clean_bloom_filter(self): self.__log('clean all in bloom filter ... start') self.bf.clear_all() self.__log('clean all in bloom filter ... done') def bloom_filter_len(self): return len(self.bf) def main(self): self.__log('spider ... start') self.__log('process start url ... running') next_page = self.__process_titles_page(self.start_url) self.__log('process start url ... done') start_time = self.__get_time() print start_time depth = 1 while next_page: # if depth == 2: # break self.__log('now it is the', str(depth), 'page') next_page = self.__process_titles_page(next_page) depth += 1 end_time = self.__get_time() print end_time duration = self.__cal_time(start_time, end_time) self.__log('duration are', duration) self.__log('spider ... done') def clean_table(self, table): self.mysql.clean_table(table) def test(self): test_url = 'http://bbs.tianya.cn/post-333-778768-1.shtml' print self.bf.add(test_url)
class DuplicatesPipeline(object): def __init__(self): self.bf = BloomFilter(10000000, 0.01, 'filter.bloom') self.f_write = open('visitedsites','w') self.si = SearchIndex() self.si.SearchInit() self.count_num = 0 self.db = MySQLdb.connect("localhost","root","","storecount") self.cursor = self.db.cursor() self.cursor.execute("DROP TABLE IF EXISTS POPULAR") sql1 = """CREATE TABLE POPULAR(URL text(512),COUNT_MARK INT);""" try: self.cursor.execute(sql1) self.db.commit() # print "cao create" except: traceback.print_exc() self.db.rollback() # self.dbpool = adbapi.ConnectionPool('MySQLdb', # host = '127.0.0.1', # db = 'storecount', # user = '******', # passwd = '', # cursorclass = MySQLdb.cursors.DictCursor, # charset = 'utf8', # use_unicode = True) self.mark = 0 # def _conditional_insert(self,tx,item): # sql = 'insert into popular values (%s, %d)' # tx.execute(sql, (item['url'],self.mark)) def process_item(self, item, spider): # print '************%d pages visited!*****************' %len(self.bf) if self.bf.add(item['url']):#True if item in the BF sql2 = "UPDATE POPULAR SET COUNT_MARK = COUNT_MARK + 1 WHERE URL = '%s'" %item['url'] try: print "update" self.cursor.execute(sql2) self.db.commit() except: traceback.print_exc() self.db.rollback() # self.dbpool.runOperation("UPDATE popular SET mark+1") raise DropItem("Duplicate item found: %s" % item) else: #print '%d pages visited!'% len(self.url_seen) self.count_num += 1 self.save_to_file(item['url'],item['title']) self.si.AddIndex(item) sql3 = """INSERT INTO POPULAR(URL,COUNT_MARK) VALUES ("%s",0);""" % item['url'] try: self.cursor.execute(sql3) self.db.commit() except: traceback.print_exc() self.db.rollback() # self._conditional_insert(self,self.dbpool, item['url'], 0) # print self.count_num if self.count_num >=100000 and self.count_num % 10000 : print self.count_num return item def save_to_file(self,url,utitle): self.f_write.write(url) self.f_write.write('\t') self.f_write.write(utitle.encode('utf-8')) self.f_write.write('\n') def __del__(self): """docstring for __del__""" self.f_write.close() self.si.IndexDone()
import tempfile import time from pybloomfilter import BloomFilter NS = 10**9 for _p in xrange(1,3): p = 10 ** _p for e in xrange(9): with tempfile.NamedTemporaryFile() as f: X = int(1000 * 10 ** (e / 2.0)) print X, p, name = f.name bloomfilter = BloomFilter(X + 1, 1.0/p, name) t = time.time() for x in xrange(X): bloomfilter.add(x) print (time.time() - t) / X * NS, t = time.time() for x in xrange(X): x in bloomfilter print (time.time() - t) / X * NS, t = time.time() for x in xrange(X, 2*X): x in bloomfilter print (time.time() - t ) / X * NS