def __init__(self, path=FILTER_PATH, debug=False): if os.path.exists(FILTER_PATH): self.url_filter = BloomFilter.open(FILTER_PATH) else: print "created a new bloom filter. " self.url_filter = BloomFilter(100000, 0.00001, FILTER_PATH) super(DuplicateFilter, self).__init__(path, debug)
class LinkFilter(): def __init__(self, domain): self.file_index = '%s_%s' % (domain, 'index.bf') self.file_html = '%s_%s' % (domain, 'html.bf') if os.path.exists(self.file_index): self.bf_index = BloomFilter.open(self.file_index) else: self.bf_index = BloomFilter(100000000, 0.001, self.file_index) if os.path.exists(self.file_html): self.bf_html = BloomFilter.open(self.file_html) else: self.bf_html = BloomFilter(100000000, 0.001, self.file_html) def index_filter(self, links): new_links = [] for link in links: if not self.bf_index.add(link.url): new_links.append(link) return new_links def html_filter(self, links): new_links = [] for link in links: #log.msg('This is a link : %s' % link, level=log.WARNING) if not self.bf_html.add(link.url): new_links.append(link) return new_links
def create_ref_bloom_filter(reference_file, error_rate, bf_file, format="fasta"): """From a given FASTA reference sequence creates a bloom filter file from each read. """ if format == "fasta": file_it = FastaIterator record = lambda it: (seq.seq for seq in it) elif format == "fastq": file_it = FastqGeneralIterator record = lambda it: (seq for _, seq, _ in it) capacity = total_reads(reference_file) with open(reference_file) as handle: it = file_it(handle) read_it = record(it) read_len = 109 read_in = [] read = [] buffer = [] bf = BloomFilter(capacity, error_rate, bf_file) sequence = read_it.next() step = read_len i = 0 while i < len(sequence): read = sequence[i:i + read_len - 1] i += step print(read) bf.update(read) bf.close()
def main(): #Check for command line arguments if len(sys.argv) != 2: print 'Usage: %s [trace file]' % os.path.basename(sys.argv[0]) sys.exit(1) #Read arguments from command line inFile = sys.argv[1] bf1 = BloomFilter(100000000, 0.001, 'bf1') bf2 = BloomFilter(100000000, 0.001, 'bf2') outputFileName="converted-"+sys.argv[1] f = open(outputFileName, "a") for line in open(inFile,'r'): if (line[0:2]=="W," or line[0:2]=="R,"): hash1=int(hashlib.sha1(line[2:]).hexdigest(), 16) % (10 ** 10) hash2=int(hashlib.md5(line[2:]).hexdigest(), 16) % (10 ** 10) if (bf1.add(hash1) and bf2.add(hash2)): f.write('%s,%d\n' % (line[0],hash1*10000) ) else: f.write('%s,%d\n' % (line[0],hash2*10000) ) elif(line==''): break else: pass f.close()
def __init__(self, seeds, done_que, run_que): self.showpercounts = 10 self.timeout = 5 self.starttime = time.time() self.oldtime = 0 self.quit = 0 self.https_enable = 0 self.run_que = run_que self.done_que = done_que self.tasks = [] self.done = 1 self.errdone = set() self.err = Error() self.loadstate() self.blacklist = set (( '.blog.','.taobao.com','.baidu.com','.edu','.gov','.mil','mail','.google', 'weibo.com','t.cn','wikipedia','facebook','twitter','dropbox' )) self.allowdDomain = set(('com','net','org','cn','info','biz','me','name','cc','tv')) self.httpget = self.httpget_requests # down method self.httpget_requests | httpget_curl self.poolsize = 60 self.poolmaxfree = 20 self.freecount = 0 self.down_pool = Pool(size=self.poolsize) self.totalnettime = 0 self.cbcputime = 0 self.totaldownsize = 0 self.curspeed = 0 self.debugnosave = 1 self.tt = 1 self.done_sites_fname='done_sites.bin' try: self.bfdone = BloomFilter.open(self.done_sites_fname) except: self.bfdone = BloomFilter(2**23, 10**(-5), self.done_sites_fname) #8M if self.run_que.qsize() == 0: for seed in seeds: self.run_que.put( seed.split("http://")[1] ) if self.https_enable == 0: self.urlpatern = re.compile(r'href=["\']http://([^/?#\"\']+)',re.I) else: self.urlpatern = re.compile(r'href=["\']http[s]?://([^/?#\"\'"]+)',re.I)
def __init__(self): self.mysql = mysql.Mysql() self.re = re self.time = time self.datetime = datetime self.requests = requests # 使用bloom_filter去重,每次从文件中读取dump.bloom if os.path.isfile("new_filter.bloom"): self.bf = BloomFilter.open("new_filter.bloom") else: self.bf = BloomFilter(10000000, 0.01, "new_filter.bloom")
def dedup(fname): bf = BloomFilter(1E8, 0.01) with open(fname, 'r') as fin: with open('deduped.tsv', 'w') as fout: for line in fin: splitLine = line.split('\t') description = splitLine[5] if bf.add(md5.new(description).digest()): continue else: fout.write(line)
def __init__(self, node_n, seen_persist, Q_logs=None): self.node_n = node_n self.Q_logs = Q_logs self.total_crawled = 0 self.payloads_dropped = 0 # single variable for tracking whether node should be active or not self.active = True # crawl task Queue # Priority Queue ~ [ (next_pull_time, host_addr, url, parent_page_stats, seed_dist, parent_url) ] self.Q_crawl_tasks = Queue.PriorityQueue() # host queue dict # { host_addr: [(url, ref_page_stats, seed_dist, parent_url), ...] } self.hqs = {} # seen url check # Bloom Filter ~ [ url ] if seen_persist: try: self.seen = BloomFilter.open(BF_FILENAME) except: self.Q_logs.put('Error opening bloom filter, creating new one') self.seen = BloomFilter(BF_CAPACITY, BF_ERROR_RATE, BF_FILENAME) else: self.seen = BloomFilter(BF_CAPACITY, BF_ERROR_RATE, BF_FILENAME) # DNS Cache # { netloc: (host_addr, time_last_checked) } self.DNScache = {} # overflow url Queue # Queue ~ [ (host_addr, url, ref_page_stats, seen_dist, parent_url) ] self.Q_overflow_urls = Queue.Queue() # host queue cleanup Queue # Priority Queue ~ [ (time_to_delete, host_addr) ] self.Q_hq_cleanup = Queue.PriorityQueue() # active url count queue- for counting/tracking active # Queue ~ [ True ] self.Q_active_count = Queue.Queue() # thread active url dict- a dict of active urls by thread using, for restart dump # { thread_name: active_url } # NOTE: note that there are problems with this methodology, but that errors will only lead # to data redundancy (as opposed to omission)... self.thread_active = {} # Queue of messages to be sent to other nodes # Queue ~ [ (node_num_to, url, seed_dist, parent_page_stats) ] self.Q_to_other_nodes = Queue.Queue()
def create_bf(): bf = BloomFilter(count, error_rate, 'filter_base.bloom') keyDigest_list = [] FILE = open(keyDigestFile, 'r') for i in range(count): keyDigest = FILE.read(keyDigestLen) keyDigest_list.append(keyDigest) FILE.close() for publicKeyID in keyDigest_list: bf.add(publicKeyID)
def __init__(self, domain): self.file_index = '%s_%s' % (domain, 'index.bf') self.file_html = '%s_%s' % (domain, 'html.bf') if os.path.exists(self.file_index): self.bf_index = BloomFilter.open(self.file_index) else: self.bf_index = BloomFilter(100000000, 0.001, self.file_index) if os.path.exists(self.file_html): self.bf_html = BloomFilter.open(self.file_html) else: self.bf_html = BloomFilter(100000000, 0.001, self.file_html)
def __init__(self, start_url, basic_url): self.basic_url = basic_url self.start_url = start_url self.mysql = mysql.Mysql() self.re = re self.time = time self.datetime = datetime self.requests = requests # 使用bloom_filter去重,每次从文件中读取dump.bloom if os.path.isfile('filter.bloom'): self.bf = BloomFilter.open('filter.bloom') else: self.bf = BloomFilter(10000000, 0.01, 'filter.bloom')
class URLBloomFilter: dbconn = None cur = None urlbf = None sql = None def initdb(self, host = 'localhost', user = '******', passwd = 'muye', db = 'muye', port = 3306, charset = 'utf8'): self.dbconn = MySQLConnection.MySQLConn() self.dbconn.connect(m_host = host, m_user = user, m_passwd = passwd, m_db = db) self.cur = self.dbconn.cursor() def initfilter(self, filename = './url.filter'): if os.path.isfile(filename): self.urlbf = BloomFilter.open(filename) else: self.urlbf = BloomFilter(10000000, 0.001, filename) def initsql(self, m_sql): self.sql = m_sql def add(self, url): if not self.urlbf.add(url): self.cur.execute(self.sql, url) return True else: return False def close(self): self.dbconn.close()
class DuplicatesPipeline(object): def __init__(self): self.bf = BloomFilter(10000000, 0.01, 'filter.bloom') self.f_write = open('visitedsites','w') self.si = SearchIndex() self.si.SearchInit() def process_item(self, item, spider): print '************%d pages visited!*****************' %len(self.bf) if self.bf.add(item['url']):#True if item in the BF raise DropItem("Duplicate item found: %s" % item) else: #print '%d pages visited!'% len(self.url_seen) self.save_to_file(item['url'],item['title']) self.si.AddIndex(item) return item def save_to_file(self,url,utitle): self.f_write.write(url) self.f_write.write('\t') self.f_write.write(utitle.encode('utf-8')) self.f_write.write('\n') def __del__(self): """docstring for __del__""" self.f_write.close() self.si.IndexDone()
class DuplicatedFlowFilter(object): def __init__(self): self.bf = BloomFilter(10000000, 0.01, 'filter.bloom') def add(self, flow): """ :param flow: the flow dict received from Proxy. :return: if the flow already in the filter. """ f = (flow[METHOD], flow[URL]) return self.bf.add(f) def __contains__(self, flow): f = (flow[METHOD], flow[URL]) return self.bf.__contains__(f)
class MongoDBPipeline(object): def __init__(self): connection = pymongo.MongoClient( settings['MONGODB_SERVER'], settings['MONGODB_PORT'] ) db = connection[settings['MONGODB_DB']] self.collection = db[settings['MONGODB_COLLECTION']] self.bf = BloomFilter(10000000, 0.01, 'filter.bloom') self.si = SearchIndex() self.si.SearchInit() def process_item(self, item, spider): if self.bf.add(item['link']):#True if item in the BF raise DropItem("Duplicate item found: %s" % item) else: for data in item: if not data: raise DropItem("Missing data!") self.collection.update({'link': item['link']}, dict(item), upsert=True) log.msg("Question added to MongoDB database!",level=log.DEBUG, spider=spider) self.si.AddIndex(item) return item def __del__(self): self.si.IndexDone()
def __init__(self): bc = config.get_boolmfilter_config() if os.path.exists(bc['bin_path']): self.bloomfilter = BloomFilter.open(bc['bin_path']) else: self.bloomfilter = BloomFilter( bc['capacity'], bc['wrong_rate'], bc['bin_path'])
def vote(request, poll): try: choice_name = request.POST['choice'] selected_choice = poll.choice_set.get(choice=choice_name) except (KeyError, Choice.DoesNotExist): return render_to_response('detail.html', {'poll':poll, 'error_message':"You didn't select a choice."}, context_instance= RequestContext(request)) if not (poll.has_expired() or already_voted(request, poll)): hash = request_hash(request) poll.total_votes += 1 selected_choice.votes += 1 poll.vote_set.create(hash=hash) selected_choice.save() #Update the seen ips from pybloomfilter import BloomFilter bf = BloomFilter.from_base64('/tmp/bloom.filter', poll.ips_seen) alreadyseen = bf.add(request.META['REMOTE_ADDR']) if not alreadyseen: poll.ips_seen = bf.to_base64() poll.ips_count += 1 poll.save() return None
def count_matches(fastq_file, bf_files, sampling): """Goes through a fastq file and checks a sample of reads if they occur in the specified bloom filter. """ if isinstance(bf_files, basestring): bf_files = [bf_files] bf = {} observed = {} for bf_file in bf_files: bf[bf_file] = BloomFilter.open(bf_file) observed[bf_file] = 0 fastq_handle = open(fastq_file) fastq_it = FastqGeneralIterator(fastq_handle) checked = 0 sampling = int(sampling) # import ipdb # ipdb.set_trace() for i, (_, read, _) in enumerate(fastq_it): if not i + 1 % sampling: continue print read checked += 1 for bf_file in bf_files: if read in bf[bf_file]: observed[bf_file] += 1 fastq_handle.close() return checked, observed
def __init__(self, settings, debug = False): self.capacity = settings.getint("DUPEFILTER_CAPACITY") self.filename = settings.get("DUPEFILTER_FILENAME") self.debug = debug self.error_rate = 0.01 self.logger = logging.getLogger(__name__) self.bloom_filter_ =BloomFilter(self.capacity, self.error_rate, self.filename)
def __init__(self, roots, exclude=None, strict=True, # What to crawl. max_redirect=10, max_tries=4, # Per-url limits. max_tasks=10, *, loop=None): self.loop = loop or asyncio.get_event_loop() self.roots = roots self.exclude = exclude self.strict = strict self.max_redirect = max_redirect self.max_tries = max_tries self.max_tasks = max_tasks self.q = Queue(loop=self.loop) self.seen_urls = BloomFilter(10000000, 0.01) self.done = [] self.session = aiohttp.ClientSession(loop=self.loop) self.root_domains = set() for root in roots: parts = urllib.parse.urlparse(root) host, port = urllib.parse.splitport(parts.netloc) if not host: continue if re.match(r'\A[\d\.]*\Z', host): self.root_domains.add(host) else: host = host.lower() if self.strict: self.root_domains.add(host) else: self.root_domains.add(lenient_host(host)) for root in roots: self.add_url(root) self.t0 = time.time() self.t1 = None
def __init__(self): self.bf = BloomFilter(10000000, 0.01, 'filter.bloom') self.f_write = open('visitedsites','w') self.si = SearchIndex() self.si.SearchInit() self.count_num = 0 self.db = MySQLdb.connect("localhost","root","","storecount") self.cursor = self.db.cursor() self.cursor.execute("DROP TABLE IF EXISTS POPULAR") sql1 = """CREATE TABLE POPULAR(URL text(512),COUNT_MARK INT);""" try: self.cursor.execute(sql1) self.db.commit() # print "cao create" except: traceback.print_exc() self.db.rollback() # self.dbpool = adbapi.ConnectionPool('MySQLdb', # host = '127.0.0.1', # db = 'storecount', # user = '******', # passwd = '', # cursorclass = MySQLdb.cursors.DictCursor, # charset = 'utf8', # use_unicode = True) self.mark = 0
class URIBloomFilter(BaseDupeFilter): def __init__(self, settings, debug = False): self.capacity = settings.getint("DUPEFILTER_CAPACITY") self.filename = settings.get("DUPEFILTER_FILENAME") self.debug = debug self.error_rate = 0.01 self.logger = logging.getLogger(__name__) self.bloom_filter_ =BloomFilter(self.capacity, self.error_rate, self.filename) @classmethod def from_settings(cls, settings): debug = settings.getbool('DUPEFILTER_DEBUG') return cls(settings, debug) def request_seen(self, request): fp = self.request_fingerprint(request) if self.check(fp): return True else: self.insert(fp) ###-------todo-------## def request_fingerprint(self, request): return request_fingerprint(request) def check(self, request): ret = request in self.bloom_filter_ return ret def insert(self, request): self.bloom_filter_.add(request) #print len(self.bloom_filter_) #print self.bloom_filter_.hash_seeds #print self.bloom_filter_.num_bits #print self.bloom_filter_.num_hashes def reset(self): self.bloom_filter_.clear_all() def save(self): pass def load(self): self.bloom_filter_.sync() self.bloom_filter_.open("bloom.dump") pass def log(self, request, spider): if self.debug: msg = "Filtered duplicate request: %(request)s" self.logger.debug(msg, {'request': request}, extra={'spider': spider}) elif self.logdupes: msg = ("Filtered duplicate request: %(request)s" " - no more duplicates will be shown" " (see DUPEFILTER_DEBUG to show all duplicates)") self.logger.debug(msg, {'request': request}, extra={'spider': spider}) self.logdupes = False spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)
def _process_one(data_file): ''' Process one output file to generate a bloom filter''' path, dump_name = os.path.split(data_file) _, parent_dir = os.path.split(path) # ensure the containing folder exists bf_dir_path = os.path.join('bloom_filters', parent_dir) if not os.path.isdir(bf_dir_path): os.mkdir(bf_dir_path) bf_file_path = os.path.join(bf_dir_path, dump_name) if not os.path.isfile(bf_file_path): ncpu, _, nparts, _, _, _, ids = read_output(data_file, header_only=False) bf = BloomFilter(nparts, 1./ncpu, bf_file_path) bf.update(ids) return bf_file_path
class SpamCheck (object): def __init__(self): # Setup the logging self.ilog= logging.getLogger('prog') self.ilog.setLevel(logging.INFO) self.console = logging.StreamHandler(sys.stderr) self.console.setLevel(logging.INFO) self.console.setFormatter(logging.Formatter('%(message)s')) self.ilog.addHandler(self.console) # Try loading the filter try: self.__loadFilter__() ilog.debug("loading filter.." ) # Create the filter if not present except: self.ilog.debug("Exception in loading ...." ) self.__create__() self.ilog.debug("Creating the file ... ") def __loadFilter__(self): self.bf = BloomFilter.open('filter.bloom') def __create__(self): self.bf = BloomFilter(10000000, 0.01, 'filter.bloom') # Let us initalize the first time, it hacky but ok self.spam("000") # Generate the filter from a file with open("bad_numbers.txt") as f: for nums in f: self.bf.add(nums.rstrip()) self.ilog.debug(".") def spam(self, bad_entity): with open("bad_numbers.txt","a+") as f: f.write(bad_entity) f.write("\n") self.ilog.info("Added bad entry to file") self.bf.add(bad_entity) def isSpam(self, entity): return entity in self.bf
def __create__(self): self.bf = BloomFilter(10000000, 0.01, 'filter.bloom') # Let us initalize the first time, it hacky but ok self.spam("000") # Generate the filter from a file with open("bad_numbers.txt") as f: for nums in f: self.bf.add(nums.rstrip()) self.ilog.debug(".")
def __init__(self,seeds): self.showpercounts = 10 self.timeout = 5 self.starttime = time.time() self.oldtime = 0 self.quit = 0 self.https_enable = 0 self.run_queue = Queue() self.tasks = [] self.done = 0 self.errdone = set() self.err = Error() self.loadstate() #self.whitelist = ['html','htm','php','shtml','asp','jsp','do','action','aspx'] self.blacklist = set (( '.blog.','.taobao.com','.baidu.com','.edu','.gov','.mil','mail','.google', 'weibo.com','t.cn','worldpress.com','blogspot.com','youtube','wikipedia','facebook','twitter','dropbox' )) self.allowdDomain = set(('com','net','org','cn','info','biz','me','name','cc','tv')) self.httpget = self.httpget_requests # down method self.httpget_requests | httpget_curl self.poolsize = 100 self.poolmaxfree = 40 self.freecount = 0 self.down_pool = Pool(size=self.poolsize) self.mutex = gevent.coros.RLock() self.totalnettime = 0 self.cbcputime = 0 self.totaldownsize = 0 self.curspeed = 0 self.debugnosave = 1 try: self.bfdone = BloomFilter.open('done_sites.bin') except: self.bfdone = BloomFilter(2**23, 10**(-5), 'done_sites.bin') if self.run_queue.qsize() == 0: for seed in seeds: self.run_queue.put( seed.split("http://")[1] ) if self.https_enable == 0: self.urlpatern = re.compile('href=[\"\']http://([^/?#\"\']+)') else: self.urlpatern = re.compile('href=[\"\']http[s]?://([^/?#\"\'"]+)')
def createBloomFilter(contentFile, filterFilename): bf = BloomFilter(10000000, 0.9999999, filterFilename) total = 0 count = 0 failed = 0 with open(contentFile, "r") as f: for domain in f: total += 1 d = domain.rstrip() if bf.add(d): count += 1 print(d) else: failed += 1 print "Total ", total print "Added ", count print "Conflicted", failed
def __init__(self): connection = pymongo.MongoClient( settings['MONGODB_SERVER'], settings['MONGODB_PORT'] ) db = connection[settings['MONGODB_DB']] self.collection = db[settings['MONGODB_COLLECTION']] self.bf = BloomFilter(10000000, 0.01, 'filter.bloom') self.si = SearchIndex() self.si.SearchInit()
class BLOOMDupeFilter(BaseDupeFilter): """Request Fingerprint duplicates filter""" def __init__(self, path=None): self.file = None self.fingerprints = BloomFilter(3000000, 0.00001, 'bloomTemp') @classmethod def from_settings(cls, settings): return cls(job_dir(settings)) def request_seen(self, request): fp = request.url if fp in self.fingerprints: return True self.fingerprints.add(fp) def close(self, reason): self.fingerprints = None
def cpu_containing(particles, bloom_filters, yieldAll=True): ''' Iterate over all bloom filter and yield the one containing the particle''' for cpu in tqdm(range(len(bloom_filters))): bf = BloomFilter.open(bloom_filters[cpu]) yieldCPU = False cpu_contains = [] for p in particles: if p in bf: yieldCPU = True cpu_contains.append(p) if yieldAll or yieldCPU: yield cpu+1, cpu_contains
def _get_bloom_filter(self) -> BloomFilter or None: """ Retrieve the bloom filter from storage server :return: Bloom filter """ resp = self.get(f"{self.STORAGESERVER}/bloom") suc = resp.json()['success'] if suc: log.debug("Successfully retrieved bloom filter.") tmp = helpers.get_temp_file() + '.bloom' b = BloomFilter.from_base64(tmp, resp.json()['bloom'].encode()) atexit.register(shutil.rmtree, tmp, True) # Remove and ignore # errors return b else: msg = resp.json()['msg'] raise RuntimeError(f"Failed to retrieve bloom filter: {msg}")
def __init__(self,*args,**kwargs): super(CollectorSpider,self).__init__(*args,**kwargs) #用一个list来存放所有的json配置中的k,v,变成了一个元祖list,遍历这个list #scrapy.log.start("./log.txt",loglevel=INFO,logstdout=True) self.log = open("/home/hong/文档/sina_working/2to3_test/log.txt",'a') print("\n\n__________________________分割线___________________________________", file=self.log) print("At Time %s : 爬虫启动............"%time.ctime(), file=self.log) self.now = time.time() self.one_month_ago = datetime.datetime(time.localtime(self.now).tm_year,time.localtime(self.now).tm_mon-1,time.localtime(self.now).tm_mday) self.config = [] self.Index_Url = "" self.flag = 0 #这里必须初始化bf,否则首次循环下面会报错 self.bf = "" self.isexists=os.path.exists("/home/hong/文档/sina_working/2to3_test/filter.bloom") if self.isexists: print("存在filter.bloom,打开!!!!",file=self.log) self.bf = BloomFilter.open("/home/hong/文档/sina_working/2to3_test/filter.bloom")
def bloom(self): """ Return bloom filter containing the record hashes (as base64 encoding). Needs to be a property to avoid concurrency problems with mutltiple threads. Initialize with database contents it no bloom filter exists. :return: Bloom Filter """ if self._bloom is None: # Initialize bloom_file = self.data_dir + config.BLOOM_FILE if os.path.isfile(bloom_file): self._bloom = BloomFilter.open(filename=bloom_file) log.info(f"Bloom Filter loaded from file {bloom_file}!") else: # new Bloom filter self._initialize_bloom_filter() return self._bloom
def threaded_crawl(tid, n, proxies, lock, output_dir="."): global count global failures fails = 0 logger = logging.getLogger(__name__) fptr = open("top-1m.csv", "r") fail_thresh = 10 # Use a different proxy after 10 failed requests in a row proxy = dict() linum = fails = 0 start = tid * n # First seed site to crawl end = tid * n + n # Last seed site to crawl seed = BloomFilter(n * 1000000, 0.1, '/tmp/{}.bloom'.format(tid).encode()) frontier = deque() logger.info('[tid {}] Loading seed URLs {} - {}'.format(tid, start, end)) for line in fptr: if linum >= start and linum < end: url = "http://" + line.split(',')[1].strip() seed.add(url.encode()) frontier.append(url) linum += 1 fptr.close() while True: url = frontier.popleft() urls = [] try: urls = parse_url(url, proxy, output_dir) except Exception as e: logger.error( "[tid {}] Fatal error occured while crawling: {}.".format( tid, url)) if len(urls) == 0: with lock: failures += 1 fails += 1 if fails > fail_thresh: proxy['http'] = proxies[randint(0, len(proxies) - 1)] logger.error("[tid {}] Failure: Activating proxy:{}".format( tid, proxy['http'])) fails = 0 for u in urls: link = u.encode() if link not in seed: seed.add(link) frontier.append(link) with lock: count += 1 if (count % 1000 == 0): logger.info('Page count: {}'.format(count)) if len(frontier) % 1000 == 0: logger.info("[tid {}] Frontier count: {}".format( tid, len(frontier)))
def __init__( self, directory: Path, filter_capacity: int, filter_error_rate: float, batch_count: int, batch_duration_sec: int, ): """Create a BatchedBloomFilter from a set of files, named `<unix_timestamp>.bloom`.""" self.directory = directory self.filter_capacity = filter_capacity self.filter_error_rate = filter_error_rate self.batch_count = batch_count self.batch_duration_sec = batch_duration_sec files = list(self.directory.glob('*.bloom')) timestamps = [] timestamp_to_path = {} for path in files: try: timestamp = int(path.stem) except ValueError: log.info( 'Ignoring invalid file name (expecting <unix_timestamp>.bloom): %s', path) else: timestamps.append(timestamp) timestamp_to_path[timestamp] = path recent_timestamps = sorted(timestamps)[-self.batch_count:] try: self.last_batch_ts = recent_timestamps[-1] except IndexError: self.last_batch_ts = 0 self.batches = [ BloomFilter.open(str(timestamp_to_path[ts])) for ts in recent_timestamps ] log.info('Found existing bloom filters: %s', dict(zip(recent_timestamps, self.batches))) self.rotate_if_needed()
def create(infile, outfile, capacity: int, error_rate: float = 0.05): import tqdm import urllib from pybloomfilter import BloomFilter bf = BloomFilter(capacity, error_rate, outfile) with open(infile) as f: for _, word in enumerate(tqdm.tqdm(f, total=capacity)): if "%" in word: word = urllib.parse.unquote(word).lower() word = word.rstrip() bf.add(word) bf.close()
class FilterPipeline(object): def __init__(self): self.bloomname = "filter" def open_spider(self, spider): isexists = os.path.exists(self.bloomname + ".bloom") if isexists == True: self.bf = BloomFilter.open(self.bloomname + ".bloom") else: self.bf = BloomFilter(100000000, 0.001, self.bloomname + '.bloom') def process_item(self, item, spider): #这里使用url和歌名作一个去重,如果在同一url取得同一首歌名,即认为其是重复数据 token = (str(item['url']) + str(item['song_info'])) flag = self.bf.add(token) #这里False表示元素添加进去了,如果里面有相同元素返回True if flag == False: return item else: raise DropItem("find this link in bloomfilter!!!")
def fit(self, tokens, class_label): #if class_label not in self.class_to_toks_bf: # self.class_to_toks_bf[class_label] = BloomFilter(capacity=self.initial_capacity, error_rate=self.error_rate) if class_label not in self.vocab_sizes: self.vocab_sizes[class_label] = BloomFilter( capacity=self.initial_capacity, error_rate=self.error_rate) self.tokens_per_class[class_label] = self.tokens_per_class.get( class_label, 0) + len(tokens) tok_freqs = self.makeTokenFreqmap(tokens) for token, token_freq in tok_freqs.iteritems(): #self.class_to_toks_bf[class_label].add(token) self.token_type_bf.add(token) #conditional_counts_bf[token+'_'+class_label] += token_freq self.class_conditional_counts[token + '_' + class_label] += token_freq self.vocab_sizes[class_label].add(token) self.class_freqs[class_label] = self.class_freqs.get(class_label, 0) + 1 self.N += 1
def rotate_if_needed(self): """Remove stale filters, create a new filter if needed, named `<unix_timestamp>.bloom`.""" ts = int(time.time()) if ts - self.last_batch_ts > self.batch_duration_sec: retained = self.batch_count - 1 stale = self.batches[:-retained] self.batches = self.batches[-retained:] for stale_bf in stale: file_name = Path(stale_bf.filename) stale_bf.close() file_name.unlink() log.info('Closed stale bloom filter: %s', file_name) bloom_filter_file = self.directory / f'{ts}.bloom' self.batches.append( BloomFilter(self.filter_capacity, self.filter_error_rate, str(bloom_filter_file))) self.last_batch_ts = ts log.info('Created a new bloom filter: %s', bloom_filter_file) log.info('Operating with filters: %r', [(bf.filename, bf) for bf in self.batches])
class DuplicateFilter(RFPDupeFilter): """ A dupe filter for url """ def __init__(self, path=FILTER_PATH, debug=False): if os.path.exists(FILTER_PATH): self.url_filter = BloomFilter.open(FILTER_PATH) else: print "created a new bloom filter. " self.url_filter = BloomFilter(100000, 0.00001, FILTER_PATH) super(DuplicateFilter, self).__init__(path, debug) def request_fingerprint(self, request): return request_fingerprint(request) def request_seen(self, request): if request.url.startswith("http://www.dianping.com/shop/"): fp = self.request_fingerprint(request) if self.url_filter.add(fp): print ">" * 5 + "filtered " + request.url + "<" * 5 return True def close(self, reason): self.url_filter = None
def getbloomFilter(bf, fem_kmers, kmer_size): if bf: print("Opening Bloom Filter of k-mers from female") female_kmers_bf = BloomFilter.open("data/female.bloom") print("Done") else: print("Need to make Bloom Filter of k-mers from female") bf_size = 3 * 1000 * 1000 * 1000 bf_filename = "data/female.bloom" female_kmers_bf = BloomFilter(bf_size, .001, bf_filename) if fem_kmers: # if female kmers file exist female_kmers_file = "data/female_kmers" with open(female_kmers_file, 'r') as fm_kmers: #assumes kmers are uppercase first_line = fm_kmers.readline() kmers.test_valid_kmer_format(first_line, kmer_size) fm_kmers.seek(0) for line in fm_kmers: female_kmers_bf.add(line[:kmer_size]) else: print( "Reading female reference one record at a time and k-merizing each record..." ) female_reference_file = "data/female.fasta" n_kmers = "N" * kmer_size for record in SeqIO.parse(female_reference_file, "fasta"): to_kmerize_fwd = str(record.seq).upper() length = len(to_kmerize_fwd) for i in range(0, length - kmer_size + 1): female_kmer = to_kmerize_fwd[i:i + kmer_size] if female_kmer != n_kmers: female_kmers_bf.add(to_kmerize_fwd[i:i + kmer_size]) print("Done creating bloom filter") return female_kmers_bf
def __init__(self, path): if not os.path.exists(path): raise RuntimeError(u"Missing Bloom: %s" % path) self.bloom = BloomFilter.open(path)
#!/usr/bin/python # -*- coding:utf-8 -*- import requests from bs4 import BeautifulSoup from JobCrawler import JobCrawler from pybloomfilter import BloomFilter from time import time company_bf = BloomFilter(1024 * 1024 * 16, 0.01) total_page = 1 def get_company_info(url, page=1): if page > total_page: return wbdata = requests.get(url).content soup = BeautifulSoup(wbdata, 'lxml') # print soup.prettify() company_list = soup.select('div.el > span.t2') # print type(company_list), '\ncompany_list :', company_list for index, company in enumerate(company_list): if index != 0: company_result = company.find_all(name='a') company_link = company_result[0].attrs['href'] company_name = company_result[0].attrs['title'] print company_name, ' - ', company_link
def load_bf(self, filename, capacity, error_rate): bf = BloomFilter(capacity=capacity, error_rate=error_rate) with open(filename) as f: for line in f: bf.add(line.split('\t')[0].strip()) return bf
print("--- new folder... ---") print("--- OK ---") else: print("--- There is this folder! ---") if __name__ == '__main__': ProgramStarttime = datetime.datetime.now() try: #创建文件夹调用 file_ad1 = "/home/260199/爬虫/爬虫数据/政府公告/政府政策公告信息" + str(ProgramStarttime) + "/国家超链接/" mkdir(file_ad1) # 调用函数 all_href = [] href_bloom = BloomFilter.open('/home/260199/爬虫/爬虫代码/政策公告/government/country/all_href.bloom') #创建excel表并编辑表头 workbook = xlwt.Workbook() worksheet = workbook.add_sheet('国家级政府公告', cell_overwrite_ok=True) header = [u'标题', u'正文', u'发布部门',u'所在栏目', u'栏目类别',u'发布日期',u'爬取时间', u'政策链接', u'附件'] i = 0 # 写表头 for each_header in header: worksheet.write(0, i, each_header) i += 1 row = 1 print("当前时间为:", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) print("国家工信部 数据开始收集,请稍等...") row,href_list = gongxinbu.main(row,worksheet,href_bloom,file_ad1,ProgramStarttime) all_href.extend(href_list)
class urlFrontier: def __init__(self, node_n, seen_persist, Q_logs=None): self.node_n = node_n self.Q_logs = Q_logs self.total_crawled = 0 self.payloads_dropped = 0 # single variable for tracking whether node should be active or not self.active = True # crawl task Queue # Priority Queue ~ [ (next_pull_time, host_addr, url, parent_page_stats, seed_dist, parent_url) ] self.Q_crawl_tasks = Queue.PriorityQueue() # host queue dict # { host_addr: [(url, ref_page_stats, seed_dist, parent_url), ...] } self.hqs = {} # seen url check # Bloom Filter ~ [ url ] if seen_persist: try: self.seen = BloomFilter.open(BF_FILENAME) except: self.Q_logs.put('Error opening bloom filter, creating new one') self.seen = BloomFilter(BF_CAPACITY, BF_ERROR_RATE, BF_FILENAME) else: self.seen = BloomFilter(BF_CAPACITY, BF_ERROR_RATE, BF_FILENAME) # DNS Cache # { netloc: (host_addr, time_last_checked) } self.DNScache = {} # overflow url Queue # Queue ~ [ (host_addr, url, ref_page_stats, seen_dist, parent_url) ] self.Q_overflow_urls = Queue.Queue() # host queue cleanup Queue # Priority Queue ~ [ (time_to_delete, host_addr) ] self.Q_hq_cleanup = Queue.PriorityQueue() # active url count queue- for counting/tracking active # Queue ~ [ True ] self.Q_active_count = Queue.Queue() # thread active url dict- a dict of active urls by thread using, for restart dump # { thread_name: active_url } # NOTE: note that there are problems with this methodology, but that errors will only lead # to data redundancy (as opposed to omission)... self.thread_active = {} # Queue of messages to be sent to other nodes # Queue ~ [ (node_num_to, url, seed_dist, parent_page_stats) ] self.Q_to_other_nodes = Queue.Queue() # primary routine for getting a crawl task from queue def get_crawl_task(self): if self.active: return self.Q_crawl_tasks.get() # if url frontier shutdown, block indefinitely (until node shutdown) else: while True: time.sleep(10) # primary routine to log crawl task done & submit extracted urls def log_and_add_extracted(self, host_addr, host_seed_dist, success, time_taken=0, url_pkgs=[]): # handle failure of page pull # NOTE: TO-DO! if not success: pass # add urls to either hq of host_addr or else overflow queue for url_pkg in url_pkgs: self._add_extracted_url(host_addr, host_seed_dist, url_pkg) # calculate time delay based on success now = datetime.datetime.now() r = random.random() td = 10 * time_taken + r * BASE_PULL_DELAY if success else ( 0.5 + r) * BASE_PULL_DELAY next_time = now + datetime.timedelta(0, td) # if the hq of host_addr is not empty, enter new task in crawl task queue if len(self.hqs[host_addr]) > 0: # add task to crawl task queue r = self.hqs[host_addr].pop() self.Q_crawl_tasks.put((next_time, host_addr) + r) # else if empty, add task to cleanup queue else: self.Q_hq_cleanup.put((next_time, host_addr)) # report crawl task done to queue, HOWEVER do not submit as done till payload dropped self.Q_crawl_tasks.task_done() # subroutine to add a url extracted from a host_addr def _add_extracted_url(self, ref_host_addr, ref_seed_dist, url_pkg, from_other_node=False): url_in, ref_page_stats, parent_url = url_pkg # basic cleaning operations on url # NOTE: it is the responsibility of the crawlNode.py extract_links fn to server proper url url = re.sub(r'/$', '', url_in) # BLOCK certain urls based on manual block rgx if re.search(BLOCK_URL_RGX, url) is not None: return False # if url already seen do not proceed, else log as seen if url in self.seen: return False else: self.seen.add(url) # get host IP address of url url_parts = urlparse.urlsplit(url) host_addr = self._get_and_log_addr(url_parts.netloc) # if the page is not of a safe type log and do not proceed # NOTE: certain types e.g. pdf, doc will be passed and handled specially by crawl_page! if re.search(SAFE_PATH_RGX, url_parts.path) is None: if DEBUG_MODE: self.Q_logs.put("*UN-SAFE PAGE TYPE SKIPPED: %s" % (url, )) return False # if DNS was resolved error already reported, do not proceed any further if host_addr is None: return False # calculate url's seed distance if not from_other_node: seed_dist = ref_seed_dist if host_addr == ref_host_addr else ref_seed_dist + 1 else: seed_dist = ref_seed_dist # check for being past max seed distance if seed_dist > MAX_SEED_DIST and MAX_SEED_DIST > -1: return False # --> At this point, marker should be added to active count # This will be removed when url is either: # (A) sent to another node successfully # (B) dropped to payload database self.Q_active_count.put(True) if DEBUG_MODE: self.Q_logs.put("Active count: %s" % self.Q_active_count.qsize()) # if the page belongs to another node, pass to message sending service if not from_other_node: if DISTR_ON_FULL_URL: url_node = hash(url) % NUMBER_OF_NODES else: url_node = hash(host_addr) % NUMBER_OF_NODES if url_node != self.node_n: self.Q_to_other_nodes.put( (url_node, url, ref_page_stats, seed_dist, parent_url)) return False # if this is an internal link, and not from other node, send directly to the serving hq if seed_dist == ref_seed_dist and not from_other_node: self.hqs[host_addr].append( (url, ref_page_stats, seed_dist, parent_url)) # update total count self.total_crawled += 1 # else send to overflow_urls to stay cautiously thread safe else: # add to overflow queue self.Q_overflow_urls.put( (host_addr, url, ref_page_stats, seed_dist, parent_url)) # add to active count self.total_crawled += 1 # subfunction for getting IP address either from DNS cache or web def _get_and_log_addr(self, hostname): # try looking up hostname in DNScache now = datetime.datetime.now() if self.DNScache.has_key(hostname): # check time for DNS refresh addr, created = self.DNScache[hostname] age = now - created if age.seconds > DNS_REFRESH_TIME: addr = self._get_addr(hostname) if addr is not None: self.DNScache[hostname] = (addr, now) else: del self.DNScache[hostname] else: addr = self._get_addr(hostname) if addr is not None: self.DNScache[hostname] = (addr, now) return addr # sub-subfunction for getting IP address from socket def _get_addr(self, hostname): try: addr_info = socket.getaddrinfo(hostname, None) except Exception as e: self.Q_logs.put('DNS ERROR: skipping ' + hostname) return None # ensure result is non-null if len(addr_info) > 0: return addr_info[0][4][0] else: self.Q_logs.put('DNS ERROR: skipping ' + hostname) return None # primary routine WITH INTERNAL LOOP for maintenance threads # routine is: get cleanup task --> delete old hq after wait --> fill from overflow # routine is looped so as not to get stuck in an impasse situation def clean_and_fill_loop(self): hqs_to_make = 0 # primary loop- must loop so as not to get stuck in impasse situation while self.active: # get queue to delete & time to delete at; if no hqs to make then block get_block = (hqs_to_make == 0) try: time_to_delete, host_addr = self.Q_hq_cleanup.get(get_block) # wait till safe to delete, then delete wait_time = time_to_delete - datetime.datetime.now() time.sleep(max(0, wait_time.total_seconds())) del self.hqs[host_addr] hqs_to_make += 1 # if there are still hqs to make, then don't block on getting more cleanup tasks except Queue.Empty: pass # try a bounded number of times to find a url in overflow that doesn't already have an hq for i in range(min(OVERFLOW_TRY_MAX, self.Q_overflow_urls.qsize())): # get an overflow url tuple r = list(self.Q_overflow_urls.get()) host_addr = r[0] # if hq already exists for this host_addr then recycle and continue if self.hqs.has_key(host_addr): self.Q_overflow_urls.task_done() self.Q_overflow_urls.put(tuple(r)) continue # else create a new hq else: self.hqs[host_addr] = [] # if OVERFLOW_MULTI enabled, try to fill the new hq with multiple consecutive cn = 0 while cn < OVERFLOW_MULTI_TRY_L: try: s = list(self.Q_overflow_urls.get(False)) # don't block on attempt to fill additional urls from overflow here... except Queue.Empty: break # check if the pulled url belongs in the hq, if not recycle if s[0] == host_addr: self.hqs[host_addr].append(tuple(s[1:])) else: self.Q_overflow_urls.put(tuple(s)) cn += 1 self.Q_overflow_urls.task_done() # add the original url from overflow to crawl tasks r.insert(0, datetime.datetime.now()) self.Q_crawl_tasks.put(tuple(r)) hqs_to_make -= 1 self.Q_overflow_urls.task_done() self.Q_hq_cleanup.task_done() break # primary routine for initialization of url frontier / hqs # NOTE: !!! Assumed that this is sole thread running when executed, prior to crawl start def initialize(self, urls=[]): now = datetime.datetime.now() # initialize all hqs as either full & tasked or empty & to be deleted i = 0 while len(self.hqs) < HQ_TO_THREAD_RATIO * NUMBER_OF_CTHREADS: i += 1 # expend all given urls if len(urls) > 0: self._init_add_url(urls.pop()) # else add empty queues and mark to be cleared & replaced else: self.hqs[i] = [] self.Q_hq_cleanup.put((now, i)) # if there are urls left over, add to appropriate queues for url in urls: self._init_add_url(url) # subroutine for adding url to hq, assuming only one thread running (initialization) def _init_add_url(self, url_in): # basic cleaning operations on url url = re.sub(r'/$', '', url_in) # assume unseen and input to seen list, add to active count self.seen.add(url) # BLOCK certain urls based on manual block rgx if re.search(BLOCK_URL_RGX, url) is not None: return False # get host IP address of url url_parts = urlparse.urlsplit(url) host_addr = self._get_and_log_addr(url_parts.netloc) # if the page is not of a safe type log and do not proceed if re.search(SAFE_PATH_RGX, url_parts.path) is None: if DEBUG_MODE: self.Q_logs.put("*UN-SAFE PAGE TYPE SKIPPED: %s" % (url, )) return False # if DNS was resolved error already reported, do not proceed any further if host_addr is None: return False # if the page belongs to another node, pass to message sending service if DISTR_ON_FULL_URL: url_node = hash(url) % NUMBER_OF_NODES else: url_node = hash(host_addr) % NUMBER_OF_NODES if url_node != self.node_n: self.Q_to_other_nodes.put((url_node, url, None, 0, None)) return False # add to an existing hq, or create new one & log new crawl task, or add to overflow self.Q_active_count.put(True) self.total_crawled += 1 if DEBUG_MODE: self.Q_logs.put("Active count: %s" % self.Q_active_count.qsize()) if self.hqs.has_key(host_addr): self.hqs[host_addr].append((url, None, 0, None)) elif len(self.hqs) < HQ_TO_THREAD_RATIO * NUMBER_OF_CTHREADS: self.hqs[host_addr] = [] self.Q_crawl_tasks.put( (datetime.datetime.now(), host_addr, url, None, 0, None)) else: self.Q_overflow_urls.put((host_addr, url, None, 0, None)) # routine called on abort (by user interrupt or by MAX_CRAWLED count being reached) to # save current contents of all queues to disk & seen filter flushed for restart def dump_for_restart(self): # ensure url frontier deactivated self.active = False # get all urls in Q_crawl_tasks, hqs, or Q_overflow_urls # only get urls as these will be re-injected through the initialize method of uf with open(RESTART_DUMP, 'w') as f: for thead_name, url in self.thread_active.iteritems(): if url is not None: f.write(url + '\n') while not self.Q_crawl_tasks.empty(): try: r = self.Q_crawl_tasks.get(True, 1) f.write(r[2] + '\n') except: continue for host_addr, paths in self.hqs.iteritems(): for path in paths: f.write(path[0] + '\n') while not self.Q_to_other_nodes.empty(): try: r = self.Q_to_other_nodes.get(True, 1) f.write(r[1] + '\n') except: continue while not self.Q_overflow_urls.empty(): try: r = self.Q_overflow_urls.get(True, 1) f.write(r[1] + '\n') except: continue # ensure seen filter file is synced self.seen.sync()
''' from core.data.bloomfilter.wrappers import GenericBloomFilter # This import can't fail, it is pure-python love ;) from core.data.bloomfilter.seekfile_bloom import FileSeekBloomFilter\ as FileSeekFilter try: # This might fail since it is a C library that only works in Linux from pybloomfilter import BloomFilter as CMmapFilter # There were reports of the C mmap filter not working properly in OSX, # just in case, I'm testing here... temp_file = GenericBloomFilter.get_temp_file() try: bf = CMmapFilter(1000, 0.01, temp_file) bf.add(1) assert 1 in bf assert 2 not in bf except: WrappedBloomFilter = FileSeekFilter else: WrappedBloomFilter = CMmapFilter except: WrappedBloomFilter = FileSeekFilter class BloomFilter(GenericBloomFilter): def __init__(self, capacity, error_rate): '''
def setup(database: dict, password: str, bloomfilter_file=None, bf_false_positive_rate=BLOOMFILTER_DEFAULT_FALSE_POSITIVE_RATE, paralleled=False, num_processes=None) -> tuple: """ Setup method of OXT for a database :param database: database with id -> list of words :param password: password to create keys :param bloomfilter_file: file to read/write bloomfilter :param bf_false_positive_rate: bloomfilter false positive rate :param bool paralleled: should we parallel the process or not :param num_processes: number of process used if parallel :return: (key, encrypted database) """ global var_dict # TODO: generate keys from password K_P = random_secure(1) # key to XOR index K_S = random_secure(CMAC_AES128_KEY_LENGTH_IN_BYTES) # Key for e iv = random_secure( CMAC_AES128_KEY_LENGTH_IN_BYTES) # IV for AES encryption K_X = random_secure(CMAC_AES128_KEY_LENGTH_IN_BYTES) # Key for xtag K_I = random_secure(CMAC_AES128_KEY_LENGTH_IN_BYTES) # Key for index K_Z = random_secure(CMAC_AES128_KEY_LENGTH_IN_BYTES) # Key for Z K_T = random_secure(CMAC_AES128_KEY_LENGTH_IN_BYTES) # Key for keyword pairing = PairingGroup('SS512') g = pairing.random(GT) assert g.initPP(), "ERROR: Failed to init pre-computation table for g." total_pairs = 0 inverted_index_all_pairs = defaultdict( list) # word -> list of ids containing this word if paralleled: # parallel processing logger.info('Parallel gen_inverted_index') pool = multiprocessing.Pool() num_docs = len(database) inverted_tuples = pool.starmap( gen_inverted_index_paralleled, list(zip(database.items(), [K_P] * num_docs))) for inverted_list in inverted_tuples: for word, rind in inverted_list: inverted_index_all_pairs[word].append(rind) total_pairs += 1 else: # sequential processing logger.info('Seq inverted_index_all_pairs') for (ind, words) in database.items(): inverted_list = gen_inverted_index(ind, words, K_P) for word, rind in inverted_list: inverted_index_all_pairs[word].append( rind) # rind is now bytes total_pairs += 1 # generate xtags. Each xtag is for a pair (word, index) xtags = set() if paralleled: logger.info('Parallel xtags') # parallel processing with multiprocessing.Pool(processes=num_processes, initializer=init_gen_xtags_parallel, initargs=(K_X, pairing, K_I, g)) as pool: xtags_lists = pool.map(gen_xtags_parallel, inverted_index_all_pairs.items()) for xtags_list in xtags_lists: xtags.update(xtags_list) var_dict = {} else: logger.info('Seq xtags') for word, indices in inverted_index_all_pairs.items(): xtags.update(gen_xtags(word, indices, K_X, pairing, K_I, g)) # Create a Bloom filter and bitarray if bloomfilter_file is not None: bf = BloomFilter(total_pairs, bf_false_positive_rate, bloomfilter_file) else: bf = BloomFilter(total_pairs, bf_false_positive_rate) num_bits = bf.num_bits bits = bitarray(num_bits) bits.setall(False) # compute the positions of each xtag and set it # the reason we need to use bits array because the library doesn't expose bits. e.g. check if a bit is set or not xtag: str for xtag in xtags: bf.add(xtag) # mimic set in bits array for hash_seed in bf.hash_seeds: pos = bloomfilter_hash(xtag, hash_seed) % num_bits bits[pos] = True # generate encrypted database edb1 = dict() if paralleled: logger.info('Parallel edb1') # parallel processing with multiprocessing.Pool(processes=num_processes, initializer=init_gen_t_set_parallel, initargs=(K_S, K_I, K_Z, K_T, iv, pairing)) as pool: t_set_dict_lists = pool.map(gen_t_set_parallel, inverted_index_all_pairs.items()) for t_set_dict in t_set_dict_lists: edb1.update(t_set_dict) var_dict = {} else: logger.info('Seq edb1') for word, indices in inverted_index_all_pairs.items(): edb1.update( gen_t_set(word, indices, K_S, K_I, K_Z, K_T, iv, pairing)) key = (K_P, K_S, K_X, K_I, K_Z, K_T) g_serialized = pairing.serialize(g) return key, iv, g_serialized, edb1, bf, bits
def __init__(self, capacity, error_rate): super().__init__() self.bloom_filter_1 = BloomFilter(capacity, error_rate) self.bloom_filter_2 = BloomFilter(capacity, error_rate)
1. 国内-省-目的地 可以获取该地区所有城市 2. 城市-景点 可以获取该城市所有景点 3. 城市-社区-游记 可以获取该城市所有游记 -- BloomFilter """ import os import requests import re from pybloomfilter import BloomFilter dir_name = 'notes/' bf = BloomFilter(1024 * 1024 * 16, 0.01) def find_all_city_pages_url(): req = requests.get('http://www.mafengwo.cn/mdd/') city_pages = re.findall('/travel-scenic-spot/mafengwo/\d{5}.html', req.text) return city_pages def get_city_number(url): return url[29:34] def save_html(file_name, html): with open(file_name, 'wb+') as f: f.write(html.encode())
def __init__(self, path=None): self.file = None self.fingerprints = BloomFilter(3000000, 0.00001, 'bloomTemp')
def __init__(self): self.bf = BloomFilter(10000000, 0.01, 'filter.bloom') self.f_write = open('visitedsites','w') self.si = SearchIndex() self.si.SearchInit()
def dsk_with_bf(file_name, n_iter, n_partition, kmer_size, bf_capacity, bf_error, top_count, verbose): """ Disk streaming of kmers with bloom filter. :param file_name: File to be processed. :param n_iter: Number of iterations to write kmers into disk. :param n_partition: Number of iterations to read files into memory. :param kmer_size: Length of the kmer. :param bf_capacity: Capacity of the bloom filter. :param bf_error: Probability of false positive in bloom filter. :param top_count: Number of kmers to be printed. :param verbose: Option to print elapsed time and memory usage. :return: """ start_operation = time.time() # initialise a min heap h = Heap() h.populate(top_count) for iter_ in range(n_iter): start_iter = time.time() # initialise files where partitioned data is written. files = [open("{}".format(j), "w") for j in range(n_partition)] with open(file_name, "r") as file_from_read: count = 0 for line in file_from_read: # take the second line to parse kmers. if count % 4 == 1: line_length = len(line) - 1 for i in range(line_length - kmer_size + 1): kmer = line[i:kmer_size + i] # assign kmers to partitions. hash_result = mmh3.hash(kmer) if hash_result % n_iter == iter_: # assign kmers to files j = int((hash_result / n_iter) % n_partition) files[j].write(kmer + "\n") count += 1 for f in files: f.close() end = time.time() if verbose: print("Disk write for iteration {0} done in {1} seconds".format( str(iter_), str(end - start_iter))) for j in range(n_partition): # initialise bloom filter bf = BF(bf_capacity, bf_error, "bf_dsk") start_partition = time.time() kmer_freq = dict() with open(str(j), "r") as f: for kmer in f: if kmer in bf: if kmer not in kmer_freq: kmer_freq[kmer] = 1 kmer_freq[kmer] += 1 else: bf.add(kmer) end = time.time() if verbose: print( "Hash table for iteration {0}, partition {1} done in {2} seconds." .format(str(iter_), str(j), str(end - start_partition))) print( "Has table size for iteration {0} partition {1} is {2} Mb". format(str(iter_), str(j), str(int(sys.getsizeof(kmer_freq)) / 10**6))) start_heap = time.time() for kmer, freq in kmer_freq.items(): if freq > h.min(): # h.pop() # h.push((freq, kmer)) h.push_pop((freq, kmer)) end = time.time() if verbose: print("Heap done in {0} seconds".format(end - start_heap)) # clean file and bf os.remove(str(j)) os.remove("bf_dsk") end_iter = time.time() if verbose: print("Iteration {0} done in {1} seconds.".format( str(iter_), str(end_iter - start_iter))) for item in h.nlargest(top_count): freq, kmer = item print(kmer[:-1], freq) end = time.time() if verbose: print("Process done in {0} seconds.".format(str(end - start_operation)))
#!/usr/bin/python # -*- coding:utf-8 -*- import requests from bs4 import BeautifulSoup import csv import sys import datetime from time import time from pybloomfilter import BloomFilter reload(sys) sys.setdefaultencoding('utf-8') download_bf = BloomFilter(1024*1024*16, 0.01) def request(url, isFirstPage): if url not in download_bf: download_bf.add(url) else: return res = requests.get(url).text soup = BeautifulSoup(res, 'html.parser') # print soup.prettify() keylist = soup.select('div.key-list > div.item-mod') for index, house in enumerate(keylist): # if index == 2: # print house
_, dim = T_des.shape # In[4]: LSH_random_vectors_set = [] #powers_of_two = 1 << np.arange(LSH_dim-1, -1, -1) # creating the multiple LSH random vectors for i in range(L_buckets): np.random.seed(i) LSH_random_vectors_set.append(np.random.randn(dim, LSH_dim)) # creating the multiple Bloom Filters BF_set = [] for i in range(L_buckets): BF_set.append(BloomFilter(2**(2 * LSH_dim), 0.01, None)) # In[5]: t0 = time.process_time() Q_kp, Q_des = detector.detectAndCompute(query_img, None) t1 = time.process_time() # We now add each LSH hash result to their dedicated Bloom Filter for i in range(L_buckets): Q_reflections = Q_des.dot(LSH_random_vectors_set[i]) >= 0 for q in np.array(Q_reflections, dtype=int): BF_set[i].add(q.tostring(None))
import requests import re import json from redis import Redis from rq import Queue from bs4 import BeautifulSoup from pybloomfilter import BloomFilter from utils import get_html,get_proxy,delete_proxy,get_content from urllib.parse import urlencode low = Queue('low',connection=Redis(host='localhost',port=6379)) bloom_f = BloomFilter(capacity=100000, error_rate=0.01) def spider_movie_comment(movie_id): # Get Pages url = "https://movie.douban.com/subject/"+movie_id+"/reviews?start=" head = get_html(url+str(0)) html = BeautifulSoup(head.content,"lxml") temp_html = html.select("#content > h1") print(temp_html) # f = open("index.html","w") # f.write(html.prettify()) # f.close() text = temp_html[0].text page = int(re.sub(r"\D*","", text)) data = [] for page_num in range(page//20+1):
from pybloomfilter import BloomFilter import sys, signal from time import time, sleep import os from worker_filter import Filter st = time() done_sites_fname = 'done_sites.bin' if os.path.isfile(done_sites_fname): bfdone = BloomFilter.open(done_sites_fname) else: print "no file" bfdone = BloomFilter(2**27, 10**(-5), done_sites_fname) #8M start = 0 filter = Filter() f = open('done_urls20160601.txt').read().strip().split('\n') for url in f: bfdone.add(url) print len(f) cnt = 0 for url in f: if url in bfdone: cnt += 1 print cnt inc = 0 print time() - st
class PcautoAskSpider(scrapy.Spider): name = "pcauto_ask" allowed_domains = ["pcauto.com.cn"] #start_urls = ['http://k.pcauto.com.cn/question/4035240.html'] start_urls = ['http://k.pcauto.com.cn/question/k16/p1.html'] def __init__(self): bloomfilterfilename = 'pcauto.filter' try: self.bf = BloomFilter.open(bloomfilterfilename) except: logging.info("new filter.bloom") self.bf = BloomFilter(50000000, 0.05, bloomfilterfilename) def start_requests(self): urls = [ "http://k.pcauto.com.cn/question/k%d/p1.html" % i for i in (1, 2, 4, 5, 6) ] for url in urls: yield scrapy.Request(url, callback=self.parse_category) def parse_category(self, response): for element_li in response.xpath( '//ul[@id="wtList"]/li[@class!="liTit"]'): url = element_li.xpath( 'i[@class="iTitle"]/a/@href').extract_first() num = element_li.xpath('i[@class="iNum"]/text()').extract_first() #状态 用来记录是否有最佳答案 phase = True if element_li.xpath( 'i[@class="iPhase"]/span[@class="icon_jj"]') else False #print("%s [%s]" % (url, num)) if phase: if (url, phase) not in self.bf: yield scrapy.Request(url, callback=self.parse_askcard) self.bf.add((url, phase)) else: if not num == '0' and (url, phase, num) not in self.bf: yield scrapy.Request(url, callback=self.parse_askcard) self.bf.add((url, phase, num)) next_url = response.xpath( '//div[@class="pcauto_page"]/a[@class="next"]/@href' ).extract_first() if next_url: yield scrapy.Request(response.urljoin(next_url), callback=self.parse_category) def parse_askcard(self, response): item = PcautoItem() item['url'] = response.url question_title = response.xpath( '//div[@id="question_content"]/div[@class="modInner"]/div[1]//text()' ).extract() item['question_title'] = parseContentList2Str(question_title) item['question'] = response.xpath( '//div[@class="modInner"]/p/text()').extract_first() ask_time = response.xpath( '//div[@class="dInfo gray"]/span[@class="sTime"]/text()' ).extract_first() item['ask_time'] = str2Timestamp(ask_time) user_name = response.xpath( '//div[@class="dInfo gray"]/span[@class="sName"]/a/text()' ).extract_first() user_url = response.xpath( '//div[@class="dInfo gray"]/span[@class="sName"]/a/@href' ).extract_first() item['ask_user'] = {'name': user_name, 'url': user_url} element_best_answer = response.xpath( '//div[@class="modAnswer modBest mt10"]//div[@class="tb"]') item['best_answer'] = self.parse_answer( element_best_answer[0]) if element_best_answer else None answer_list = list() for element in response.xpath( '//div[@class="modAnswer mt10 modOut"]/div[@class="modInner"]/div[@class!="th"]' ): answer_list.append(self.parse_answer(element)) item['answer_list'] = answer_list item['answer_count'] = len(answer_list) yield item def parse_answer(self, element): answer = dict() answer['id'] = element.xpath('div[2]/@id').extract_first() user_icon = element.xpath('.//img/@src').extract_first() element_user = element.xpath('.//i[@class="blue"]') or element.xpath( './/div[@class="dTitle"]') user_name = element_user[0].xpath('a/text()').extract_first() user_url = element_user[0].xpath('a/@href').extract_first() answer['user'] = { 'name': user_name, 'url': user_url, 'icon': user_icon } answer_time = ''.join( element.xpath('.//div[@class="gray"]/text()').extract()) answer['answer_time'] = str2Timestamp(answer_time) or element.xpath( './/span[@class="sTime"]/text()').extract_first() answer['answer'] = element.xpath( './/div[@class="answerCon"]/p/text()').extract_first() return answer
def __init__(self): self.bf = BloomFilter.open('filter.bloom') self.f_write = open('jingdong.txt', 'w') self.si = SearchIndex() self.si.SearchInit()
#!/usr/bin/env python # coding:utf-8 # manning 2015-1-27 import time import os import urlparse import hashlib import sys #sys.path.append("..") #from config.config import * #reload(sys) #sys.setdefaultencoding("utf-8") from pybloomfilter import BloomFilter bf = BloomFilter(100000, 0.01) def format(url): ''' 策略是构建一个三元组 第一项为url的netloc 第二项为path中每项的拆分长度 第三项为query的每个参数名称(参数按照字母顺序排序,避免由于顺序不同而导致的重复问题) ''' if urlparse.urlparse(url)[2] == '': url = url + '/' url_structure = urlparse.urlparse(url) netloc = url_structure[1] path = url_structure[2]