def __init__(self, domain, first_url=None, first_url_callback=None, first_url_follow=True, url_amount=100000, requests_session=None, tls=False, max_depth=100): self.domain = domain self.max_depth = max_depth self.bf = BloomFilter(capacity=url_amount, error_rate=1 / url_amount) self.url_queue = queue.Queue() if first_url is None: if tls: first_url = 'https://' + domain else: first_url = 'http://' + domain self.url_queue.put( (first_url, first_url_callback, first_url_follow, 0)) if requests_session is None: self.session = requests.Session() else: self.session = requests_session
class UrlSpider(CrawlSpider): name = "urlspider" allowed_domains = ["tianya.cn"] start_urls = ("http://www.hao123.com", ) rules = ( Rule(SgmlLinkExtractor(allow=()), callback="parse_resp", follow= True), ) def __init__(self, *args, **kwargs): # run using: scrapy crawl xss_spider -a url='http://example.com' super(UrlSpider, self).__init__(*args, **kwargs) self.start_urls = [kwargs.get('url')] hostname = urlparse(self.start_urls[0]).hostname self.allowed_domains = [hostname] # adding [] around the value seems to allow it to crawl subdomain of value self.fingerprints = BloomFilter(3000000, 0.0001) def parse_start_url(self, response): print "start:"+response.url return def parse_resp(self, response): fp = response.url new_fp = obtain_key(fp) if new_fp in self.fingerprints: return self.fingerprints.add(new_fp) item = SiteCrawlItem() item["url"] = response.url yield item
def __init__(self, server): # redis server self.server = server # 用来判断是否重复出现 allowed = [ "qq.com", "163.com", "people.com.cn", "xinhuanet.com", "cntv.cn", "ifeng.com", "hexun.com", "sina.com.cn", "sohu.com", "dbw.cn", ] self.bloom_domain_filter = BloomFilter(capacity=32) for a in allowed: self.bloom_domain_filter.add(a) # 正则过滤, 一些博客 self.qzone_filter = re.compile(r"^http://.*\.qzone\.qq\.com") self.wangyiblog_filter = re.compile(r"^http://.*\.blog\.163\.com") self.hexunblog_filter = re.compile(r"^http://.*\.blog\.hexun\.com") self.sohublog_filter = re.compile(r"http://.*\.blog\.sohu\.com") self.sohui_filter = re.compile(r"http://.*\.i\.sohu\.com") self.bloom_domain_vec = BloomFilter(capacity=1<<16, error_rate=0.001) self.bloom_netloc_vec = BloomFilter(capacity=1<<16, error_rate=0.001)
def user_init(): import re users = BloomFilter(10000000, 0.001) f = open(u"D:/工作/数据美化/data/简书用户id1.txt") for line in f: users.add(line.strip()) return users
class UrlManager(object): def __init__(self): self.urls = [] self.url_bloom_filter = BloomFilter(capacity=500000, error_rate=0.001) def add_url(self, url): # if url not in self.url_bloom_filter: self.urls.append(url) # self.url_bloom_filter.add(url) def add_urls(self, urls): for url in urls: self.add_url(url) def is_empty(self): return len(self.urls) == 0 def get_url(self): return self.urls.pop(0) def get_len(self): return len(self.urls) def is_viewed(self, url): return url in self.url_bloom_filter def add_viewed(self, url): self.url_bloom_filter.add(url)
def main(argv): if argv: error_rate = float(argv[0]) print "[BUILDING] Using error-rate: {}".format(error_rate) if os.path.isfile(nsrl_path): print "[BUILDING] Reading in NSRL Database" with open(nsrl_path) as f_line: # Strip off header _ = f_line.readline() print "[BUILDING] Calculating number of hashes in NSRL..." num_lines = sum(bl.count("\n") for bl in blocks(f_line)) print "[BUILDING] There are %s hashes in the NSRL Database" % num_lines with open(nsrl_path) as f_nsrl: # Strip off header _ = f_nsrl.readline() print "[BUILDING] Creating bloomfilter" bf = BloomFilter(num_lines, error_rate) print "[BUILDING] Inserting hashes into bloomfilter" for line in f_nsrl: sha1_hash = line.split(",")[0].strip('"') if sha1_hash: try: sha1 = binascii.unhexlify(sha1_hash) bf.add(sha1) except Exception as e: print "[ERROR] %s" % e print "[BUILDING] NSRL bloomfilter contains {} items.".format(len(bf)) with open('nsrl.bloom', 'wb') as nb: bf.tofile(nb) print "[BUILDING] Complete" else: print("[ERROR] No such file or directory: %s", nsrl_path) return
def spider_update_Init(dbname, website, carnum): # Mongo setting # spider_original_Init(dbname, website, carnum) # Mongo con connection = pymongo.MongoClient(settings['MONGODB_SERVER'], settings['MONGODB_PORT']) dbdata = connection[dbname] collectiondata = dbdata[website] # pybloom num = (int(carnum) + collectiondata.count()) * 1.1 df = BloomFilter(capacity=num, error_rate=0.01) # urllist urllist = [] for i in collectiondata.find(): if "url" in i.keys(): item = i["url"] if "status" in i.keys(): if not (i['status'].find('sold') == -1): continue itemmd5 = md5(item).hexdigest() returndf = df.add(itemmd5) if not (returndf): urllist.append(item) connection.close() return urllist
def __init__(self, url, charset=None, headers=None, response_handle=None, timeout=3, retry_times=30, load_wait=None, execute_js=None, execute_js_wait=None, retry_delta=3, http_proxy_url=None, force=False): ''' url 目标url charset 编码 data post的数据,字符串 headers 自定义请求头,dict response_handle 采集结果处理函数 timeout 超时时间,int, 比如:3 retry_times 重试次数,int,比如3 load_wait 加载页面后等待时间,秒. execute_js 加载页面完成后执行的js execute_js_waite 执行js之后等待的时间 retry_delta 如果出错,重试间隔,秒,int http_proxy_url 代理ip, "http://192.168.1.1:80" force 强制爬取,而不管有没有爬取过. ''' if not PhantomjsSpider._url_buff: PhantomjsSpider._url_buff = [BloomFilter(1000000)] global _queue _hash = md5(url) self.url = url self.timeout = timeout self.retry_times = retry_times self.retry_delta = retry_delta self.response_handle = response_handle self.charset = charset self.headers = headers self.execute_js = execute_js self.execute_js_wait = execute_js_wait self.load_wait = load_wait self.proxy = http_proxy_url if not force: try: for bloomfilter in PhantomjsSpider._url_buff: assert _hash not in bloomfilter except: pass else: try: PhantomjsSpider._url_buff[-1].add(_hash) except: PhantomjsSpider._url_buff.append( BloomFilter(PhantomjsSpider._url_buff[-1].capacity + 1000000)) PhantomjsSpider._url_buff[-1].add(_hash) _queue.put(self._go) else: _queue.put(self._go)
def generateBloomFilter(file): "Generates the bloom filter for entries in file." # this probably isnt enough, need to look the data formatting over more # thoroughly d = BloomFilter(1000, 0.001) for line in file: d.add(line.split(1)[0])
class BloomCheckPipeline(object): def __int__(self): file_name = 'bloomfilter' def open_spider(self, spider): file_name = 'bloomfilter' is_exist = os.path.exists(file_name + '.blm') if is_exist: self.bf = BloomFilter.fromfile(open('bloomfilter.blm', 'rb')) print('open blm file success') else: self.bf = BloomFilter(100000, 0.001) print('didn\'t find the blm file') def process_item(self, item, spider): # 我是过滤掉相同url的item 各位看需求 if item['urlToken'] or item['id'] in self.bf: print('drop one item for exist') raise DropItem('drop an item for exist') else: self.bf.add(item['urlToken']) print('add one success') return item def close_spider(self, spider): self.bf.tofile(open('bloomfilter.blm', 'wb'))
def __init__(self): try: with open(FILTER_FILE) as f: self.f = BloomFilter.fromfile(f) except IOError: self.f = BloomFilter(capacity=10000000, error_rate=0.001) self.num = 0
def user_init(): import re users = BloomFilter(10000000, 0.001) f= open(u"D:/工作/数据美化/data/简书用户id1.txt") for line in f: users.add(line.strip()) return users
def __init__(self, lines, estimated_lines, dup_proportion, truncate): super().__init__() estimated_dups = estimated_lines * dup_proportion self.truncate = truncate self.potential = BloomFilter(capacity=estimated_dups, error_rate=0.001) self.seen = set() self._find_collisions(lines, estimated_lines)
def add(self, key): """Adds a key to this bloom filter. If the key already exists in this filter it will return True. Otherwise False. >>> b = ScalableBloomFilter(initial_capacity=100, error_rate=0.001, \ mode=ScalableBloomFilter.SMALL_SET_GROWTH) >>> b.add("hello") False >>> b.add("hello") True """ if key in self: return True if not self.filters: filter = BloomFilter(capacity=self.initial_capacity, error_rate=self.error_rate * (1.0 - self.ratio)) self.filters.append(filter) else: filter = self.filters[-1] if filter.count >= filter.capacity: filter = BloomFilter(capacity=filter.capacity * self.scale, error_rate=filter.error_rate * self.ratio) self.filters.append(filter) filter.add(key, skip_check=True) return False
def determine_lookup_speed_threshold(self): from time import time #do each one 5 times bf = BloomFilter(capacity=self.bloom_size, error_rate=self.bloom_error) count = 1 repetitions = 5 self_bf_holder = self.bf self.bf = bf while True: bf.add('andrew_' + str(count)) bin_faster_count = 0 for j in xrange(repetitions): #Linear scan t1 = time() self.linear_scan_count('andrew') t2 = time() linear_time = t2-t1 t1 = time() self.binsearch_count('andrew') t2 = time() bin_time = t2-t1 bin_faster_count += int(bin_time < linear_time) if 1.*bin_faster_count / repetitions >= 0.75: del bf self.bf = self_bf_holder return count count += 1
def start(): res = request_get(biqukan_url) index = BeautifulSoup(res, features=features) if os.path.exists(bf_file): LOG.info('bs from file') bf = BloomFilter.fromfile(open(bf_file, 'r')) else: LOG.info('init bs') bf = BloomFilter(500000) try: pool = Pool(size=pool_size) book_urls = find_wanben() book_urls += find_new_storage_block(index) book_urls += find_recommend_block(index, u'强力推荐') book_urls += find_type_block(index, u'玄幻小说') book_urls += find_type_block(index, u'修真小说') book_urls += find_type_block(index, u'都市小说') book_urls += find_type_block(index, u'穿越小说') book_urls += find_type_block(index, u'网游小说') book_urls += find_type_block(index, u'科幻小说') book_urls += find_new_update_block(index) book_num = len(book_urls) for i, url in enumerate(book_urls): pool.spawn(download_book, url, bf) # download_book(url, bf) LOG.info(u'开始下载%s本,剩余%s本', i + 1, book_num - i - 1) pool.join() LOG.info(u'下载完成') except Exception as e: LOG.exception(e) finally: bf.tofile(open(bf_file, 'w'))
def __init__(self): #mail # self.mailer = MailSender.from_settings(settings) #mongo self.connection = pymongo.MongoClient(settings['MONGODB_SERVER'], settings['MONGODB_PORT']) db = self.connection[settings['MONGODB_DB']] self.collection = db[settings['MONGODB_COLLECTION']] self.collectionurllog = db[settings['MONGODB_COLLECTION'] + "_urllog"] #bloom file filename = settings['MONGODB_COLLECTION'] + ".blm" #pybloom num = (int(settings['CrawlCar_Num']) + self.collection.count()) * 1.1 self.df = BloomFilter(capacity=num, error_rate=0.01) #read isexists = os.path.exists(filename) self.fa = open(filename, "a") if isexists: fr = open(filename, "r") lines = fr.readlines() for line in lines: line = line.strip('\n') self.df.add(line) fr.close() else: for i in self.collection.find(): if "status" in i.keys(): item = i["status"] item = md5(item).hexdigest() self.df.add(item) self.fa.writelines(item + '\n') #count self.counts = 0
def test_insert_then_test(self): result = create_index( '/tmp/fake.csv', # input filename self.test_file, # file-like object 0.0001, # error rate 1, # skip lines [1, 2], # fields ',', # delimiter False) # recursive domain self.assertEqual( { '/tmp/fake.csv.2.bfindex': 6, '/tmp/fake.csv.1.bfindex': 5 }, result) b1 = BloomFilter.fromfile(open('/tmp/fake.csv.1.bfindex', 'rb')) b2 = BloomFilter.fromfile(open('/tmp/fake.csv.2.bfindex', 'rb')) self.assertEqual(False, 'FieldA' in b1) self.assertEqual(False, 'FieldB' in b2) for word in ('apple', 'banana', 'orange', 'pear', 'pineapple'): self.assertEqual(True, word in b1) self.assertEqual(False, word in b2) for word in ('carrot', 'potato', 'leek', 'cauliflower', 'bean'): self.assertEqual(True, word in b2) self.assertEqual(False, word in b1)
def open_spider(self, spider): brandName = 'mybloom' isexists = os.path.exists(brandName + '.blm') if isexists == True: self.bf = BloomFilter.fromfile(open(brandName + '.blm', 'rb')) else: self.bf = BloomFilter(100000, 0.001)
class Filter(object): def __init__(self, cachefile, capacity=1000000, error_rate=0.001): self.cachefile = cachefile if os.name == 'nt' or not cachefile: from pybloom import BloomFilter if self.cache(): with open(cachefile, 'r') as fp: self.filter = BloomFilter.fromfile(fp) else: self.filter = BloomFilter(capacity=capacity, error_rate=error_rate) elif os.name == 'posix': from pybloomfilter import BloomFilter if self.cache(): self.filter = BloomFilter.open(self.cachefile) else: self.filter = BloomFilter(capacity, error_rate, cachefile) def __contains__(self, key): return key in self.filter def add(self, obj): self.filter.add(obj) if os.name == 'nt': with open(self.cachefile, 'w') as fp: self.filter.tofile(fp) def cache(self): return os.path.exists(self.cachefile or '')
def __init__(self, capacity=(1 << 30), error_rate=0.0001): self.bloomFilters = [] self.leaderIndex = 0 self.followerIndex = 1 self.bloomFilters.append(BloomFilter(capacity, error_rate)) self.bloomFilters.append(BloomFilter(capacity, error_rate)) self.status = 0
def create_empty_bloomfilter(self): """Create an empty bloom filter with byte aligness.""" bf = BloomFilter(capacity=self.cache.quota, error_rate=self.error_rate) bs = bf.bitarray.tostring() bf.bitarray = bitarray() bf.bitarray.fromstring(bs) return bf
class BLOOMDupeFilter(BaseDupeFilter): """Request Fingerprint duplicates filter""" def __init__(self, path=None): self.file = None # capacity # this BloomFilter must be able to store at least *capacity* elements # while maintaining no more than *error_rate* chance of false # positives # error_rate # the error_rate of the filter returning false positives. This # determines the filters capacity. Inserting more than capacity # elements greatly increases the chance of false positives. self.fingerprints = BloomFilter(capacity=2000000, error_rate=0.00001) # get all the urls from database db = DynamoDBPipeline() urls = db.get_url_list() [self.fingerprints.add(url) for url in urls] @classmethod def from_settings(cls, settings): return cls(job_dir(settings)) def request_seen(self, request): fp = request.url if fp in self.fingerprints: return True self.fingerprints.add(fp) def close(self, reason): self.fingerprints = None
def __init__(self, password_file=pw_file, fp_rate=0.001, ignore_case=True): self._log = logging.getLogger('passcheck.passcheck.PassCheck') self._fp_rate = fp_rate self._pw_file = os.path.realpath(password_file) self._ignore_case = ignore_case self._log.debug('Counting items in password file') with open(self._pw_file, 'r') as f: for line_num, line in enumerate(f): pass self._num_passwords = line_num + 1 self._log.debug('Creating BloomFilter with capacity=%d' % self._num_passwords) self._bf = BloomFilter(capacity=self._num_passwords, error_rate=self._fp_rate) self._log.debug('Loading passwords into BloomFilter') num_added = 0 with open(self._pw_file, 'r') as f: for line in f: pw = line[:-1] if self._ignore_case: pw = pw.lower() if not self._bf.add(pw): num_added += 1 if num_added > self._num_passwords: e = Exception('Password file was modified during load') self._log.error(e) raise e # Handle possibility of duplicates (especially if case is ignored) if num_added < self._num_passwords: self._log.warn('Expected %d passwords, but added %d' % (self._num_passwords, num_added)) self._num_passwords = num_added
def test_insert_then_test(self): result = create_index( '/tmp/fake.csv', # input filename self.test_file, # file-like object 0.0001, # error rate 1, # skip lines [1, 2], # fields ',', # delimiter False) # recursive domain self.assertEqual( {'/tmp/fake.csv.2.bfindex': 6, '/tmp/fake.csv.1.bfindex': 5}, result) b1 = BloomFilter.fromfile(open('/tmp/fake.csv.1.bfindex', 'rb')) b2 = BloomFilter.fromfile(open('/tmp/fake.csv.2.bfindex', 'rb')) self.assertEqual(False, 'FieldA' in b1) self.assertEqual(False, 'FieldB' in b2) for word in ('apple', 'banana', 'orange', 'pear', 'pineapple'): self.assertEqual(True, word in b1) self.assertEqual(False, word in b2) for word in ('carrot', 'potato', 'leek', 'cauliflower', 'bean'): self.assertEqual(True, word in b2) self.assertEqual(False, word in b1)
class UrlBloom: '''BloomFilter: check elements repetition''' def __init__(self, _capacity=1000000, _error_rate=0.00001): self.is_full = False # determine if open backup bloom data by time if CONFIG.get('BACKUP', 0) == 1: self.bomb = TimeBomb(CONFIG['TMP_DIR'] + CONFIG['BLOOM_FILE']) self.filter = self.bomb.load() if self.filter is None: self.filter = BloomFilter(capacity=_capacity, error_rate=_error_rate) self.bomb.dump(self.filter) else: self.filter = BloomFilter(capacity=_capacity, error_rate=_error_rate) def add(self, links): if self.is_full: return try: for ele in links: self.filter.add(ele) except IndexError: # rasie IndexError when bloom is at capacity self.is_full = True def clean(self, links): res = [] for ele in links: if ele not in self.filter: res.append(ele) return res
def main(): if os.path.isfile(nsrl_path): print "BUILDING: Reading in NSRL Database" with open(nsrl_path) as f_line: # Strip off header _ = f_line.readline() print "BUILDING: Calculating number of hashes in NSRL..." num_lines = sum(bl.count("\n") for bl in blocks(f_line)) print "BUILDING: There are %s hashes in the NSRL Database" % num_lines with open(nsrl_path) as f_nsrl: # Strip off header _ = f_nsrl.readline() print "BUILDING: Creating bloomfilter" bf = BloomFilter(num_lines, error_rate) print "BUILDING: Inserting hashes into bloomfilter" for line in f_nsrl: md5_hash = line.split(",")[1].strip('"') if md5_hash: try: bf.add(md5_hash) except Exception as e: print "ERROR: %s" % e print "BUILDING: NSRL bloomfilter contains {} items.".format(len(bf)) with open('nsrl.bloom', 'wb') as nb: bf.tofile(nb) print "BUILDING: Complete" else: print("ERROR: No such file or directory: %s", nsrl_path) return
def build_bloom_filter_and_iblt(m, include_value_in_iblt=False): c = 8 * math.pow(math.log(2), 2) tau = 16.5 n = len(selected_txs) alpha = n / (c * tau) # print(alpha * tau) if m <= n: fpr = 0.1 else: fpr = alpha / m - n print("Mempool difference", abs(m - n)) n_cells = int((4 / 3) * abs(m - n)) + 30 print('n_cells', n_cells) logging.info("Calculated FPR: %f" % fpr) fpr = 0.1 b = BloomFilter(capacity=n, error_rate=fpr) i = IBLT(m=n_cells, k=3, key_size=32, value_size=0) for tx in selected_txs: b.add(tx['hash']) v = '' if include_value_in_iblt: v = tx_to_bytes(tx) i.insert(tx['hash'], v) return b, i
def __enter__(self): if os.path.exists(self.bloom_file): with open(self.bloom_file, 'rb') as f: self.bloom = BloomFilter.fromfile(f) else: self.bloom = BloomFilter(capacity=10000000, error_rate=0.001) return self.bloom
def _build_filter(): bf = BloomFilter(capacity=10000, error_rate=0.001) worst = [w[:-2] for w in open(_WORST_DUMP).readlines()] map(bf.add, worst) with open(_BLOOM_DUMP, 'w') as f: bf.tofile(f) print "Serialized bloom filter to ", _BLOOM_DUMP
def returnItemsWithMinSupportV3(itemSet, lenItem, transactionList, minSupport, freqSet): _itemSet = set() localSet = defaultdict(int) if len(itemSet): filterCdd = BloomFilter(capacity=len(itemSet), error_rate=0.0001) else: print("As I say, ValueError: Capacity must be > 0") return set([]) print("Store cdds in BF ... - %s" % getTime()) for val in itemSet: pass # 待引入counting BF,如达到minSup*len(transactionList),则不插入;or 不用counting BF,判断,已在BF的则不再插入。 filterCdd.add(val) print("Mapping cddFromTrans on BF ... - %s" % getTime()) for trans in transactionList: for cdd in combinations(trans, lenItem): cdd = frozenset(cdd) if cdd in filterCdd: freqSet[cdd] += 1 #zi 全局存一个 localSet[cdd] += 1 #zi 局部存一个,(item, count),然后过滤小于minSupport的。 print("Filter cdds that less than minSup. - %s" % getTime()) for item, count in localSet.items(): support = float(count) / len(transactionList) if support > minSupport: _itemSet.add(item) return _itemSet
class CrawlBSF: request_headers = { 'host': "www.mafengwo.cn", 'connection': "keep-alive", 'cache-control': "no-cache", 'upgrade-insecure-requests': "1", 'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36", 'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 'accept-language': "zh-CN,en-US;q=0.8,en;q=0.6" } cur_level = 0 max_level = 5 dir_name = 'iterate/' iter_width = 50 downloaded_urls = [] du_md5_file_name = dir_name + 'download.txt' du_url_file_name = dir_name + 'urls.txt' bloom_downloaded_urls = BloomFilter(1024 * 1024 * 16, 0.01) bloom_url_queue = BloomFilter(1024 * 1024 * 16, 0.01) cur_queue = deque() child_queue = deque() def __init__(self, url): self.root_url = url self.cur_queue.append(url) self.du_file = open(self.du_url_file_name, 'a+') try: self.dumd5_file = open(self.du_md5_file_name, 'r') self.downloaded_urls = self.dumd5_file.readlines() self.dumd5_file.close() for urlmd5 in self.downloaded_urls: self.bloom_downloaded_urls.add(urlmd5[:-2]) except IOError: print "File not found" finally: self.dumd5_file = open(self.du_md5_file_name, 'a+') def enqueueUrl(self, url): if url not in self.bloom_url_queue and hashlib.md5( url).hexdigest() not in crawler.bloom_downloaded_urls: self.child_queue.append(url) self.bloom_url_queue.add(url) def dequeuUrl(self): try: url = self.cur_queue.popleft() return url except IndexError: return None def close(self): self.dumd5_file.close() self.du_file.close()
def vacuum_all(self, limit=None, time_limit=None, unupdated=False): logger.debug('Begin vacuum_all(limit=%s, time_limit=%s, unupdated=%s)', limit, time_limit, unupdated) ##TODO delete SCIFields with SCFilterId not found in SCFilter self.plugins = self.load_plugins() self.ts = self.term_stat('SupplierCatalogItemFields Vacuum', len(self.plugins)) now = start_time = datetime.now() try: transaction.begin() for plug in self.plugins.itervalues(): supplier_catalog_filter_id = plug.supplier_catalog_filter_id() ### Generate a bloom filter set of SCIF id's in VersionModel model_name = plug.version_model() + 'Model' VersionModel = getattr(model, model_name) query = DBSession.query(VersionModel.supplier_catalog_item_field_id) s = BloomFilter(capacity=query.count() + 1) self.ts['sub_total'] = query.count() for (supplier_catalog_item_field_id, ) in query.yield_per(100): s.add(supplier_catalog_item_field_id) self.ts['sub_done'] += 1 del query ### Iterate through SCIFields, deleting any that don't appear in the bloom filter. query = DBSession.query(SupplierCatalogItemFieldModel) query = query.filter(SupplierCatalogItemFieldModel.supplier_catalog_filter_id == supplier_catalog_filter_id) if unupdated is not True: query = query.filter(SupplierCatalogItemFieldModel.updated != None) if limit is not None: query = query.order_by(SupplierCatalogItemFieldModel.vacuumed.nullsfirst()) query = query.limit(limit) logger.debug("LIMIT %i, supplier_catalog_filter_id %s", limit, supplier_catalog_filter_id) self.ts['sub_done'] = 0 self.ts['sub_total'] = query.count() for supplier_catalog_item_field in query.yield_per(100): if supplier_catalog_item_field.id not in s: logger.debug("Deleting SupplierCatalogItemField %s", supplier_catalog_item_field.id) DBSession.delete(supplier_catalog_item_field) else: supplier_catalog_item_field.vacuumed = now if self.ts['sub_done'] % 1000 == 0: DBSession.flush() self.ts['sub_done'] += 1 del query DBSession.flush() if time_limit is not None: if datetime.now() > start_time + time_limit: logger.info("Reached Time Limit at %i of %i", self.ts['done'], self.ts['total']) transaction.commit() break; self.ts['done'] += 1 transaction.commit() except Exception: logger.exception("Caught Exception: ") transaction.abort() finally: self.ts.finish() logger.debug('End vacuum()')
def get_bloom(self): bloom_cache = BloomFilter(capacity=10000000, error_rate=0.00001) sql = "select url from user_tbl" self.cursor.execute(sql) datalist = self.cursor.fetchall() for data in datalist: bloom_cache.add(data[0]) return bloom_cache
def test_union_capacity_fail(): bloom_one = BloomFilter(1000, 0.001) bloom_two = BloomFilter(100, 0.001) def _run(): new_bloom = bloom_one.union(bloom_two) assertRaises(ValueError, _run)
def checkCinT(currentCSet, transactionList, minSupport, freqSet): filterTrans = BloomFilter(capacity=len(transactionList), error_rate=0.001) for val in transactionList: filterTrans.add(val) print filterTrans.count for cdd in currentCSet: pass return freqSet
def test_intersection_capacity_fail(self): bloom_one = BloomFilter(1000, 0.001) bloom_two = BloomFilter(100, 0.001) def _run(): new_bloom = bloom_one.intersection(bloom_two) self.assertRaises(ValueError, _run)
def test_union_k_fail(self): bloom_one = BloomFilter(100, 0.01) bloom_two = BloomFilter(100, 0.001) def _run(): new_bloom = bloom_one.union(bloom_two) self.assertRaises(ValueError, _run)
def test_intersection_k_fail(): bloom_one = BloomFilter(100, 0.001) bloom_two = BloomFilter(100, 0.01) def _run(): new_bloom = bloom_one.intersection(bloom_two) assertRaises(ValueError, _run)
class BlogSpider(Spider): def __init__(self): self.pageNumber =0 self.logfile = open("/home/hduser/Logs/csdnUserlog.log","w") self.f = BloomFilter(capacity=10000000, error_rate=0.0001) name = "csdnUserScrapy" #减慢爬取速度 为2s download_delay = 0.5 allowed_domains = ["my.csdn.net"] start_urls = [ "http://my.csdn.net/jiazhijun","http://my.csdn.net/sodino","http://my.csdn.net/bill_man","http://my.csdn.net/lhc2207221755","http://my.csdn.net/xgbing","http://my.csdn.net/LoongEmbedded","http://my.csdn.net/jdh99","http://my.csdn.net/zqiang_55","http://my.csdn.net/zhao_zepeng","http://my.csdn.net/linyt","http://my.csdn.net/kmyhy","http://my.csdn.net/lincyang","http://my.csdn.net/jdsjlzx","http://my.csdn.net/u011012932","http://my.csdn.net/yayun0516","http://my.csdn.net/qq_23547831","http://my.csdn.net/CHENYUFENG1991","http://my.csdn.net/qq_26787115","http://my.csdn.net/kongki","http://my.csdn.net/you23hai45","http://my.csdn.net/cometwo","http://my.csdn.net/yuanziok","http://my.csdn.net/woxueliuyun","http://my.csdn.net/gatieme","http://my.csdn.net/u010850027","http://my.csdn.net/yinwenjie","http://my.csdn.net/teamlet","http://my.csdn.net/wangyangzhizhou","http://my.csdn.net/xiaoxian8023","http://my.csdn.net/ooppookid","http://my.csdn.net/wsl211511","http://my.csdn.net/liyuanbhu","http://my.csdn.net/sxhelijian","http://my.csdn.net/raylee2007","http://my.csdn.net/luozhuang","http://my.csdn.net/shaqoneal","http://my.csdn.net/dc_726","http://my.csdn.net/tobacco5648","http://my.csdn.net/wowkk","http://my.csdn.net/csfreebird","http://my.csdn.net/xukai871105","http://my.csdn.net/tuzongxun","http://my.csdn.net/mchdba","http://my.csdn.net/lichangzai","http://my.csdn.net/leftfist","http://my.csdn.net/wonder4","http://my.csdn.net/fogyisland2000","http://my.csdn.net/smstong","http://my.csdn.net/david_520042","http://my.csdn.net/ghostbear","http://my.csdn.net/xuyaqun","http://my.csdn.net/force_eagle","http://my.csdn.net/Jmilk","http://my.csdn.net/xiangpingli","http://my.csdn.net/quqi99","http://my.csdn.net/michaelzhou224","http://my.csdn.net/zzq900503","http://my.csdn.net/pipisorry","http://my.csdn.net/zhangmike","http://my.csdn.net/foruok","http://my.csdn.net/fengbingchun","http://my.csdn.net/qingrun","http://my.csdn.net/harrymeng","http://my.csdn.net/pukuimin1226","http://my.csdn.net/lihuoming","http://my.csdn.net/zhazha1980518","http://my.csdn.net/redarmy_chen","http://my.csdn.net/yuanmeng001","http://my.csdn.net/yeka","http://my.csdn.net/xieqq","http://my.csdn.net/zhangxiaoxiang","http://my.csdn.net/oiio","http://my.csdn.net/jobchanceleo","http://my.csdn.net/broadview2006" ] def parse(self, response): sel = Selector(response) item = CsdnusersspyderItem() print "response URL %s\n" % str(response.url) self.f.add(str(response.url)) #print "*********\nBloom added self.url: %s \n**********\n" % str(response.url) item["userName"] = str(response.url).split('/')[-1] relativeMarks =response.xpath("//div[@class='header clearfix']/a[@href]").extract() item["follow"] = [] item["befollowed"] = [] i = 0 for u in relativeMarks: unameMark = re.findall(r'username="******"',u) (s,e) = re.search(r'".*"',unameMark[0]).span() uname = unameMark[0][s+1:e-1] if i <= 7: item["follow"].append(uname.encode('utf-8')) else: item["befollowed"].append(uname.encode('utf-8')) newUrl = "http://my.csdn.net/"+uname if newUrl in self.f: self.logfile.write("Duplicated URL: %s\n" % newUrl) pass else: #self.logfile.write("wei chong fu %s\n" % newUrl) yield Request(newUrl,callback=self.parse) i += 1 item["pageUrl"] = str(response.url) focusNumMark = response.xpath("//dd[@class='focus_num']").extract()[0] (s ,e) = re.search(r'\d+',focusNumMark).span() focusNum = focusNumMark[s:e].encode('utf-8') item["followNum"] = focusNum fansNumMark = response.xpath("//dd[@class='fans_num']").extract()[0] (s ,e) = re.search(r'\d+',fansNumMark).span() fansNum = fansNumMark[s:e].encode('utf-8') item["befollowedNum"] = fansNum item["pageID"] = self.pageNumber item["pageMD5"] =GetMD5.getMD5(item["pageUrl"]) yield item self.pageNumber = self.pageNumber +1 if self.pageNumber % 1000 == 0: time.sleep(15)
def filterCdd(currentCSet, transactionList, minSupport, freqSet): filterCdd = BloomFilter(capacity=len(currentCSet), error_rate=0.0001) for val in currentCSet: filterCdd.add(val) for trans in transactionList: for cdd in combinations(trans, 4): if cdd in filterCdd: freqSet[cdd] += 1 return freqSet
def __init__(self, path): self.path = path self.rfile = None self.is_tofile = False if not os.path.isfile(path): self.bf = BloomFilter(100000, 0.001) else: self.rfile = open(path, 'r') self.bf = BloomFilter.fromfile(self.rfile)
def init_bloom_filter(self, spider_name): self.bloom_file = '%s.bloom' % spider_name if os.path.exists(self.bloom_file): self.bloom_filter = \ BloomFilter.fromfile(open(self.bloom_file, 'r')) else: self.bloom_filter = \ BloomFilter(capacity=100000000, error_rate=0.001) pass
class DownloadCache(object): def __init__(self, capacity, error_rate): self.cache = BloomFilter(capacity=capacity, error_rate=error_rate) def add(self, url): self.cache.add(url) def __contains__(self, item): return item in self.cache
def open_spider(self, spider): file_name = 'bloomfilter' is_exist = os.path.exists(file_name + '.blm') if is_exist: self.bf = BloomFilter.fromfile(open('bloomfilter.blm', 'rb')) print('open blm file success') else: self.bf = BloomFilter(100000, 0.001) print('didn\'t find the blm file')
def to_bloom(filename): with open(filename, 'r') as f: b = BloomFilter(capacity=1000000, error_rate=0.001) for line in f: if line != "": b.add(line) new_filename = filename + ".bloom" out_f = open(new_filename, 'wb') b.tofile(out_f)
def main(): uncompressedList, queryList = generateTestData(20, 1000, 100000, 10000000) # print(uncompressedList) # print("\n\n\n\n\n\n") # print(queryList) print(len(uncompressedList), len(queryList)) f = BloomFilter(capacity=1000, error_rate=0.001) for x in range(10): f.add(x) print(10 in f) print(5 in f)
def __init__(self, _capacity=1000000, _error_rate=0.00001): self.is_full = False # determine if open backup bloom data by time if CONFIG.get('BACKUP', 0) == 1: self.bomb = TimeBomb(CONFIG['TMP_DIR'] + CONFIG['BLOOM_FILE']) self.filter = self.bomb.load() if self.filter is None: self.filter = BloomFilter(capacity=_capacity, error_rate=_error_rate) self.bomb.dump(self.filter) else: self.filter = BloomFilter(capacity=_capacity, error_rate=_error_rate)
class product_spider_object_type_xml(CrawlSpider): # Default Data should be config in spiders name = "Product_Spider_Lazada" allowed_domains = [] start_urls = [] # rules = ( # ) # My Extra DATA data = [] name_data = '' source = '' # Init Spider def __init__(self, *arg, **karg): self.init_yaml('scrapy_service/templates/product.yaml','lazada_sitemap') CrawlSpider.__init__(self, *arg) # Load information form YAML file def init_yaml(self, path_to_file, name_data): document = open(path_to_file, 'r') self.data = load(document) self.name_data = name_data self.source = self.data[self.name_data]['database']['name'] document.close() self.allowed_domains = self.data[self.name_data]['allowed_domains'] self.start_urls = self.data[self.name_data]['start_urls'] # Get Links by Rule. This can be NULL temp_rule = [] for rule in self.data[self.name_data]['pattern']: temp_rule.append(Rule(LinkExtractor(allow=(rule, )), callback='parse')) self.rules = set(temp_rule) self.crawled_links = BloomFilter(2000000,0.00001) def parse(self, response): xpath_selector = HtmlXPathSelector(response) # Check to parse more links if response.headers.get('Content-Type',False) and 'xml' in response.headers['Content-Type']: extra_links = HtmlParser.extract_new_link_with_xpath(self.data[self.name_data], xpath_selector) for link in extra_links: current_link = link if 'http' in link else self.start_urls[0]+ link if current_link not in self.crawled_links: self.crawled_links.add(current_link) yield Request(current_link, callback=self.parse) else: ### Get ALL Items which existing in the current link items = HtmlParser.extract_product_with_xpath(self.data[self.name_data], xpath_selector, self.source) for item in items: yield item
def test_bloom_string(self): f = BloomFilter(capacity=10000, error_rate=0.001) for i in xrange(0, f.capacity): rnd = "".join(random.choice(string.letters) for i in xrange(40)) _ = f.add(rnd) self.assertEqual(rnd in f, True) for i in string.letters: self.assertEqual(i in f, False) self.assertEqual(rnd in f, True)
def test_bloom_int(self): f = BloomFilter(capacity=10000, error_rate=0.001) for i in xrange(0, f.capacity): _ = f.add(i) for i in xrange(0, f.capacity / 2): r = random.randint(0, f.capacity - 1) self.assertEqual(r in f, True) for i in xrange(0, f.capacity / 2): r = random.randint(f.capacity, f.capacity * 2) self.assertEqual(r in f, False)
def generate_write_bloomfilter(dir_name, capacity=1000000, error_rate=0.01): bf = BloomFilter(capacity, error_rate) data_dir = zhihu_util.get_data_directory(dir_name) data_file_list = zhihu_util.get_file_list(data_dir) for data_file in data_file_list: # read url_suffix from data file with open(data_file, "r") as file_object: for line in file_object: url_suffix = line.split(USER_FIELD_DELIMITER)[0] if url_suffix.strip() != '': # print "......url suffix:%s added into bloom filter" % url_suffix bf.add(str(url_suffix)) return bf
class DataStore(object): def __init__(self): self.blacklist = Feed().parse_feeds() self.bloom = BloomFilter(capacity=6000, error_rate=0.001) self.generate_bloom() def generate_bloom(self): for blacklist in self.blacklist[0]: for ip in blacklist['ips']: self.bloom.add(ip) def is_threat(self, ip): search = ip in self.bloom return search
def record(url): """ first time download tieba img create a bloomfliter for the next time downloading """ numlist =getallnumlist(url) bloomfilter =BloomFilter(1000000) for number in numlist: bloomfilter.add(number) with open('./%s/check/bloomfilter' %(url[28:]) ,'ab+') as b: bloomfilter.tofile(b) #print 'pool' multiprocessdownload(numlist)
def jaccard_ind(filename_1, filename_2): with open(filename_1, 'rb') as f_1: with open(filename_2, 'rb') as f_2: print(filename_1) b_1 = BloomFilter.fromfile(f_1) b_2 = BloomFilter.fromfile(f_2) b_inter = b_1.intersection(b_2) b_union = b_1.union(b_2) bits_inter = b_inter.bitarray.count(True) bits_union = b_union.bitarray.count(True) j_i = float(bits_inter) / float(bits_union) #print("%s ~ %s, %f" % filename_1, filename_2, j_i) print("%s %s %f" % (filename_1, filename_2, j_i)) f_2.close() f_1.close()
class BloomDupeFilter(BaseDupeFilter): def __init__(self, path=None): self.file = path self.fingerprints = BloomFilter(5000000, 0.00001) @classmethod def from_settings(cls, settings): return cls(job_dir(settings)) def request_seen(self, request): if request.url in self.fingerprints: return True self.fingerprints.add(request.url) def close(self, reason): self.fingerprints = None
def __init__(self, *args, **kwargs): # run using: scrapy crawl xss_spider -a url='http://example.com' super(UrlSpider, self).__init__(*args, **kwargs) self.start_urls = [kwargs.get('url')] hostname = urlparse(self.start_urls[0]).hostname self.allowed_domains = [hostname] # adding [] around the value seems to allow it to crawl subdomain of value self.fingerprints = BloomFilter(3000000, 0.0001)
def main(): bf = BloomFilter(1000000, 0.001) #Creation of bloomfilter text = raw_input('Enter the text: ') words_list = text.split() file = open("/usr/share/dict/words") for word in file: _ = bf.add(word.rstrip())# Add valid words for item in words_list: #Checks for invalid words & displays them. if item not in bf: print 'Spelling error:', item file.close()
def close(self): if self._bf is None and self.__data is None: return words = self.__data.getvalue().split() self._bf = BloomFilter(capacity=len(words) + 1) for word in words: self._bf.add(word, skip_check=True) def get_bl_size(): t = tempfile.NamedTemporaryFile().name with open(t, 'w') as fn: self._bf.tofile(fn) s = os.path.getsize(t) os.remove(t) return s if os.path.isfile(self._name): return a = open(self._name, 'w') a.write(struct.pack(">L", get_bl_size())) self._bf.tofile(a) with GzipFile(self._name, 'w', fileobj=a) as f: f.write(self.__data.getvalue()) a.close()