def main(argv): if argv: error_rate = float(argv[0]) print "[BUILDING] Using error-rate: {}".format(error_rate) if os.path.isfile(nsrl_path): print "[BUILDING] Reading in NSRL Database" with open(nsrl_path) as f_line: # Strip off header _ = f_line.readline() print "[BUILDING] Calculating number of hashes in NSRL..." num_lines = sum(bl.count("\n") for bl in blocks(f_line)) print "[BUILDING] There are %s hashes in the NSRL Database" % num_lines with open(nsrl_path) as f_nsrl: # Strip off header _ = f_nsrl.readline() print "[BUILDING] Creating bloomfilter" bf = BloomFilter(num_lines, error_rate) print "[BUILDING] Inserting hashes into bloomfilter" for line in f_nsrl: md5_hash = line.split(",")[1].strip('"') if md5_hash: try: md5 = binascii.unhexlify(md5_hash) bf.add(md5) except Exception as e: print "[ERROR] %s" % e print "[BUILDING] NSRL bloomfilter contains {} items.".format( len(bf)) with open('nsrl.bloom', 'wb') as nb: bf.tofile(nb) print "[BUILDING] Complete" else: print("[ERROR] No such file or directory: %s", nsrl_path) return
def build_bloom_filter_and_iblt(m, include_value_in_iblt=False): c = 8 * math.pow(math.log(2), 2) tau = 16.5 n = len(selected_txs) alpha = n / (c * tau) # print(alpha * tau) if m <= n: fpr = 0.1 else: fpr = alpha / m - n print("Mempool difference", abs(m - n)) n_cells = int((4 / 3) * abs(m - n)) + 30 print('n_cells', n_cells) logging.info("Calculated FPR: %f" % fpr) fpr = 0.1 b = BloomFilter(capacity=n, error_rate=fpr) i = IBLT(m=n_cells, k=3, key_size=32, value_size=0) for tx in selected_txs: b.add(tx['hash']) v = '' if include_value_in_iblt: v = tx_to_bytes(tx) i.insert(tx['hash'], v) return b, i
def main(): if os.path.isfile(nsrl_path): print "BUILDING: Reading in NSRL Database" with open(nsrl_path) as f_line: # Strip off header _ = f_line.readline() print "BUILDING: Calculating number of hashes in NSRL..." num_lines = sum(bl.count("\n") for bl in blocks(f_line)) print "BUILDING: There are %s hashes in the NSRL Database" % num_lines with open(nsrl_path) as f_nsrl: # Strip off header _ = f_nsrl.readline() print "BUILDING: Creating bloomfilter" bf = BloomFilter(num_lines, error_rate) print "BUILDING: Inserting hashes into bloomfilter" for line in f_nsrl: md5_hash = line.split(",")[1].strip('"') if md5_hash: try: bf.add(md5_hash) except Exception as e: print "ERROR: %s" % e print "BUILDING: NSRL bloomfilter contains {} items.".format(len(bf)) with open('nsrl.bloom', 'wb') as nb: bf.tofile(nb) print "BUILDING: Complete" else: print("ERROR: No such file or directory: %s", nsrl_path) return
class UrlSpider(CrawlSpider): name = "urlspider" allowed_domains = ["tianya.cn"] start_urls = ("http://www.hao123.com", ) rules = ( Rule(SgmlLinkExtractor(allow=()), callback="parse_resp", follow= True), ) def __init__(self, *args, **kwargs): # run using: scrapy crawl xss_spider -a url='http://example.com' super(UrlSpider, self).__init__(*args, **kwargs) self.start_urls = [kwargs.get('url')] hostname = urlparse(self.start_urls[0]).hostname self.allowed_domains = [hostname] # adding [] around the value seems to allow it to crawl subdomain of value self.fingerprints = BloomFilter(3000000, 0.0001) def parse_start_url(self, response): print "start:"+response.url return def parse_resp(self, response): fp = response.url new_fp = obtain_key(fp) if new_fp in self.fingerprints: return self.fingerprints.add(new_fp) item = SiteCrawlItem() item["url"] = response.url yield item
def main(argv): if argv: error_rate = float(argv[0]) print "[BUILDING] Using error-rate: {}".format(error_rate) if os.path.isfile(nsrl_path): print "[BUILDING] Reading in NSRL Database" with open(nsrl_path) as f_line: # Strip off header _ = f_line.readline() print "[BUILDING] Calculating number of hashes in NSRL..." num_lines = sum(bl.count("\n") for bl in blocks(f_line)) print "[BUILDING] There are %s hashes in the NSRL Database" % num_lines with open(nsrl_path) as f_nsrl: # Strip off header _ = f_nsrl.readline() print "[BUILDING] Creating bloomfilter" bf = BloomFilter(num_lines, error_rate) print "[BUILDING] Inserting hashes into bloomfilter" for line in f_nsrl: sha1_hash = line.split(",")[0].strip('"') if sha1_hash: try: sha1 = binascii.unhexlify(sha1_hash) bf.add(sha1) except Exception as e: print "[ERROR] %s" % e print "[BUILDING] NSRL bloomfilter contains {} items.".format(len(bf)) with open('nsrl.bloom', 'wb') as nb: bf.tofile(nb) print "[BUILDING] Complete" else: print("[ERROR] No such file or directory: %s", nsrl_path) return
class BloomCheckPipeline(object): def __int__(self): file_name = 'bloomfilter' def open_spider(self, spider): file_name = 'bloomfilter' is_exist = os.path.exists(file_name + '.blm') if is_exist: self.bf = BloomFilter.fromfile(open('bloomfilter.blm', 'rb')) print('open blm file success') else: self.bf = BloomFilter(100000, 0.001) print('didn\'t find the blm file') def process_item(self, item, spider): # 我是过滤掉相同url的item 各位看需求 if item['urlToken'] or item['id'] in self.bf: print('drop one item for exist') raise DropItem('drop an item for exist') else: self.bf.add(item['urlToken']) print('add one success') return item def close_spider(self, spider): self.bf.tofile(open('bloomfilter.blm', 'wb'))
def user_init(): import re users = BloomFilter(10000000, 0.001) f= open(u"D:/工作/数据美化/data/简书用户id1.txt") for line in f: users.add(line.strip()) return users
def add(self, key): """Adds a key to this bloom filter. If the key already exists in this filter it will return True. Otherwise False. >>> b = ScalableBloomFilter(initial_capacity=100, error_rate=0.001, \ mode=ScalableBloomFilter.SMALL_SET_GROWTH) >>> b.add("hello") False >>> b.add("hello") True """ if key in self: return True if not self.filters: filter = BloomFilter(capacity=self.initial_capacity, error_rate=self.error_rate * (1.0 - self.ratio)) self.filters.append(filter) else: filter = self.filters[-1] if filter.count >= filter.capacity: filter = BloomFilter(capacity=filter.capacity * self.scale, error_rate=filter.error_rate * self.ratio) self.filters.append(filter) filter.add(key, skip_check=True) return False
class BLOOMDupeFilter(BaseDupeFilter): """Request Fingerprint duplicates filter""" def __init__(self, path=None): self.file = None # capacity # this BloomFilter must be able to store at least *capacity* elements # while maintaining no more than *error_rate* chance of false # positives # error_rate # the error_rate of the filter returning false positives. This # determines the filters capacity. Inserting more than capacity # elements greatly increases the chance of false positives. self.fingerprints = BloomFilter(capacity=2000000, error_rate=0.00001) # get all the urls from database db = DynamoDBPipeline() urls = db.get_url_list() [self.fingerprints.add(url) for url in urls] @classmethod def from_settings(cls, settings): return cls(job_dir(settings)) def request_seen(self, request): fp = request.url if fp in self.fingerprints: return True self.fingerprints.add(fp) def close(self, reason): self.fingerprints = None
def determine_lookup_speed_threshold(self): from time import time #do each one 5 times bf = BloomFilter(capacity=self.bloom_size, error_rate=self.bloom_error) count = 1 repetitions = 5 self_bf_holder = self.bf self.bf = bf while True: bf.add('andrew_' + str(count)) bin_faster_count = 0 for j in xrange(repetitions): #Linear scan t1 = time() self.linear_scan_count('andrew') t2 = time() linear_time = t2-t1 t1 = time() self.binsearch_count('andrew') t2 = time() bin_time = t2-t1 bin_faster_count += int(bin_time < linear_time) if 1.*bin_faster_count / repetitions >= 0.75: del bf self.bf = self_bf_holder return count count += 1
class UrlBloom: '''BloomFilter: check elements repetition''' def __init__(self, _capacity=1000000, _error_rate=0.00001): self.is_full = False # determine if open backup bloom data by time if CONFIG.get('BACKUP', 0) == 1: self.bomb = TimeBomb(CONFIG['TMP_DIR'] + CONFIG['BLOOM_FILE']) self.filter = self.bomb.load() if self.filter is None: self.filter = BloomFilter(capacity=_capacity, error_rate=_error_rate) self.bomb.dump(self.filter) else: self.filter = BloomFilter(capacity=_capacity, error_rate=_error_rate) def add(self, links): if self.is_full: return try: for ele in links: self.filter.add(ele) except IndexError: # rasie IndexError when bloom is at capacity self.is_full = True def clean(self, links): res = [] for ele in links: if ele not in self.filter: res.append(ele) return res
class Filter(object): def __init__(self, cachefile, capacity=1000000, error_rate=0.001): self.cachefile = cachefile if os.name == 'nt' or not cachefile: from pybloom import BloomFilter if self.cache(): with open(cachefile, 'r') as fp: self.filter = BloomFilter.fromfile(fp) else: self.filter = BloomFilter(capacity=capacity, error_rate=error_rate) elif os.name == 'posix': from pybloomfilter import BloomFilter if self.cache(): self.filter = BloomFilter.open(self.cachefile) else: self.filter = BloomFilter(capacity, error_rate, cachefile) def __contains__(self, key): return key in self.filter def add(self, obj): self.filter.add(obj) if os.name == 'nt': with open(self.cachefile, 'w') as fp: self.filter.tofile(fp) def cache(self): return os.path.exists(self.cachefile or '')
def generateBloomFilter(file): "Generates the bloom filter for entries in file." # this probably isnt enough, need to look the data formatting over more # thoroughly d = BloomFilter(1000, 0.001) for line in file: d.add(line.split(1)[0])
def user_init(): import re users = BloomFilter(10000000, 0.001) f = open(u"D:/工作/数据美化/data/简书用户id1.txt") for line in f: users.add(line.strip()) return users
class UrlManager(object): def __init__(self): self.urls = [] self.url_bloom_filter = BloomFilter(capacity=500000, error_rate=0.001) def add_url(self, url): # if url not in self.url_bloom_filter: self.urls.append(url) # self.url_bloom_filter.add(url) def add_urls(self, urls): for url in urls: self.add_url(url) def is_empty(self): return len(self.urls) == 0 def get_url(self): return self.urls.pop(0) def get_len(self): return len(self.urls) def is_viewed(self, url): return url in self.url_bloom_filter def add_viewed(self, url): self.url_bloom_filter.add(url)
def returnItemsWithMinSupportV3(itemSet, lenItem, transactionList, minSupport, freqSet): _itemSet = set() localSet = defaultdict(int) if len(itemSet): filterCdd = BloomFilter(capacity=len(itemSet), error_rate=0.0001) else: print("As I say, ValueError: Capacity must be > 0") return set([]) print("Store cdds in BF ... - %s" % getTime()) for val in itemSet: pass # 待引入counting BF,如达到minSup*len(transactionList),则不插入;or 不用counting BF,判断,已在BF的则不再插入。 filterCdd.add(val) print("Mapping cddFromTrans on BF ... - %s" % getTime()) for trans in transactionList: for cdd in combinations(trans, lenItem): cdd = frozenset(cdd) if cdd in filterCdd: freqSet[cdd] += 1 #zi 全局存一个 localSet[cdd] += 1 #zi 局部存一个,(item, count),然后过滤小于minSupport的。 print("Filter cdds that less than minSup. - %s" % getTime()) for item, count in localSet.items(): support = float(count) / len(transactionList) if support > minSupport: _itemSet.add(item) return _itemSet
def vacuum_all(self, limit=None, time_limit=None, unupdated=False): logger.debug('Begin vacuum_all(limit=%s, time_limit=%s, unupdated=%s)', limit, time_limit, unupdated) ##TODO delete SCIFields with SCFilterId not found in SCFilter self.plugins = self.load_plugins() self.ts = self.term_stat('SupplierCatalogItemFields Vacuum', len(self.plugins)) now = start_time = datetime.now() try: transaction.begin() for plug in self.plugins.itervalues(): supplier_catalog_filter_id = plug.supplier_catalog_filter_id() ### Generate a bloom filter set of SCIF id's in VersionModel model_name = plug.version_model() + 'Model' VersionModel = getattr(model, model_name) query = DBSession.query(VersionModel.supplier_catalog_item_field_id) s = BloomFilter(capacity=query.count() + 1) self.ts['sub_total'] = query.count() for (supplier_catalog_item_field_id, ) in query.yield_per(100): s.add(supplier_catalog_item_field_id) self.ts['sub_done'] += 1 del query ### Iterate through SCIFields, deleting any that don't appear in the bloom filter. query = DBSession.query(SupplierCatalogItemFieldModel) query = query.filter(SupplierCatalogItemFieldModel.supplier_catalog_filter_id == supplier_catalog_filter_id) if unupdated is not True: query = query.filter(SupplierCatalogItemFieldModel.updated != None) if limit is not None: query = query.order_by(SupplierCatalogItemFieldModel.vacuumed.nullsfirst()) query = query.limit(limit) logger.debug("LIMIT %i, supplier_catalog_filter_id %s", limit, supplier_catalog_filter_id) self.ts['sub_done'] = 0 self.ts['sub_total'] = query.count() for supplier_catalog_item_field in query.yield_per(100): if supplier_catalog_item_field.id not in s: logger.debug("Deleting SupplierCatalogItemField %s", supplier_catalog_item_field.id) DBSession.delete(supplier_catalog_item_field) else: supplier_catalog_item_field.vacuumed = now if self.ts['sub_done'] % 1000 == 0: DBSession.flush() self.ts['sub_done'] += 1 del query DBSession.flush() if time_limit is not None: if datetime.now() > start_time + time_limit: logger.info("Reached Time Limit at %i of %i", self.ts['done'], self.ts['total']) transaction.commit() break; self.ts['done'] += 1 transaction.commit() except Exception: logger.exception("Caught Exception: ") transaction.abort() finally: self.ts.finish() logger.debug('End vacuum()')
def get_bloom(self): bloom_cache = BloomFilter(capacity=10000000, error_rate=0.00001) sql = "select url from user_tbl" self.cursor.execute(sql) datalist = self.cursor.fetchall() for data in datalist: bloom_cache.add(data[0]) return bloom_cache
def checkCinT(currentCSet, transactionList, minSupport, freqSet): filterTrans = BloomFilter(capacity=len(transactionList), error_rate=0.001) for val in transactionList: filterTrans.add(val) print filterTrans.count for cdd in currentCSet: pass return freqSet
class DownloadCache(object): def __init__(self, capacity, error_rate): self.cache = BloomFilter(capacity=capacity, error_rate=error_rate) def add(self, url): self.cache.add(url) def __contains__(self, item): return item in self.cache
def filterCdd(currentCSet, transactionList, minSupport, freqSet): filterCdd = BloomFilter(capacity=len(currentCSet), error_rate=0.0001) for val in currentCSet: filterCdd.add(val) for trans in transactionList: for cdd in combinations(trans, 4): if cdd in filterCdd: freqSet[cdd] += 1 return freqSet
class BlogSpider(Spider): def __init__(self): self.pageNumber =0 self.logfile = open("/home/hduser/Logs/csdnUserlog.log","w") self.f = BloomFilter(capacity=10000000, error_rate=0.0001) name = "csdnUserScrapy" #减慢爬取速度 为2s download_delay = 0.5 allowed_domains = ["my.csdn.net"] start_urls = [ "http://my.csdn.net/jiazhijun","http://my.csdn.net/sodino","http://my.csdn.net/bill_man","http://my.csdn.net/lhc2207221755","http://my.csdn.net/xgbing","http://my.csdn.net/LoongEmbedded","http://my.csdn.net/jdh99","http://my.csdn.net/zqiang_55","http://my.csdn.net/zhao_zepeng","http://my.csdn.net/linyt","http://my.csdn.net/kmyhy","http://my.csdn.net/lincyang","http://my.csdn.net/jdsjlzx","http://my.csdn.net/u011012932","http://my.csdn.net/yayun0516","http://my.csdn.net/qq_23547831","http://my.csdn.net/CHENYUFENG1991","http://my.csdn.net/qq_26787115","http://my.csdn.net/kongki","http://my.csdn.net/you23hai45","http://my.csdn.net/cometwo","http://my.csdn.net/yuanziok","http://my.csdn.net/woxueliuyun","http://my.csdn.net/gatieme","http://my.csdn.net/u010850027","http://my.csdn.net/yinwenjie","http://my.csdn.net/teamlet","http://my.csdn.net/wangyangzhizhou","http://my.csdn.net/xiaoxian8023","http://my.csdn.net/ooppookid","http://my.csdn.net/wsl211511","http://my.csdn.net/liyuanbhu","http://my.csdn.net/sxhelijian","http://my.csdn.net/raylee2007","http://my.csdn.net/luozhuang","http://my.csdn.net/shaqoneal","http://my.csdn.net/dc_726","http://my.csdn.net/tobacco5648","http://my.csdn.net/wowkk","http://my.csdn.net/csfreebird","http://my.csdn.net/xukai871105","http://my.csdn.net/tuzongxun","http://my.csdn.net/mchdba","http://my.csdn.net/lichangzai","http://my.csdn.net/leftfist","http://my.csdn.net/wonder4","http://my.csdn.net/fogyisland2000","http://my.csdn.net/smstong","http://my.csdn.net/david_520042","http://my.csdn.net/ghostbear","http://my.csdn.net/xuyaqun","http://my.csdn.net/force_eagle","http://my.csdn.net/Jmilk","http://my.csdn.net/xiangpingli","http://my.csdn.net/quqi99","http://my.csdn.net/michaelzhou224","http://my.csdn.net/zzq900503","http://my.csdn.net/pipisorry","http://my.csdn.net/zhangmike","http://my.csdn.net/foruok","http://my.csdn.net/fengbingchun","http://my.csdn.net/qingrun","http://my.csdn.net/harrymeng","http://my.csdn.net/pukuimin1226","http://my.csdn.net/lihuoming","http://my.csdn.net/zhazha1980518","http://my.csdn.net/redarmy_chen","http://my.csdn.net/yuanmeng001","http://my.csdn.net/yeka","http://my.csdn.net/xieqq","http://my.csdn.net/zhangxiaoxiang","http://my.csdn.net/oiio","http://my.csdn.net/jobchanceleo","http://my.csdn.net/broadview2006" ] def parse(self, response): sel = Selector(response) item = CsdnusersspyderItem() print "response URL %s\n" % str(response.url) self.f.add(str(response.url)) #print "*********\nBloom added self.url: %s \n**********\n" % str(response.url) item["userName"] = str(response.url).split('/')[-1] relativeMarks =response.xpath("//div[@class='header clearfix']/a[@href]").extract() item["follow"] = [] item["befollowed"] = [] i = 0 for u in relativeMarks: unameMark = re.findall(r'username="******"',u) (s,e) = re.search(r'".*"',unameMark[0]).span() uname = unameMark[0][s+1:e-1] if i <= 7: item["follow"].append(uname.encode('utf-8')) else: item["befollowed"].append(uname.encode('utf-8')) newUrl = "http://my.csdn.net/"+uname if newUrl in self.f: self.logfile.write("Duplicated URL: %s\n" % newUrl) pass else: #self.logfile.write("wei chong fu %s\n" % newUrl) yield Request(newUrl,callback=self.parse) i += 1 item["pageUrl"] = str(response.url) focusNumMark = response.xpath("//dd[@class='focus_num']").extract()[0] (s ,e) = re.search(r'\d+',focusNumMark).span() focusNum = focusNumMark[s:e].encode('utf-8') item["followNum"] = focusNum fansNumMark = response.xpath("//dd[@class='fans_num']").extract()[0] (s ,e) = re.search(r'\d+',fansNumMark).span() fansNum = fansNumMark[s:e].encode('utf-8') item["befollowedNum"] = fansNum item["pageID"] = self.pageNumber item["pageMD5"] =GetMD5.getMD5(item["pageUrl"]) yield item self.pageNumber = self.pageNumber +1 if self.pageNumber % 1000 == 0: time.sleep(15)
class BloomZip(object): def __init__(self, name): super(BloomZip, self).__init__() self.__data = StringIO() self._name = name self._bf = None if os.path.isfile(self._name): with open(self._name, 'rb') as f: length = struct.unpack(">L", f.read(4))[0] self._bf = BloomFilter.fromfile(f, length) def contains(self, word): return word in self._bf def write(self, data): self.__data.write(data) def close(self): if self._bf is None and self.__data is None: return words = self.__data.getvalue().split() self._bf = BloomFilter(capacity=len(words) + 1) for word in words: self._bf.add(word, skip_check=True) def get_bl_size(): t = tempfile.NamedTemporaryFile().name with open(t, 'w') as fn: self._bf.tofile(fn) s = os.path.getsize(t) os.remove(t) return s if os.path.isfile(self._name): return a = open(self._name, 'w') a.write(struct.pack(">L", get_bl_size())) self._bf.tofile(a) with GzipFile(self._name, 'w', fileobj=a) as f: f.write(self.__data.getvalue()) a.close() def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.close() if exc_type is not None: print(exc_tb) raise exc_val
def test_union(): bloom_one = BloomFilter(100, 0.001) bloom_two = BloomFilter(100, 0.001) chars = [chr(i) for i in range(97, 123)] for char in chars[len(chars) / 2:]: bloom_one.add(char) for char in chars[:len(chars) / 2]: bloom_two.add(char) new_bloom = bloom_one.union(bloom_two) for char in chars: assert_(char in new_bloom)
def spider_new_Init(spidername, dbname, website, carnum, urltag='url', keycol='url'): #Mongo setting # spider_original_Init(dbname, website, carnum) # Mongo con connection = pymongo.MongoClient(settings['MONGODB_SERVER'], settings['MONGODB_PORT']) dbdata = connection[dbname] collectiondata = dbdata[website] # bloom file filename = 'blm/' + dbname + '/' + spidername + ".blm" # pybloom num = int((int(carnum) + collectiondata.count()) * 1.1) df = BloomFilter(capacity=num, error_rate=0.001) # read isexists = os.path.exists(filename) itemmax = 0 if isexists: fr = open(filename, "r") lines = fr.readlines() for line in lines: line = line.strip('\n') df.add(line) fr.close() else: fa = open(filename, "w") for i in collectiondata.find(): if keycol in i.keys(): if urltag == 'url': item = i[keycol] itemmd5 = md5(item).hexdigest() returndf = df.add(itemmd5) if not (returndf): fa.writelines(itemmd5 + '\n') else: item = re.findall('\d+', i["url"]) item = int(item[len(item) - 1]) if item > itemmax: itemmax = item if urltag == 'num': for item in range(1, itemmax + 1): item = str(item) itemmd5 = md5(item).hexdigest() returndf = df.add(itemmd5) if not (returndf): fa.writelines(itemmd5 + '\n') fa.close() connection.close() return df
def test_union(self): bloom_one = BloomFilter(100, 0.001) bloom_two = BloomFilter(100, 0.001) chars = [chr(i) for i in range(97, 123)] for char in chars[len(chars)/2:]: bloom_one.add(char) for char in chars[:len(chars)/2]: bloom_two.add(char) new_bloom = bloom_one.union(bloom_two) for char in chars: self.assert_(char in new_bloom)
def main(): uncompressedList, queryList = generateTestData(20, 1000, 100000, 10000000) # print(uncompressedList) # print("\n\n\n\n\n\n") # print(queryList) print(len(uncompressedList), len(queryList)) f = BloomFilter(capacity=1000, error_rate=0.001) for x in range(10): f.add(x) print(10 in f) print(5 in f)
def to_bloom(filename): with open(filename, 'r') as f: b = BloomFilter(capacity=1000000, error_rate=0.001) for line in f: if line != "": b.add(line) new_filename = filename + ".bloom" out_f = open(new_filename, 'wb') b.tofile(out_f)
def test_union(self): bloom_one = BloomFilter(100, 0.001) bloom_two = BloomFilter(100, 0.001) chars = [chr(i) for i in range_fn(97, 123)] for char in chars[int(len(chars) / 2):]: bloom_one.add(char) for char in chars[:int(len(chars) / 2)]: bloom_two.add(char) new_bloom = bloom_one.union(bloom_two) for char in chars: self.assertTrue(char in new_bloom)
class product_spider_object_type_xml(CrawlSpider): # Default Data should be config in spiders name = "Product_Spider_Lazada" allowed_domains = [] start_urls = [] # rules = ( # ) # My Extra DATA data = [] name_data = '' source = '' # Init Spider def __init__(self, *arg, **karg): self.init_yaml('scrapy_service/templates/product.yaml','lazada_sitemap') CrawlSpider.__init__(self, *arg) # Load information form YAML file def init_yaml(self, path_to_file, name_data): document = open(path_to_file, 'r') self.data = load(document) self.name_data = name_data self.source = self.data[self.name_data]['database']['name'] document.close() self.allowed_domains = self.data[self.name_data]['allowed_domains'] self.start_urls = self.data[self.name_data]['start_urls'] # Get Links by Rule. This can be NULL temp_rule = [] for rule in self.data[self.name_data]['pattern']: temp_rule.append(Rule(LinkExtractor(allow=(rule, )), callback='parse')) self.rules = set(temp_rule) self.crawled_links = BloomFilter(2000000,0.00001) def parse(self, response): xpath_selector = HtmlXPathSelector(response) # Check to parse more links if response.headers.get('Content-Type',False) and 'xml' in response.headers['Content-Type']: extra_links = HtmlParser.extract_new_link_with_xpath(self.data[self.name_data], xpath_selector) for link in extra_links: current_link = link if 'http' in link else self.start_urls[0]+ link if current_link not in self.crawled_links: self.crawled_links.add(current_link) yield Request(current_link, callback=self.parse) else: ### Get ALL Items which existing in the current link items = HtmlParser.extract_product_with_xpath(self.data[self.name_data], xpath_selector, self.source) for item in items: yield item
def generate_write_bloomfilter(dir_name, capacity=1000000, error_rate=0.01): bf = BloomFilter(capacity, error_rate) data_dir = zhihu_util.get_data_directory(dir_name) data_file_list = zhihu_util.get_file_list(data_dir) for data_file in data_file_list: # read url_suffix from data file with open(data_file, "r") as file_object: for line in file_object: url_suffix = line.split(USER_FIELD_DELIMITER)[0] if url_suffix.strip() != '': # print "......url suffix:%s added into bloom filter" % url_suffix bf.add(str(url_suffix)) return bf
def test_intersection(self): bloom_one = BloomFilter(100, 0.001) bloom_two = BloomFilter(100, 0.001) chars = [chr(i) for i in range(97, 123)] for char in chars: bloom_one.add(char) for char in chars[:len(chars)/2]: bloom_two.add(char) new_bloom = bloom_one.intersection(bloom_two) for char in chars[:len(chars)/2]: self.assert_(char in new_bloom) for char in chars[len(chars)/2:]: self.assert_(char not in new_bloom)
def hound(): f = BloomFilter(capacity=100000, error_rate=0.01) text = requests.get('https://www.gutenberg.org/files/2852/2852-0.txt').text for word in text.split(): word = word.lower().strip() f.add(word) print len(f) print len(text.split()) for w in ('holmes', 'watson', 'hound', 'moor', 'queen'): print 'Found', w, w in f
class DataStore(object): def __init__(self): self.blacklist = Feed().parse_feeds() self.bloom = BloomFilter(capacity=6000, error_rate=0.001) self.generate_bloom() def generate_bloom(self): for blacklist in self.blacklist[0]: for ip in blacklist['ips']: self.bloom.add(ip) def is_threat(self, ip): search = ip in self.bloom return search
def record(url): """ first time download tieba img create a bloomfliter for the next time downloading """ numlist =getallnumlist(url) bloomfilter =BloomFilter(1000000) for number in numlist: bloomfilter.add(number) with open('./%s/check/bloomfilter' %(url[28:]) ,'ab+') as b: bloomfilter.tofile(b) #print 'pool' multiprocessdownload(numlist)
def record(url): """ first time download tieba img create a bloomfliter for the next time downloading """ numlist = getallnumlist(url) bloomfilter = BloomFilter(1000000) for number in numlist: bloomfilter.add(number) with open('./%s/check/bloomfilter' % (url[28:]), 'ab+') as b: bloomfilter.tofile(b) #print 'pool' multiprocessdownload(numlist)
class BloomFilterDuplicateRemover(DuplicateRemover): def __init__(self, capacity=1000000, error_rate=0.001): self.bloomFilter = BloomFilter(capacity, error_rate) def dump(self, url): """检查是否存在url 如果存在返回True 否则False""" url_hash = self.url_hash(url) return self.bloomFilter.__contains__(url_hash) def add(self, url): url_hash = self.url_hash(url) self.bloomFilter.add(url_hash) def count(self): return self.bloomFilter.count
def create_bloom_filter(values, error_rate): """ Create a BloomFilter object with the given error rate and a capacity given by the number of unique items in values. Add each value in values to the BloomFilter and return. """ value_set = set(filter(lambda x: len(x), values)) debug("Creating bloom filter, capacity=%d, error_rate=%f (%.4f%%)\n" % ( len(value_set), error_rate, 100 * error_rate)) b = BloomFilter(capacity=len(value_set), error_rate=error_rate) for value in value_set: debug("Adding '%s'\n" % value) b.add(value) return (b, len(value_set))
class BloomDupeFilter(BaseDupeFilter): def __init__(self, path=None): self.file = path self.fingerprints = BloomFilter(5000000, 0.00001) @classmethod def from_settings(cls, settings): return cls(job_dir(settings)) def request_seen(self, request): if request.url in self.fingerprints: return True self.fingerprints.add(request.url) def close(self, reason): self.fingerprints = None
def spider_update_Init(dbname, website, carnum): # Mongo setting # spider_original_Init(dbname, website, carnum) # Mongo con connection = pymongo.MongoClient(settings['MONGODB_SERVER'], settings['MONGODB_PORT']) dbdata = connection[dbname] collectiondata = dbdata[website] # pybloom num = (int(carnum) + collectiondata.count()) * 1.1 df = BloomFilter(capacity=num, error_rate=0.01) # urllist urllist = [] for i in collectiondata.find(): if "url" in i.keys(): item = i["url"] if "status" in i.keys(): if not (i['status'].find('sold') == -1): continue itemmd5 = md5(item).hexdigest() returndf = df.add(itemmd5) if not (returndf): urllist.append(item) connection.close() return urllist
class FileBloomFilter(object): def __init__(self, path): self.path = path self.rfile = None self.is_tofile = False if not os.path.isfile(path): self.bf = BloomFilter(100000, 0.001) else: self.rfile = open(path, 'r') self.bf = BloomFilter.fromfile(self.rfile) def __del__(self): if not self.is_tofile: self.tofile() if self.rfile: self.rfile.close() def tofile(self): if self.bf: wfile = open(self.path, 'w+') self.bf.tofile(wfile) wfile.close() self.is_tofile = True def have(self, item): key = item['ip'] + ":" + str(item['port']) if key in self.bf: return True else: return False def filter_proxy_ip_list(self, items): filter_items = [] for item in items: if not self.have(item): filter_items.append(item) return filter_items def add_proxy_ip(self, item): key = item['ip'] + ":" + str(item['port']) self.bf.add(key) self.is_tofile = False def add_proxy_ip_all(self, items): for item in items: self.add_proxy_ip(item)
def bloom_filter(ifile, ofile, dupfile, blofile, maxsize=99999999): blf = BloomFilter(capacity=maxsize, error_rate=0.001) with open(ifile, 'rb') as f: lines = f.readlines() lines_len = len(lines) for line in lines: if line in blf: lines.remove(line) with open(dupfile, 'ab') as new_file: new_file.write(line) else: blf.add(line) print "lines:", lines_len, " => ", "nlines:", len(blf) with open(blofile, 'wb') as blfile: pickle.dump(blf, blfile) with open(ofile, 'wb') as outfile: outfile.writelines(lines)
class BLOOMDupeFilter(BaseDupeFilter): def __init__(self, path=None): self.file = None self.fingerprints = BloomFilter(capacity=1000000, error_rate=0.001) @classmethod def from_settings(cls, settings): return cls(job_dir(settings)) def request_seen(self, request): fp = request.url if fp in self.fingerprints: return True self.fingerprints.add(fp) return False def close(self, reason): self.fingerprints = None
class BloomURLDupeFilter(BaseDupeFilter): """Request Fingerprint duplicates filter""" def __init__(self, path=None): self.file = None self.fingerprints = BloomFilter(3000000, 0.0001) @classmethod def from_settings(cls, settings): return cls(job_dir(settings)) def request_seen(self, request): fp = request.url if fp in self.fingerprints: return True self.fingerprints.add(fp) def close(self, reason): self.fingerprints = None
def returnItemsWithMinSupportV4(itemSet, lenItem, transactionList, minSupport, freqSet): _itemSet = set() localSet = defaultdict(int) filterCdd = BloomFilter(capacity=len(itemSet), error_rate=0.0001) for val in itemSet: filterCdd.add(val) for trans in transactionList: for cdd in combinations(trans, lenItem): if cdd in filterCdd: freqSet[cdd] += 1 #zi 全局存一个 localSet[cdd] += 1 #zi 局部存一个,(item, count),然后过滤小于minSupport的。 for item, count in localSet.items(): support = float(count) / len(transactionList) if support > minSupport: _itemSet.add(item) return _itemSet
def compile(): boys = BloomFilter(capacity=703) girls = BloomFilter(capacity=1003) with open('sample_data/names.csv', 'r') as f: reader = csv.reader(f) reader.next() for row in reader: if float(row[2])<.0005: continue if row[3].lower() == 'boy': boys.add(row[1].lower()) elif row[3].lower() == 'girl': girls.add(row[1].lower()) with open('blooms/boys', 'w') as f: boys.tofile(f) with open('blooms/girls', 'w') as f: girls.tofile(f) print len(boys), len(girls)
def feedBloom(row): f = BloomFilter(capacity=200, error_rate=0.6) f.add(row.src_ip) f.add(row.src_ip[0:5]) f.add(row.src_ip[5:8]) f.add(row.target_ip) return np.array(f.bitarray.tolist(), dtype=np.int)
def feedBloom(row): f = BloomFilter(capacity = 200 , error_rate = 0.6) f.add(row.src_ip) f.add(row.src_ip[0:5]) f.add(row.src_ip[5:8]) f.add(row.target_ip) return np.array(f.bitarray.tolist(),dtype=np.int)
def crawl(url, seen=None): print("crawling: {0}".format(url)) if not seen: seen = BloomFilter(capacity=50000, error_rate=0.0001) with Timeout(5, False): try: response = requests.get(url) except Exception: return location = domain(url) wanted_urls = [] for url_match in url_regex.finditer(response.text): url = url_match.group(0) # To not destroy the internet, we only fetch URLs on the same domain. if url not in seen and location in domain(url): wanted_urls.append(url) seen.add(url) subtasks = group(crawl.s(url, seen) for url in wanted_urls) subtasks()
def test_bloom_string(self): f = BloomFilter(capacity=10000, error_rate=0.001) for i in xrange(0, f.capacity): rnd = "".join(random.choice(string.letters) for i in xrange(40)) _ = f.add(rnd) self.assertEqual(rnd in f, True) for i in string.letters: self.assertEqual(i in f, False) self.assertEqual(rnd in f, True)
def test_bloom_int(self): f = BloomFilter(capacity=10000, error_rate=0.001) for i in xrange(0, f.capacity): _ = f.add(i) for i in xrange(0, f.capacity / 2): r = random.randint(0, f.capacity - 1) self.assertEqual(r in f, True) for i in xrange(0, f.capacity / 2): r = random.randint(f.capacity, f.capacity * 2) self.assertEqual(r in f, False)
def main(capacity=100000, request_error_rate=0.1): f = BloomFilter(capacity=capacity, error_rate=request_error_rate) assert (capacity == f.capacity) start = time.time() for i in range(0, f.capacity): f.add(i, skip_check=True) end = time.time() print("{:5.3f} seconds to add to capacity, {:10.2f} entries/second".format( end - start, f.capacity / (end - start))) one_bits = f.bitarray.count(True) # zero_bits = f.bitarray.count(False) # print "Number of 1 bits:", one_bits # print "Number of 0 bits:", zero_bits print("Number of Filter Bits:", f.num_bits) print("Number of slices:", f.num_slices) print("Bits per slice:", f.bits_per_slice) print("------") print("Fraction of 1 bits at capacity: {:5.3f}".format(one_bits / float(f.num_bits))) # Look for false positives and measure the actual fp rate trials = f.capacity fp = 0 start = time.time() for i in range(f.capacity, f.capacity + trials + 1): if i in f: fp += 1 end = time.time() print(("{:5.3f} seconds to check false positives, " "{:10.2f} checks/second".format(end - start, trials / (end - start)))) print("Requested FP rate: {:2.4f}".format(request_error_rate)) print("Experimental false positive rate: {:2.4f}".format(fp / float(trials))) # Compute theoretical fp max (Goel/Gupta) k = f.num_slices m = f.num_bits n = f.capacity fp_theory = math.pow((1 - math.exp(-k * (n + 0.5) / (m - 1))), k) print("Projected FP rate (Goel/Gupta): {:2.6f}".format(fp_theory))
def crawl(url, seen=None): print("crawling: %r" % (url, )) if not seen: seen = BloomFilter(capacity=50000, error_rate=0.0001) with Timeout(5, False): try: data = urllib2.urlopen(url).read() except (urllib2.HTTPError, IOError): return location = domain(url) wanted_urls = [] for url_match in url_regex.finditer(data): url = url_match.group(0) # To not destroy the internet, we only fetch URLs on the same domain. if url not in seen and location in domain(url): wanted_urls.append(url) seen.add(url) subtasks = TaskSet(crawl.subtask((url, seen)) for url in wanted_urls) subtasks.apply_async()
def compile(): uni_names = BloomFilter(capacity=719) name_strings = [] with open('sample_data/uni_names.out', 'r') as f: for line in f: m = re.search(r'\| "(.*)"$', line.strip()) if m: name = m.group(1).strip().lower() name_strings.append(name) uni_names.add(name) ngpol_filt = NGPOLFilter(4, name_strings) for name in name_strings: if name not in ngpol_filt: print name print ngpol_filt.min_rating print ngpol_filt.deviation ngpol_filt.clean() with open('blooms/uni_names','w') as f: uni_names.tofile(f) with open('ngpols/uni_names','w') as f: ngpol_filt.tofile(f) print len(uni_names)