Example #1
0
def main(argv):
    if argv:
        error_rate = float(argv[0])
    print "[BUILDING] Using error-rate: {}".format(error_rate)
    if os.path.isfile(nsrl_path):
        print "[BUILDING] Reading in NSRL Database"
        with open(nsrl_path) as f_line:
            # Strip off header
            _ = f_line.readline()
            print "[BUILDING] Calculating number of hashes in NSRL..."
            num_lines = sum(bl.count("\n") for bl in blocks(f_line))
            print "[BUILDING] There are %s hashes in the NSRL Database" % num_lines
        with open(nsrl_path) as f_nsrl:
            # Strip off header
            _ = f_nsrl.readline()
            print "[BUILDING] Creating bloomfilter"
            bf = BloomFilter(num_lines, error_rate)
            print "[BUILDING] Inserting hashes into bloomfilter"
            for line in f_nsrl:
                md5_hash = line.split(",")[1].strip('"')
                if md5_hash:
                    try:
                        md5 = binascii.unhexlify(md5_hash)
                        bf.add(md5)
                    except Exception as e:
                        print "[ERROR] %s" % e
            print "[BUILDING] NSRL bloomfilter contains {} items.".format(
                len(bf))
            with open('nsrl.bloom', 'wb') as nb:
                bf.tofile(nb)
            print "[BUILDING] Complete"
    else:
        print("[ERROR] No such file or directory: %s", nsrl_path)

    return
Example #2
0
def build_bloom_filter_and_iblt(m, include_value_in_iblt=False):
    c = 8 * math.pow(math.log(2), 2)
    tau = 16.5
    n = len(selected_txs)
    alpha = n / (c * tau)
    # print(alpha * tau)

    if m <= n:
        fpr = 0.1
    else:
        fpr = alpha / m - n
    print("Mempool difference", abs(m - n))
    n_cells = int((4 / 3) * abs(m - n)) + 30
    print('n_cells', n_cells)
    logging.info("Calculated FPR: %f" % fpr)
    fpr = 0.1
    b = BloomFilter(capacity=n, error_rate=fpr)
    i = IBLT(m=n_cells, k=3, key_size=32, value_size=0)
    for tx in selected_txs:
        b.add(tx['hash'])
        v = ''
        if include_value_in_iblt:
            v = tx_to_bytes(tx)
        i.insert(tx['hash'], v)
    return b, i
Example #3
0
def main():
    if os.path.isfile(nsrl_path):
        print "BUILDING: Reading in NSRL Database"
        with open(nsrl_path) as f_line:
            # Strip off header
            _ = f_line.readline()
            print "BUILDING: Calculating number of hashes in NSRL..."
            num_lines = sum(bl.count("\n") for bl in blocks(f_line))
            print "BUILDING: There are %s hashes in the NSRL Database" % num_lines
        with open(nsrl_path) as f_nsrl:
            # Strip off header
            _ = f_nsrl.readline()
            print "BUILDING: Creating bloomfilter"
            bf = BloomFilter(num_lines, error_rate)
            print "BUILDING: Inserting hashes into bloomfilter"
            for line in f_nsrl:
                md5_hash = line.split(",")[1].strip('"')
                if md5_hash:
                    try:
                        bf.add(md5_hash)
                    except Exception as e:
                        print "ERROR: %s" % e
            print "BUILDING: NSRL bloomfilter contains {} items.".format(len(bf))
            with open('nsrl.bloom', 'wb') as nb:
                bf.tofile(nb)
            print "BUILDING: Complete"
    else:
        print("ERROR: No such file or directory: %s", nsrl_path)

    return
Example #4
0
class UrlSpider(CrawlSpider):
    name = "urlspider"
    allowed_domains = ["tianya.cn"]
    start_urls = ("http://www.hao123.com", )
    rules = (
            Rule(SgmlLinkExtractor(allow=()), callback="parse_resp", follow= True),
            )

    def __init__(self, *args, **kwargs):
        # run using: scrapy crawl xss_spider -a url='http://example.com'
        super(UrlSpider, self).__init__(*args, **kwargs)
        self.start_urls = [kwargs.get('url')]
        hostname = urlparse(self.start_urls[0]).hostname
        self.allowed_domains = [hostname] # adding [] around the value seems to allow it to crawl subdomain of value
        self.fingerprints = BloomFilter(3000000, 0.0001)

    def parse_start_url(self, response):
        print "start:"+response.url
        return

    def parse_resp(self, response):
        fp = response.url
        new_fp = obtain_key(fp)
        if new_fp in self.fingerprints:
            return
        self.fingerprints.add(new_fp)

        item = SiteCrawlItem()
        item["url"] = response.url
        yield item
Example #5
0
def main(argv):
    if argv:
        error_rate = float(argv[0])
    print "[BUILDING] Using error-rate: {}".format(error_rate)
    if os.path.isfile(nsrl_path):
        print "[BUILDING] Reading in NSRL Database"
        with open(nsrl_path) as f_line:
            # Strip off header
            _ = f_line.readline()
            print "[BUILDING] Calculating number of hashes in NSRL..."
            num_lines = sum(bl.count("\n") for bl in blocks(f_line))
            print "[BUILDING] There are %s hashes in the NSRL Database" % num_lines
        with open(nsrl_path) as f_nsrl:
            # Strip off header
            _ = f_nsrl.readline()
            print "[BUILDING] Creating bloomfilter"
            bf = BloomFilter(num_lines, error_rate)
            print "[BUILDING] Inserting hashes into bloomfilter"
            for line in f_nsrl:
                sha1_hash = line.split(",")[0].strip('"')
                if sha1_hash:
                    try:
                        sha1 = binascii.unhexlify(sha1_hash)
                        bf.add(sha1)
                    except Exception as e:
                        print "[ERROR] %s" % e
            print "[BUILDING] NSRL bloomfilter contains {} items.".format(len(bf))
            with open('nsrl.bloom', 'wb') as nb:
                bf.tofile(nb)
            print "[BUILDING] Complete"
    else:
        print("[ERROR] No such file or directory: %s", nsrl_path)

    return
Example #6
0
class BloomCheckPipeline(object):
    def __int__(self):
        file_name = 'bloomfilter'

    def open_spider(self, spider):
        file_name = 'bloomfilter'
        is_exist = os.path.exists(file_name + '.blm')
        if is_exist:
            self.bf = BloomFilter.fromfile(open('bloomfilter.blm', 'rb'))
            print('open blm file success')
        else:
            self.bf = BloomFilter(100000, 0.001)
            print('didn\'t find the blm file')

    def process_item(self, item, spider):
        # 我是过滤掉相同url的item 各位看需求
        if item['urlToken'] or item['id'] in self.bf:
            print('drop one item for exist')
            raise DropItem('drop an item for exist')
        else:
            self.bf.add(item['urlToken'])
            print('add one success')
            return item

    def close_spider(self, spider):
        self.bf.tofile(open('bloomfilter.blm', 'wb'))
Example #7
0
def user_init():
    import re
    users = BloomFilter(10000000, 0.001)
    f= open(u"D:/工作/数据美化/data/简书用户id1.txt")
    for line in f:
        users.add(line.strip())
    return users
Example #8
0
    def add(self, key):
        """Adds a key to this bloom filter.
        If the key already exists in this filter it will return True.
        Otherwise False.

        >>> b = ScalableBloomFilter(initial_capacity=100, error_rate=0.001, \
                                    mode=ScalableBloomFilter.SMALL_SET_GROWTH)
        >>> b.add("hello")
        False
        >>> b.add("hello")
        True

        """
        if key in self:
            return True
        if not self.filters:
            filter = BloomFilter(capacity=self.initial_capacity,
                                 error_rate=self.error_rate *
                                 (1.0 - self.ratio))
            self.filters.append(filter)
        else:
            filter = self.filters[-1]
            if filter.count >= filter.capacity:
                filter = BloomFilter(capacity=filter.capacity * self.scale,
                                     error_rate=filter.error_rate * self.ratio)
                self.filters.append(filter)
        filter.add(key, skip_check=True)
        return False
Example #9
0
class BLOOMDupeFilter(BaseDupeFilter):
    """Request Fingerprint duplicates filter"""

    def __init__(self, path=None):
        self.file = None
        # capacity
        #     this BloomFilter must be able to store at least *capacity* elements
        #     while maintaining no more than *error_rate* chance of false
        #     positives
        # error_rate
        #     the error_rate of the filter returning false positives. This
        #     determines the filters capacity. Inserting more than capacity
        #     elements greatly increases the chance of false positives.
        self.fingerprints = BloomFilter(capacity=2000000, error_rate=0.00001)
        # get all the urls from database
        db = DynamoDBPipeline()
        urls = db.get_url_list()
        [self.fingerprints.add(url) for url in urls]

    @classmethod
    def from_settings(cls, settings):
        return cls(job_dir(settings))

    def request_seen(self, request):
        fp = request.url
        if fp in self.fingerprints:
            return True
        self.fingerprints.add(fp)

    def close(self, reason):
        self.fingerprints = None
Example #10
0
    def add(self, key):
        """Adds a key to this bloom filter.
        If the key already exists in this filter it will return True.
        Otherwise False.

        >>> b = ScalableBloomFilter(initial_capacity=100, error_rate=0.001, \
                                    mode=ScalableBloomFilter.SMALL_SET_GROWTH)
        >>> b.add("hello")
        False
        >>> b.add("hello")
        True

        """
        if key in self:
            return True
        if not self.filters:
            filter = BloomFilter(capacity=self.initial_capacity, error_rate=self.error_rate * (1.0 - self.ratio))
            self.filters.append(filter)
        else:
            filter = self.filters[-1]
            if filter.count >= filter.capacity:
                filter = BloomFilter(capacity=filter.capacity * self.scale, error_rate=filter.error_rate * self.ratio)
                self.filters.append(filter)
        filter.add(key, skip_check=True)
        return False
Example #11
0
	def determine_lookup_speed_threshold(self):
		from time import time
		#do each one 5 times
		bf = BloomFilter(capacity=self.bloom_size, error_rate=self.bloom_error)
		count = 1
		repetitions = 5
		
		self_bf_holder = self.bf
		self.bf = bf
		while True:
			bf.add('andrew_' + str(count))
			bin_faster_count = 0
			for j in xrange(repetitions):
				#Linear scan
				t1 = time()
				self.linear_scan_count('andrew')
				t2 = time()
				linear_time = t2-t1
			
				t1 = time()
				self.binsearch_count('andrew')
				t2 = time()
				bin_time = t2-t1
			
				bin_faster_count += int(bin_time < linear_time)
		
			if 1.*bin_faster_count / repetitions >= 0.75:
				del bf
				self.bf = self_bf_holder
				return count
			
			count += 1
			
Example #12
0
class UrlBloom:
    '''BloomFilter: check elements repetition'''
    def __init__(self, _capacity=1000000, _error_rate=0.00001):
        self.is_full = False
        # determine if open backup bloom data by time
        if CONFIG.get('BACKUP', 0) == 1:
            self.bomb = TimeBomb(CONFIG['TMP_DIR'] + CONFIG['BLOOM_FILE'])
            self.filter = self.bomb.load()
            if self.filter is None:
                self.filter = BloomFilter(capacity=_capacity, error_rate=_error_rate)
            self.bomb.dump(self.filter)
        else:
            self.filter = BloomFilter(capacity=_capacity, error_rate=_error_rate)

    def add(self, links):
        if self.is_full:
            return
        try:
            for ele in links:
                self.filter.add(ele)
        except IndexError:
            # rasie IndexError when bloom is at capacity
            self.is_full = True


    def clean(self, links):
        res = []
        for ele in links:
            if ele not in self.filter:
                res.append(ele)
        return res
Example #13
0
class Filter(object):
    def __init__(self, cachefile, capacity=1000000, error_rate=0.001):
        self.cachefile = cachefile
        if os.name == 'nt' or not cachefile:
            from pybloom import BloomFilter
            if self.cache():
                with open(cachefile, 'r') as fp:
                    self.filter = BloomFilter.fromfile(fp)
            else:
                self.filter = BloomFilter(capacity=capacity,
                                          error_rate=error_rate)
        elif os.name == 'posix':
            from pybloomfilter import BloomFilter
            if self.cache():
                self.filter = BloomFilter.open(self.cachefile)
            else:
                self.filter = BloomFilter(capacity, error_rate, cachefile)

    def __contains__(self, key):
        return key in self.filter

    def add(self, obj):
        self.filter.add(obj)
        if os.name == 'nt':
            with open(self.cachefile, 'w') as fp:
                self.filter.tofile(fp)

    def cache(self):
        return os.path.exists(self.cachefile or '')
Example #14
0
def generateBloomFilter(file):
    "Generates the bloom filter for entries in file."
    # this probably isnt enough, need to look the data formatting over more
    # thoroughly
    d = BloomFilter(1000, 0.001)
    for line in file:
        d.add(line.split(1)[0])
Example #15
0
def user_init():
    import re
    users = BloomFilter(10000000, 0.001)
    f = open(u"D:/工作/数据美化/data/简书用户id1.txt")
    for line in f:
        users.add(line.strip())
    return users
Example #16
0
class UrlManager(object):
    def __init__(self):
        self.urls = []
        self.url_bloom_filter = BloomFilter(capacity=500000, error_rate=0.001)

    def add_url(self, url):
        # if url not in self.url_bloom_filter:
        self.urls.append(url)
        # self.url_bloom_filter.add(url)

    def add_urls(self, urls):
        for url in urls:
            self.add_url(url)

    def is_empty(self):
        return len(self.urls) == 0

    def get_url(self):
        return self.urls.pop(0)

    def get_len(self):
        return len(self.urls)

    def is_viewed(self, url):
        return url in self.url_bloom_filter

    def add_viewed(self, url):
        self.url_bloom_filter.add(url)
Example #17
0
def returnItemsWithMinSupportV3(itemSet, lenItem, transactionList, minSupport,
                                freqSet):
    _itemSet = set()
    localSet = defaultdict(int)
    if len(itemSet):
        filterCdd = BloomFilter(capacity=len(itemSet), error_rate=0.0001)
    else:
        print("As I say, ValueError: Capacity must be > 0")
        return set([])
    print("Store cdds in BF ... - %s" % getTime())
    for val in itemSet:
        pass  # 待引入counting BF,如达到minSup*len(transactionList),则不插入;or 不用counting BF,判断,已在BF的则不再插入。
        filterCdd.add(val)
    print("Mapping cddFromTrans on BF ... - %s" % getTime())
    for trans in transactionList:
        for cdd in combinations(trans, lenItem):
            cdd = frozenset(cdd)
            if cdd in filterCdd:
                freqSet[cdd] += 1  #zi 全局存一个
                localSet[cdd] += 1  #zi 局部存一个,(item, count),然后过滤小于minSupport的。
    print("Filter cdds that less than minSup. - %s" % getTime())
    for item, count in localSet.items():
        support = float(count) / len(transactionList)
        if support > minSupport:
            _itemSet.add(item)

    return _itemSet
	def vacuum_all(self, limit=None, time_limit=None, unupdated=False):
		logger.debug('Begin vacuum_all(limit=%s, time_limit=%s, unupdated=%s)', limit, time_limit, unupdated)
		##TODO delete SCIFields with SCFilterId not found in SCFilter

		self.plugins = self.load_plugins()
		self.ts = self.term_stat('SupplierCatalogItemFields Vacuum', len(self.plugins))
		now = start_time = datetime.now()
		try:
			transaction.begin()
			for plug in self.plugins.itervalues():
				supplier_catalog_filter_id = plug.supplier_catalog_filter_id()
				
				### Generate a bloom filter set of SCIF id's in VersionModel
				model_name = plug.version_model()  + 'Model'
				VersionModel = getattr(model, model_name)
				query = DBSession.query(VersionModel.supplier_catalog_item_field_id)
				s = BloomFilter(capacity=query.count() + 1)
				self.ts['sub_total'] = query.count()
				for (supplier_catalog_item_field_id, )  in query.yield_per(100):
					s.add(supplier_catalog_item_field_id)
					self.ts['sub_done'] += 1
				del query
				
				### Iterate through SCIFields, deleting any that don't appear in the bloom filter.
				query = DBSession.query(SupplierCatalogItemFieldModel)
				query = query.filter(SupplierCatalogItemFieldModel.supplier_catalog_filter_id == supplier_catalog_filter_id)
				if unupdated is not True:
					query = query.filter(SupplierCatalogItemFieldModel.updated != None)
				
				if limit is not None:
					query = query.order_by(SupplierCatalogItemFieldModel.vacuumed.nullsfirst())
					query = query.limit(limit)
					logger.debug("LIMIT %i, supplier_catalog_filter_id %s", limit, supplier_catalog_filter_id)
				self.ts['sub_done'] = 0
				self.ts['sub_total'] = query.count()
				for supplier_catalog_item_field in query.yield_per(100):
					if supplier_catalog_item_field.id not in s:
						logger.debug("Deleting SupplierCatalogItemField %s", supplier_catalog_item_field.id)
						DBSession.delete(supplier_catalog_item_field)
					else:
						supplier_catalog_item_field.vacuumed = now
					if self.ts['sub_done'] % 1000 == 0:
						DBSession.flush()
					self.ts['sub_done'] += 1
				del query
				DBSession.flush()
				if time_limit is not None:
					if datetime.now() > start_time + time_limit:
						logger.info("Reached Time Limit at %i of %i", self.ts['done'], self.ts['total'])
						transaction.commit()
						break;
				self.ts['done'] += 1
			transaction.commit()
		except Exception:
			logger.exception("Caught Exception: ")
			transaction.abort()
		finally:
			self.ts.finish()
		logger.debug('End vacuum()')
Example #19
0
 def get_bloom(self):
     bloom_cache = BloomFilter(capacity=10000000, error_rate=0.00001)
     sql = "select url from user_tbl"
     self.cursor.execute(sql)
     datalist = self.cursor.fetchall()
     for data in datalist:
         bloom_cache.add(data[0])
     return bloom_cache
Example #20
0
def checkCinT(currentCSet, transactionList, minSupport, freqSet):
    filterTrans = BloomFilter(capacity=len(transactionList), error_rate=0.001)
    for val in transactionList:
        filterTrans.add(val)
    print filterTrans.count
    for cdd in currentCSet:
        pass
    return freqSet
Example #21
0
class DownloadCache(object):
    def __init__(self, capacity, error_rate):
        self.cache = BloomFilter(capacity=capacity, error_rate=error_rate)

    def add(self, url):
        self.cache.add(url)

    def __contains__(self, item):
        return item in self.cache
Example #22
0
def filterCdd(currentCSet, transactionList, minSupport, freqSet):
    filterCdd = BloomFilter(capacity=len(currentCSet), error_rate=0.0001)
    for val in currentCSet:
        filterCdd.add(val)
    for trans in transactionList:
        for cdd in combinations(trans, 4):
            if cdd in filterCdd:
                freqSet[cdd] += 1
    return freqSet
Example #23
0
class BlogSpider(Spider):  
    def __init__(self):
        self.pageNumber =0
        self.logfile = open("/home/hduser/Logs/csdnUserlog.log","w")
        self.f = BloomFilter(capacity=10000000, error_rate=0.0001)
    name = "csdnUserScrapy"  
    #减慢爬取速度 为2s  
    download_delay = 0.5 
    allowed_domains = ["my.csdn.net"]  
    start_urls = [ 
    "http://my.csdn.net/jiazhijun","http://my.csdn.net/sodino","http://my.csdn.net/bill_man","http://my.csdn.net/lhc2207221755","http://my.csdn.net/xgbing","http://my.csdn.net/LoongEmbedded","http://my.csdn.net/jdh99","http://my.csdn.net/zqiang_55","http://my.csdn.net/zhao_zepeng","http://my.csdn.net/linyt","http://my.csdn.net/kmyhy","http://my.csdn.net/lincyang","http://my.csdn.net/jdsjlzx","http://my.csdn.net/u011012932","http://my.csdn.net/yayun0516","http://my.csdn.net/qq_23547831","http://my.csdn.net/CHENYUFENG1991","http://my.csdn.net/qq_26787115","http://my.csdn.net/kongki","http://my.csdn.net/you23hai45","http://my.csdn.net/cometwo","http://my.csdn.net/yuanziok","http://my.csdn.net/woxueliuyun","http://my.csdn.net/gatieme","http://my.csdn.net/u010850027","http://my.csdn.net/yinwenjie","http://my.csdn.net/teamlet","http://my.csdn.net/wangyangzhizhou","http://my.csdn.net/xiaoxian8023","http://my.csdn.net/ooppookid","http://my.csdn.net/wsl211511","http://my.csdn.net/liyuanbhu","http://my.csdn.net/sxhelijian","http://my.csdn.net/raylee2007","http://my.csdn.net/luozhuang","http://my.csdn.net/shaqoneal","http://my.csdn.net/dc_726","http://my.csdn.net/tobacco5648","http://my.csdn.net/wowkk","http://my.csdn.net/csfreebird","http://my.csdn.net/xukai871105","http://my.csdn.net/tuzongxun","http://my.csdn.net/mchdba","http://my.csdn.net/lichangzai","http://my.csdn.net/leftfist","http://my.csdn.net/wonder4","http://my.csdn.net/fogyisland2000","http://my.csdn.net/smstong","http://my.csdn.net/david_520042","http://my.csdn.net/ghostbear","http://my.csdn.net/xuyaqun","http://my.csdn.net/force_eagle","http://my.csdn.net/Jmilk","http://my.csdn.net/xiangpingli","http://my.csdn.net/quqi99","http://my.csdn.net/michaelzhou224","http://my.csdn.net/zzq900503","http://my.csdn.net/pipisorry","http://my.csdn.net/zhangmike","http://my.csdn.net/foruok","http://my.csdn.net/fengbingchun","http://my.csdn.net/qingrun","http://my.csdn.net/harrymeng","http://my.csdn.net/pukuimin1226","http://my.csdn.net/lihuoming","http://my.csdn.net/zhazha1980518","http://my.csdn.net/redarmy_chen","http://my.csdn.net/yuanmeng001","http://my.csdn.net/yeka","http://my.csdn.net/xieqq","http://my.csdn.net/zhangxiaoxiang","http://my.csdn.net/oiio","http://my.csdn.net/jobchanceleo","http://my.csdn.net/broadview2006"
    ]  
    

    def parse(self, response):  
        sel = Selector(response)  
        item = CsdnusersspyderItem()
        print "response URL %s\n" % str(response.url)
        self.f.add(str(response.url))
        #print "*********\nBloom added self.url: %s \n**********\n" % str(response.url)
        item["userName"] = str(response.url).split('/')[-1]
        relativeMarks =response.xpath("//div[@class='header clearfix']/a[@href]").extract()
        item["follow"] = []
        item["befollowed"] = []
        i = 0
        for u in relativeMarks:
            unameMark = re.findall(r'username="******"',u)
            (s,e) = re.search(r'".*"',unameMark[0]).span()
            uname = unameMark[0][s+1:e-1]
            if i <= 7:
                item["follow"].append(uname.encode('utf-8'))
            else:
                item["befollowed"].append(uname.encode('utf-8'))
            newUrl = "http://my.csdn.net/"+uname
            if newUrl in self.f:
                self.logfile.write("Duplicated URL: %s\n" % newUrl)
                pass
            else:
                #self.logfile.write("wei chong fu %s\n" % newUrl)
                yield Request(newUrl,callback=self.parse)
                i += 1
        item["pageUrl"] = str(response.url)
        focusNumMark = response.xpath("//dd[@class='focus_num']").extract()[0]
        (s ,e) = re.search(r'\d+',focusNumMark).span()
        focusNum = focusNumMark[s:e].encode('utf-8')
        item["followNum"] = focusNum

        fansNumMark = response.xpath("//dd[@class='fans_num']").extract()[0]
        (s ,e) = re.search(r'\d+',fansNumMark).span()
        fansNum = fansNumMark[s:e].encode('utf-8')
        item["befollowedNum"] = fansNum
        item["pageID"] = self.pageNumber
        item["pageMD5"] =GetMD5.getMD5(item["pageUrl"])
        yield item
        self.pageNumber = self.pageNumber +1
        if self.pageNumber % 1000 == 0:
            time.sleep(15)
Example #24
0
class BloomZip(object):
    def __init__(self, name):
        super(BloomZip, self).__init__()
        self.__data = StringIO()
        self._name = name
        self._bf = None

        if os.path.isfile(self._name):
            with open(self._name, 'rb') as f:
                length = struct.unpack(">L", f.read(4))[0]
                self._bf = BloomFilter.fromfile(f, length)

    def contains(self, word):
        return word in self._bf

    def write(self, data):
        self.__data.write(data)

    def close(self):
        if self._bf is None and self.__data is None:
            return

        words = self.__data.getvalue().split()

        self._bf = BloomFilter(capacity=len(words) + 1)

        for word in words:
            self._bf.add(word, skip_check=True)

        def get_bl_size():
            t = tempfile.NamedTemporaryFile().name
            with open(t, 'w') as fn:
                self._bf.tofile(fn)
            s = os.path.getsize(t)
            os.remove(t)
            return s

        if os.path.isfile(self._name):
            return

        a = open(self._name, 'w')
        a.write(struct.pack(">L", get_bl_size()))
        self._bf.tofile(a)
        with GzipFile(self._name, 'w', fileobj=a) as f:
            f.write(self.__data.getvalue())
        a.close()

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()
        if exc_type is not None:
            print(exc_tb)
            raise exc_val
Example #25
0
 def test_union():
     bloom_one = BloomFilter(100, 0.001)
     bloom_two = BloomFilter(100, 0.001)
     chars = [chr(i) for i in range(97, 123)]
     for char in chars[len(chars) / 2:]:
         bloom_one.add(char)
     for char in chars[:len(chars) / 2]:
         bloom_two.add(char)
     new_bloom = bloom_one.union(bloom_two)
     for char in chars:
         assert_(char in new_bloom)
Example #26
0
def spider_new_Init(spidername,
                    dbname,
                    website,
                    carnum,
                    urltag='url',
                    keycol='url'):
    #Mongo setting
    # spider_original_Init(dbname, website, carnum)
    # Mongo con
    connection = pymongo.MongoClient(settings['MONGODB_SERVER'],
                                     settings['MONGODB_PORT'])
    dbdata = connection[dbname]
    collectiondata = dbdata[website]

    # bloom file
    filename = 'blm/' + dbname + '/' + spidername + ".blm"
    # pybloom
    num = int((int(carnum) + collectiondata.count()) * 1.1)
    df = BloomFilter(capacity=num, error_rate=0.001)
    # read
    isexists = os.path.exists(filename)
    itemmax = 0
    if isexists:
        fr = open(filename, "r")
        lines = fr.readlines()
        for line in lines:
            line = line.strip('\n')
            df.add(line)
        fr.close()
    else:
        fa = open(filename, "w")
        for i in collectiondata.find():
            if keycol in i.keys():
                if urltag == 'url':
                    item = i[keycol]
                    itemmd5 = md5(item).hexdigest()
                    returndf = df.add(itemmd5)
                    if not (returndf):
                        fa.writelines(itemmd5 + '\n')
                else:
                    item = re.findall('\d+', i["url"])
                    item = int(item[len(item) - 1])
                    if item > itemmax:
                        itemmax = item
        if urltag == 'num':
            for item in range(1, itemmax + 1):
                item = str(item)
                itemmd5 = md5(item).hexdigest()
                returndf = df.add(itemmd5)
                if not (returndf):
                    fa.writelines(itemmd5 + '\n')
        fa.close()
    connection.close()
    return df
 def test_union(self):
     bloom_one = BloomFilter(100, 0.001)
     bloom_two = BloomFilter(100, 0.001)
     chars = [chr(i) for i in range(97, 123)]
     for char in chars[len(chars)/2:]:
         bloom_one.add(char)
     for char in chars[:len(chars)/2]:
         bloom_two.add(char)
     new_bloom = bloom_one.union(bloom_two)
     for char in chars:
         self.assert_(char in new_bloom)
Example #28
0
def main():
    uncompressedList, queryList = generateTestData(20, 1000, 100000, 10000000)
    # print(uncompressedList)
    # print("\n\n\n\n\n\n")
    # print(queryList)
    print(len(uncompressedList), len(queryList))
    f = BloomFilter(capacity=1000, error_rate=0.001)
    for x in range(10):
        f.add(x)
    print(10 in f)
    print(5 in f)
Example #29
0
def to_bloom(filename):
    with open(filename, 'r') as f:
        b = BloomFilter(capacity=1000000, error_rate=0.001)

        for line in f:
            if line != "":
                b.add(line)

        new_filename = filename + ".bloom"
        out_f = open(new_filename, 'wb')
        b.tofile(out_f)
Example #30
0
 def test_union(self):
     bloom_one = BloomFilter(100, 0.001)
     bloom_two = BloomFilter(100, 0.001)
     chars = [chr(i) for i in range_fn(97, 123)]
     for char in chars[int(len(chars) / 2):]:
         bloom_one.add(char)
     for char in chars[:int(len(chars) / 2)]:
         bloom_two.add(char)
     new_bloom = bloom_one.union(bloom_two)
     for char in chars:
         self.assertTrue(char in new_bloom)
Example #31
0
class product_spider_object_type_xml(CrawlSpider):
    # Default Data should be config in spiders
    name = "Product_Spider_Lazada"
    allowed_domains = []
    start_urls = []
    # rules = (
    # )

    # My Extra DATA
    data = []
    name_data = ''
    source = ''


     # Init Spider
    def __init__(self, *arg, **karg):
        self.init_yaml('scrapy_service/templates/product.yaml','lazada_sitemap')
        CrawlSpider.__init__(self, *arg)

    # Load information form YAML file
    def init_yaml(self, path_to_file, name_data):
        document = open(path_to_file, 'r')
        self.data = load(document)
        self.name_data = name_data
        self.source = self.data[self.name_data]['database']['name']
        document.close()
        
        self.allowed_domains = self.data[self.name_data]['allowed_domains']
        self.start_urls = self.data[self.name_data]['start_urls']
       
        # Get Links by Rule. This can be NULL
        temp_rule = []
        for rule in self.data[self.name_data]['pattern']:
            temp_rule.append(Rule(LinkExtractor(allow=(rule, )), callback='parse'))
        self.rules = set(temp_rule)
        self.crawled_links = BloomFilter(2000000,0.00001)

    def parse(self, response):
        xpath_selector = HtmlXPathSelector(response)
        
        # Check to parse more links
        if response.headers.get('Content-Type',False) and 'xml' in response.headers['Content-Type']:
            extra_links = HtmlParser.extract_new_link_with_xpath(self.data[self.name_data], xpath_selector)
            for link in extra_links:
                current_link = link if 'http' in link else self.start_urls[0]+ link
                if current_link not in self.crawled_links:
                    self.crawled_links.add(current_link)
                    yield Request(current_link, callback=self.parse)
        else:
            ### Get ALL Items which existing in the current link 
            items = HtmlParser.extract_product_with_xpath(self.data[self.name_data], xpath_selector, self.source)
            for item in items:
                yield item
Example #32
0
def generate_write_bloomfilter(dir_name, capacity=1000000, error_rate=0.01):
    bf = BloomFilter(capacity, error_rate)
    data_dir = zhihu_util.get_data_directory(dir_name)
    data_file_list = zhihu_util.get_file_list(data_dir)
    for data_file in data_file_list:
        # read url_suffix from data file
        with open(data_file, "r") as file_object:
            for line in file_object:
                url_suffix = line.split(USER_FIELD_DELIMITER)[0]
                if url_suffix.strip() != '':
                    # print "......url suffix:%s added into bloom filter" % url_suffix
                    bf.add(str(url_suffix))
    return bf
Example #33
0
 def test_intersection(self):
     bloom_one = BloomFilter(100, 0.001)
     bloom_two = BloomFilter(100, 0.001)
     chars = [chr(i) for i in range(97, 123)]
     for char in chars:
         bloom_one.add(char)
     for char in chars[:len(chars)/2]:
         bloom_two.add(char)
     new_bloom = bloom_one.intersection(bloom_two)
     for char in chars[:len(chars)/2]:
         self.assert_(char in new_bloom)
     for char in chars[len(chars)/2:]:
         self.assert_(char not in new_bloom)
def hound():
    f = BloomFilter(capacity=100000, error_rate=0.01)
    text = requests.get('https://www.gutenberg.org/files/2852/2852-0.txt').text

    for word in text.split():
        word = word.lower().strip()
        f.add(word)

    print len(f)
    print len(text.split())

    for w in ('holmes', 'watson', 'hound', 'moor', 'queen'):
        print 'Found', w, w in f
Example #35
0
class DataStore(object):
    def __init__(self):
        self.blacklist = Feed().parse_feeds()
        self.bloom = BloomFilter(capacity=6000, error_rate=0.001)
        self.generate_bloom()

    def generate_bloom(self):
        for blacklist in self.blacklist[0]:
            for ip in blacklist['ips']:
                self.bloom.add(ip)

    def is_threat(self, ip):
        search = ip in self.bloom
        return search
def record(url):
    """
    first time download tieba img
    create a bloomfliter for the next time downloading
    """
    numlist =getallnumlist(url)
 
    bloomfilter =BloomFilter(1000000)
    for number in numlist:
        bloomfilter.add(number)
    with open('./%s/check/bloomfilter' %(url[28:])  ,'ab+') as b:
        bloomfilter.tofile(b)
    #print 'pool'              
    
    multiprocessdownload(numlist)
Example #37
0
def record(url):
    """
    first time download tieba img
    create a bloomfliter for the next time downloading
    """
    numlist = getallnumlist(url)

    bloomfilter = BloomFilter(1000000)
    for number in numlist:
        bloomfilter.add(number)
    with open('./%s/check/bloomfilter' % (url[28:]), 'ab+') as b:
        bloomfilter.tofile(b)
    #print 'pool'

    multiprocessdownload(numlist)
Example #38
0
class BloomFilterDuplicateRemover(DuplicateRemover):
    def __init__(self, capacity=1000000, error_rate=0.001):
        self.bloomFilter = BloomFilter(capacity, error_rate)

    def dump(self, url):
        """检查是否存在url 如果存在返回True 否则False"""
        url_hash = self.url_hash(url)
        return self.bloomFilter.__contains__(url_hash)

    def add(self, url):
        url_hash = self.url_hash(url)
        self.bloomFilter.add(url_hash)

    def count(self):
        return self.bloomFilter.count
def create_bloom_filter(values, error_rate):
    """
    Create a BloomFilter object with the given error rate and a capacity
    given by the number of unique items in values. Add each value in values
    to the BloomFilter and return.
    """
    value_set = set(filter(lambda x: len(x), values))

    debug("Creating bloom filter, capacity=%d, error_rate=%f (%.4f%%)\n" % (
        len(value_set), error_rate, 100 * error_rate))
    b = BloomFilter(capacity=len(value_set), error_rate=error_rate)
    for value in value_set:
        debug("Adding '%s'\n" % value)
        b.add(value)

    return (b, len(value_set))
Example #40
0
class BloomDupeFilter(BaseDupeFilter):
    def __init__(self, path=None):
        self.file = path
        self.fingerprints = BloomFilter(5000000, 0.00001)

    @classmethod
    def from_settings(cls, settings):
        return cls(job_dir(settings))

    def request_seen(self, request):
        if request.url in self.fingerprints:
            return True
        self.fingerprints.add(request.url)

    def close(self, reason):
        self.fingerprints = None
Example #41
0
def spider_update_Init(dbname, website, carnum):

    # Mongo setting
    # spider_original_Init(dbname, website, carnum)
    # Mongo con
    connection = pymongo.MongoClient(settings['MONGODB_SERVER'],
                                     settings['MONGODB_PORT'])
    dbdata = connection[dbname]
    collectiondata = dbdata[website]

    # pybloom
    num = (int(carnum) + collectiondata.count()) * 1.1
    df = BloomFilter(capacity=num, error_rate=0.01)

    # urllist
    urllist = []
    for i in collectiondata.find():
        if "url" in i.keys():
            item = i["url"]
            if "status" in i.keys():
                if not (i['status'].find('sold') == -1):
                    continue
            itemmd5 = md5(item).hexdigest()
            returndf = df.add(itemmd5)
            if not (returndf):
                urllist.append(item)
    connection.close()
    return urllist
class FileBloomFilter(object):
    def __init__(self, path):
        self.path = path
        self.rfile = None
        self.is_tofile = False
        if not os.path.isfile(path):
            self.bf = BloomFilter(100000, 0.001)
        else:
            self.rfile = open(path, 'r')
            self.bf = BloomFilter.fromfile(self.rfile)

    def __del__(self):
        if not self.is_tofile:
            self.tofile()
        if self.rfile:
            self.rfile.close()

    def tofile(self):
        if self.bf:
            wfile = open(self.path, 'w+')
            self.bf.tofile(wfile)
            wfile.close()
            self.is_tofile = True

    def have(self, item):
        key = item['ip'] + ":" + str(item['port'])
        if key in self.bf:
            return True
        else:
            return False

    def filter_proxy_ip_list(self, items):
        filter_items = []
        for item in items:
            if not self.have(item):
                filter_items.append(item)
        return filter_items

    def add_proxy_ip(self, item):
        key = item['ip'] + ":" + str(item['port'])
        self.bf.add(key)
        self.is_tofile = False

    def add_proxy_ip_all(self, items):
        for item in items:
            self.add_proxy_ip(item)
Example #43
0
def bloom_filter(ifile, ofile, dupfile, blofile, maxsize=99999999):
    blf = BloomFilter(capacity=maxsize, error_rate=0.001)
    with open(ifile, 'rb') as f:
        lines = f.readlines()
    lines_len = len(lines)
    for line in lines:
        if line in blf:
            lines.remove(line)
            with open(dupfile, 'ab') as new_file:
                new_file.write(line)
        else:
            blf.add(line)
    print "lines:", lines_len, " => ", "nlines:", len(blf)
    with open(blofile, 'wb') as blfile:
        pickle.dump(blf, blfile)
    with open(ofile, 'wb') as outfile:
        outfile.writelines(lines)
Example #44
0
class BLOOMDupeFilter(BaseDupeFilter):
    def __init__(self, path=None):
        self.file = None
        self.fingerprints = BloomFilter(capacity=1000000, error_rate=0.001)

    @classmethod
    def from_settings(cls, settings):
        return cls(job_dir(settings))

    def request_seen(self, request):
        fp = request.url
        if fp in self.fingerprints:
            return True
        self.fingerprints.add(fp)
        return False

    def close(self, reason):
        self.fingerprints = None
Example #45
0
class BLOOMDupeFilter(BaseDupeFilter):
    def __init__(self, path=None):
        self.file = None
        self.fingerprints = BloomFilter(capacity=1000000, error_rate=0.001)
 
    @classmethod
    def from_settings(cls, settings):
        return cls(job_dir(settings))
 
    def request_seen(self, request):
        fp = request.url
        if fp in self.fingerprints:
            return True
        self.fingerprints.add(fp)
        return False

    def close(self, reason):
        self.fingerprints = None
Example #46
0
class BloomURLDupeFilter(BaseDupeFilter):
    """Request Fingerprint duplicates filter"""
    def __init__(self, path=None):
        self.file = None
        self.fingerprints = BloomFilter(3000000, 0.0001)

    @classmethod
    def from_settings(cls, settings):
        return cls(job_dir(settings))

    def request_seen(self, request):
        fp = request.url
        if fp in self.fingerprints:
            return True
        self.fingerprints.add(fp)

    def close(self, reason):
        self.fingerprints = None
def returnItemsWithMinSupportV4(itemSet, lenItem, transactionList, minSupport,
                                freqSet):
    _itemSet = set()
    localSet = defaultdict(int)
    filterCdd = BloomFilter(capacity=len(itemSet), error_rate=0.0001)
    for val in itemSet:
        filterCdd.add(val)
    for trans in transactionList:
        for cdd in combinations(trans, lenItem):
            if cdd in filterCdd:
                freqSet[cdd] += 1  #zi 全局存一个
                localSet[cdd] += 1  #zi 局部存一个,(item, count),然后过滤小于minSupport的。

    for item, count in localSet.items():
        support = float(count) / len(transactionList)
        if support > minSupport:
            _itemSet.add(item)

    return _itemSet
Example #48
0
class BloomURLDupeFilter(BaseDupeFilter):
    """Request Fingerprint duplicates filter"""

    def __init__(self, path=None):
        self.file = None
        self.fingerprints = BloomFilter(3000000, 0.0001)

    @classmethod
    def from_settings(cls, settings):
        return cls(job_dir(settings))

    def request_seen(self, request):
        fp = request.url
        if fp in self.fingerprints:
            return True
        self.fingerprints.add(fp)

    def close(self, reason):
        self.fingerprints = None
Example #49
0
def compile():
    boys = BloomFilter(capacity=703)
    girls = BloomFilter(capacity=1003)

    with open('sample_data/names.csv', 'r') as f:
        reader = csv.reader(f)
        reader.next()
        for row in reader:
            if float(row[2])<.0005:
                continue
            if row[3].lower() == 'boy':
                boys.add(row[1].lower())
            elif row[3].lower() == 'girl':
                girls.add(row[1].lower())

    with open('blooms/boys', 'w') as f:
        boys.tofile(f)
    with open('blooms/girls', 'w') as f:
        girls.tofile(f)
    print len(boys), len(girls)
Example #50
0
def feedBloom(row):
    f = BloomFilter(capacity=200, error_rate=0.6)
    f.add(row.src_ip)
    f.add(row.src_ip[0:5])
    f.add(row.src_ip[5:8])
    f.add(row.target_ip)
    return np.array(f.bitarray.tolist(), dtype=np.int)
Example #51
0
def feedBloom(row):
    f = BloomFilter(capacity = 200 , error_rate = 0.6)
    f.add(row.src_ip) 
    f.add(row.src_ip[0:5])
    f.add(row.src_ip[5:8])
    f.add(row.target_ip)
    return np.array(f.bitarray.tolist(),dtype=np.int)
Example #52
0
def crawl(url, seen=None):
    print("crawling: {0}".format(url))
    if not seen:
        seen = BloomFilter(capacity=50000, error_rate=0.0001)

    with Timeout(5, False):
        try:
            response = requests.get(url)
        except Exception:
            return

    location = domain(url)
    wanted_urls = []
    for url_match in url_regex.finditer(response.text):
        url = url_match.group(0)
        # To not destroy the internet, we only fetch URLs on the same domain.
        if url not in seen and location in domain(url):
            wanted_urls.append(url)
            seen.add(url)

    subtasks = group(crawl.s(url, seen) for url in wanted_urls)
    subtasks()
Example #53
0
    def test_bloom_string(self):
        f = BloomFilter(capacity=10000, error_rate=0.001)

        for i in xrange(0, f.capacity):
            rnd = "".join(random.choice(string.letters) for i in xrange(40))
            _ = f.add(rnd)

        self.assertEqual(rnd in f, True)

        for i in string.letters:
            self.assertEqual(i in f, False)

        self.assertEqual(rnd in f, True)
Example #54
0
    def test_bloom_int(self):
        f = BloomFilter(capacity=10000, error_rate=0.001)

        for i in xrange(0, f.capacity):
            _ = f.add(i)

        for i in xrange(0, f.capacity / 2):
            r = random.randint(0, f.capacity - 1)
            self.assertEqual(r in f, True)

        for i in xrange(0, f.capacity / 2):
            r = random.randint(f.capacity, f.capacity * 2)
            self.assertEqual(r in f, False)
Example #55
0
def main(capacity=100000, request_error_rate=0.1):
    f = BloomFilter(capacity=capacity, error_rate=request_error_rate)
    assert (capacity == f.capacity)
    start = time.time()
    for i in range(0, f.capacity):
        f.add(i, skip_check=True)
    end = time.time()
    print("{:5.3f} seconds to add to capacity, {:10.2f} entries/second".format(
        end - start, f.capacity / (end - start)))
    one_bits = f.bitarray.count(True)
    # zero_bits = f.bitarray.count(False)
    # print "Number of 1 bits:", one_bits
    # print "Number of 0 bits:", zero_bits
    print("Number of Filter Bits:", f.num_bits)
    print("Number of slices:", f.num_slices)
    print("Bits per slice:", f.bits_per_slice)
    print("------")
    print("Fraction of 1 bits at capacity: {:5.3f}".format(one_bits /
                                                           float(f.num_bits)))
    # Look for false positives and measure the actual fp rate
    trials = f.capacity
    fp = 0
    start = time.time()
    for i in range(f.capacity, f.capacity + trials + 1):
        if i in f:
            fp += 1
    end = time.time()
    print(("{:5.3f} seconds to check false positives, "
           "{:10.2f} checks/second".format(end - start,
                                           trials / (end - start))))
    print("Requested FP rate: {:2.4f}".format(request_error_rate))
    print("Experimental false positive rate: {:2.4f}".format(fp /
                                                             float(trials)))
    # Compute theoretical fp max (Goel/Gupta)
    k = f.num_slices
    m = f.num_bits
    n = f.capacity
    fp_theory = math.pow((1 - math.exp(-k * (n + 0.5) / (m - 1))), k)
    print("Projected FP rate (Goel/Gupta): {:2.6f}".format(fp_theory))
Example #56
0
def crawl(url, seen=None):
    print("crawling: %r" % (url, ))
    if not seen:
        seen = BloomFilter(capacity=50000, error_rate=0.0001)

    with Timeout(5, False):
        try:
            data = urllib2.urlopen(url).read()
        except (urllib2.HTTPError, IOError):
            return

    location = domain(url)
    wanted_urls = []
    for url_match in url_regex.finditer(data):
        url = url_match.group(0)
        # To not destroy the internet, we only fetch URLs on the same domain.
        if url not in seen and location in domain(url):
            wanted_urls.append(url)
            seen.add(url)

    subtasks = TaskSet(crawl.subtask((url, seen)) for url in wanted_urls)
    subtasks.apply_async()
Example #57
0
def compile():
    uni_names = BloomFilter(capacity=719)
    name_strings = []
    with open('sample_data/uni_names.out', 'r') as f:
        for line in f:
            m = re.search(r'\| "(.*)"$', line.strip())
            if m:
                name = m.group(1).strip().lower()
                name_strings.append(name)
                uni_names.add(name)
    ngpol_filt = NGPOLFilter(4, name_strings)
    for name in name_strings:
        if name not in ngpol_filt:
            print name
    print ngpol_filt.min_rating
    print ngpol_filt.deviation
    ngpol_filt.clean()
    with open('blooms/uni_names','w') as f:
        uni_names.tofile(f)
    with open('ngpols/uni_names','w') as f:
        ngpol_filt.tofile(f)
    print len(uni_names)