Exemple #1
0
class Sava_key(Thread):
    def __init__(self,save_queue,filename):
        super(Sava_key, self).__init__()
        self.save_queue = save_queue
        self.filename = filename
        self.boom = BloomFilter(capacity=1e7,error_rate=0.001)

    def run(self):
        while True:
            try:
                kw = save_queue.get()

                # 一层过滤,必须包含实验室
                if '实验室' in kw:

                    # 二层过滤,排除不需要的词
                    keywords = filter_key.clean_key(kw)
                    if keywords is None:
                        continue
                    else:
                        # 保存过的词过滤掉
                        if keywords in self.boom:
                            continue
                        self.boom.add(keywords)

                        print('新词:{}'.format(keywords))
                        self.sava_txt(keywords)
            finally:
                self.save_queue.task_done()

    def sava_txt(self,kw):
        with open(self.filename,'a',encoding='utf-8') as fs:
            fs.write('{}\n'.format(kw))
Exemple #2
0
class BloomData:
    capacity = 50
    error_rate = 0.2

    def __init__(self, capacity=50, error_rate=0.2):
        """存储布隆过滤器和真值
        :param capacity:布隆过滤器容量
        :param error_rate:错误率
        """
        self.bloom = BloomFilter(capacity=self.capacity, error_rate=self.error_rate)
        self.Ture_Data = []

    def AddDataToBloom(self, new_data, key=0):
        """添加数据到布隆过滤器
        :param new_data: 待添加数据
        :param key: 模式1:布隆数据;模式0:字符数据
        :return: 现布隆数据包含真值数量
        """
        if key == 0:
            self.Ture_Data.append(new_data)
            self.bloom.add(new_data)
        if key == 1:
            self.bloom.union(new_data.bloom)
            self.Ture_Data.extend(new_data.Ture_Data)
        return len(self.Ture_Data)
Exemple #3
0
class TrainValBloom:
    def __init__(
        self,
        train_row_nrs: List[int],
        chosen_feats: List[int],
        full_dataset_size: int,
        val_row_nrs: List[int] = None,
    ):
        self.train_bloom = BloomFilter(capacity=len(train_row_nrs), error_rate=0.01)
        self.full_dataset_size = full_dataset_size
        self.subset_size = len(train_row_nrs)
        self.chosen_feats = chosen_feats
        all_row_nrs = list(range(0, full_dataset_size))

        if val_row_nrs is None:
            val_row_nrs = [x for x in all_row_nrs if x not in train_row_nrs]

        self.val_bloom = BloomFilter(capacity=len(val_row_nrs), error_rate=0.01)

        for row_nr in train_row_nrs:
            self.train_bloom.add(row_nr)

        for row_nr in val_row_nrs:
            self.val_bloom.add(row_nr)

    def into_dataset(self, train_set: "TrainSet", save_bloom: bool) -> "TrainValSet":
        train_inds, val_inds = self.into_indices()

        if save_bloom:
            train_val_bloom = self
        else:
            train_val_bloom = None

        return TrainValSet(
            train_set.x_train[np.ix_(train_inds, self.chosen_feats)],
            train_set.y_train[train_inds],
            train_set.x_train[np.ix_(val_inds, self.chosen_feats)],
            train_set.y_train[val_inds],
            train_val_bloom,
        )

    def into_indices(self) -> (List[int], List[int]):
        train_inds = np.zeros(self.subset_size)
        val_inds = np.zeros(self.full_dataset_size - self.subset_size)

        train_cntr = 0
        val_cntr = 0
        for i in range(0, self.full_dataset_size):
            if i in self.train_bloom and train_cntr < self.subset_size:
                train_inds[train_cntr] = i
                train_cntr += 1
            else:
                val_inds[val_cntr] = i
                val_cntr += 1

        val_inds = np.trim_zeros(val_inds, "b")  # 'b' = trim only from back
        train_inds_lst: List[int] = train_inds.astype(int).tolist()
        val_inds_lst: List[int] = val_inds.astype(int).tolist()

        return train_inds_lst, val_inds_lst
Exemple #4
0
class BloomCheckFunction(object):
    def __init__(self):
        self.filename = 'bloomFilter.blm'
        is_exist = os.path.exists(self.filename)
        if is_exist:
            self.bf = BloomFilter.fromfile(open(self.filename, 'rb'))

        else:
            self.bf = BloomFilter(100000000, 0.001)

    def process_item(self, data):
        data_encode_md5 = hashlib.md5(
            data.encode(encoding='utf-8')).hexdigest()
        if data_encode_md5 in self.bf:
            # 内容没有更新 丢弃item
            self.save_bloom_file()
            return False

        else:
            self.bf.add(data_encode_md5)
            self.save_bloom_file()
            return True

    def save_bloom_file(self):
        self.bf.tofile(open(self.filename, 'wb'))
Exemple #5
0
class Related_Key(Thread,Downloader):
    def __init__(self,key_queue,save_queue):
        super(Related_Key, self).__init__()
        self.key_queue = key_queue  # 采集列队
        self.save_queue = save_queue    #保存列队
        self.bloom = BloomFilter(capacity=1e7,error_rate=0.001) #过滤器

    def run(self):
        while True:
            try:
                kw = self.key_queue.get()
                # 过滤抓取,如果存在则不抓,不存在则添加进列表并抓取
                if kw in self.bloom:
                    continue
                self.bloom.add(kw)

                # 下载源码
                source = self.download(kw)
                if source is None:
                    continue
                self.parse_html(source)
                time.sleep(0.5)  # 处理完一次暂停0.5秒
            finally:
                self.key_queue.task_done()

    def parse_html(self,html):
        ele = etree.HTML(html)
        keyList = ele.xpath('//table//tr//th/a/text()')
        for key in keyList:
            self.key_queue.put(key) #添加采集列队
            self.save_queue.put(key)    #添加保存列队
Exemple #6
0
def train_bloom_filter(elements, error_rate=0.001):
    elements = set(elements)
    bf = BloomFilter(capacity=len(elements), error_rate=error_rate)
    for element in elements:
        bf.add(element)

    return bf
Exemple #7
0
class Filter_key(Thread):
    def __init__(self, save_queue,contain,filename):
        super().__init__()
        self.save_queue = save_queue    #保存列队
        self.contain = contain  #必须包含词
        self.filename = filename  # 文件名
        self.bloom = BloomFilter(capacity=1e7,error_rate=0.001)

    def run(self):
        while True:
            wd = self.save_queue.get()
            # 判断是否包含某词
            for con in self.contain:
                if con in wd:

                    # 关键词长度大于4个字才保存
                    if len(wd) > 10:
                        # 符合包含词再进行去重
                        if wd in self.bloom:
                            continue
                        self.bloom.add(wd)
                        print('得到新词:{}'.format(wd))
                        self.save_file(wd)
                        self.save_queue.task_done()

    # 保存文件函数
    def save_file(self,wd):
        with open(self.filename, mode='a', encoding='utf-8') as f:
                f.write('{}\n'.format(wd))
class MyBloomUtil:
    def __init__(self, bloom_name):
        bloom_dir = './bf'
        if not os.path.exists(bloom_dir):
            os.makedirs(bloom_dir)
        self.bloom_path = '%s/%s.blm' % (bloom_dir, bloom_name)
        is_exist = os.path.exists(self.bloom_path)
        if is_exist:
            self.bf = BloomFilter.fromfile(open(self.bloom_path, 'rb'))
        else:
            self.bf = BloomFilter(20000, 0.001)

    def is_exists(self, item):
        if item in self.bf:
            print('%s is already in bloom_filter.' % item)
            return True
        return False

    def add_in_bf(self, item):
        print('add %s' % item)
        self.bf.add(item)
        self.bf.tofile(open(self.bloom_path, 'wb'))

    def process_item(self, item):
        if item in self.bf:
            logger.info('[%s] is already in bloom.' % item)
            return None
        else:
            logger.info('add [%s]' % item)
            self.bf.add(item)
            self.bf.tofile(open(self.bloom_path, 'wb'))
            return item
Exemple #9
0
    def create_DB(self, path_DB):
        db = []
        for j, filepath in enumerate(
                natsort.natsorted(os.listdir(path_DB))[:self.n]):
            dbBloom = BloomFilter(self.m, self.q)
            self.k = dbBloom.num_slices
            f = open('{}/{}'.format(path_DB, filepath), 'r')
            print "    DEBUG: Reading patient file {:3d} from: {}/{}".format(
                j, path_DB, filepath)
            for i, line in enumerate(f):
                # '#'-lines might result in too few added snps
                if not line.startswith('#'):
                    snp = line.strip().split('\t')
                    try:
                        dbBloom.add(snp[0] + snp[1] + snp[3] + snp[4])
                    except:
                        pass
                if i + 1 >= self.m:
                    break

            db.append(dbBloom.bitarray)
            f.close()

        # Update n (needs to be done since len(db) could be smaller than specified n)
        self.n = len(db)

        # Reset Bloom filter length (the used library is slightly above the theoretical optimum)
        self.l = len(db[0])

        return db
    def test_calculate_error_rate__large_test(self):

        # -- fetch test and train samples
        train, test = self.prepare_test_and_train(2)

        # -- train Bloom filter
        bf = BloomFilter(len(train), error_rate=0.1)
        for word in train:
            bf.add(word)

        assert round(calculate_error_rate(bf, test), 1) == 0.1
Exemple #11
0
 def create_query(self, path_QRY):
     queryBloom = BloomFilter(self.m, self.q)
     f = open(path_QRY, 'r')
     for qrySNP in f:
         if not qrySNP.startswith('#'):
             snp = qrySNP.strip().split('\t')
             try:
                 queryBloom.add(snp[0] + snp[1] + snp[3] + snp[4])
             except:
                 pass
     return 0, queryBloom.bitarray
def writeToFile(outpath, path, size, error):
    bloom = BloomFilter(size, error)

    dest = open(outpath, 'w')

    f = open(path, 'r')
    for line in f:
        if not line.startswith('#'):
            snp = line.strip().split('\t')
            bloom.add(snp[0] + snp[1] + snp[3] + snp[4])
    f.close()

    bloom.bitarray.tofile(dest)
    dest.close()
Exemple #13
0
class Save_key(Thread):
    def __init__(self, save_queue, contain, db_config, filename):
        super(Save_key, self).__init__()
        self.save_queue = save_queue  #保存列队
        self.contain = contain  #必须包含词
        self.db_config = db_config  #数据库配置
        self.filename = filename  #保存文件名
        self.bloom = BloomFilter(capacity=1e7, error_rate=0.001)  #关键词重复采集过滤器

    def run(self):
        while True:
            wd = self.save_queue.get()

            # 一层过滤,必须包含实验室
            if 'sem' or '竟价' in wd:

                # 二层过滤,排除不需要的词
                keywords = filter_key.clean_key(wd)
                if keywords is None:
                    continue
                else:
                    # 过滤重复保存过的词
                    if wd in self.bloom:
                        continue
                    else:
                        self.bloom.add(wd)
                        print('得到新词:{}'.format(wd))
                        self.save_file(wd)
            self.save_queue.task_done()

    def save_file(self, wd):
        # 方式一,数据入库:
        # try:
        #     conn = pymysql.Connect(**self.db_config)
        #     try:
        #         sql = "insert ignore into sys(keywords) values(%s)"
        #         with conn.cursor() as cursor:
        #             cursor.execute(sql, args=(wd))
        #     except pymysql.err.Error as err:
        #         print('插入数据出错,新词:{},异常:{}'.format(wd, err))
        #     else:
        #         conn.commit()
        #         conn.close()
        # except pymysql.err.MySQLError:
        #     print('链接数据库出错!')

        # 方式二,写入本地:
        with open(self.filename, mode='a', encoding='utf-8') as f:
            f.write('{}\n'.format(wd))
def bloom_url(url):
    is_exist = os.path.exists(r'C:\spiders\zhilian_celery\bloom.blm')
    if is_exist:
        bf = BloomFilter.fromfile(
            open(r'C:\spiders\zhilian_celery\bloom.blm', 'rb', buffering=40))
    else:
        bf = BloomFilter(10000000, 0.001)

        # for animal in animals:
    if url in bf:
        print(1)
        return 0
    else:
        bf.add(url)
        bf.tofile(open(r'C:\spiders\zhilian_celery\bloom.blm', 'wb'))
        return 1
Exemple #15
0
class Save_key(Thread):
    def __init__(self, save_queue, contain, filename):
        super().__init__()
        self.save_queue = save_queue  #保存列队
        self.contain = contain  #必须包含词
        self.filename = filename  #文件名
        self.bloom = BloomFilter(capacity=1e7, error_rate=0.001)

    def run(self):
        while True:
            wd = self.save_queue.get()
            # 判断是否包含某词
            for con in self.contain:
                if con in wd:

                    # 关键词长度大于4个字才保存
                    if len(wd) > 5:
                        # 符合包含词再进行去重
                        if wd in self.bloom:
                            continue
                        self.bloom.add(wd)
                        print('得到新词:{}'.format(wd))
                        self.save_file(wd)
                        self.save_queue.task_done()
            # 释放资源
            gc.collect()

    # 保存文件函数
    def save_file(self, wd):
        # 开始入库
        # try:
        #     conn = pymysql.Connect(**self.db_config)
        #     try:
        #         sql = "insert ignore into shiyanshi_key(keywords) values(%s)"
        #         with conn.cursor() as cursor:
        #             cursor.execute(sql, args=(wd))
        #     except pymysql.err.Error as err:
        #         print('插入数据出错,新词:{},异常:{}'.format(wd, err))
        #     else:
        #         conn.commit()
        #         conn.close()
        # except pymysql.err.MySQLError:
        #     print('链接数据库出错!')
        with open(self.filename, mode='a', encoding='utf-8') as f:
            f.write('{}\n'.format(wd))
        # 释放资源
        gc.collect()
def train_bloom_filter():
    # -- training the Bloom filter
    hot_display_names = set()
    with open('./resources/0.xml', 'rb') as f:
        for line in f:
            user = row_to_dict(line)
            hot_display_names.add(user['displayname'])

    bf = BloomFilter(len(hot_display_names), error_rate=0.001)

    for name in hot_display_names:
        bf.add(name)

    with open('./resources/hot_names_bloom_filter', 'wb') as f:
        bf.tofile(f)

    return bf
Exemple #17
0
class BloomCheckPipeline(object):
    def __init__(self):
        self.file_name = r'Z:/朱靖/布隆滤波器过滤文件/carpicture/BloomFiltercnki.blm'
        self.bf = None
        self.cap_begin = 0
        self.cap_end = 0
        self.cnt = 0

    def open_spider(self, spider):
        if os.path.exists(self.file_name):
            self.bf = BloomFilter.fromfile(open(self.file_name, 'rb'))
            self.cap_begin = len(self.bf)  # 打开blm文件时读入初始数量
            print('open blm file success')
            print('初始容量:%d' % self.cap_begin)
        else:
            self.bf = BloomFilter(100000000, 0.001)
            print('Not find the blm file, creat one')

    def process_item(self, item, spider):
        if item['image_url'] in self.bf:
            print('drop one item %s for exist' % item['title'])
            raise DropItem('drop an item %s for exists' % item['title'])
        else:
            try:
                self.bf.add(item['image_url'])
                self.cnt += 1
            except Exception as reason:
                print('BloomFilter Error------:%s' % reason)
            # 每写入1w个url时就保存blm文件一次
            if self.cnt > 10000:
                self.save_blm()
                self.cnt = 0
            return item

    def save_blm(self):
        print('Save Blm File ******')
        self.cap_end = len(self.bf)
        print('此次存入图片数量:%d' % (self.cap_end - self.cap_begin))
        self.bf.tofile(open(self.file_name, 'wb'))

    def close_spider(self, spider):
        print('close_spider tofile------')
        self.cap_end = len(self.bf)
        print('此次存入图片数:%d' % (self.cap_end - self.cap_begin))
        self.bf.tofile(open(self.file_name, 'wb'))
Exemple #18
0
class BloomCheckPipeline(object):
    def __init__(self):
        self.file_name = 'Z:/朱靖/布隆滤波器过滤文件/学科网/bloomfilter_xuekew.blm'
        self.bf = None
        self.cap_begin = 0
        self.cap_end = 0
        self.cnt = 0

    def open_spider(self, spider):
        if os.path.exists(self.file_name):
            self.bf = BloomFilter.fromfile(open(self.file_name, 'rb'))
            print('open blm file success')
            self.cap_begin = len(self.bf)
            print('open blm file success')
            print('初始容量:%d' % self.cap_begin)
        else:
            self.bf = BloomFilter(100000000, 0.001)
            print('Not find the blm file')

    def process_item(self, item, spider):
        if item['url'] in self.bf:
            print('drop one item %s for exits' % item['title'])
            raise DropItem('drop an item %s for exits' % item['title'])
        else:
            try:
                self.bf.add(item['url'])
                self.cnt += 1
            except Exception as reason:
                print("BloomFilter Error------:%s" % reason)
            if self.cnt > 10000:
                self.save_blm()
                self.cnt = 0
            return item

    def save_blm(self):
        print('Save Blm File ******')
        self.cap_end = len(self.bf)
        print('此次存入文章数:%d' % (self.cap_end - self.cap_begin))
        self.bf.tofile(open(self.file_name, 'wb'))

    def close_spider(self, spider):
        print('close spider tofile-------')
        self.cap_end = len(self.bf)
        print('此次存入文章数:%d' % (self.cap_end - self.cap_begin))
        self.bf.tofile(open(self.file_name, 'wb'))
Exemple #19
0
class Spider(Thread,Downloads):
    def __init__(self,key_queue,save_queue):
        super(Spider, self).__init__()
        self.key_queue = key_queue  #采集关键词列队
        self.save_queue = save_queue #关键词保存列队
        self.bloom = BloomFilter(capacity=1e7,error_rate=0.001) #关键词重复采集过滤器

    def run(self):
        while True:
            try:
                kw = self.key_queue.get()   #从关键词列队中提取一个

                # 判断采集过滤器中是否已采集,如果有则跳过,没有则添加
                if kw in self.bloom:
                    continue
                self.bloom.add(kw)

                # 开始下载源码
                source = self.download(kw)
                if source is None:
                    continue

                # 开始提取源码中的内容
                self.parse_html(source)

            finally:
                self.key_queue.task_done()  #无论怎样都要把消息队列处理完

    def parse_html(self,source):
        # 推荐    //div[@class="hint-mid"]/a/text()
        # 相关    //div[@class="hintBox"]//table//tr/td/p/a/text()
        elt = etree.HTML(source)

        # 推荐
        recommend_list = elt.xpath('//div[@class="hint-mid"]/a/text()')
        # 遍历集合,将得到的词不断的添加到关键词队列及保存队列中
        for recommend in recommend_list:
            self.key_queue.put(recommend)
            self.save_queue.put(recommend)

        # 相关
        related_list = elt.xpath('//div[@class="hintBox"]//table//tr/td/p/a/text()')
        for related in related_list:
            self.key_queue.put(related)
            self.save_queue.put(related)
Exemple #20
0
class BLOOMDupeFilter(BaseDupeFilter):
    """Request Fingerprint duplicates filter"""
    def __init__(self, path=None):
        self.file = None
        self.fingerprints = BloomFilter(2000000, 0.00001)

    @classmethod
    def from_settings(cls, settings):
        return cls(job_dir(settings))

    def request_seen(self, request):
        fp = request.url
        if fp in self.fingerprints:
            return True
        self.fingerprints.add(fp)

    def close(self, reason):
        self.fingerprints = None
class MyBloomUtil:
    def __init__(self, bloom_name):
        self.bloom_path = '%s.blm' % bloom_name
        is_exist = os.path.exists(self.bloom_path)
        if is_exist:
            self.bf = BloomFilter.fromfile(open(self.bloom_path, 'rb'))
        else:
            self.bf = BloomFilter(20000, 0.001)

    def process_item(self, item):
        if item in self.bf:
            logger.info('[%s] is already in bloom.' % item)
            return None
        else:
            print('add one')
            self.bf.add(item)
            self.bf.tofile(open(self.bloom_path, 'wb'))
            return item
Exemple #22
0
    def parse(self, response):

        # fname = "/media/common/娱乐/Electronic_Design/Coding/Python/Scrapy/tutorial/tutorial/spiders/temp"
        #
        # html = response.xpath('//html').extract()[0]
        # fobj = open(fname, 'w')
        # fobj.writelines(html.encode('utf-8'))
        # fobj.close()

        # bloom = BloomFilter(100, 10)
        bloom = BloomFilter(1000, 0.001)
        animals = [
            'dog', 'cat', 'giraffe', 'fly', 'mosquito', 'horse', 'eagle',
            'bird', 'bison', 'boar', 'butterfly', 'ant', 'anaconda', 'bear',
            'chicken', 'dolphin', 'donkey', 'crow', 'crocodile'
        ]
        # First insertion of animals into the bloom filter
        for animal in animals:
            bloom.add(animal)

        # Membership existence for already inserted animals
        # There should not be any false negatives
        for animal in animals:
            if animal in bloom:
                print('{} is in bloom filter as expected'.format(animal))
            else:
                print('Something is terribly went wrong for {}'.format(animal))
                print('FALSE NEGATIVE!')

        # Membership existence for not inserted animals
        # There could be false positives
        other_animals = [
            'badger', 'cow', 'pig', 'sheep', 'bee', 'wolf', 'fox', 'whale',
            'shark', 'fish', 'turkey', 'duck', 'dove', 'deer', 'elephant',
            'frog', 'falcon', 'goat', 'gorilla', 'hawk'
        ]
        for other_animal in other_animals:
            if other_animal in bloom:
                print('{} is not in the bloom, but a false positive'.format(
                    other_animal))
            else:
                print('{} is not in the bloom filter as expected'.format(
                    other_animal))
Exemple #23
0
class Spider(Thread,Downloads):
    def __init__(self,key_queue,save_queue):
        super(Spider, self).__init__()
        self.key_queue = key_queue
        self.save_queue = save_queue
        self.boom = BloomFilter(capacity=1e7,error_rate=0.001) #关键词重复采集过滤器

    def run(self):
        while True:
            try:
                kw = self.key_queue.get()
                # 如果存在采集过滤器中就跳过
                if kw in self.boom:
                    continue
                # 否则添加进采集过滤器中
                self.boom.add(kw)

                source = self.download(kw)
                if source is None:
                    continue
                self.parse_html(source)

                # 处理完一次,休眠3秒
                # time.sleep(3)
            finally:
                self.key_queue.task_done()

    def parse_html(self,source):
        ele = etree.HTML(source)
        searchList = ele.xpath('//div[@class="c-row row-item row-item2"]/div/a/span/text()')
        for search in searchList:
            # print('{}'.format(search))
            str_search = str(search)    #lxml占用过多内存,转换为字符串并将其存储,这样可以防止整个树被垃圾回收
            self.key_queue.put(str_search)
            self.save_queue.put(str_search)

        relatedList = ele.xpath('//div[@class="rw-list-new rw-list-new2"]/a/span/text()')
        for related in relatedList:
            # print('{}'.format(related))
            str_related = str(related)
            self.key_queue.put(str_related)
            self.save_queue.put(str_related)
Exemple #24
0
 def domain(cls, domain_url):
     """checking the doamin URL, if it is found in the adult URL or contain the bad words.
     @:return True, if the domain has been found, else return false. If false, the domain can be
     added.
     """
     bf = BloomFilter(10000000)
     path = os.path.dirname(os.path.abspath(__file__))
     file = open(path + "/data/porn_sites_list.txt", "r+")
     files = file.readlines()
     for item in files:
         bf.add(item.strip())
     file.close()
     result = domain_url in bf
     if result:
         return True
     # else:
     #     for word in bad_domains_words:
     #         if domain_url.__contains__(word):
     #             return True
     return False
Exemple #25
0
 def filter_url(self, url):
     """
     进行url去重处理,可能需要的请求数据过多,防止重复
     :param url:对url进行判断,看是否重复
     :return:
     """
     bloom_path = '{}.blm'.format(self.name)
     # 判断是否存在这个文件
     is_exist = os.path.exists(bloom_path)
     if is_exist:
         bf = BloomFilter.fromfile(open(bloom_path, 'rb'))
     else:
         # 新建一个,储存在内存中
         bf = BloomFilter(1000000, 0.01)
     if url in bf:
         return False
     # 不存在将url添加进去
     bf.add(url)
     bf.tofile(open(bloom_path, 'wb'))
     return True
Exemple #26
0
def build(
    infile,
    outfile,
    error_rate=0.0001,
    delim=None,
    column=1,
    skip_first=False,
    unhex=False,
    comment_prefix=None,
    num_items=None,
):
    print("[BUILDING] Using error-rate: {}".format(error_rate))
    if os.path.isfile(infile):
        print("[BUILDING] Reading in Hashset: {}".format(infile))
        print("[BUILDING] Calculating number of hashes...")
        if not num_items:
            num_items = get_number_of_items(infile, skip_first, comment_prefix)
        print("[BUILDING] There are {} hashes in the Hashset".format(num_items))
        print("[BUILDING] Creating bloomfilter")
        bf = BloomFilter(num_items, error_rate)
        print("[BUILDING] Inserting hashes into bloomfilter")
        for item in get_items(
            infile,
            delim=delim,
            column=column,
            skip_first=skip_first,
            unhex=unhex,
            comment_prefix=comment_prefix,
        ):
            try:
                bf.add(item)
            except Exception as e:
                print("[ERROR] {}".format(e), file=sys.stderr)
        print("[BUILDING] Hashset bloomfilter contains {} items.".format(len(bf)))
        with open(outfile, "wb") as fh:
            bf.tofile(fh)
        print("[BUILDING] Complete")
    else:
        print("[ERROR] No such file or directory: {}".format(infile), file=sys.stderr)

    return
Exemple #27
0
class BLOOMDupeFilter(BaseDupeFilter):
    """
    BLOOM Duplicate Filter
    This filter is interesting to use if you crawl a lot of url, it will take less memory to filter the urls.
    """
    def __init__(self, path=None):
        self.file = None
        self.fingerprints = BloomFilter(2000000, 0.00001)

    @classmethod
    def from_settings(cls, settings):
        return cls(job_dir(settings))

    def request_seen(self, request):
        fp = request.url
        if fp in self.fingerprints:
            return True
        self.fingerprints.add(fp)

    def close(self, reason):
        self.fingerprints = None
Exemple #28
0
def crawl(url, seen=None):
    print(f'crawling: {url}')
    if not seen:
        seen = BloomFilter(capacity=50000, error_rate=0.0001)

    with Timeout(5, False):
        try:
            response = requests.get(url)
        except requests.exception.RequestError:
            return

    location = domain(url)
    wanted_urls = []
    for url_match in url_regex.finditer(response.text):
        url = url_match.group(0)
        # To not destroy the internet, we only fetch URLs on the same domain.
        if url not in seen and location in domain(url):
            wanted_urls.append(url)
            seen.add(url)

    subtasks = group(crawl.s(url, seen) for url in wanted_urls)
    subtasks.delay()
Exemple #29
0
class Spider(Thread, Downloads):
    def __init__(self, key_queue, save_queue):
        super(Spider, self).__init__()
        self.key_queue = key_queue  #采集关键词列队
        self.save_queue = save_queue  #关键词保存列队
        self.bloom = BloomFilter(capacity=1e7, error_rate=0.001)  #关键词重复采集过滤器

    def run(self):
        while True:
            try:
                kw = self.key_queue.get()  #从关键词列队中提取一个

                # 判断采集过滤器中是否已采集,如果有则跳过,没有则添加
                if kw in self.bloom:
                    continue
                self.bloom.add(kw)

                # 开始下载源码
                source = self.download(kw)
                if source is None:
                    continue

                # 开始提取源码中的内容
                self.parse_html(source)

            finally:
                self.key_queue.task_done()  #无论怎样都要把消息队列处理完

    def parse_html(self, source):
        # 相关    //table//tr//th/a/text()
        elt = etree.HTML(source)

        # 相关
        related_list = elt.xpath('//table//tr//th/a/text()')
        for related in related_list:
            str_related = str(
                related)  #将原来的lxml 数据格式转换为 str 防止 lxml 回收,一直采集会不停的占用内存
            self.key_queue.put(str_related)
            self.save_queue.put(str_related)
Exemple #30
0
def bloom_file_init():
    path = '../spiders/sites.blm'
    is_exist = os.path.exists(path)
    # 判断是否存在bloom文件
    # 判断存在就读取
    if is_exist:
        bf = BloomFilter.fromfile(open(path, 'rb'))
    # 没有该文件则创建bf对象 最后的时候保存文件
    else:
        bf = BloomFilter(10000000, 0.01)

    with MongoClient(get_project_settings()['MONGODB_URL']) as client:
        sites_coll = client.site.sites
        sites_unverified_coll = client.site.sites_unverified
        for x in sites_coll.find():
            result = bf.add(x['url'])
            print(x['url'], ' ', result)
        for x in sites_unverified_coll.find({}):
            result = bf.add(x['url'])
            print(x['url'], ' ', result)

    bf.tofile(open(path, 'wb'))
	# Calculate sourmash estimate of Jaccard index
	E1 = MH.CountEstimator(n=h, max_prime=prime, ksize=ksize, save_kmers='y')
	E2 = MH.CountEstimator(n=h, max_prime=prime, ksize=ksize, save_kmers='y')
	E1.add_sequence(seq1)
	E2.add_sequence(seq2)
	estimate_jaccard = E1.jaccard(E2)
	estimate_jaccards[it] = estimate_jaccard

	# Containment version.
	# Bloom filter
	f = BloomFilter(capacity=i_size+n1, error_rate=p)
	len_kmers_1 = 0
	for val in kmers1:
		if val not in f:
			len_kmers_1 += 1
			f.add(val)
	#len_kmers_1 *= (1 - p)  # adjust for the false positive rate, shouldn't need to do this as I'm just adding elements
	int_est = 0
	for val in E2._kmers:
		#if val in f:  # in python2, no distinguishing between byte and utf-8 string
		if val is not '':
			if val.decode("utf-8") in f:
				int_est += 1
	int_est -= p*h  # adjust for the false positive rate
	containment_est = int_est / float(h)

	# Calculate the containment estimate of jaccard, len(kmers2) is exact (as in practice this is part of the training
	# database and so only needs to be done once (and the genomes are relatively small so this is no big cost)
	containment_est_jaccard = \
		len(kmers2) * containment_est / \
		(len(kmers2) + len_kmers_1 - len(kmers2) * containment_est)