Python BloomFilter Examples

Programming Language: Python

Namespace/Package Name: bloompy

Class/Type: BloomFilter

Examples at hotexamples.com: 8

Python BloomFilter - 8 examples found. These are the top rated real world Python examples of bloompy.BloomFilter extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

BloomFilter(5)

add(5)

fromfile(1)

tofile(1)

Example #1

Show file

 def __init__(self, depth: int, max_url_nums: int, cookies: str, exclude_urls: List[str],
              domain_reg_list: List[str],
              path_dicts: List[str], header: dict = None):
     self.domain_reg = ''
     self.domain_reg_list = domain_reg_list
     self.complement = 0
     self.depth = 5 if not depth else depth
     self.max_url_nums = 5000 if not max_url_nums else max_url_nums
     self.cookie = cookies
     self.exclude_urls = [url.replace('*', '\\S*') for url in exclude_urls]
     self.url_dict = dict()
     self.url_cache = BloomFilter(element_num=max_url_nums * 5, error_rate=0.01)
     self.current_depth = 0
     self.current_crawl_queue = list()
     self.next_crawl_queue = list()
     self.max_queue_length = self.max_url_nums + 1000
     self.header = header
     self.path_dicts = path_dicts
     self.header = header
     self.filter_exts = [
         'css', 'png', 'gif', 'jpg', 'jpeg', 'swf', 'tiff',
         'pdf', 'ico', 'flv', 'mp4', 'mp3', 'avi', 'mpg', 'gz',
         'mpeg', 'iso', 'dat', 'mov', 'rar', 'exe', 'zip', 'tar',
         'bin', 'bz2', 'xsl', 'doc', 'docx', 'ppt', 'pptx', 'xls',
         'xlsx', 'csv', 'map', "ttf", 'tif', 'woff', 'woff2',
         'cab', 'apk', 'bmp', 'svg', 'exif', 'xml', 'rss', 'webp'
     ]
     self.exclude_urls_reg_str = ''

Example #2

Show file

def test_speed():
    
    n = 10000
    p = 0.0001
    b = BloomFilter(n, p)
    print b
    
    strings = set()
    string_size = 20
    for i in range(n):
    	string = ""
    	for j in range(string_size):
    		string += chr(random.randint(0, 255))
        strings.add(string)
    
    total_time = 0
    starttime = time.time()
    for string in strings:
    	b.add(string)
    total_time = (time.time() - starttime)
    	
    ns = float(len(strings))
    k = float(b.k)
    total_time = float(total_time)
    
    print "Number of hash functions: %d" % b.k
    print "Speed per hash: %f seconds" % (total_time / ns / k)
    print "Speed per add: %f seconds" % (total_time / ns)

Example #3

Show file

 def __init__(self, cookie: str = None, headers: dict = None, max_num: int = 10000, domain_regs: list = None,
              depth: int = 5):
     self.cookie = cookie
     self.headers = headers if headers else DEFAULT_HEADERS
     self.waiting_queue = Manager().Queue(maxsize=max_num * 2)
     self.current_queue = Manager().Queue(maxsize=max_num * 2)
     self.max_url_num = max_num
     self.crawled_urls = BloomFilter(element_num=max_num * 5, error_rate=0.01)
     self.url_dict = Manager().dict()
     self.domain_reg_list = domain_regs
     self.depth = depth
     self.current_depth = 0
     self.filter_exts = [
         'css', 'png', 'gif', 'jpg', 'jpeg', 'swf', 'tiff',
         'pdf', 'ico', 'flv', 'mp4', 'mp3', 'avi', 'mpg', 'gz',
         'mpeg', 'iso', 'dat', 'mov', 'rar', 'exe', 'zip', 'tar',
         'bin', 'bz2', 'xsl', 'doc', 'docx', 'ppt', 'pptx', 'xls',
         'xlsx', 'csv', 'map', "ttf", 'tif', 'woff', 'woff2',
         'cab', 'apk', 'bmp', 'svg', 'exif', 'xml', 'rss', 'webp', 'js'
     ]

Example #4

Show file

def test_error_rate():
    n = 10000
    p = 0.001
    b = BloomFilter(n, p)
    print "Creating BloomFilter for %d elements and false positive probability = %f ..." % (n, p)
    print "Optimal values are m = %d, k = %d" % (b.m, b.k)
    elt = 'apple'
    
    print "Testing..."
    assert elt not in b
    
    print "After adding '%s'..." % elt
    b.add(elt)
    
    print "Testing..."
    assert elt in b
    
    # create random strings
    strings = set()
    string_size = 20
    for i in range(n):
    	string = ""
    	for j in range(string_size):
    		string += chr(random.randint(0, 255))
    	strings.add(string)
    
    # other strings
    other_strings = set()
    for i in range(n):
    	string = ""
    	for j in range(string_size):
    		string += chr(random.randint(0, 255))
    	other_strings.add(string)
    
    # add all to set
    for s in list(strings):
    	b.add(s)
    
    # test for collisions
    other_strings = other_strings - strings
    collisions = 0
    for s in list(other_strings):
    	if s in b:
    		collisions += 1
    
    print "False positive rate was %d / %d = %f" % (
    	collisions, len(other_strings), 
    	float(collisions) / float(len(other_strings)))

Example #5

Show file

 def setUp(self):
     self.bf = BloomFilter(0.001,10**3)

Example #6

Show file

class TestBloomFilter(unittest.TestCase):

    def setUp(self):
        self.bf = BloomFilter(0.001,10**3)

    def test_init(self):
        self.assertTrue(isinstance(self.bf.bit_array,bitarray.bitarray))
        self.assertTrue(self.bf.error_rate>0)
        self.assertTrue(all(is_prime(i)for i in self.bf.seeds))
        self.assertTrue(self.bf.bit_array.count() == 0)

    def test_add(self):
        self.assertFalse(self.bf.add(12))
        self.assertFalse(self.bf.add(-12))
        self.assertFalse(self.bf.add(12.0))
        self.assertFalse(self.bf.add('12'))
        self.assertFalse(self.bf.add([12]))
        self.assertFalse(self.bf.add((12,)))
        self.assertFalse(self.bf.add({12:''}))
        self.assertFalse(self.bf.add(b'12'))
        self.assertFalse(self.bf.add(type('12',(),{})))

        self.assertTrue(self.bf.add(12))
        self.assertTrue(self.bf.add(-12))
        self.assertTrue(self.bf.add(12.0))
        self.assertTrue(self.bf.add('12'))
        self.assertTrue(self.bf.add([12]))
        self.assertTrue(self.bf.add((12,)))
        self.assertTrue(self.bf.add({12: ''}))
        self.assertTrue(self.bf.add(b'12'))
        self.assertTrue(self.bf.add(type('12', (), {})))

    def test_exists(self):
        self.bf.add(12)
        self.assertTrue((12 in self.bf))
        self.assertFalse('12' in self.bf)

    def test_tofile(self):
        self.assertFalse(self.bf.tofile(r'test\bf.bf'))

    def test_fromfile(self):
        self.assertTrue(isinstance(self.bf.fromfile(r'test\bf.bf'),BloomFilter))

Example #7

Show file

class Crawler(object):
    def __init__(self, cookie: str = None, headers: dict = None, max_num: int = 10000, domain_regs: list = None,
                 depth: int = 5):
        self.cookie = cookie
        self.headers = headers if headers else DEFAULT_HEADERS
        self.waiting_queue = Manager().Queue(maxsize=max_num * 2)
        self.current_queue = Manager().Queue(maxsize=max_num * 2)
        self.max_url_num = max_num
        self.crawled_urls = BloomFilter(element_num=max_num * 5, error_rate=0.01)
        self.url_dict = Manager().dict()
        self.domain_reg_list = domain_regs
        self.depth = depth
        self.current_depth = 0
        self.filter_exts = [
            'css', 'png', 'gif', 'jpg', 'jpeg', 'swf', 'tiff',
            'pdf', 'ico', 'flv', 'mp4', 'mp3', 'avi', 'mpg', 'gz',
            'mpeg', 'iso', 'dat', 'mov', 'rar', 'exe', 'zip', 'tar',
            'bin', 'bz2', 'xsl', 'doc', 'docx', 'ppt', 'pptx', 'xls',
            'xlsx', 'csv', 'map', "ttf", 'tif', 'woff', 'woff2',
            'cab', 'apk', 'bmp', 'svg', 'exif', 'xml', 'rss', 'webp', 'js'
        ]

    def run(self, urls):
        self.consist_headers()
        # 默认只爬取当前根域下的url
        self.domain_reg_list = self.parse_domain(urls) if not self.domain_reg_list else self.domain_reg_list
        self._init_reg()
        for url in urls:
            self.call_crawl_handler(url)
        print('all task done')
        print(self.url_dict)

    def call_crawl_handler(self, url):
        if 'http' not in url:
            init_url = 'http://' + url
        else:
            init_url = url
        self.current_queue.put_nowait(init_url)
        # 初始化url为了避免重复爬取，在初始化时就放入布隆过滤器
        init_url_without_protocol = url.split('//')[-1]
        self.crawled_urls.add(init_url_without_protocol)
        while self.current_depth < self.depth:
            if len(self.url_dict) >= self.max_url_num:
                print('达到预设爬去上限, 爬虫结束')
                break
            print('now crawl depth is :{}'.format(self.current_depth))
            tmp_results = []
            # 利用进程池去完成爬虫
            pool = Pool(os.cpu_count() * 2)
            while not self.current_queue.empty():
                print('当前在待爬队列中还有:{}个url'.format(self.current_queue.qsize()))
                url = self.current_queue.get_nowait()
                if not url.endswith('js'):
                    result = pool.apply_async(func=self.crawl_handler, args=(url,))
                    tmp_results.append(result)
                    # self.crawl_handler(url)
            pool.close()
            pool.join()
            tmp_reqs = []
            for result in tmp_results:
                for r in result.get():
                    tmp_reqs.append(r)
            self._handle_url(tmp_reqs)
            self.current_queue = self.waiting_queue
            self.waiting_queue = Manager().Queue(maxsize=self.max_url_num * 2)
            self.current_depth += 1
            print('depth:{} crawled done'.format(self.current_depth))

    def consist_headers(self):
        if self.cookie:
            self.headers['Cookie'] = self.cookie

    @staticmethod
    def parse_domain(domain_list):
        """
        需要将输入的url或者域名解析成域名, 用于后续同域判断等操作
        :param domain_list:
        :return:
        """

        def _split_url_protocol_and_path(domain):
            # 去掉协议
            if '://' in domain:
                domain = domain.split('://')[1]
            # 截取路径
            if '.com.cn' in domain:
                return domain.split('.com.cn')[0] + '.com.cn'
            if '.com' in domain:
                return domain.split('.com')[0] + '.com'
            if '.io' in domain:
                return domain.split('.xyz')[0] + '.io'
            # 针对ip:port形式的url,截取/即可
            return domain.split('/')[0]

        return [_split_url_protocol_and_path(domain) for domain in domain_list]

    @staticmethod
    def _parse_post_data(post_data) -> str:
        """
        解析动态请求获取里面的data成一个字符串
        :param post_data:
        :return:
        """
        if not post_data:
            return ''
        if not isinstance(post_data, dict):
            if '=' in post_data:
                param_dict = {}
                if '&' in post_data:
                    params_couples = post_data.split('&')
                    for param in params_couples:
                        if '=' not in param:
                            continue
                        k, v = param.split('=')
                        param_dict[k] = v
                else:
                    k, v = post_data.split('=')
                    param_dict[k] = v
                post_data = param_dict
            else:
                post_data = json.loads(post_data)
        post_data_list = [k for k, _ in post_data.items()]
        post_data_list.sort()
        return ''.join([param + '&' for param in post_data_list])[:-1]

    @staticmethod
    def parse_static_url(url):
        """
        把解析到的静态url, 重新组合成一个字典
        {
            'url': 'xxxxxx',
            'originUrl': 'xxxxxx/a=aa',
            'method': 'GET',
            'queryString': 'a=aa'
        }
        :param url:
        :return:
        """
        try:
            req = dict()
            req['method'] = 'GET'
            req['originUrl'] = url
            if '?' not in url:
                req['url'] = url
                return req
            url_consist = url.split('?')
            req['url'] = url_consist[0]
            params = url_consist[1]
            if '&' not in params:
                params_consist = params.split('=')
                req['queryString'] = params_consist[0] if params_consist[0] else ''
                return req
            multi_params = params.split('&')
            params_list = list(map(lambda y: y.split('=')[0], filter(lambda x: '=' in x, multi_params)))
            # 按首字母把参数排序
            params_list.sort()
            req['queryString'] = ''.join([key + '=&' for key in params_list])[:-2]
            return req
        except Exception:
            msg = traceback.format_exc()
            print(msg)
            return None

    def _init_reg(self):
        """
        根据解析出来的域名拼接一个正则, 用于同域校验
        :return:
        """
        domain_reg = ['^']
        domain_reg.extend(['(http|https):\/\/' + domain.replace('.', '\.') + '.*|' for domain in self.domain_reg_list])
        # domain_reg.extend(map(lambda x: '(http|https):\/\/' + x.replace('.', '\.') + '.*|', self.domain_reg_list))
        tmp_domain_reg = ''.join(domain_reg)
        self.domain_reg = tmp_domain_reg[:-1] + '$'

    def filter_ext(self, url):
        """
        过滤掉特殊后缀的url, 如一些静态资源等等
        如果存在url的后缀是需要排除的，则排除
        :param url:
        :return:
        """
        try:
            f = url.split('/')[-1].strip()
            if '.' in f:
                ext = f.split('.')[-1].strip().lower()
                if ext and ext in self.filter_exts:
                    return True
                else:
                    return False
            return False
        except Exception:
            msg = traceback.format_exc()
            print(msg)
            return False

    def filter_url_by_domain(self, url):
        """
        检验当前的url是否满足条件
        是  的url以及不在[不需要]的url集合里,返回True.不满足要求，返回false
        :param url:
        :return:
        """
        # 校验域名
        if not re.match(self.domain_reg, url, flags=0):
            return False
        # TODO: 后续补齐这部分功能
        # if len(self.exclude_urls) == 0:
        #    return True
        # 校验exclude_urls
        # if re.match(self.exclude_urls_reg_str, url, flags=0):
        #    return False
        return True

    def static_crawler(self, page, results, url) -> List["ElementHandle"]:
        """
        主要用于页面中静态url的解析, 目前涵盖了a标签的href属性和src属性
        """
        links = page.query_selector_all("//a")
        tmp_link = []
        for link in links:
            href = link.get_property("href").json_value()
            src = link.get_property("src").json_value()
            if not href or href == url:
                continue
            if not self.filter_ext(url=href) and self.filter_url_by_domain(url=href):
                req = self.parse_static_url(href)
                if req:
                    print('href:{}'.format(req))
                    results.append(req)

            if not src or src == url:
                continue
            if not self.filter_ext(url=src) and self.filter_url_by_domain(url=src):
                req = self.parse_static_url(src)
                if req:
                    print('src:{}'.format(req))
                    results.append(req)

            # 这里主要是用于有些a标签里的写法是<javascript>标签，用于执行某些js操作
            if 'javascript' in href or 'javascript' in src:
                tmp_link.append(link)
        return tmp_link

    def _check_crawled_url(self, url) -> bool:
        """
        检查是否已爬取,不存在,则返回True
        :param url:
        :return:
        """
        if url in self.crawled_urls:
            return False
        return True

    def _check_url_is_exist_by_md5(self, url_dict):
        """
        利用MD5去检查url是否重复
        :param url_dict:
        :return:
        """
        try:
            exist_md5 = list(url_dict.keys())[0]
            if exist_md5 in self.crawled_urls:
                return False
            return True
        except Exception:
            msg = traceback.format_exc()
            print(msg)
            return True

    @staticmethod
    def calculate_md5(url_har):
        """
        计算md5来去重
        :param url_har:
        :return:
        """
        url = url_har['url']
        # 有些post请求后缀会加timestamp时间戳来防重放
        tmp_list = url.split('//')[-1].split('?')
        url_without_protocol = tmp_list[0] if len(tmp_list) > 1 else url.split('//')[-1]
        method = url_har['method']
        query_string = ''
        post_data = ''
        if 'queryString' in url_har:
            query_string = url_har['queryString']
        if 'postData' in url_har:
            post_data = url_har['postData']
        tmp_str = url_without_protocol + '&' + method + '&' + query_string + post_data
        return hashlib.md5(tmp_str.encode('utf-8')).hexdigest()

    def _handle_url(self, req_list):
        """
        处理爬到的url, 看看是不是需要过滤或者是不是已经爬取过了
        :param req_list:
        :return:
        """
        if not req_list:
            return
        insert_req_list = list()
        for req in req_list:
            url = req['originUrl']
            if url.endswith('/'):
                url = url[:-1]
            url_without_protocol = url.split('//')[-1]
            '''
            解析完成后,返回的结构体包括:url,queryString(if exist),method
            需要对url做判断：
            1、是否存在于最后的url集合里
            2、是否已爬过
            3、url的后缀是否在需要过滤的集合里(最先判断,如果需要过滤则直接忽略)
            '''
            md5 = self.calculate_md5(req)
            tmp_dict = {
                md5: req
            }
            if self._check_url_is_exist_by_md5(tmp_dict):
                if len(self.url_dict.keys()) < self.max_url_num:
                    self.url_dict[md5] = req
                    # TODO:后面可以定制化插入taskId
                    insert_req_list.append({'taskId': 'test12', 'urlDict': json.dumps(req)})
                # 如果没有爬过,则放入下一轮要爬取的队列里
                if self._check_crawled_url(url_without_protocol) and not self.waiting_queue.full():
                    self.waiting_queue.put_nowait(req['originUrl'])
                    self.crawled_urls.add(url_without_protocol)

    def crawl_handler(self, url) -> list:
        result = []

        def intercept(route: Route, request: Request):
            # 拦截前端跳转,主要方法是修改请求响应为204 TODO: 后续在遇到前端跳转的时候，优化hook逻辑
            if request.is_navigation_request() and request.frame.parent_frame:
                request.response().status = 204
                route.continue_()
                return
            # 尝试拦截后端跳转
            if request.redirected_to:
                if request.post_data_json:
                    request.response().status = 200
                    self.waiting_queue.put_nowait(request.redirected_to.url)
                else:
                    ...
                route.continue_()
                return
            resource_type = request.resource_type
            # 过滤动态请求
            if resource_type in ['image', 'media', 'eventsource', 'websocket']:
                route.abort()
            else:
                url_origin = request.url
                if not url_origin:
                    route.continue_()
                    return
                if not self.filter_ext(url=url_origin) and self.filter_url_by_domain(url=url_origin):
                    headers = request.headers
                    method = request.method
                    post_data_json: dict = request.post_data_json
                    http_har = dict()
                    if method == 'POST' or method == 'PUT':
                        post_data_origin = post_data_json
                        post_data_handled = self._parse_post_data(post_data_origin)
                        content_type = headers['content-type'] if 'content-type' in headers else ''
                        http_har['originPostData'] = post_data_origin
                        http_har['postData'] = post_data_handled
                        http_har['contentType'] = content_type
                        http_har['url'] = url_origin
                        http_har['originUrl'] = url_origin
                        http_har['method'] = method
                    if method == 'GET':
                        http_har = self.parse_static_url(url_origin)
                    result.append(http_har)
                route.continue_()

        with sync_playwright() as p:
            browser = p.webkit.launch(headless=True, chromium_sandbox=True, )
            page = browser.new_page()
            page.set_default_navigation_timeout(30000)
            page.set_extra_http_headers(self.headers)
            page.route('**/*', intercept)
            page.goto(url)
            page.wait_for_load_state(state='networkidle', timeout=30000)

            tmp_links = self.static_crawler(page, result, url)
            page.evaluate(FORM_FILL_UPLOAD_JS)
            for link in tmp_links:
                link.click()
            page.close()
            browser.close()
        return result

Example #8

Show file

class Crawler(object):

    def __init__(self, depth: int, max_url_nums: int, cookies: str, exclude_urls: List[str],
                 domain_reg_list: List[str],
                 path_dicts: List[str], header: dict = None):
        self.domain_reg = ''
        self.domain_reg_list = domain_reg_list
        self.complement = 0
        self.depth = 5 if not depth else depth
        self.max_url_nums = 5000 if not max_url_nums else max_url_nums
        self.cookie = cookies
        self.exclude_urls = [url.replace('*', '\\S*') for url in exclude_urls]
        self.url_dict = dict()
        self.url_cache = BloomFilter(element_num=max_url_nums * 5, error_rate=0.01)
        self.current_depth = 0
        self.current_crawl_queue = list()
        self.next_crawl_queue = list()
        self.max_queue_length = self.max_url_nums + 1000
        self.header = header
        self.path_dicts = path_dicts
        self.header = header
        self.filter_exts = [
            'css', 'png', 'gif', 'jpg', 'jpeg', 'swf', 'tiff',
            'pdf', 'ico', 'flv', 'mp4', 'mp3', 'avi', 'mpg', 'gz',
            'mpeg', 'iso', 'dat', 'mov', 'rar', 'exe', 'zip', 'tar',
            'bin', 'bz2', 'xsl', 'doc', 'docx', 'ppt', 'pptx', 'xls',
            'xlsx', 'csv', 'map', "ttf", 'tif', 'woff', 'woff2',
            'cab', 'apk', 'bmp', 'svg', 'exif', 'xml', 'rss', 'webp'
        ]
        self.exclude_urls_reg_str = ''

    @staticmethod
    def parse_domain(domain_list):
        """
        需要将输入的url或者域名解析成域名, 用于后续同域判断等操作
        :param domain_list:
        :return:
        """

        def _split_url_protocol_and_path(domain):
            # 去掉协议
            if '://' in domain:
                domain = domain.split('://')[1]
            # 截取路径
            if '.com.cn' in domain:
                return domain.split('.com.cn')[0] + '.com.cn'
            if '.com' in domain:
                return domain.split('.com')[0] + '.com'
            if '.xyz' in domain:
                return domain.split('.xyz')[0] + '.xyz'
            # 针对ip:port形式的url,截取/即可
            return domain.split('/')[0]

        return [_split_url_protocol_and_path(domain) for domain in domain_list]

    def _init_reg(self):
        """
        根据解析出来的域名拼接一个正则, 用于同域校验
        :return:
        """
        domain_reg = ['^']
        if '*.  .com.cn' in self.domain_reg_list or '*.  .com' in self.domain_reg_list:
            domain_reg.append('.*\.  \.com.*$')
            self.domain_reg = ''.join(domain_reg)
            return
        domain_reg.extend(['(http|https):\/\/' + domain.replace('.', '\.') + '.*|' for domain in self.domain_reg_list])
        # domain_reg.extend(map(lambda x: '(http|https):\/\/' + x.replace('.', '\.') + '.*|', self.domain_reg_list))
        tmp_domain_reg = ''.join(domain_reg)
        self.domain_reg = tmp_domain_reg[:-1] + '$'

    def run(self, domain_list):
        if not isinstance(domain_list, list):
            raise Exception('domains must be list')
        self.domain_reg_list = self.parse_domain(domain_list) if not self.domain_reg_list else self.domain_reg_list
        self._init_reg()
        self._consist_exclude_urls_regex()
        # for domain in domain_list:
        #     self.crawl_url(domain)
        tasks = [gevent.spawn(self.crawl_url(domain)) for domain in domain_list]
        gevent.joinall(tasks)
        print('all tasks done')
        print(self.url_dict)

    def crawl_url(self, domain):
        print('enter crawler:{}'.format(domain))
        if 'http' not in domain:
            init_url = 'http://' + domain
        else:
            init_url = domain

        self.current_crawl_queue.append(init_url)
        while self.current_depth < self.depth:
            if len(self.url_dict.keys()) >= self.max_url_nums:
                break
            print('now depth is:{}'.format(self.current_depth))
            tasks = [gevent.spawn(self._crawler_handler, url) for url in self.current_crawl_queue if
                     not url.endswith('.js')]
            gevent.joinall(tasks)
            # 将下一轮待爬取的url提升到当前
            self.current_crawl_queue = self.next_crawl_queue
            self.current_depth += 1

    def filter_url_by_domain(self, url):
        """
        检验当前的url是否满足条件
        是  的url以及不在[不需要]的url集合里,返回True.不满足要求，返回false
        :param url:
        :return:
        """
        # 校验域名
        if not re.match(self.domain_reg, url, flags=0):
            return False
        if len(self.exclude_urls) == 0:
            return True
        # 校验exclude_urls
        if re.match(self.exclude_urls_reg_str, url, flags=0):
            return False
        return True

    def filter_ext(self, url):
        """
        过滤掉特殊后缀的url, 如一些静态资源等等
        如果存在url的后缀是需要排除的，则排除
        :param url:
        :return:
        """
        try:
            f = url.split('/')[-1].strip()
            if '.' in f:
                ext = f.split('.')[-1].strip().lower()
                if ext and ext in self.filter_exts:
                    return True
                else:
                    return False
            return True
        except Exception as e:
            msg = traceback.format_exc()
            print(msg)
            return False

    def _check_crawled_url(self, url):
        """
        检查是否已爬取,不存在,则返回True
        :param url:
        :return:
        """
        if url in self.url_cache:
            return False
        return True

    def _check_url_is_exist_by_md5(self, url_dict):
        """
        利用MD5去检查url是否重复
        :param url_dict:
        :return:
        """
        try:
            exist_md5 = list(url_dict.keys())[0]
            if exist_md5 in self.url_dict:
                return False
            return True
        except Exception as e:
            msg = traceback.format_exc()
            print(msg)
            return True

    def _handle_url(self, req_list):
        """
        处理爬到的url, 看看是不是需要过滤或者是不是已经爬取过了
        :param req_list:
        :return:
        """
        if not req_list:
            return
        insert_req_list = list()
        for req in req_list:
            url = req['originUrl']
            if url.endswith('/'):
                url = url[:-1]
            url_without_protocol = url.split('//')[-1]
            '''
            解析完成后,返回的结构体包括:url,queryString(if exist),method
            需要对url做判断：
            1、是否存在于最后的url集合里
            2、是否已爬过
            3、url的后缀是否在需要过滤的集合里(最先判断,如果需要过滤则直接忽略)
            '''
            md5 = self.calculate_md5(req)
            tmp_dict = {
                md5: req
            }
            if self._check_url_is_exist_by_md5(tmp_dict):
                if len(self.url_dict.keys()) < self.max_url_nums:
                    self.url_dict[md5] = req
                    insert_req_list.append({'taskId': self.task_id, 'urlDict': json.dumps(req)})
                # 如果没有爬过,则放入下一轮要爬取的队列里
                if self._check_crawled_url(url_without_protocol):
                    # 待爬队列已满,则不往队列里添加元素
                    if len(self.next_crawl_queue) < self.max_queue_length:
                        self.next_crawl_queue.append(req['originUrl'])
                        self.url_cache.add(url_without_protocol)

    # 计算MD5
    @staticmethod
    def calculate_md5(url_har):
        url = url_har['url']
        # 有些post请求后缀会加timestamp时间戳来防重放
        tmp_list = url.split('//')[-1].split("?")
        url_without_protocol = tmp_list[0] if len(tmp_list) > 1 else url.split('//')[-1]
        method = url_har['method']
        query_string = ''
        post_data = ''
        if 'queryString' in url_har:
            query_string = url_har['queryString']
        if 'postData' in url_har:
            post_data = url_har['postData']
        tmp_str = url_without_protocol + '&' + method + '&' + query_string + post_data
        return hashlib.md5(tmp_str.encode('utf-8')).hexdigest()

    @staticmethod
    def parse_static_url(url):
        """
        把解析到的静态url, 重新组合成一个字典
        {
            'url': 'xxxxxx',
            'originUrl': 'xxxxxx/a=aa',
            'method': 'GET',
            'queryString': 'a=aa'
        }
        :param url:
        :return:
        """
        try:
            req = dict()
            req['method'] = 'GET'
            req['originUrl'] = url
            if '?' not in url:
                req['url'] = url
                return req
            url_consist = url.split('?')
            req['url'] = url_consist[0]
            params = url_consist[1]
            if '&' not in params:
                params_consist = params.split('=')
                req['queryString'] = params_consist[0] if params_consist[0] else ''
                return req
            multi_params = params.split('&')
            params_list = list(map(lambda y: y.split('=')[0], filter(lambda x: '=' in x, multi_params)))
            # 按首字母把参数排序
            params_list.sort()
            req['queryString'] = ''.join([key + '=&' for key in params_list])[:-2]
            return req
        except Exception as e:
            msg = traceback.format_exc()
            print(msg)
            return None

    @staticmethod
    def _parse_post_data(post_data) -> str:
        """
        解析动态请求获取里面的data成一个字符串
        :param post_data:
        :return:
        """
        if not post_data:
            return ''
        if not isinstance(post_data, dict):
            if '=' in post_data:
                param_dict = {}
                if '&' in post_data:
                    params_couples = post_data.split('&')
                    for param in params_couples:
                        if '=' not in param:
                            continue
                        k, v = param.split('=')
                        param_dict[k] = v
                else:
                    k, v = post_data.split('=')
                    param_dict[k] = v
                post_data = param_dict
            else:
                post_data = json.loads(post_data)
        post_data_list = [k for k, __ in post_data.items()]
        post_data_list.sort()
        return ''.join([param + '&' for param in post_data_list])[:-1]

    def _consist_exclude_urls_regex(self):
        self.exclude_urls_reg_str = '|'.join(self.exclude_urls) if len(self.exclude_urls) else ''
        print('exclude url reg is'.format(self.exclude_urls_reg_str))

    def static_crawler(self, page, results, url) -> List["ElementHandle"]:
        """
        主要用于页面中静态url的解析, 目前涵盖了a标签的href属性和src属性
        """
        links = page.query_selector_all("//a")
        tmp_link = []
        for link in links:
            href = link.get_property("href").json_value()
            src = link.get_property("src").json_value()
            if not href or href == url:
                continue
            if self.filter_ext(url=href) and self.filter_url_by_domain(url=href):
                req = self.parse_static_url(href)
                if req:
                    print('href:{}'.format(req))
                    results.append(req)
            if not src or src == url:
                continue
            if self.filter_ext(url=src) and self.filter_url_by_domain(url=src):
                req = self.parse_static_url(src)
                if req:
                    print('src:{}'.format(req))
                    results.append(req)
            if 'javascript' in href or 'javascript' in src:
                tmp_link.append(link)
        return tmp_link

    def _crawler_handler(self, url):
        print('start crawling url:{}'.format(url))
        results = []

        def log_and_continue_request(route: Route, request: Request):
            resource_type = request.resource_type
            '''请求过滤'''
            if resource_type in ['image', 'media', 'eventsource', 'websocket']:
                route.abort()
            else:
                url_origin = request.url
                headers = request.headers
                method = request.method
                post_data_json = request.post_data_json
                print(url_origin, headers, method, post_data_json)
                if not url_origin:
                    route.continue_()
                    return
                if not self.filter_url_by_domain(url_origin) or self.filter_ext(url_origin):
                    route.continue_()
                    return
                http_har = dict()
                if method == 'POST' or method == 'PUT':
                    post_data_origin = post_data_json
                    post_data_handled = self._parse_post_data(post_data_origin)
                    content_type = headers['content-type'] if 'content-type' in headers else ''
                    http_har['originPostData'] = post_data_origin
                    http_har['postData'] = post_data_handled
                    http_har['contentType'] = content_type
                    http_har['url'] = url_origin
                    http_har['originUrl'] = url_origin
                    http_har['method'] = method
                if method == 'GET':
                    http_har = self.parse_static_url(url_origin)
                results.append(http_har)
                route.continue_()

        with sync_playwright() as p:
            browser = p.webkit.launch(headless=True, chromium_sandbox=True, )
            page = browser.new_page()
            page.set_default_navigation_timeout(30000)
            if self.cookie:
                self.header['Cookie'] = self.cookie
            page.set_extra_http_headers(headers=self.header)
            page.route('**/*', log_and_continue_request)
            page.goto(url)
            page.wait_for_load_state(state="networkidle", timeout=30000)

            tmp_link = self.static_crawler(page, results, url)
            for link in tmp_link:
                link.click()
                self.static_crawler(page, results, url)

            browser.close()

        self._handle_url(results)