Python Manager.full Examples

Programming Language: Python

Namespace/Package Name: multiprocessing

Class/Type: Manager

Method/Function: full

Examples at hotexamples.com: 3

Python Manager.full - 3 examples found. These are the top rated real world Python examples of multiprocessing.Manager.full extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Value(30)

dict(30)

Queue(30)

Namespace(30)

Manager(30)

Lock(30)

append(30)

get(30)

Event(28)

Semaphore(17)

empty(16)

RLock(16)

clear(14)

copy(11)

acquire(11)

get_nowait(11)

JoinableQueue(8)

extend(8)

Array(7)

Condition(7)

Pool(4)

full(3)

_getvalue(3)

count(2)

buff_num(2)

Barrier(2)

__exit__(2)

BoundedSemaphore(2)

frame_delay(2)

file(2)

F_cur(1)

Employee(1)

error(1)

exited(1)

Counter(1)

file_b(1)

file_a(1)

dXg(1)

finished(1)

flag(1)

found(1)

data(1)

Xd(1)

dXd(1)

GLOBAL_RUNNING_REWARD(1)

__enter__(1)

__str__(1)

_callmethod(1)

_debug_info(1)

X_cur(1)

Example #1

Show file

File: StatisticalAnalysis.py Project: strtrek/EarthquakePrediction

def data_statistical(path, subset: str):
    data = pd.read_csv(path, iterator=True, dtype=str)
    loop = True
    indicator = 0
    df_stat = pd.DataFrame(
        columns=["start", "stop", "begin", "end", "min", "max", "mean"])
    process_pool = Pool(Processes)  # 进程池
    queue_materials = Manager().Queue(Processes)  # 原料队列
    cnt_materials = 0  # 发料计数
    queue_products = Manager().Queue(Processes)  # 成品队列
    cnt_products = 0  # 出厂计数
    while loop:
        try:
            chunk = data.get_chunk(ChunkSize)
            chunk_length = len(chunk)
            # Material prepare
            while queue_materials.full():
                time.sleep(BlockingTime)
            queue_materials.put((chunk, indicator, subset))
            process_pool.apply_async(func=chunk_statistical,
                                     args=(queue_materials,
                                           queue_products))  # 非堵塞异步
            cnt_materials = cnt_materials + 1  # 发料计数
            # Product collect
            cnt_products, df_stat = product_collect(queue_products, df_stat,
                                                    cnt_products)
            # Indicator
            indicator = indicator + chunk_length
            # Debugging
            if (indicator > (ChunkSize * Debugging)) and (Debugging > 0):
                loop = False
                print("Iteration is stopped by debugging.")
        except StopIteration:
            loop = False
            print("Iteration is stopped at {0}.".format(indicator))
    # Product collect
    while cnt_products < cnt_materials:
        cnt_products, df_stat = product_collect(queue_products, df_stat,
                                                cnt_products)
    process_pool.close()
    process_pool.join()
    return {
        "DataFrame": df_stat,
        "lines": indicator,
    }

Example #2

Show file

class Crawler(object):
    def __init__(self, cookie: str = None, headers: dict = None, max_num: int = 10000, domain_regs: list = None,
                 depth: int = 5):
        self.cookie = cookie
        self.headers = headers if headers else DEFAULT_HEADERS
        self.waiting_queue = Manager().Queue(maxsize=max_num * 2)
        self.current_queue = Manager().Queue(maxsize=max_num * 2)
        self.max_url_num = max_num
        self.crawled_urls = BloomFilter(element_num=max_num * 5, error_rate=0.01)
        self.url_dict = Manager().dict()
        self.domain_reg_list = domain_regs
        self.depth = depth
        self.current_depth = 0
        self.filter_exts = [
            'css', 'png', 'gif', 'jpg', 'jpeg', 'swf', 'tiff',
            'pdf', 'ico', 'flv', 'mp4', 'mp3', 'avi', 'mpg', 'gz',
            'mpeg', 'iso', 'dat', 'mov', 'rar', 'exe', 'zip', 'tar',
            'bin', 'bz2', 'xsl', 'doc', 'docx', 'ppt', 'pptx', 'xls',
            'xlsx', 'csv', 'map', "ttf", 'tif', 'woff', 'woff2',
            'cab', 'apk', 'bmp', 'svg', 'exif', 'xml', 'rss', 'webp', 'js'
        ]

    def run(self, urls):
        self.consist_headers()
        # 默认只爬取当前根域下的url
        self.domain_reg_list = self.parse_domain(urls) if not self.domain_reg_list else self.domain_reg_list
        self._init_reg()
        for url in urls:
            self.call_crawl_handler(url)
        print('all task done')
        print(self.url_dict)

    def call_crawl_handler(self, url):
        if 'http' not in url:
            init_url = 'http://' + url
        else:
            init_url = url
        self.current_queue.put_nowait(init_url)
        # 初始化url为了避免重复爬取，在初始化时就放入布隆过滤器
        init_url_without_protocol = url.split('//')[-1]
        self.crawled_urls.add(init_url_without_protocol)
        while self.current_depth < self.depth:
            if len(self.url_dict) >= self.max_url_num:
                print('达到预设爬去上限, 爬虫结束')
                break
            print('now crawl depth is :{}'.format(self.current_depth))
            tmp_results = []
            # 利用进程池去完成爬虫
            pool = Pool(os.cpu_count() * 2)
            while not self.current_queue.empty():
                print('当前在待爬队列中还有:{}个url'.format(self.current_queue.qsize()))
                url = self.current_queue.get_nowait()
                if not url.endswith('js'):
                    result = pool.apply_async(func=self.crawl_handler, args=(url,))
                    tmp_results.append(result)
                    # self.crawl_handler(url)
            pool.close()
            pool.join()
            tmp_reqs = []
            for result in tmp_results:
                for r in result.get():
                    tmp_reqs.append(r)
            self._handle_url(tmp_reqs)
            self.current_queue = self.waiting_queue
            self.waiting_queue = Manager().Queue(maxsize=self.max_url_num * 2)
            self.current_depth += 1
            print('depth:{} crawled done'.format(self.current_depth))

    def consist_headers(self):
        if self.cookie:
            self.headers['Cookie'] = self.cookie

    @staticmethod
    def parse_domain(domain_list):
        """
        需要将输入的url或者域名解析成域名, 用于后续同域判断等操作
        :param domain_list:
        :return:
        """

        def _split_url_protocol_and_path(domain):
            # 去掉协议
            if '://' in domain:
                domain = domain.split('://')[1]
            # 截取路径
            if '.com.cn' in domain:
                return domain.split('.com.cn')[0] + '.com.cn'
            if '.com' in domain:
                return domain.split('.com')[0] + '.com'
            if '.io' in domain:
                return domain.split('.xyz')[0] + '.io'
            # 针对ip:port形式的url,截取/即可
            return domain.split('/')[0]

        return [_split_url_protocol_and_path(domain) for domain in domain_list]

    @staticmethod
    def _parse_post_data(post_data) -> str:
        """
        解析动态请求获取里面的data成一个字符串
        :param post_data:
        :return:
        """
        if not post_data:
            return ''
        if not isinstance(post_data, dict):
            if '=' in post_data:
                param_dict = {}
                if '&' in post_data:
                    params_couples = post_data.split('&')
                    for param in params_couples:
                        if '=' not in param:
                            continue
                        k, v = param.split('=')
                        param_dict[k] = v
                else:
                    k, v = post_data.split('=')
                    param_dict[k] = v
                post_data = param_dict
            else:
                post_data = json.loads(post_data)
        post_data_list = [k for k, _ in post_data.items()]
        post_data_list.sort()
        return ''.join([param + '&' for param in post_data_list])[:-1]

    @staticmethod
    def parse_static_url(url):
        """
        把解析到的静态url, 重新组合成一个字典
        {
            'url': 'xxxxxx',
            'originUrl': 'xxxxxx/a=aa',
            'method': 'GET',
            'queryString': 'a=aa'
        }
        :param url:
        :return:
        """
        try:
            req = dict()
            req['method'] = 'GET'
            req['originUrl'] = url
            if '?' not in url:
                req['url'] = url
                return req
            url_consist = url.split('?')
            req['url'] = url_consist[0]
            params = url_consist[1]
            if '&' not in params:
                params_consist = params.split('=')
                req['queryString'] = params_consist[0] if params_consist[0] else ''
                return req
            multi_params = params.split('&')
            params_list = list(map(lambda y: y.split('=')[0], filter(lambda x: '=' in x, multi_params)))
            # 按首字母把参数排序
            params_list.sort()
            req['queryString'] = ''.join([key + '=&' for key in params_list])[:-2]
            return req
        except Exception:
            msg = traceback.format_exc()
            print(msg)
            return None

    def _init_reg(self):
        """
        根据解析出来的域名拼接一个正则, 用于同域校验
        :return:
        """
        domain_reg = ['^']
        domain_reg.extend(['(http|https):\/\/' + domain.replace('.', '\.') + '.*|' for domain in self.domain_reg_list])
        # domain_reg.extend(map(lambda x: '(http|https):\/\/' + x.replace('.', '\.') + '.*|', self.domain_reg_list))
        tmp_domain_reg = ''.join(domain_reg)
        self.domain_reg = tmp_domain_reg[:-1] + '$'

    def filter_ext(self, url):
        """
        过滤掉特殊后缀的url, 如一些静态资源等等
        如果存在url的后缀是需要排除的，则排除
        :param url:
        :return:
        """
        try:
            f = url.split('/')[-1].strip()
            if '.' in f:
                ext = f.split('.')[-1].strip().lower()
                if ext and ext in self.filter_exts:
                    return True
                else:
                    return False
            return False
        except Exception:
            msg = traceback.format_exc()
            print(msg)
            return False

    def filter_url_by_domain(self, url):
        """
        检验当前的url是否满足条件
        是  的url以及不在[不需要]的url集合里,返回True.不满足要求，返回false
        :param url:
        :return:
        """
        # 校验域名
        if not re.match(self.domain_reg, url, flags=0):
            return False
        # TODO: 后续补齐这部分功能
        # if len(self.exclude_urls) == 0:
        #    return True
        # 校验exclude_urls
        # if re.match(self.exclude_urls_reg_str, url, flags=0):
        #    return False
        return True

    def static_crawler(self, page, results, url) -> List["ElementHandle"]:
        """
        主要用于页面中静态url的解析, 目前涵盖了a标签的href属性和src属性
        """
        links = page.query_selector_all("//a")
        tmp_link = []
        for link in links:
            href = link.get_property("href").json_value()
            src = link.get_property("src").json_value()
            if not href or href == url:
                continue
            if not self.filter_ext(url=href) and self.filter_url_by_domain(url=href):
                req = self.parse_static_url(href)
                if req:
                    print('href:{}'.format(req))
                    results.append(req)

            if not src or src == url:
                continue
            if not self.filter_ext(url=src) and self.filter_url_by_domain(url=src):
                req = self.parse_static_url(src)
                if req:
                    print('src:{}'.format(req))
                    results.append(req)

            # 这里主要是用于有些a标签里的写法是<javascript>标签，用于执行某些js操作
            if 'javascript' in href or 'javascript' in src:
                tmp_link.append(link)
        return tmp_link

    def _check_crawled_url(self, url) -> bool:
        """
        检查是否已爬取,不存在,则返回True
        :param url:
        :return:
        """
        if url in self.crawled_urls:
            return False
        return True

    def _check_url_is_exist_by_md5(self, url_dict):
        """
        利用MD5去检查url是否重复
        :param url_dict:
        :return:
        """
        try:
            exist_md5 = list(url_dict.keys())[0]
            if exist_md5 in self.crawled_urls:
                return False
            return True
        except Exception:
            msg = traceback.format_exc()
            print(msg)
            return True

    @staticmethod
    def calculate_md5(url_har):
        """
        计算md5来去重
        :param url_har:
        :return:
        """
        url = url_har['url']
        # 有些post请求后缀会加timestamp时间戳来防重放
        tmp_list = url.split('//')[-1].split('?')
        url_without_protocol = tmp_list[0] if len(tmp_list) > 1 else url.split('//')[-1]
        method = url_har['method']
        query_string = ''
        post_data = ''
        if 'queryString' in url_har:
            query_string = url_har['queryString']
        if 'postData' in url_har:
            post_data = url_har['postData']
        tmp_str = url_without_protocol + '&' + method + '&' + query_string + post_data
        return hashlib.md5(tmp_str.encode('utf-8')).hexdigest()

    def _handle_url(self, req_list):
        """
        处理爬到的url, 看看是不是需要过滤或者是不是已经爬取过了
        :param req_list:
        :return:
        """
        if not req_list:
            return
        insert_req_list = list()
        for req in req_list:
            url = req['originUrl']
            if url.endswith('/'):
                url = url[:-1]
            url_without_protocol = url.split('//')[-1]
            '''
            解析完成后,返回的结构体包括:url,queryString(if exist),method
            需要对url做判断：
            1、是否存在于最后的url集合里
            2、是否已爬过
            3、url的后缀是否在需要过滤的集合里(最先判断,如果需要过滤则直接忽略)
            '''
            md5 = self.calculate_md5(req)
            tmp_dict = {
                md5: req
            }
            if self._check_url_is_exist_by_md5(tmp_dict):
                if len(self.url_dict.keys()) < self.max_url_num:
                    self.url_dict[md5] = req
                    # TODO:后面可以定制化插入taskId
                    insert_req_list.append({'taskId': 'test12', 'urlDict': json.dumps(req)})
                # 如果没有爬过,则放入下一轮要爬取的队列里
                if self._check_crawled_url(url_without_protocol) and not self.waiting_queue.full():
                    self.waiting_queue.put_nowait(req['originUrl'])
                    self.crawled_urls.add(url_without_protocol)

    def crawl_handler(self, url) -> list:
        result = []

        def intercept(route: Route, request: Request):
            # 拦截前端跳转,主要方法是修改请求响应为204 TODO: 后续在遇到前端跳转的时候，优化hook逻辑
            if request.is_navigation_request() and request.frame.parent_frame:
                request.response().status = 204
                route.continue_()
                return
            # 尝试拦截后端跳转
            if request.redirected_to:
                if request.post_data_json:
                    request.response().status = 200
                    self.waiting_queue.put_nowait(request.redirected_to.url)
                else:
                    ...
                route.continue_()
                return
            resource_type = request.resource_type
            # 过滤动态请求
            if resource_type in ['image', 'media', 'eventsource', 'websocket']:
                route.abort()
            else:
                url_origin = request.url
                if not url_origin:
                    route.continue_()
                    return
                if not self.filter_ext(url=url_origin) and self.filter_url_by_domain(url=url_origin):
                    headers = request.headers
                    method = request.method
                    post_data_json: dict = request.post_data_json
                    http_har = dict()
                    if method == 'POST' or method == 'PUT':
                        post_data_origin = post_data_json
                        post_data_handled = self._parse_post_data(post_data_origin)
                        content_type = headers['content-type'] if 'content-type' in headers else ''
                        http_har['originPostData'] = post_data_origin
                        http_har['postData'] = post_data_handled
                        http_har['contentType'] = content_type
                        http_har['url'] = url_origin
                        http_har['originUrl'] = url_origin
                        http_har['method'] = method
                    if method == 'GET':
                        http_har = self.parse_static_url(url_origin)
                    result.append(http_har)
                route.continue_()

        with sync_playwright() as p:
            browser = p.webkit.launch(headless=True, chromium_sandbox=True, )
            page = browser.new_page()
            page.set_default_navigation_timeout(30000)
            page.set_extra_http_headers(self.headers)
            page.route('**/*', intercept)
            page.goto(url)
            page.wait_for_load_state(state='networkidle', timeout=30000)

            tmp_links = self.static_crawler(page, result, url)
            page.evaluate(FORM_FILL_UPLOAD_JS)
            for link in tmp_links:
                link.click()
            page.close()
            browser.close()
        return result

Example #3

Show file

File: host_scanner.py Project: ChenBingwei/Python-002

class HostScanner:
    def __init__(self,
                 function,
                 number,
                 method,
                 port_range=None,
                 ip_range=None,
                 verbose=False,
                 write_json=False,
                 json_file='./result.json'):
        self.function = function
        self.number = number
        self.method = method
        self.port_range = port_range
        self.ip_range = ip_range
        self.write_json = write_json
        self._verbose = verbose
        self.json_file = json_file

        self._PoolExecutor = ThreadPoolExecutor if self.method == 'thread' else ProcessPoolExecutor
        self.que = Manager().Queue(10)

    def _update_json_file(self):
        if self._verbose:
            print(f"Write the results in the queue to {self.json_file}")
        json_update = {}
        while not self.que.empty():
            json_update.update(self.que.get())

        if os.path.exists(self.json_file):
            with open(self.json_file, 'r') as fr:
                json_content = json.loads(fr.read())
                json_content.update(json_update)

            with open(self.json_file, 'w') as fw:
                fw.write(json.dumps(json_content, indent=4))
        else:
            with open(self.json_file, 'w') as fw:
                fw.write(json.dumps(json_update, indent=4))

    def _ping_host_ip(self, ip):
        if self._verbose:
            print("pid is %s" % os.getpid())
        try:
            res = subprocess.call('ping -c 2 -t 2 %s' % ip,
                                  shell=True,
                                  stdout=subprocess.PIPE)
            status = 'Active' if res == 0 else 'Inactive'
            print(f'{ip} {status}')

            if self.write_json:
                if self.method == 'proc':
                    with WRITE_LOCK_PROC:
                        if self.que.full():
                            self._update_json_file()
                elif self.method == 'thread':
                    with WRITE_LOCK_THREAD:
                        if self.que.full():
                            self._update_json_file()
                self.que.put({ip: status})

        except Exception as e:
            print('Failed to get status for {}: {}'.format(ip, e))

    def _scan_host_port(self, port):
        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        try:
            s.connect((self.ip_range, port))

            print(f'{port} OPEN')
            if self.write_json:
                if self.method == 'proc':
                    with WRITE_LOCK_PROC:
                        if self.que.full():
                            self._update_json_file()
                elif self.method == 'thread':
                    with WRITE_LOCK_THREAD:
                        if self.que.full():
                            self._update_json_file()
                self.que.put({port: 'OPEN'})
        except Exception as e:
            # Ignore the failed port
            pass
        finally:
            s.close()

    def _runMultiWorks(self):
        with self._PoolExecutor(self.number) as Executor:
            if self.function == 'tcp':
                print(f'The scanned host is {self.ip_range}')
                Executor.map(self._scan_host_port, list(self.port_range))
            elif self.function == 'ping':
                Executor.map(self._ping_host_ip, self.ip_range)

    def run(self):
        # In order to support multiple debugging,
        # delete the generated json file in the first run
        if os.path.exists(self.json_file):
            os.remove(self.json_file)

        if self._verbose:
            print('Start')
            print('*' * 20)

        start_time = time.time()
        self._runMultiWorks()
        end_time = time.time()

        if self._verbose:
            print('*' * 20)
            print('End')
            print("Total time spent: %0.2f" % (end_time - start_time))

        if self.write_json:
            print("Writing into {}".format(self.json_file))
            self._update_json_file()