Python Manager.get_nowait Examples

Programming Language: Python

Namespace/Package Name: multiprocessing

Class/Type: Manager

Method/Function: get_nowait

Examples at hotexamples.com: 14

Python Manager.get_nowait - 14 examples found. These are the top rated real world Python examples of multiprocessing.Manager.get_nowait extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Value(30)

dict(30)

Queue(30)

Namespace(30)

Manager(30)

Lock(30)

append(30)

get(30)

Event(28)

Semaphore(17)

empty(16)

RLock(16)

clear(14)

copy(11)

acquire(11)

get_nowait(11)

JoinableQueue(8)

extend(8)

Array(7)

Condition(7)

Pool(4)

full(3)

_getvalue(3)

count(2)

buff_num(2)

Barrier(2)

__exit__(2)

BoundedSemaphore(2)

frame_delay(2)

file(2)

F_cur(1)

Employee(1)

error(1)

exited(1)

Counter(1)

file_b(1)

file_a(1)

dXg(1)

finished(1)

flag(1)

found(1)

data(1)

Xd(1)

dXd(1)

GLOBAL_RUNNING_REWARD(1)

__enter__(1)

__str__(1)

_callmethod(1)

_debug_info(1)

X_cur(1)

Example #1

Show file

def main():

	#获取文件夹名字
	oldFolderName = input("请输入你要拷贝的文件夹:")
	print(oldFolderName)

	#创建一个新文件夹
	newFolderName = oldFolderName + ["-附件"]

	#创建新文件夹
	os.mkdir(newFolderName)

	#获取文件夹中的所有名字
	filesName = os.listdir(oldFolderName)

	#使用多进程拷贝
	pool = Pool(5)
	queue = Manager().Queue()

	for name in filesName:
		pool.apply_async(filesCopy, (name, oldFolderName, newFolderName, queue, ))

	#显示进度
	num = 0
	allNum = len(filesName)
	while num < allNum:
		queue.get_nowait()
		num+=1
		copyRate = num/allNum
		print("\rcopy的进度是：%.2f%%"%(copyRate*100), end="")

	pool.close()
	pool.join()

Example #2

Show file

class Pool(object):
    """
    The Pool class represents a pool of worker threads. It has methods
    which allows tasks to be offloaded to the worker processes in a
    few different ways
   """
    def __init__(self, num_workers, name="Pool"):
        """
        \param num_workers (integer) number of worker threads to start
        \param name (string) prefix for the worker threads' name
        """
        self.queue = Manager().Queue()
        self.closed = False
        self.workers = []

        for idx in range(num_workers):
            process = PoolWorker(self.queue, name="%s-Worker-%d" % (name, idx))
            process.daemon = True
            try:
                process.start()
            except:
                # If one thread has a problem, undo everything
                self.terminate()
                raise
            else:
                self.workers.append(process)

    def submit(self, work_unit):
        self.queue.put(work_unit)

    def close(self):
        """Prevents any more tasks from being submitted to the
        pool. Once all the tasks have been completed the worker
        processes will exit."""
        # No lock here. We assume it's sufficiently atomic...
        self.closed = True

    def terminate(self):
        """Stops the worker processes immediately without completing
        outstanding work. When the pool object is garbage collected
        terminate() will be called immediately."""
        self.close()

        # Clearing the job queue
        try:
            while 1:
                self.queue.get_nowait()
        # except Manager().Queue.empty():
        except:
            pass

        # Send one sentinel for each worker thread: each thread will die
        # eventually, leaving the next sentinel for the next thread
        for process in self.workers:
            self.queue.put(SENTINEL)

Example #3

Show file

File: process_pool.py Project: anushreejangid/csm

class Pool(object):
    """
    The Pool class represents a pool of worker threads. It has methods
    which allows tasks to be offloaded to the worker processes in a
    few different ways
   """
    def __init__(self, num_workers, name="Pool"):
        """
        \param num_workers (integer) number of worker threads to start
        \param name (string) prefix for the worker threads' name
        """
        self.queue = Manager().Queue()
        self.closed = False
        self.workers = []

        for idx in range(num_workers):
            process = PoolWorker(self.queue, name="%s-Worker-%d" % (name, idx))
            process.daemon = True
            try:
                process.start()
            except:
                # If one thread has a problem, undo everything
                self.terminate()
                raise
            else:
                self.workers.append(process)

    def submit(self, work_unit):
        self.queue.put(work_unit)
    
    def close(self):
        """Prevents any more tasks from being submitted to the
        pool. Once all the tasks have been completed the worker
        processes will exit."""
        # No lock here. We assume it's sufficiently atomic...
        self.closed = True

    def terminate(self):
        """Stops the worker processes immediately without completing
        outstanding work. When the pool object is garbage collected
        terminate() will be called immediately."""
        self.close()

        # Clearing the job queue
        try:
            while 1:
                self.queue.get_nowait()
        # except Manager().Queue.empty():
        except:
            pass

        # Send one sentinel for each worker thread: each thread will die
        # eventually, leaving the next sentinel for the next thread
        for process in self.workers:
            self.queue.put(SENTINEL)

Example #4

Show file

File: DQN_MCTS_TEST.py Project: syd951186545/BiYe

def main():
    from multiprocessing import Pool
    from multiprocessing import Manager
    pool = Pool(5)
    isright = 0
    precision_queue = Manager().Queue()
    for i_ep, root in enumerate(env.TreeList):
        pool.apply_async(func=sub_main, args=(root, precision_queue))
    pool.close()
    pool.join()

    ql = precision_queue.qsize()
    for i in range(ql):
        isright += precision_queue.get_nowait()
    print(isright/precision_queue.qsize())

Example #5

Show file

class MultiProcessFile(object):
    """
    helper for testing multiprocessing

    multiprocessing poses a problem for doctests, since the strategy
    of replacing sys.stdout/stderr with file-like objects then
    inspecting the results won't work: the child processes will
    write to the objects, but the data will not be reflected
    in the parent doctest-ing process.

    The solution is to create file-like objects which will interact with
    multiprocessing in a more desirable way.

    All processes can write to this object, but only the creator can read.
    This allows the testing system to see a unified picture of I/O.
    """
    def __init__(self):
        # per advice at:
        #    http://docs.python.org/library/multiprocessing.html#all-platforms
        self.__master = getpid()
        self.__queue = Manager().Queue()
        self.__buffer = StringIO()
        self.softspace = 0

    def buffer(self):
        if getpid() != self.__master:
            return

        from queue import Empty
        from collections import defaultdict
        cache = defaultdict(str)
        while True:
            try:
                pid, data = self.__queue.get_nowait()
            except Empty:
                break
            if pid == ():
                #show parent output after children
                #this is what users see, usually
                pid = ( 1e100, ) # googol!
            cache[pid] += data
        for pid in sorted(cache):
            #self.__buffer.write( '%s wrote: %r\n' % (pid, cache[pid]) ) #DEBUG
            self.__buffer.write( cache[pid] )
    def write(self, data):
        # note that these pids are in the form of current_process()._identity
        # rather than OS pids
        from multiprocessing import current_process
        pid = current_process()._identity
        self.__queue.put((pid, data))
    def __iter__(self):
        "getattr doesn't work for iter()"
        self.buffer()
        return self.__buffer
    def seek(self, offset, whence=0):
        self.buffer()
        return self.__buffer.seek(offset, whence)
    def getvalue(self):
        self.buffer()
        return self.__buffer.getvalue()
    def __getattr__(self, attr):
        return getattr(self.__buffer, attr)

Example #6

Show file

File: plugintest.py Project: ANKIT-KS/fjord

class MultiProcessFile(object):
    """
    helper for testing multiprocessing

    multiprocessing poses a problem for doctests, since the strategy
    of replacing sys.stdout/stderr with file-like objects then
    inspecting the results won't work: the child processes will
    write to the objects, but the data will not be reflected
    in the parent doctest-ing process.

    The solution is to create file-like objects which will interact with
    multiprocessing in a more desirable way.

    All processes can write to this object, but only the creator can read.
    This allows the testing system to see a unified picture of I/O.
    """
    def __init__(self):
        # per advice at:
        #    http://docs.python.org/library/multiprocessing.html#all-platforms
        self.__master = getpid()
        self.__queue = Manager().Queue()
        self.__buffer = StringIO()
        self.softspace = 0

    def buffer(self):
        if getpid() != self.__master:
            return

        from Queue import Empty
        from collections import defaultdict
        cache = defaultdict(str)
        while True:
            try:
                pid, data = self.__queue.get_nowait()
            except Empty:
                break
            if pid == ():
                #show parent output after children
                #this is what users see, usually
                pid = ( 1e100, ) # googol!
            cache[pid] += data
        for pid in sorted(cache):
            #self.__buffer.write( '%s wrote: %r\n' % (pid, cache[pid]) ) #DEBUG
            self.__buffer.write( cache[pid] )
    def write(self, data):
        # note that these pids are in the form of current_process()._identity
        # rather than OS pids
        from multiprocessing import current_process
        pid = current_process()._identity
        self.__queue.put((pid, data))
    def __iter__(self):
        "getattr doesn't work for iter()"
        self.buffer()
        return self.__buffer
    def seek(self, offset, whence=0):
        self.buffer()
        return self.__buffer.seek(offset, whence)
    def getvalue(self):
        self.buffer()
        return self.__buffer.getvalue()
    def __getattr__(self, attr):
        return getattr(self.__buffer, attr)

Example #7

Show file

    pool = Pool(processes=40)
    queue = Manager().Queue(maxsize=40000)
    cnt = 0
    for i in range(len(label_raw)):
        pool.apply_async(label_expand, args=(
            i,
            label_raw[i],
            queue,
        ))
    pool.close()
    pool.join()

    label_ex = []
    while True:
        try:
            (i, lb) = queue.get_nowait()
            label_ex.append((i, lb))
        except:
            break
    print('finish label expansion ', len(label_ex))
    label_ex = sorted(label_ex, key=lambda x: x[0])
    label = []
    for (i, lb) in label_ex:
        label.append(lb)
    '''test code'''
    # plt.plot(input[7])
    # plt.plot(label[7])
    # plt.show()

    x_train = []
    for dat in input:

Example #8

Show file

File: build_vocabulary_4.py Project: kenzzuli/train_bert_model_from_scratch

class VocabBuilder(object):
    """
    用来构造词表的类
    """
    def __init__(self):
        self.file_full_path_list = self.get_file_path_list()
        self.dict = {
            "[PAD]": 0,
            "[UNK]": 1,
            "[CLS]": 2,
            "[SEP]": 3,
            "[MASK]": 4
        }  # 词表
        self.count_dict = dict()  # 统计每个字符出现次数的词典 {char1:10, char2:12}
        self.sorted_count_list = None  # 按照频数排序后的列表
        self.q_count_dic = Manager().Queue()  # 用于存放所有线程的count_dic
        self.q_count_dic_done = Manager().Queue()  # 用于存放已经完成字典合并的count_dic序列

    @staticmethod
    def get_file_path_list():
        """
        获取所有文本的完整路径
        :return:
        """
        data_path = "../corpus_processed"
        file_list = os.listdir(data_path)
        full_file_path_list = [
            os.path.join(data_path, file) for file in file_list
        ]
        return full_file_path_list

    def tokenize_and_count(self, file):
        """
        按照字符切分句子，并统计词频
        :param file: txt文本
        :return: ['乙', '女', 'は', 'お', '姉', 'さ', 'ま', 'に', '恋', 'し', 'て', 'る', '櫻', 'の', '園', 'の', 'エ', 'ト', 'ワ', 'ー', 'ル']
        """
        basic_tokenizer = BasicTokenizer(
            do_lower_case=False)  # 必须加上do_lower_case=False,这样じ才不会变成し
        token_list = []
        with open(file, mode="r", encoding="utf-8") as fin:
            text = fin.read()
            str_list = basic_tokenizer.tokenize(text)
            for str in str_list:
                token_list += list(str)
        # 统计
        tmp_count_dict = dict()
        for token in token_list:
            if token not in tmp_count_dict:
                tmp_count_dict[token] = 0
            tmp_count_dict[token] += 1
        self.q_count_dic.put(tmp_count_dict)  # 将统计结果放入序列

    def join_dicts(self):
        """
        使用一个单独的进程，将所有进程统计出来的字典与总字典合并
        :return:
        """
        while True:
            tmp_count_dict = self.q_count_dic.get()  # 从队列获取统计出来的字典
            all_keys = tmp_count_dict.keys() | self.count_dict.keys()
            self.count_dict = {
                key: tmp_count_dict.get(key, 0) + self.count_dict.get(key, 0)
                for key in all_keys
            }
            self.q_count_dic_done.put(1)  # 将完成任务的字典放入序列中，只放字典的长度即可，不必放整个字典
            if self.q_count_dic_done.qsize() == len(self.file_full_path_list):
                self.q_count_dic.put(self.count_dict)  # 通过序列传递最终的结果
                break

    def build_vocab(self, min_count=None, max_count=None, vocab_size=None):
        """
        构造词表
        :param min_count: 字符出现的最少次数
        :param max_count: 字符出现的最大次数
        :param vocab_size: 词表中单词的最大数量
        :return:
        """
        self.count_dict = self.q_count_dic.get_nowait()  # 从序列中获取最终的字典
        # 对单词频数字典进行筛选，仅保留频数位于[min_count,max_count]的单词
        if isinstance(min_count, int):
            self.count_dict = {
                key: value
                for key, value in self.count_dict.items() if min_count <= value
            }
        if isinstance(max_count, int):
            self.count_dict = {
                key: value
                for key, value in self.count_dict.items() if value <= max_count
            }
            # 对count字典按照频数进行降序排序，再转成列表 [('b', 10), ('a', 9), ('c', 8)]
        self.sorted_count_list = sorted(self.count_dict.items(),
                                        key=lambda x: x[1],
                                        reverse=True)
        # 限制词表中单词的最大数量
        if isinstance(vocab_size, int):
            # 如果vocab_size<len(sorted_count_list)，则只取前vocab_size个
            if vocab_size < len(self.sorted_count_list):
                self.sorted_count_list = self.sorted_count_list[:vocab_size]
        # 将满足条件的sorted_count_list中的单词，保存到self.dic中
        for token, _ in self.sorted_count_list:
            # if token not in self.dict.keys() and len(self.dict) < vocab_size:  # 加上这一句是为了能够分句传入
            self.dict[token] = len(
                self.dict)  # dict由{UNK:0,PAD:1} --> {UNK:0,PAD:1,token:2}

    def save_vocab(self):
        dict_path = "./vocab.txt"
        with open(dict_path, mode="w", encoding="utf-8") as fout:
            for token in self.dict.keys():
                fout.write(token + "\n")
        print("Vocabulary Size:%d" % len(self.dict))

    def show_progress_bar(self):
        """做一个进度条"""
        with tqdm(total=len(self.file_full_path_list)) as pbar:
            while True:
                pbar.set_description("Building Vocabulary")
                processed_file_num = self.q_count_dic_done.qsize()
                pbar.n = processed_file_num
                pbar.refresh()
                if processed_file_num == len(self.file_full_path_list):
                    break

    def multi_process_token_counter(self):
        po = Pool()
        po.apply_async(self.join_dicts)  # 添加合并字典任务
        for file_path in self.file_full_path_list:
            # 添加分词并统计任务
            po.apply_async(self.tokenize_and_count, args=(file_path, ))
        self.show_progress_bar()
        po.close()
        po.join()

    def run(self):
        # 多进程词频分词、词频统计、字典合并
        self.multi_process_token_counter()
        # 生成词表
        self.build_vocab()
        # 保存词表
        self.save_vocab()

Example #9

Show file

File: RelationExtractor.py Project: subhadeepmaji/ml_algorithms

class RelationExtractor:
    """
    Relation Extraction based on Semantic Role Labeling of SENNA
    """
    def __init__(self, data_source=None, relation_sink=None, workers=4):
        """
        :param data_source: data_source object of type DataSource
        :param relation_sink: data_sink object of type DataSink
        :param workers: number of child process workers in source sink mode
        """
        if data_source:
            assert isinstance(data_source, DSource.MongoDataSource),\
                "data_source object must be instance of MongoDataSource"
            self.data_source = data_source

        if relation_sink:
            assert isinstance(relation_sink, DSink.ElasticDataSink), \
                "relation_sink object must be instance of ElasticDataSink"
            self.relation_sink = relation_sink
            self.model_class = self.relation_sink.model_identifier.model_class

        self.relation_annotator = pnt.Annotator()
        self.stemmer = PorterStemmer()
        self.workers = workers
        self.relation_queue = Manager().Queue(maxsize=10000)
        self.persist_attributes = ['relation_annotator', 'stemmer', 'model_class', 'relation_queue']

    def __getstate__(self):
        state = dict()
        for attr in self.persist_attributes:
            state[attr] = self.__dict__[attr]
        return state

    def __setstate(self, d):
        self.__dict__.update(d)

    @staticmethod
    def __populate_arguments(semantic_element):
        """
        form a argument object from the srl semantic element
        :param semantic_element: SRL semantic element
        :return: RelationArgument instance
        """
        return RelationArgument(A0=semantic_element.get('A0'), A1=semantic_element.get('A1'),
                                A2=semantic_element.get('A2'), A3=semantic_element.get('A3'))

    @staticmethod
    def __populate_modifier(semantic_element):
        """
        form a argument modifier object from the srl semantic element
        :param semantic_element: SRL semantic element
        :return: RelationModifier instance
        """
        return RelationModifier(DIR=semantic_element.get('AM-DIR'), MNR=semantic_element.get('AM-MNR'),
                                LOC=semantic_element.get('AM-LOC'), TMP=semantic_element.get('AM-TMP'),
                                EXT=semantic_element.get('AM-EXT'), PNC=semantic_element.get('AM-PNC'),
                                CAU=semantic_element.get('AM-CAU'), NEG=semantic_element.get('AM-NEG'))

    def form_relations(self, text, block_id, payload, ff, persist=True):
        """
        form relation(s) on a given text
        :param text: text on which to get the relations on,
        text will be sentence tokenized and relations formed at sentence level
        :param block_id: unique identifier of the block
        :param persist: persist the relations extracted from the text in the sink,
        relation_sink needed to be specified
        :return: list of relations
        """
        text_sentences = pattern.tokenize(text)
        relations = []
        for sentence in text_sentences:

            # work with ascii string only
            sentence = "".join((c for c in sentence if 0 < ord(c) < 127))
            try:
                senna_annotation = self.relation_annotator.getAnnotations(sentence)
            except Exception as e:
                logger.error(e)
                continue

            chunk_parse, pos_tags, role_labeling, tokenized_sentence = \
                senna_annotation['chunk'], senna_annotation['pos'], senna_annotation['srl'], \
                senna_annotation['words']

            # nothing to do here empty srl
            if not role_labeling: continue

            for semantic_element in role_labeling:
                arguments = RelationExtractor.__populate_arguments(semantic_element)
                modifiers = RelationExtractor.__populate_modifier(semantic_element)
                verb = semantic_element.get('V')
                # order of the arguments returned is important, A0 --> A1 --> A2 --> A3
                arguments = [v for v in vars(arguments).itervalues() if v]
                modifiers = [v for v in vars(modifiers).itervalues() if v]

                if not arguments: continue
                argument_pairs = [e for e in ((ai, aj) for i, ai in enumerate(arguments) for j, aj
                                              in enumerate(arguments) if i < j)]

                verb = relation_util.normalize_relation(verb)

                for a0, a1 in argument_pairs:
                    en0 = relation_util.form_entity(tokenized_sentence, a0, chunk_parse, pos_tags)
                    en1 = relation_util.form_entity(tokenized_sentence, a1, chunk_parse, pos_tags)
                    if not en0 or not en1: continue
                    relations.append(RelationTuple(left_entity=en0, right_entity=en1, relation=verb,
                                                   sentence=sentence, text=text, block_id=block_id,
                                                   payload=payload, ff = ff))
                    logger.info("generated a relation for ")
                    logger.info(block_id)

                for arg_modifier in modifiers:
                    mod_pos = sentence.find(arg_modifier)
                    linked_arg = min([(a, abs(mod_pos - sentence.find(a))) for a in arguments], key=lambda e: e[1])[0]
                    en0 = relation_util.form_entity(tokenized_sentence, linked_arg, chunk_parse, pos_tags)
                    en1 = relation_util.form_entity(tokenized_sentence, arg_modifier, chunk_parse, pos_tags)
                    if not en0 or not en1: continue
                    relations.append(RelationTuple(left_entity=en0, right_entity=en1, relation=verb,
                                                   sentence=sentence, text=text, block_id=block_id,
                                                   payload=payload, ff=ff))
                    logger.info("generated a relation for ")
                    logger.info(block_id)

        return relations

    def form_relations_source(self, source_item):
        if not source_item:
            logger.error("got an empty source item")
            return

        item_entry = ""
        payload = ""
        ff = ""

        for f_name, f_value in source_item:
            if f_name == "payload":
                payload = f_value
            elif f_name == "ff":
                ff = f_value
            else:
                item_entry += f_value

        if item_entry == ' ': return
        try:
            block_id = str(uuid.uuid1())
            relations = self.form_relations(item_entry, block_id, payload, ff)
        except RuntimeError as e:
            logger.error("Error generating relations")
            logger.error(e)
            return

        for relation in relations:
            sink_relation = self.model_class()
            sink_relation.leftEntity = relation.left_entity
            sink_relation.rightEntity = relation.right_entity
            sink_relation.relation = relation.relation
            sink_relation.sentence = relation.sentence
            sink_relation.text = relation.text
            sink_relation.block_id = relation.block_id
            sink_relation.productName = relation.ff
            sink_relation.webLocation = relation.payload

            logger.info("generated a relation")
            logger.info(sink_relation)

            try:
                self.relation_queue.put(sink_relation, timeout=1)
            except Full as e:
                logger.error(e)

    def sink_relations(self):
        while not self.all_sinked:
            try:
                item = self.relation_queue.get_nowait()
                self.relation_sink.sink_item(item)
            except Empty as e:
                pass

    def form_relations_from_source(self):

        if not self.data_source or not self.relation_sink:
            raise RuntimeError("Data source and sink must be set")

        self.data_source.start()
        self.relation_sink.start()

        self.all_sinked = False
        pool = Pool(processes=self.workers)
        t1 = time.time()
        pool.imap(self.form_relations_source, self.data_source)

        sinker = Thread(target=self.sink_relations, name='Sink-Thread')
        sinker.start()

        pool.close()
        pool.join()
        self.all_sinked = True
        t2 = time.time()
        logger.info("process finished in :: %d  seconds" %(t2 - t1))

Example #10

Show file

    pool = Pool(processes=40)
    queue = Manager().Queue(maxsize=20000)
    cnt = 0
    for i in range(len(label_raw)):
        pool.apply_async(label_expand, args=(
            i,
            label_raw[i],
            queue,
        ))
    pool.close()
    pool.join()

    label_ex = []
    while True:
        try:
            (i, lb) = queue.get_nowait()
            label_ex.append((i, lb))
        except:
            break
    print('finish label expansion ', len(label_ex))
    label_ex = sorted(label_ex, key=lambda x: x[0])
    label = []
    for (i, lb) in label_ex:
        label.append(lb)
    '''test code'''
    # plt.plot(input[7])
    # plt.plot(label[7])
    # plt.show()

    if input_dim < output_dim:
        print('input_dim smaller than output_dim, quit task')

Example #11

Show file

class Crawler(object):
    def __init__(self, cookie: str = None, headers: dict = None, max_num: int = 10000, domain_regs: list = None,
                 depth: int = 5):
        self.cookie = cookie
        self.headers = headers if headers else DEFAULT_HEADERS
        self.waiting_queue = Manager().Queue(maxsize=max_num * 2)
        self.current_queue = Manager().Queue(maxsize=max_num * 2)
        self.max_url_num = max_num
        self.crawled_urls = BloomFilter(element_num=max_num * 5, error_rate=0.01)
        self.url_dict = Manager().dict()
        self.domain_reg_list = domain_regs
        self.depth = depth
        self.current_depth = 0
        self.filter_exts = [
            'css', 'png', 'gif', 'jpg', 'jpeg', 'swf', 'tiff',
            'pdf', 'ico', 'flv', 'mp4', 'mp3', 'avi', 'mpg', 'gz',
            'mpeg', 'iso', 'dat', 'mov', 'rar', 'exe', 'zip', 'tar',
            'bin', 'bz2', 'xsl', 'doc', 'docx', 'ppt', 'pptx', 'xls',
            'xlsx', 'csv', 'map', "ttf", 'tif', 'woff', 'woff2',
            'cab', 'apk', 'bmp', 'svg', 'exif', 'xml', 'rss', 'webp', 'js'
        ]

    def run(self, urls):
        self.consist_headers()
        # 默认只爬取当前根域下的url
        self.domain_reg_list = self.parse_domain(urls) if not self.domain_reg_list else self.domain_reg_list
        self._init_reg()
        for url in urls:
            self.call_crawl_handler(url)
        print('all task done')
        print(self.url_dict)

    def call_crawl_handler(self, url):
        if 'http' not in url:
            init_url = 'http://' + url
        else:
            init_url = url
        self.current_queue.put_nowait(init_url)
        # 初始化url为了避免重复爬取，在初始化时就放入布隆过滤器
        init_url_without_protocol = url.split('//')[-1]
        self.crawled_urls.add(init_url_without_protocol)
        while self.current_depth < self.depth:
            if len(self.url_dict) >= self.max_url_num:
                print('达到预设爬去上限, 爬虫结束')
                break
            print('now crawl depth is :{}'.format(self.current_depth))
            tmp_results = []
            # 利用进程池去完成爬虫
            pool = Pool(os.cpu_count() * 2)
            while not self.current_queue.empty():
                print('当前在待爬队列中还有:{}个url'.format(self.current_queue.qsize()))
                url = self.current_queue.get_nowait()
                if not url.endswith('js'):
                    result = pool.apply_async(func=self.crawl_handler, args=(url,))
                    tmp_results.append(result)
                    # self.crawl_handler(url)
            pool.close()
            pool.join()
            tmp_reqs = []
            for result in tmp_results:
                for r in result.get():
                    tmp_reqs.append(r)
            self._handle_url(tmp_reqs)
            self.current_queue = self.waiting_queue
            self.waiting_queue = Manager().Queue(maxsize=self.max_url_num * 2)
            self.current_depth += 1
            print('depth:{} crawled done'.format(self.current_depth))

    def consist_headers(self):
        if self.cookie:
            self.headers['Cookie'] = self.cookie

    @staticmethod
    def parse_domain(domain_list):
        """
        需要将输入的url或者域名解析成域名, 用于后续同域判断等操作
        :param domain_list:
        :return:
        """

        def _split_url_protocol_and_path(domain):
            # 去掉协议
            if '://' in domain:
                domain = domain.split('://')[1]
            # 截取路径
            if '.com.cn' in domain:
                return domain.split('.com.cn')[0] + '.com.cn'
            if '.com' in domain:
                return domain.split('.com')[0] + '.com'
            if '.io' in domain:
                return domain.split('.xyz')[0] + '.io'
            # 针对ip:port形式的url,截取/即可
            return domain.split('/')[0]

        return [_split_url_protocol_and_path(domain) for domain in domain_list]

    @staticmethod
    def _parse_post_data(post_data) -> str:
        """
        解析动态请求获取里面的data成一个字符串
        :param post_data:
        :return:
        """
        if not post_data:
            return ''
        if not isinstance(post_data, dict):
            if '=' in post_data:
                param_dict = {}
                if '&' in post_data:
                    params_couples = post_data.split('&')
                    for param in params_couples:
                        if '=' not in param:
                            continue
                        k, v = param.split('=')
                        param_dict[k] = v
                else:
                    k, v = post_data.split('=')
                    param_dict[k] = v
                post_data = param_dict
            else:
                post_data = json.loads(post_data)
        post_data_list = [k for k, _ in post_data.items()]
        post_data_list.sort()
        return ''.join([param + '&' for param in post_data_list])[:-1]

    @staticmethod
    def parse_static_url(url):
        """
        把解析到的静态url, 重新组合成一个字典
        {
            'url': 'xxxxxx',
            'originUrl': 'xxxxxx/a=aa',
            'method': 'GET',
            'queryString': 'a=aa'
        }
        :param url:
        :return:
        """
        try:
            req = dict()
            req['method'] = 'GET'
            req['originUrl'] = url
            if '?' not in url:
                req['url'] = url
                return req
            url_consist = url.split('?')
            req['url'] = url_consist[0]
            params = url_consist[1]
            if '&' not in params:
                params_consist = params.split('=')
                req['queryString'] = params_consist[0] if params_consist[0] else ''
                return req
            multi_params = params.split('&')
            params_list = list(map(lambda y: y.split('=')[0], filter(lambda x: '=' in x, multi_params)))
            # 按首字母把参数排序
            params_list.sort()
            req['queryString'] = ''.join([key + '=&' for key in params_list])[:-2]
            return req
        except Exception:
            msg = traceback.format_exc()
            print(msg)
            return None

    def _init_reg(self):
        """
        根据解析出来的域名拼接一个正则, 用于同域校验
        :return:
        """
        domain_reg = ['^']
        domain_reg.extend(['(http|https):\/\/' + domain.replace('.', '\.') + '.*|' for domain in self.domain_reg_list])
        # domain_reg.extend(map(lambda x: '(http|https):\/\/' + x.replace('.', '\.') + '.*|', self.domain_reg_list))
        tmp_domain_reg = ''.join(domain_reg)
        self.domain_reg = tmp_domain_reg[:-1] + '$'

    def filter_ext(self, url):
        """
        过滤掉特殊后缀的url, 如一些静态资源等等
        如果存在url的后缀是需要排除的，则排除
        :param url:
        :return:
        """
        try:
            f = url.split('/')[-1].strip()
            if '.' in f:
                ext = f.split('.')[-1].strip().lower()
                if ext and ext in self.filter_exts:
                    return True
                else:
                    return False
            return False
        except Exception:
            msg = traceback.format_exc()
            print(msg)
            return False

    def filter_url_by_domain(self, url):
        """
        检验当前的url是否满足条件
        是  的url以及不在[不需要]的url集合里,返回True.不满足要求，返回false
        :param url:
        :return:
        """
        # 校验域名
        if not re.match(self.domain_reg, url, flags=0):
            return False
        # TODO: 后续补齐这部分功能
        # if len(self.exclude_urls) == 0:
        #    return True
        # 校验exclude_urls
        # if re.match(self.exclude_urls_reg_str, url, flags=0):
        #    return False
        return True

    def static_crawler(self, page, results, url) -> List["ElementHandle"]:
        """
        主要用于页面中静态url的解析, 目前涵盖了a标签的href属性和src属性
        """
        links = page.query_selector_all("//a")
        tmp_link = []
        for link in links:
            href = link.get_property("href").json_value()
            src = link.get_property("src").json_value()
            if not href or href == url:
                continue
            if not self.filter_ext(url=href) and self.filter_url_by_domain(url=href):
                req = self.parse_static_url(href)
                if req:
                    print('href:{}'.format(req))
                    results.append(req)

            if not src or src == url:
                continue
            if not self.filter_ext(url=src) and self.filter_url_by_domain(url=src):
                req = self.parse_static_url(src)
                if req:
                    print('src:{}'.format(req))
                    results.append(req)

            # 这里主要是用于有些a标签里的写法是<javascript>标签，用于执行某些js操作
            if 'javascript' in href or 'javascript' in src:
                tmp_link.append(link)
        return tmp_link

    def _check_crawled_url(self, url) -> bool:
        """
        检查是否已爬取,不存在,则返回True
        :param url:
        :return:
        """
        if url in self.crawled_urls:
            return False
        return True

    def _check_url_is_exist_by_md5(self, url_dict):
        """
        利用MD5去检查url是否重复
        :param url_dict:
        :return:
        """
        try:
            exist_md5 = list(url_dict.keys())[0]
            if exist_md5 in self.crawled_urls:
                return False
            return True
        except Exception:
            msg = traceback.format_exc()
            print(msg)
            return True

    @staticmethod
    def calculate_md5(url_har):
        """
        计算md5来去重
        :param url_har:
        :return:
        """
        url = url_har['url']
        # 有些post请求后缀会加timestamp时间戳来防重放
        tmp_list = url.split('//')[-1].split('?')
        url_without_protocol = tmp_list[0] if len(tmp_list) > 1 else url.split('//')[-1]
        method = url_har['method']
        query_string = ''
        post_data = ''
        if 'queryString' in url_har:
            query_string = url_har['queryString']
        if 'postData' in url_har:
            post_data = url_har['postData']
        tmp_str = url_without_protocol + '&' + method + '&' + query_string + post_data
        return hashlib.md5(tmp_str.encode('utf-8')).hexdigest()

    def _handle_url(self, req_list):
        """
        处理爬到的url, 看看是不是需要过滤或者是不是已经爬取过了
        :param req_list:
        :return:
        """
        if not req_list:
            return
        insert_req_list = list()
        for req in req_list:
            url = req['originUrl']
            if url.endswith('/'):
                url = url[:-1]
            url_without_protocol = url.split('//')[-1]
            '''
            解析完成后,返回的结构体包括:url,queryString(if exist),method
            需要对url做判断：
            1、是否存在于最后的url集合里
            2、是否已爬过
            3、url的后缀是否在需要过滤的集合里(最先判断,如果需要过滤则直接忽略)
            '''
            md5 = self.calculate_md5(req)
            tmp_dict = {
                md5: req
            }
            if self._check_url_is_exist_by_md5(tmp_dict):
                if len(self.url_dict.keys()) < self.max_url_num:
                    self.url_dict[md5] = req
                    # TODO:后面可以定制化插入taskId
                    insert_req_list.append({'taskId': 'test12', 'urlDict': json.dumps(req)})
                # 如果没有爬过,则放入下一轮要爬取的队列里
                if self._check_crawled_url(url_without_protocol) and not self.waiting_queue.full():
                    self.waiting_queue.put_nowait(req['originUrl'])
                    self.crawled_urls.add(url_without_protocol)

    def crawl_handler(self, url) -> list:
        result = []

        def intercept(route: Route, request: Request):
            # 拦截前端跳转,主要方法是修改请求响应为204 TODO: 后续在遇到前端跳转的时候，优化hook逻辑
            if request.is_navigation_request() and request.frame.parent_frame:
                request.response().status = 204
                route.continue_()
                return
            # 尝试拦截后端跳转
            if request.redirected_to:
                if request.post_data_json:
                    request.response().status = 200
                    self.waiting_queue.put_nowait(request.redirected_to.url)
                else:
                    ...
                route.continue_()
                return
            resource_type = request.resource_type
            # 过滤动态请求
            if resource_type in ['image', 'media', 'eventsource', 'websocket']:
                route.abort()
            else:
                url_origin = request.url
                if not url_origin:
                    route.continue_()
                    return
                if not self.filter_ext(url=url_origin) and self.filter_url_by_domain(url=url_origin):
                    headers = request.headers
                    method = request.method
                    post_data_json: dict = request.post_data_json
                    http_har = dict()
                    if method == 'POST' or method == 'PUT':
                        post_data_origin = post_data_json
                        post_data_handled = self._parse_post_data(post_data_origin)
                        content_type = headers['content-type'] if 'content-type' in headers else ''
                        http_har['originPostData'] = post_data_origin
                        http_har['postData'] = post_data_handled
                        http_har['contentType'] = content_type
                        http_har['url'] = url_origin
                        http_har['originUrl'] = url_origin
                        http_har['method'] = method
                    if method == 'GET':
                        http_har = self.parse_static_url(url_origin)
                    result.append(http_har)
                route.continue_()

        with sync_playwright() as p:
            browser = p.webkit.launch(headless=True, chromium_sandbox=True, )
            page = browser.new_page()
            page.set_default_navigation_timeout(30000)
            page.set_extra_http_headers(self.headers)
            page.route('**/*', intercept)
            page.goto(url)
            page.wait_for_load_state(state='networkidle', timeout=30000)

            tmp_links = self.static_crawler(page, result, url)
            page.evaluate(FORM_FILL_UPLOAD_JS)
            for link in tmp_links:
                link.click()
            page.close()
            browser.close()
        return result

Example #12

Show file

class RelationExtractor:
    """
    Relation Extraction based on Semantic Role Labeling of SENNA
    """
    def __init__(self, data_source=None, relation_sink=None, workers=4):
        """
        :param data_source: data_source object of type DataSource
        :param relation_sink: data_sink object of type DataSink
        :param workers: number of child process workers in source sink mode
        """
        if data_source:
            assert isinstance(data_source, DSource.MongoDataSource),\
                "data_source object must be instance of MongoDataSource"
            self.data_source = data_source

        if relation_sink:
            assert isinstance(relation_sink, DSink.ElasticDataSink), \
                "relation_sink object must be instance of ElasticDataSink"
            self.relation_sink = relation_sink
            self.model_class = self.relation_sink.model_identifier.model_class

        self.relation_annotator = pnt.Annotator()
        self.stemmer = PorterStemmer()
        self.workers = workers
        self.relation_queue = Manager().Queue(maxsize=10000)
        self.persist_attributes = [
            'relation_annotator', 'stemmer', 'model_class', 'relation_queue'
        ]

    def __getstate__(self):
        state = dict()
        for attr in self.persist_attributes:
            state[attr] = self.__dict__[attr]
        return state

    def __setstate(self, d):
        self.__dict__.update(d)

    @staticmethod
    def __populate_arguments(semantic_element):
        """
        form a argument object from the srl semantic element
        :param semantic_element: SRL semantic element
        :return: RelationArgument instance
        """
        return RelationArgument(A0=semantic_element.get('A0'),
                                A1=semantic_element.get('A1'),
                                A2=semantic_element.get('A2'),
                                A3=semantic_element.get('A3'))

    @staticmethod
    def __populate_modifier(semantic_element):
        """
        form a argument modifier object from the srl semantic element
        :param semantic_element: SRL semantic element
        :return: RelationModifier instance
        """
        return RelationModifier(DIR=semantic_element.get('AM-DIR'),
                                MNR=semantic_element.get('AM-MNR'),
                                LOC=semantic_element.get('AM-LOC'),
                                TMP=semantic_element.get('AM-TMP'),
                                EXT=semantic_element.get('AM-EXT'),
                                PNC=semantic_element.get('AM-PNC'),
                                CAU=semantic_element.get('AM-CAU'),
                                NEG=semantic_element.get('AM-NEG'))

    def form_relations(self, text, block_id, payload, ff, persist=True):
        """
        form relation(s) on a given text
        :param text: text on which to get the relations on,
        text will be sentence tokenized and relations formed at sentence level
        :param block_id: unique identifier of the block
        :param persist: persist the relations extracted from the text in the sink,
        relation_sink needed to be specified
        :return: list of relations
        """
        text_sentences = pattern.tokenize(text)
        relations = []
        for sentence in text_sentences:

            # work with ascii string only
            sentence = "".join((c for c in sentence if 0 < ord(c) < 127))
            try:
                senna_annotation = self.relation_annotator.getAnnotations(
                    sentence)
            except Exception as e:
                logger.error(e)
                continue

            chunk_parse, pos_tags, role_labeling, tokenized_sentence = \
                senna_annotation['chunk'], senna_annotation['pos'], senna_annotation['srl'], \
                senna_annotation['words']

            # nothing to do here empty srl
            if not role_labeling: continue

            for semantic_element in role_labeling:
                arguments = RelationExtractor.__populate_arguments(
                    semantic_element)
                modifiers = RelationExtractor.__populate_modifier(
                    semantic_element)
                verb = semantic_element.get('V')
                # order of the arguments returned is important, A0 --> A1 --> A2 --> A3
                arguments = [v for v in vars(arguments).itervalues() if v]
                modifiers = [v for v in vars(modifiers).itervalues() if v]

                if not arguments: continue
                argument_pairs = [
                    e for e in ((ai, aj) for i, ai in enumerate(arguments)
                                for j, aj in enumerate(arguments) if i < j)
                ]

                verb = relation_util.normalize_relation(verb)

                for a0, a1 in argument_pairs:
                    en0 = relation_util.form_entity(tokenized_sentence, a0,
                                                    chunk_parse, pos_tags)
                    en1 = relation_util.form_entity(tokenized_sentence, a1,
                                                    chunk_parse, pos_tags)
                    if not en0 or not en1: continue
                    relations.append(
                        RelationTuple(left_entity=en0,
                                      right_entity=en1,
                                      relation=verb,
                                      sentence=sentence,
                                      text=text,
                                      block_id=block_id,
                                      payload=payload,
                                      ff=ff))
                    logger.info("generated a relation for ")
                    logger.info(block_id)

                for arg_modifier in modifiers:
                    mod_pos = sentence.find(arg_modifier)
                    linked_arg = min([(a, abs(mod_pos - sentence.find(a)))
                                      for a in arguments],
                                     key=lambda e: e[1])[0]
                    en0 = relation_util.form_entity(tokenized_sentence,
                                                    linked_arg, chunk_parse,
                                                    pos_tags)
                    en1 = relation_util.form_entity(tokenized_sentence,
                                                    arg_modifier, chunk_parse,
                                                    pos_tags)
                    if not en0 or not en1: continue
                    relations.append(
                        RelationTuple(left_entity=en0,
                                      right_entity=en1,
                                      relation=verb,
                                      sentence=sentence,
                                      text=text,
                                      block_id=block_id,
                                      payload=payload,
                                      ff=ff))
                    logger.info("generated a relation for ")
                    logger.info(block_id)

        return relations

    def form_relations_source(self, source_item):
        if not source_item:
            logger.error("got an empty source item")
            return

        item_entry = ""
        payload = ""
        ff = ""

        for f_name, f_value in source_item:
            if f_name == "payload":
                payload = f_value
            elif f_name == "ff":
                ff = f_value
            else:
                item_entry += f_value

        if item_entry == ' ': return
        try:
            block_id = str(uuid.uuid1())
            relations = self.form_relations(item_entry, block_id, payload, ff)
        except RuntimeError as e:
            logger.error("Error generating relations")
            logger.error(e)
            return

        for relation in relations:
            sink_relation = self.model_class()
            sink_relation.leftEntity = relation.left_entity
            sink_relation.rightEntity = relation.right_entity
            sink_relation.relation = relation.relation
            sink_relation.sentence = relation.sentence
            sink_relation.text = relation.text
            sink_relation.block_id = relation.block_id
            sink_relation.productName = relation.ff
            sink_relation.webLocation = relation.payload

            logger.info("generated a relation")
            logger.info(sink_relation)

            try:
                self.relation_queue.put(sink_relation, timeout=1)
            except Full as e:
                logger.error(e)

    def sink_relations(self):
        while not self.all_sinked:
            try:
                item = self.relation_queue.get_nowait()
                self.relation_sink.sink_item(item)
            except Empty as e:
                pass

    def form_relations_from_source(self):

        if not self.data_source or not self.relation_sink:
            raise RuntimeError("Data source and sink must be set")

        self.data_source.start()
        self.relation_sink.start()

        self.all_sinked = False
        pool = Pool(processes=self.workers)
        t1 = time.time()
        pool.imap(self.form_relations_source, self.data_source)

        sinker = Thread(target=self.sink_relations, name='Sink-Thread')
        sinker.start()

        pool.close()
        pool.join()
        self.all_sinked = True
        t2 = time.time()
        logger.info("process finished in :: %d  seconds" % (t2 - t1))

Example #13

Show file

File: search_info.py Project: ChuXiaoYi/daily_practice

        except IndexError as e :
            pass
        else:
            response = resp.group(1)
            if resp:
                ret = urllib.parse.unquote(response)
                return_rule = get_return_rule(json.loads(ret)['response'])
                result_queue.put_nowait(return_rule)
            else:
                print('没有response')


if __name__ == '__main__':
    pool = Pool(4)
    qid_queue = Manager().Queue()
    # result_queue = Manager().Queue()
    pool.apply_async(find_log, args=('query', 0, qid_queue,), callback=call_back)
    print('有一个线程去找qid了')
    while isinstance(qid_queue.get(), int):
        pool.apply_async(find_log, args=('responseServer', qid_queue.get_nowait(), qid_queue,), callback=call_back)
    pool.close()
    pool.join()
    index = 1
    while qid_queue.qsize() > 0:
        if not isinstance(qid_queue.get_nowait(), int) and index < 30:
            csv_file = open('travco_return_rule.csv', 'a', encoding='utf8')
            writer = csv.writer(csv_file)
            writer.writerow([qid_queue.get()])
            csv_file.close()
            index += 1

Example #14

Show file

    args = parser.parse_args()

    # init pool and queue
    q_get = Manager().Queue()
    q_put = Manager().Queue()
    pool = Pool(processes = args.process)

    # put pic path into get queue: q_get
    pics = os.listdir(args.folder)
    for pic in pics:
        path = os.path.join(args.folder, pic)
        q_get.put(path)
    
    # use multi process to get hash result and put the result into q_put
    n = q_get.qsize()
    while True:
        try:
            image_path = q_get.get_nowait()
            i = q_get.qsize()
        except:
            break
        else:
            pool.apply_async(get_hash_queue, (i, n, image_path, q_put, ))
    pool.close()
    pool.join()

    sys.stdout.write('\n')
    # get hash bucket
    image_bucket, hash_bucket0, hash_bucket1, hash_bucket2, hash_bucket3 = get_hash_bucket(q_put)
    # image deduplicate
    image_deduplication(image_bucket, hash_bucket0, hash_bucket1, hash_bucket2, hash_bucket3)