Example #1
0
def main():

	#获取文件夹名字
	oldFolderName = input("请输入你要拷贝的文件夹:")
	print(oldFolderName)

	#创建一个新文件夹
	newFolderName = oldFolderName + ["-附件"]

	#创建新文件夹
	os.mkdir(newFolderName)

	#获取文件夹中的所有名字
	filesName = os.listdir(oldFolderName)

	#使用多进程拷贝
	pool = Pool(5)
	queue = Manager().Queue()

	for name in filesName:
		pool.apply_async(filesCopy, (name, oldFolderName, newFolderName, queue, ))

	#显示进度
	num = 0
	allNum = len(filesName)
	while num < allNum:
		queue.get_nowait()
		num+=1
		copyRate = num/allNum
		print("\rcopy的进度是:%.2f%%"%(copyRate*100), end="")

	pool.close()
	pool.join()
Example #2
0
class Pool(object):
    """
    The Pool class represents a pool of worker threads. It has methods
    which allows tasks to be offloaded to the worker processes in a
    few different ways
   """
    def __init__(self, num_workers, name="Pool"):
        """
        \param num_workers (integer) number of worker threads to start
        \param name (string) prefix for the worker threads' name
        """
        self.queue = Manager().Queue()
        self.closed = False
        self.workers = []

        for idx in range(num_workers):
            process = PoolWorker(self.queue, name="%s-Worker-%d" % (name, idx))
            process.daemon = True
            try:
                process.start()
            except:
                # If one thread has a problem, undo everything
                self.terminate()
                raise
            else:
                self.workers.append(process)

    def submit(self, work_unit):
        self.queue.put(work_unit)

    def close(self):
        """Prevents any more tasks from being submitted to the
        pool. Once all the tasks have been completed the worker
        processes will exit."""
        # No lock here. We assume it's sufficiently atomic...
        self.closed = True

    def terminate(self):
        """Stops the worker processes immediately without completing
        outstanding work. When the pool object is garbage collected
        terminate() will be called immediately."""
        self.close()

        # Clearing the job queue
        try:
            while 1:
                self.queue.get_nowait()
        # except Manager().Queue.empty():
        except:
            pass

        # Send one sentinel for each worker thread: each thread will die
        # eventually, leaving the next sentinel for the next thread
        for process in self.workers:
            self.queue.put(SENTINEL)
Example #3
0
class Pool(object):
    """
    The Pool class represents a pool of worker threads. It has methods
    which allows tasks to be offloaded to the worker processes in a
    few different ways
   """
    def __init__(self, num_workers, name="Pool"):
        """
        \param num_workers (integer) number of worker threads to start
        \param name (string) prefix for the worker threads' name
        """
        self.queue = Manager().Queue()
        self.closed = False
        self.workers = []

        for idx in range(num_workers):
            process = PoolWorker(self.queue, name="%s-Worker-%d" % (name, idx))
            process.daemon = True
            try:
                process.start()
            except:
                # If one thread has a problem, undo everything
                self.terminate()
                raise
            else:
                self.workers.append(process)

    def submit(self, work_unit):
        self.queue.put(work_unit)
    
    def close(self):
        """Prevents any more tasks from being submitted to the
        pool. Once all the tasks have been completed the worker
        processes will exit."""
        # No lock here. We assume it's sufficiently atomic...
        self.closed = True

    def terminate(self):
        """Stops the worker processes immediately without completing
        outstanding work. When the pool object is garbage collected
        terminate() will be called immediately."""
        self.close()

        # Clearing the job queue
        try:
            while 1:
                self.queue.get_nowait()
        # except Manager().Queue.empty():
        except:
            pass

        # Send one sentinel for each worker thread: each thread will die
        # eventually, leaving the next sentinel for the next thread
        for process in self.workers:
            self.queue.put(SENTINEL)
Example #4
0
def main():
    from multiprocessing import Pool
    from multiprocessing import Manager
    pool = Pool(5)
    isright = 0
    precision_queue = Manager().Queue()
    for i_ep, root in enumerate(env.TreeList):
        pool.apply_async(func=sub_main, args=(root, precision_queue))
    pool.close()
    pool.join()

    ql = precision_queue.qsize()
    for i in range(ql):
        isright += precision_queue.get_nowait()
    print(isright/precision_queue.qsize())
Example #5
0
class MultiProcessFile(object):
    """
    helper for testing multiprocessing

    multiprocessing poses a problem for doctests, since the strategy
    of replacing sys.stdout/stderr with file-like objects then
    inspecting the results won't work: the child processes will
    write to the objects, but the data will not be reflected
    in the parent doctest-ing process.

    The solution is to create file-like objects which will interact with
    multiprocessing in a more desirable way.

    All processes can write to this object, but only the creator can read.
    This allows the testing system to see a unified picture of I/O.
    """
    def __init__(self):
        # per advice at:
        #    http://docs.python.org/library/multiprocessing.html#all-platforms
        self.__master = getpid()
        self.__queue = Manager().Queue()
        self.__buffer = StringIO()
        self.softspace = 0

    def buffer(self):
        if getpid() != self.__master:
            return

        from queue import Empty
        from collections import defaultdict
        cache = defaultdict(str)
        while True:
            try:
                pid, data = self.__queue.get_nowait()
            except Empty:
                break
            if pid == ():
                #show parent output after children
                #this is what users see, usually
                pid = ( 1e100, ) # googol!
            cache[pid] += data
        for pid in sorted(cache):
            #self.__buffer.write( '%s wrote: %r\n' % (pid, cache[pid]) ) #DEBUG
            self.__buffer.write( cache[pid] )
    def write(self, data):
        # note that these pids are in the form of current_process()._identity
        # rather than OS pids
        from multiprocessing import current_process
        pid = current_process()._identity
        self.__queue.put((pid, data))
    def __iter__(self):
        "getattr doesn't work for iter()"
        self.buffer()
        return self.__buffer
    def seek(self, offset, whence=0):
        self.buffer()
        return self.__buffer.seek(offset, whence)
    def getvalue(self):
        self.buffer()
        return self.__buffer.getvalue()
    def __getattr__(self, attr):
        return getattr(self.__buffer, attr)
Example #6
0
class MultiProcessFile(object):
    """
    helper for testing multiprocessing

    multiprocessing poses a problem for doctests, since the strategy
    of replacing sys.stdout/stderr with file-like objects then
    inspecting the results won't work: the child processes will
    write to the objects, but the data will not be reflected
    in the parent doctest-ing process.

    The solution is to create file-like objects which will interact with
    multiprocessing in a more desirable way.

    All processes can write to this object, but only the creator can read.
    This allows the testing system to see a unified picture of I/O.
    """
    def __init__(self):
        # per advice at:
        #    http://docs.python.org/library/multiprocessing.html#all-platforms
        self.__master = getpid()
        self.__queue = Manager().Queue()
        self.__buffer = StringIO()
        self.softspace = 0

    def buffer(self):
        if getpid() != self.__master:
            return

        from Queue import Empty
        from collections import defaultdict
        cache = defaultdict(str)
        while True:
            try:
                pid, data = self.__queue.get_nowait()
            except Empty:
                break
            if pid == ():
                #show parent output after children
                #this is what users see, usually
                pid = ( 1e100, ) # googol!
            cache[pid] += data
        for pid in sorted(cache):
            #self.__buffer.write( '%s wrote: %r\n' % (pid, cache[pid]) ) #DEBUG
            self.__buffer.write( cache[pid] )
    def write(self, data):
        # note that these pids are in the form of current_process()._identity
        # rather than OS pids
        from multiprocessing import current_process
        pid = current_process()._identity
        self.__queue.put((pid, data))
    def __iter__(self):
        "getattr doesn't work for iter()"
        self.buffer()
        return self.__buffer
    def seek(self, offset, whence=0):
        self.buffer()
        return self.__buffer.seek(offset, whence)
    def getvalue(self):
        self.buffer()
        return self.__buffer.getvalue()
    def __getattr__(self, attr):
        return getattr(self.__buffer, attr)
Example #7
0
    pool = Pool(processes=40)
    queue = Manager().Queue(maxsize=40000)
    cnt = 0
    for i in range(len(label_raw)):
        pool.apply_async(label_expand, args=(
            i,
            label_raw[i],
            queue,
        ))
    pool.close()
    pool.join()

    label_ex = []
    while True:
        try:
            (i, lb) = queue.get_nowait()
            label_ex.append((i, lb))
        except:
            break
    print('finish label expansion ', len(label_ex))
    label_ex = sorted(label_ex, key=lambda x: x[0])
    label = []
    for (i, lb) in label_ex:
        label.append(lb)
    '''test code'''
    # plt.plot(input[7])
    # plt.plot(label[7])
    # plt.show()

    x_train = []
    for dat in input:
class VocabBuilder(object):
    """
    用来构造词表的类
    """
    def __init__(self):
        self.file_full_path_list = self.get_file_path_list()
        self.dict = {
            "[PAD]": 0,
            "[UNK]": 1,
            "[CLS]": 2,
            "[SEP]": 3,
            "[MASK]": 4
        }  # 词表
        self.count_dict = dict()  # 统计每个字符出现次数的词典 {char1:10, char2:12}
        self.sorted_count_list = None  # 按照频数排序后的列表
        self.q_count_dic = Manager().Queue()  # 用于存放所有线程的count_dic
        self.q_count_dic_done = Manager().Queue()  # 用于存放已经完成字典合并的count_dic序列

    @staticmethod
    def get_file_path_list():
        """
        获取所有文本的完整路径
        :return:
        """
        data_path = "../corpus_processed"
        file_list = os.listdir(data_path)
        full_file_path_list = [
            os.path.join(data_path, file) for file in file_list
        ]
        return full_file_path_list

    def tokenize_and_count(self, file):
        """
        按照字符切分句子,并统计词频
        :param file: txt文本
        :return: ['乙', '女', 'は', 'お', '姉', 'さ', 'ま', 'に', '恋', 'し', 'て', 'る', '櫻', 'の', '園', 'の', 'エ', 'ト', 'ワ', 'ー', 'ル']
        """
        basic_tokenizer = BasicTokenizer(
            do_lower_case=False)  # 必须加上do_lower_case=False,这样じ才不会变成し
        token_list = []
        with open(file, mode="r", encoding="utf-8") as fin:
            text = fin.read()
            str_list = basic_tokenizer.tokenize(text)
            for str in str_list:
                token_list += list(str)
        # 统计
        tmp_count_dict = dict()
        for token in token_list:
            if token not in tmp_count_dict:
                tmp_count_dict[token] = 0
            tmp_count_dict[token] += 1
        self.q_count_dic.put(tmp_count_dict)  # 将统计结果放入序列

    def join_dicts(self):
        """
        使用一个单独的进程,将所有进程统计出来的字典与总字典合并
        :return:
        """
        while True:
            tmp_count_dict = self.q_count_dic.get()  # 从队列获取统计出来的字典
            all_keys = tmp_count_dict.keys() | self.count_dict.keys()
            self.count_dict = {
                key: tmp_count_dict.get(key, 0) + self.count_dict.get(key, 0)
                for key in all_keys
            }
            self.q_count_dic_done.put(1)  # 将完成任务的字典放入序列中,只放字典的长度即可,不必放整个字典
            if self.q_count_dic_done.qsize() == len(self.file_full_path_list):
                self.q_count_dic.put(self.count_dict)  # 通过序列传递最终的结果
                break

    def build_vocab(self, min_count=None, max_count=None, vocab_size=None):
        """
        构造词表
        :param min_count: 字符出现的最少次数
        :param max_count: 字符出现的最大次数
        :param vocab_size: 词表中单词的最大数量
        :return:
        """
        self.count_dict = self.q_count_dic.get_nowait()  # 从序列中获取最终的字典
        # 对单词频数字典进行筛选,仅保留频数位于[min_count,max_count]的单词
        if isinstance(min_count, int):
            self.count_dict = {
                key: value
                for key, value in self.count_dict.items() if min_count <= value
            }
        if isinstance(max_count, int):
            self.count_dict = {
                key: value
                for key, value in self.count_dict.items() if value <= max_count
            }
            # 对count字典按照频数进行降序排序,再转成列表 [('b', 10), ('a', 9), ('c', 8)]
        self.sorted_count_list = sorted(self.count_dict.items(),
                                        key=lambda x: x[1],
                                        reverse=True)
        # 限制词表中单词的最大数量
        if isinstance(vocab_size, int):
            # 如果vocab_size<len(sorted_count_list),则只取前vocab_size个
            if vocab_size < len(self.sorted_count_list):
                self.sorted_count_list = self.sorted_count_list[:vocab_size]
        # 将满足条件的sorted_count_list中的单词,保存到self.dic中
        for token, _ in self.sorted_count_list:
            # if token not in self.dict.keys() and len(self.dict) < vocab_size:  # 加上这一句是为了能够分句传入
            self.dict[token] = len(
                self.dict)  # dict由{UNK:0,PAD:1} --> {UNK:0,PAD:1,token:2}

    def save_vocab(self):
        dict_path = "./vocab.txt"
        with open(dict_path, mode="w", encoding="utf-8") as fout:
            for token in self.dict.keys():
                fout.write(token + "\n")
        print("Vocabulary Size:%d" % len(self.dict))

    def show_progress_bar(self):
        """做一个进度条"""
        with tqdm(total=len(self.file_full_path_list)) as pbar:
            while True:
                pbar.set_description("Building Vocabulary")
                processed_file_num = self.q_count_dic_done.qsize()
                pbar.n = processed_file_num
                pbar.refresh()
                if processed_file_num == len(self.file_full_path_list):
                    break

    def multi_process_token_counter(self):
        po = Pool()
        po.apply_async(self.join_dicts)  # 添加合并字典任务
        for file_path in self.file_full_path_list:
            # 添加分词并统计任务
            po.apply_async(self.tokenize_and_count, args=(file_path, ))
        self.show_progress_bar()
        po.close()
        po.join()

    def run(self):
        # 多进程词频分词、词频统计、字典合并
        self.multi_process_token_counter()
        # 生成词表
        self.build_vocab()
        # 保存词表
        self.save_vocab()
class RelationExtractor:
    """
    Relation Extraction based on Semantic Role Labeling of SENNA
    """
    def __init__(self, data_source=None, relation_sink=None, workers=4):
        """
        :param data_source: data_source object of type DataSource
        :param relation_sink: data_sink object of type DataSink
        :param workers: number of child process workers in source sink mode
        """
        if data_source:
            assert isinstance(data_source, DSource.MongoDataSource),\
                "data_source object must be instance of MongoDataSource"
            self.data_source = data_source

        if relation_sink:
            assert isinstance(relation_sink, DSink.ElasticDataSink), \
                "relation_sink object must be instance of ElasticDataSink"
            self.relation_sink = relation_sink
            self.model_class = self.relation_sink.model_identifier.model_class

        self.relation_annotator = pnt.Annotator()
        self.stemmer = PorterStemmer()
        self.workers = workers
        self.relation_queue = Manager().Queue(maxsize=10000)
        self.persist_attributes = ['relation_annotator', 'stemmer', 'model_class', 'relation_queue']

    def __getstate__(self):
        state = dict()
        for attr in self.persist_attributes:
            state[attr] = self.__dict__[attr]
        return state

    def __setstate(self, d):
        self.__dict__.update(d)

    @staticmethod
    def __populate_arguments(semantic_element):
        """
        form a argument object from the srl semantic element
        :param semantic_element: SRL semantic element
        :return: RelationArgument instance
        """
        return RelationArgument(A0=semantic_element.get('A0'), A1=semantic_element.get('A1'),
                                A2=semantic_element.get('A2'), A3=semantic_element.get('A3'))

    @staticmethod
    def __populate_modifier(semantic_element):
        """
        form a argument modifier object from the srl semantic element
        :param semantic_element: SRL semantic element
        :return: RelationModifier instance
        """
        return RelationModifier(DIR=semantic_element.get('AM-DIR'), MNR=semantic_element.get('AM-MNR'),
                                LOC=semantic_element.get('AM-LOC'), TMP=semantic_element.get('AM-TMP'),
                                EXT=semantic_element.get('AM-EXT'), PNC=semantic_element.get('AM-PNC'),
                                CAU=semantic_element.get('AM-CAU'), NEG=semantic_element.get('AM-NEG'))

    def form_relations(self, text, block_id, payload, ff, persist=True):
        """
        form relation(s) on a given text
        :param text: text on which to get the relations on,
        text will be sentence tokenized and relations formed at sentence level
        :param block_id: unique identifier of the block
        :param persist: persist the relations extracted from the text in the sink,
        relation_sink needed to be specified
        :return: list of relations
        """
        text_sentences = pattern.tokenize(text)
        relations = []
        for sentence in text_sentences:

            # work with ascii string only
            sentence = "".join((c for c in sentence if 0 < ord(c) < 127))
            try:
                senna_annotation = self.relation_annotator.getAnnotations(sentence)
            except Exception as e:
                logger.error(e)
                continue

            chunk_parse, pos_tags, role_labeling, tokenized_sentence = \
                senna_annotation['chunk'], senna_annotation['pos'], senna_annotation['srl'], \
                senna_annotation['words']

            # nothing to do here empty srl
            if not role_labeling: continue

            for semantic_element in role_labeling:
                arguments = RelationExtractor.__populate_arguments(semantic_element)
                modifiers = RelationExtractor.__populate_modifier(semantic_element)
                verb = semantic_element.get('V')
                # order of the arguments returned is important, A0 --> A1 --> A2 --> A3
                arguments = [v for v in vars(arguments).itervalues() if v]
                modifiers = [v for v in vars(modifiers).itervalues() if v]

                if not arguments: continue
                argument_pairs = [e for e in ((ai, aj) for i, ai in enumerate(arguments) for j, aj
                                              in enumerate(arguments) if i < j)]

                verb = relation_util.normalize_relation(verb)

                for a0, a1 in argument_pairs:
                    en0 = relation_util.form_entity(tokenized_sentence, a0, chunk_parse, pos_tags)
                    en1 = relation_util.form_entity(tokenized_sentence, a1, chunk_parse, pos_tags)
                    if not en0 or not en1: continue
                    relations.append(RelationTuple(left_entity=en0, right_entity=en1, relation=verb,
                                                   sentence=sentence, text=text, block_id=block_id,
                                                   payload=payload, ff = ff))
                    logger.info("generated a relation for ")
                    logger.info(block_id)

                for arg_modifier in modifiers:
                    mod_pos = sentence.find(arg_modifier)
                    linked_arg = min([(a, abs(mod_pos - sentence.find(a))) for a in arguments], key=lambda e: e[1])[0]
                    en0 = relation_util.form_entity(tokenized_sentence, linked_arg, chunk_parse, pos_tags)
                    en1 = relation_util.form_entity(tokenized_sentence, arg_modifier, chunk_parse, pos_tags)
                    if not en0 or not en1: continue
                    relations.append(RelationTuple(left_entity=en0, right_entity=en1, relation=verb,
                                                   sentence=sentence, text=text, block_id=block_id,
                                                   payload=payload, ff=ff))
                    logger.info("generated a relation for ")
                    logger.info(block_id)

        return relations

    def form_relations_source(self, source_item):
        if not source_item:
            logger.error("got an empty source item")
            return

        item_entry = ""
        payload = ""
        ff = ""

        for f_name, f_value in source_item:
            if f_name == "payload":
                payload = f_value
            elif f_name == "ff":
                ff = f_value
            else:
                item_entry += f_value

        if item_entry == ' ': return
        try:
            block_id = str(uuid.uuid1())
            relations = self.form_relations(item_entry, block_id, payload, ff)
        except RuntimeError as e:
            logger.error("Error generating relations")
            logger.error(e)
            return

        for relation in relations:
            sink_relation = self.model_class()
            sink_relation.leftEntity = relation.left_entity
            sink_relation.rightEntity = relation.right_entity
            sink_relation.relation = relation.relation
            sink_relation.sentence = relation.sentence
            sink_relation.text = relation.text
            sink_relation.block_id = relation.block_id
            sink_relation.productName = relation.ff
            sink_relation.webLocation = relation.payload

            logger.info("generated a relation")
            logger.info(sink_relation)

            try:
                self.relation_queue.put(sink_relation, timeout=1)
            except Full as e:
                logger.error(e)

    def sink_relations(self):
        while not self.all_sinked:
            try:
                item = self.relation_queue.get_nowait()
                self.relation_sink.sink_item(item)
            except Empty as e:
                pass

    def form_relations_from_source(self):

        if not self.data_source or not self.relation_sink:
            raise RuntimeError("Data source and sink must be set")

        self.data_source.start()
        self.relation_sink.start()

        self.all_sinked = False
        pool = Pool(processes=self.workers)
        t1 = time.time()
        pool.imap(self.form_relations_source, self.data_source)

        sinker = Thread(target=self.sink_relations, name='Sink-Thread')
        sinker.start()

        pool.close()
        pool.join()
        self.all_sinked = True
        t2 = time.time()
        logger.info("process finished in :: %d  seconds" %(t2 - t1))
Example #10
0
    pool = Pool(processes=40)
    queue = Manager().Queue(maxsize=20000)
    cnt = 0
    for i in range(len(label_raw)):
        pool.apply_async(label_expand, args=(
            i,
            label_raw[i],
            queue,
        ))
    pool.close()
    pool.join()

    label_ex = []
    while True:
        try:
            (i, lb) = queue.get_nowait()
            label_ex.append((i, lb))
        except:
            break
    print('finish label expansion ', len(label_ex))
    label_ex = sorted(label_ex, key=lambda x: x[0])
    label = []
    for (i, lb) in label_ex:
        label.append(lb)
    '''test code'''
    # plt.plot(input[7])
    # plt.plot(label[7])
    # plt.show()

    if input_dim < output_dim:
        print('input_dim smaller than output_dim, quit task')
Example #11
0
class Crawler(object):
    def __init__(self, cookie: str = None, headers: dict = None, max_num: int = 10000, domain_regs: list = None,
                 depth: int = 5):
        self.cookie = cookie
        self.headers = headers if headers else DEFAULT_HEADERS
        self.waiting_queue = Manager().Queue(maxsize=max_num * 2)
        self.current_queue = Manager().Queue(maxsize=max_num * 2)
        self.max_url_num = max_num
        self.crawled_urls = BloomFilter(element_num=max_num * 5, error_rate=0.01)
        self.url_dict = Manager().dict()
        self.domain_reg_list = domain_regs
        self.depth = depth
        self.current_depth = 0
        self.filter_exts = [
            'css', 'png', 'gif', 'jpg', 'jpeg', 'swf', 'tiff',
            'pdf', 'ico', 'flv', 'mp4', 'mp3', 'avi', 'mpg', 'gz',
            'mpeg', 'iso', 'dat', 'mov', 'rar', 'exe', 'zip', 'tar',
            'bin', 'bz2', 'xsl', 'doc', 'docx', 'ppt', 'pptx', 'xls',
            'xlsx', 'csv', 'map', "ttf", 'tif', 'woff', 'woff2',
            'cab', 'apk', 'bmp', 'svg', 'exif', 'xml', 'rss', 'webp', 'js'
        ]

    def run(self, urls):
        self.consist_headers()
        # 默认只爬取当前根域下的url
        self.domain_reg_list = self.parse_domain(urls) if not self.domain_reg_list else self.domain_reg_list
        self._init_reg()
        for url in urls:
            self.call_crawl_handler(url)
        print('all task done')
        print(self.url_dict)

    def call_crawl_handler(self, url):
        if 'http' not in url:
            init_url = 'http://' + url
        else:
            init_url = url
        self.current_queue.put_nowait(init_url)
        # 初始化url为了避免重复爬取,在初始化时就放入布隆过滤器
        init_url_without_protocol = url.split('//')[-1]
        self.crawled_urls.add(init_url_without_protocol)
        while self.current_depth < self.depth:
            if len(self.url_dict) >= self.max_url_num:
                print('达到预设爬去上限, 爬虫结束')
                break
            print('now crawl depth is :{}'.format(self.current_depth))
            tmp_results = []
            # 利用进程池去完成爬虫
            pool = Pool(os.cpu_count() * 2)
            while not self.current_queue.empty():
                print('当前在待爬队列中还有:{}个url'.format(self.current_queue.qsize()))
                url = self.current_queue.get_nowait()
                if not url.endswith('js'):
                    result = pool.apply_async(func=self.crawl_handler, args=(url,))
                    tmp_results.append(result)
                    # self.crawl_handler(url)
            pool.close()
            pool.join()
            tmp_reqs = []
            for result in tmp_results:
                for r in result.get():
                    tmp_reqs.append(r)
            self._handle_url(tmp_reqs)
            self.current_queue = self.waiting_queue
            self.waiting_queue = Manager().Queue(maxsize=self.max_url_num * 2)
            self.current_depth += 1
            print('depth:{} crawled done'.format(self.current_depth))

    def consist_headers(self):
        if self.cookie:
            self.headers['Cookie'] = self.cookie

    @staticmethod
    def parse_domain(domain_list):
        """
        需要将输入的url或者域名解析成域名, 用于后续同域判断等操作
        :param domain_list:
        :return:
        """

        def _split_url_protocol_and_path(domain):
            # 去掉协议
            if '://' in domain:
                domain = domain.split('://')[1]
            # 截取路径
            if '.com.cn' in domain:
                return domain.split('.com.cn')[0] + '.com.cn'
            if '.com' in domain:
                return domain.split('.com')[0] + '.com'
            if '.io' in domain:
                return domain.split('.xyz')[0] + '.io'
            # 针对ip:port形式的url,截取/即可
            return domain.split('/')[0]

        return [_split_url_protocol_and_path(domain) for domain in domain_list]

    @staticmethod
    def _parse_post_data(post_data) -> str:
        """
        解析动态请求获取里面的data成一个字符串
        :param post_data:
        :return:
        """
        if not post_data:
            return ''
        if not isinstance(post_data, dict):
            if '=' in post_data:
                param_dict = {}
                if '&' in post_data:
                    params_couples = post_data.split('&')
                    for param in params_couples:
                        if '=' not in param:
                            continue
                        k, v = param.split('=')
                        param_dict[k] = v
                else:
                    k, v = post_data.split('=')
                    param_dict[k] = v
                post_data = param_dict
            else:
                post_data = json.loads(post_data)
        post_data_list = [k for k, _ in post_data.items()]
        post_data_list.sort()
        return ''.join([param + '&' for param in post_data_list])[:-1]

    @staticmethod
    def parse_static_url(url):
        """
        把解析到的静态url, 重新组合成一个字典
        {
            'url': 'xxxxxx',
            'originUrl': 'xxxxxx/a=aa',
            'method': 'GET',
            'queryString': 'a=aa'
        }
        :param url:
        :return:
        """
        try:
            req = dict()
            req['method'] = 'GET'
            req['originUrl'] = url
            if '?' not in url:
                req['url'] = url
                return req
            url_consist = url.split('?')
            req['url'] = url_consist[0]
            params = url_consist[1]
            if '&' not in params:
                params_consist = params.split('=')
                req['queryString'] = params_consist[0] if params_consist[0] else ''
                return req
            multi_params = params.split('&')
            params_list = list(map(lambda y: y.split('=')[0], filter(lambda x: '=' in x, multi_params)))
            # 按首字母把参数排序
            params_list.sort()
            req['queryString'] = ''.join([key + '=&' for key in params_list])[:-2]
            return req
        except Exception:
            msg = traceback.format_exc()
            print(msg)
            return None

    def _init_reg(self):
        """
        根据解析出来的域名拼接一个正则, 用于同域校验
        :return:
        """
        domain_reg = ['^']
        domain_reg.extend(['(http|https):\/\/' + domain.replace('.', '\.') + '.*|' for domain in self.domain_reg_list])
        # domain_reg.extend(map(lambda x: '(http|https):\/\/' + x.replace('.', '\.') + '.*|', self.domain_reg_list))
        tmp_domain_reg = ''.join(domain_reg)
        self.domain_reg = tmp_domain_reg[:-1] + '$'

    def filter_ext(self, url):
        """
        过滤掉特殊后缀的url, 如一些静态资源等等
        如果存在url的后缀是需要排除的,则排除
        :param url:
        :return:
        """
        try:
            f = url.split('/')[-1].strip()
            if '.' in f:
                ext = f.split('.')[-1].strip().lower()
                if ext and ext in self.filter_exts:
                    return True
                else:
                    return False
            return False
        except Exception:
            msg = traceback.format_exc()
            print(msg)
            return False

    def filter_url_by_domain(self, url):
        """
        检验当前的url是否满足条件
        是  的url以及不在[不需要]的url集合里,返回True.不满足要求,返回false
        :param url:
        :return:
        """
        # 校验域名
        if not re.match(self.domain_reg, url, flags=0):
            return False
        # TODO: 后续补齐这部分功能
        # if len(self.exclude_urls) == 0:
        #    return True
        # 校验exclude_urls
        # if re.match(self.exclude_urls_reg_str, url, flags=0):
        #    return False
        return True

    def static_crawler(self, page, results, url) -> List["ElementHandle"]:
        """
        主要用于页面中静态url的解析, 目前涵盖了a标签的href属性和src属性
        """
        links = page.query_selector_all("//a")
        tmp_link = []
        for link in links:
            href = link.get_property("href").json_value()
            src = link.get_property("src").json_value()
            if not href or href == url:
                continue
            if not self.filter_ext(url=href) and self.filter_url_by_domain(url=href):
                req = self.parse_static_url(href)
                if req:
                    print('href:{}'.format(req))
                    results.append(req)

            if not src or src == url:
                continue
            if not self.filter_ext(url=src) and self.filter_url_by_domain(url=src):
                req = self.parse_static_url(src)
                if req:
                    print('src:{}'.format(req))
                    results.append(req)

            # 这里主要是用于有些a标签里的写法是<javascript>标签,用于执行某些js操作
            if 'javascript' in href or 'javascript' in src:
                tmp_link.append(link)
        return tmp_link

    def _check_crawled_url(self, url) -> bool:
        """
        检查是否已爬取,不存在,则返回True
        :param url:
        :return:
        """
        if url in self.crawled_urls:
            return False
        return True

    def _check_url_is_exist_by_md5(self, url_dict):
        """
        利用MD5去检查url是否重复
        :param url_dict:
        :return:
        """
        try:
            exist_md5 = list(url_dict.keys())[0]
            if exist_md5 in self.crawled_urls:
                return False
            return True
        except Exception:
            msg = traceback.format_exc()
            print(msg)
            return True

    @staticmethod
    def calculate_md5(url_har):
        """
        计算md5来去重
        :param url_har:
        :return:
        """
        url = url_har['url']
        # 有些post请求后缀会加timestamp时间戳来防重放
        tmp_list = url.split('//')[-1].split('?')
        url_without_protocol = tmp_list[0] if len(tmp_list) > 1 else url.split('//')[-1]
        method = url_har['method']
        query_string = ''
        post_data = ''
        if 'queryString' in url_har:
            query_string = url_har['queryString']
        if 'postData' in url_har:
            post_data = url_har['postData']
        tmp_str = url_without_protocol + '&' + method + '&' + query_string + post_data
        return hashlib.md5(tmp_str.encode('utf-8')).hexdigest()

    def _handle_url(self, req_list):
        """
        处理爬到的url, 看看是不是需要过滤或者是不是已经爬取过了
        :param req_list:
        :return:
        """
        if not req_list:
            return
        insert_req_list = list()
        for req in req_list:
            url = req['originUrl']
            if url.endswith('/'):
                url = url[:-1]
            url_without_protocol = url.split('//')[-1]
            '''
            解析完成后,返回的结构体包括:url,queryString(if exist),method
            需要对url做判断:
            1、是否存在于最后的url集合里
            2、是否已爬过
            3、url的后缀是否在需要过滤的集合里(最先判断,如果需要过滤则直接忽略)
            '''
            md5 = self.calculate_md5(req)
            tmp_dict = {
                md5: req
            }
            if self._check_url_is_exist_by_md5(tmp_dict):
                if len(self.url_dict.keys()) < self.max_url_num:
                    self.url_dict[md5] = req
                    # TODO:后面可以定制化插入taskId
                    insert_req_list.append({'taskId': 'test12', 'urlDict': json.dumps(req)})
                # 如果没有爬过,则放入下一轮要爬取的队列里
                if self._check_crawled_url(url_without_protocol) and not self.waiting_queue.full():
                    self.waiting_queue.put_nowait(req['originUrl'])
                    self.crawled_urls.add(url_without_protocol)

    def crawl_handler(self, url) -> list:
        result = []

        def intercept(route: Route, request: Request):
            # 拦截前端跳转,主要方法是修改请求响应为204 TODO: 后续在遇到前端跳转的时候,优化hook逻辑
            if request.is_navigation_request() and request.frame.parent_frame:
                request.response().status = 204
                route.continue_()
                return
            # 尝试拦截后端跳转
            if request.redirected_to:
                if request.post_data_json:
                    request.response().status = 200
                    self.waiting_queue.put_nowait(request.redirected_to.url)
                else:
                    ...
                route.continue_()
                return
            resource_type = request.resource_type
            # 过滤动态请求
            if resource_type in ['image', 'media', 'eventsource', 'websocket']:
                route.abort()
            else:
                url_origin = request.url
                if not url_origin:
                    route.continue_()
                    return
                if not self.filter_ext(url=url_origin) and self.filter_url_by_domain(url=url_origin):
                    headers = request.headers
                    method = request.method
                    post_data_json: dict = request.post_data_json
                    http_har = dict()
                    if method == 'POST' or method == 'PUT':
                        post_data_origin = post_data_json
                        post_data_handled = self._parse_post_data(post_data_origin)
                        content_type = headers['content-type'] if 'content-type' in headers else ''
                        http_har['originPostData'] = post_data_origin
                        http_har['postData'] = post_data_handled
                        http_har['contentType'] = content_type
                        http_har['url'] = url_origin
                        http_har['originUrl'] = url_origin
                        http_har['method'] = method
                    if method == 'GET':
                        http_har = self.parse_static_url(url_origin)
                    result.append(http_har)
                route.continue_()

        with sync_playwright() as p:
            browser = p.webkit.launch(headless=True, chromium_sandbox=True, )
            page = browser.new_page()
            page.set_default_navigation_timeout(30000)
            page.set_extra_http_headers(self.headers)
            page.route('**/*', intercept)
            page.goto(url)
            page.wait_for_load_state(state='networkidle', timeout=30000)

            tmp_links = self.static_crawler(page, result, url)
            page.evaluate(FORM_FILL_UPLOAD_JS)
            for link in tmp_links:
                link.click()
            page.close()
            browser.close()
        return result
Example #12
0
class RelationExtractor:
    """
    Relation Extraction based on Semantic Role Labeling of SENNA
    """
    def __init__(self, data_source=None, relation_sink=None, workers=4):
        """
        :param data_source: data_source object of type DataSource
        :param relation_sink: data_sink object of type DataSink
        :param workers: number of child process workers in source sink mode
        """
        if data_source:
            assert isinstance(data_source, DSource.MongoDataSource),\
                "data_source object must be instance of MongoDataSource"
            self.data_source = data_source

        if relation_sink:
            assert isinstance(relation_sink, DSink.ElasticDataSink), \
                "relation_sink object must be instance of ElasticDataSink"
            self.relation_sink = relation_sink
            self.model_class = self.relation_sink.model_identifier.model_class

        self.relation_annotator = pnt.Annotator()
        self.stemmer = PorterStemmer()
        self.workers = workers
        self.relation_queue = Manager().Queue(maxsize=10000)
        self.persist_attributes = [
            'relation_annotator', 'stemmer', 'model_class', 'relation_queue'
        ]

    def __getstate__(self):
        state = dict()
        for attr in self.persist_attributes:
            state[attr] = self.__dict__[attr]
        return state

    def __setstate(self, d):
        self.__dict__.update(d)

    @staticmethod
    def __populate_arguments(semantic_element):
        """
        form a argument object from the srl semantic element
        :param semantic_element: SRL semantic element
        :return: RelationArgument instance
        """
        return RelationArgument(A0=semantic_element.get('A0'),
                                A1=semantic_element.get('A1'),
                                A2=semantic_element.get('A2'),
                                A3=semantic_element.get('A3'))

    @staticmethod
    def __populate_modifier(semantic_element):
        """
        form a argument modifier object from the srl semantic element
        :param semantic_element: SRL semantic element
        :return: RelationModifier instance
        """
        return RelationModifier(DIR=semantic_element.get('AM-DIR'),
                                MNR=semantic_element.get('AM-MNR'),
                                LOC=semantic_element.get('AM-LOC'),
                                TMP=semantic_element.get('AM-TMP'),
                                EXT=semantic_element.get('AM-EXT'),
                                PNC=semantic_element.get('AM-PNC'),
                                CAU=semantic_element.get('AM-CAU'),
                                NEG=semantic_element.get('AM-NEG'))

    def form_relations(self, text, block_id, payload, ff, persist=True):
        """
        form relation(s) on a given text
        :param text: text on which to get the relations on,
        text will be sentence tokenized and relations formed at sentence level
        :param block_id: unique identifier of the block
        :param persist: persist the relations extracted from the text in the sink,
        relation_sink needed to be specified
        :return: list of relations
        """
        text_sentences = pattern.tokenize(text)
        relations = []
        for sentence in text_sentences:

            # work with ascii string only
            sentence = "".join((c for c in sentence if 0 < ord(c) < 127))
            try:
                senna_annotation = self.relation_annotator.getAnnotations(
                    sentence)
            except Exception as e:
                logger.error(e)
                continue

            chunk_parse, pos_tags, role_labeling, tokenized_sentence = \
                senna_annotation['chunk'], senna_annotation['pos'], senna_annotation['srl'], \
                senna_annotation['words']

            # nothing to do here empty srl
            if not role_labeling: continue

            for semantic_element in role_labeling:
                arguments = RelationExtractor.__populate_arguments(
                    semantic_element)
                modifiers = RelationExtractor.__populate_modifier(
                    semantic_element)
                verb = semantic_element.get('V')
                # order of the arguments returned is important, A0 --> A1 --> A2 --> A3
                arguments = [v for v in vars(arguments).itervalues() if v]
                modifiers = [v for v in vars(modifiers).itervalues() if v]

                if not arguments: continue
                argument_pairs = [
                    e for e in ((ai, aj) for i, ai in enumerate(arguments)
                                for j, aj in enumerate(arguments) if i < j)
                ]

                verb = relation_util.normalize_relation(verb)

                for a0, a1 in argument_pairs:
                    en0 = relation_util.form_entity(tokenized_sentence, a0,
                                                    chunk_parse, pos_tags)
                    en1 = relation_util.form_entity(tokenized_sentence, a1,
                                                    chunk_parse, pos_tags)
                    if not en0 or not en1: continue
                    relations.append(
                        RelationTuple(left_entity=en0,
                                      right_entity=en1,
                                      relation=verb,
                                      sentence=sentence,
                                      text=text,
                                      block_id=block_id,
                                      payload=payload,
                                      ff=ff))
                    logger.info("generated a relation for ")
                    logger.info(block_id)

                for arg_modifier in modifiers:
                    mod_pos = sentence.find(arg_modifier)
                    linked_arg = min([(a, abs(mod_pos - sentence.find(a)))
                                      for a in arguments],
                                     key=lambda e: e[1])[0]
                    en0 = relation_util.form_entity(tokenized_sentence,
                                                    linked_arg, chunk_parse,
                                                    pos_tags)
                    en1 = relation_util.form_entity(tokenized_sentence,
                                                    arg_modifier, chunk_parse,
                                                    pos_tags)
                    if not en0 or not en1: continue
                    relations.append(
                        RelationTuple(left_entity=en0,
                                      right_entity=en1,
                                      relation=verb,
                                      sentence=sentence,
                                      text=text,
                                      block_id=block_id,
                                      payload=payload,
                                      ff=ff))
                    logger.info("generated a relation for ")
                    logger.info(block_id)

        return relations

    def form_relations_source(self, source_item):
        if not source_item:
            logger.error("got an empty source item")
            return

        item_entry = ""
        payload = ""
        ff = ""

        for f_name, f_value in source_item:
            if f_name == "payload":
                payload = f_value
            elif f_name == "ff":
                ff = f_value
            else:
                item_entry += f_value

        if item_entry == ' ': return
        try:
            block_id = str(uuid.uuid1())
            relations = self.form_relations(item_entry, block_id, payload, ff)
        except RuntimeError as e:
            logger.error("Error generating relations")
            logger.error(e)
            return

        for relation in relations:
            sink_relation = self.model_class()
            sink_relation.leftEntity = relation.left_entity
            sink_relation.rightEntity = relation.right_entity
            sink_relation.relation = relation.relation
            sink_relation.sentence = relation.sentence
            sink_relation.text = relation.text
            sink_relation.block_id = relation.block_id
            sink_relation.productName = relation.ff
            sink_relation.webLocation = relation.payload

            logger.info("generated a relation")
            logger.info(sink_relation)

            try:
                self.relation_queue.put(sink_relation, timeout=1)
            except Full as e:
                logger.error(e)

    def sink_relations(self):
        while not self.all_sinked:
            try:
                item = self.relation_queue.get_nowait()
                self.relation_sink.sink_item(item)
            except Empty as e:
                pass

    def form_relations_from_source(self):

        if not self.data_source or not self.relation_sink:
            raise RuntimeError("Data source and sink must be set")

        self.data_source.start()
        self.relation_sink.start()

        self.all_sinked = False
        pool = Pool(processes=self.workers)
        t1 = time.time()
        pool.imap(self.form_relations_source, self.data_source)

        sinker = Thread(target=self.sink_relations, name='Sink-Thread')
        sinker.start()

        pool.close()
        pool.join()
        self.all_sinked = True
        t2 = time.time()
        logger.info("process finished in :: %d  seconds" % (t2 - t1))
Example #13
0
        except IndexError as e :
            pass
        else:
            response = resp.group(1)
            if resp:
                ret = urllib.parse.unquote(response)
                return_rule = get_return_rule(json.loads(ret)['response'])
                result_queue.put_nowait(return_rule)
            else:
                print('没有response')


if __name__ == '__main__':
    pool = Pool(4)
    qid_queue = Manager().Queue()
    # result_queue = Manager().Queue()
    pool.apply_async(find_log, args=('query', 0, qid_queue,), callback=call_back)
    print('有一个线程去找qid了')
    while isinstance(qid_queue.get(), int):
        pool.apply_async(find_log, args=('responseServer', qid_queue.get_nowait(), qid_queue,), callback=call_back)
    pool.close()
    pool.join()
    index = 1
    while qid_queue.qsize() > 0:
        if not isinstance(qid_queue.get_nowait(), int) and index < 30:
            csv_file = open('travco_return_rule.csv', 'a', encoding='utf8')
            writer = csv.writer(csv_file)
            writer.writerow([qid_queue.get()])
            csv_file.close()
            index += 1
Example #14
0
    args = parser.parse_args()

    # init pool and queue
    q_get = Manager().Queue()
    q_put = Manager().Queue()
    pool = Pool(processes = args.process)

    # put pic path into get queue: q_get
    pics = os.listdir(args.folder)
    for pic in pics:
        path = os.path.join(args.folder, pic)
        q_get.put(path)
    
    # use multi process to get hash result and put the result into q_put
    n = q_get.qsize()
    while True:
        try:
            image_path = q_get.get_nowait()
            i = q_get.qsize()
        except:
            break
        else:
            pool.apply_async(get_hash_queue, (i, n, image_path, q_put, ))
    pool.close()
    pool.join()

    sys.stdout.write('\n')
    # get hash bucket
    image_bucket, hash_bucket0, hash_bucket1, hash_bucket2, hash_bucket3 = get_hash_bucket(q_put)
    # image deduplicate
    image_deduplication(image_bucket, hash_bucket0, hash_bucket1, hash_bucket2, hash_bucket3)