def main(): #获取文件夹名字 oldFolderName = input("请输入你要拷贝的文件夹:") print(oldFolderName) #创建一个新文件夹 newFolderName = oldFolderName + ["-附件"] #创建新文件夹 os.mkdir(newFolderName) #获取文件夹中的所有名字 filesName = os.listdir(oldFolderName) #使用多进程拷贝 pool = Pool(5) queue = Manager().Queue() for name in filesName: pool.apply_async(filesCopy, (name, oldFolderName, newFolderName, queue, )) #显示进度 num = 0 allNum = len(filesName) while num < allNum: queue.get_nowait() num+=1 copyRate = num/allNum print("\rcopy的进度是:%.2f%%"%(copyRate*100), end="") pool.close() pool.join()
class Pool(object): """ The Pool class represents a pool of worker threads. It has methods which allows tasks to be offloaded to the worker processes in a few different ways """ def __init__(self, num_workers, name="Pool"): """ \param num_workers (integer) number of worker threads to start \param name (string) prefix for the worker threads' name """ self.queue = Manager().Queue() self.closed = False self.workers = [] for idx in range(num_workers): process = PoolWorker(self.queue, name="%s-Worker-%d" % (name, idx)) process.daemon = True try: process.start() except: # If one thread has a problem, undo everything self.terminate() raise else: self.workers.append(process) def submit(self, work_unit): self.queue.put(work_unit) def close(self): """Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit.""" # No lock here. We assume it's sufficiently atomic... self.closed = True def terminate(self): """Stops the worker processes immediately without completing outstanding work. When the pool object is garbage collected terminate() will be called immediately.""" self.close() # Clearing the job queue try: while 1: self.queue.get_nowait() # except Manager().Queue.empty(): except: pass # Send one sentinel for each worker thread: each thread will die # eventually, leaving the next sentinel for the next thread for process in self.workers: self.queue.put(SENTINEL)
def main(): from multiprocessing import Pool from multiprocessing import Manager pool = Pool(5) isright = 0 precision_queue = Manager().Queue() for i_ep, root in enumerate(env.TreeList): pool.apply_async(func=sub_main, args=(root, precision_queue)) pool.close() pool.join() ql = precision_queue.qsize() for i in range(ql): isright += precision_queue.get_nowait() print(isright/precision_queue.qsize())
class MultiProcessFile(object): """ helper for testing multiprocessing multiprocessing poses a problem for doctests, since the strategy of replacing sys.stdout/stderr with file-like objects then inspecting the results won't work: the child processes will write to the objects, but the data will not be reflected in the parent doctest-ing process. The solution is to create file-like objects which will interact with multiprocessing in a more desirable way. All processes can write to this object, but only the creator can read. This allows the testing system to see a unified picture of I/O. """ def __init__(self): # per advice at: # http://docs.python.org/library/multiprocessing.html#all-platforms self.__master = getpid() self.__queue = Manager().Queue() self.__buffer = StringIO() self.softspace = 0 def buffer(self): if getpid() != self.__master: return from queue import Empty from collections import defaultdict cache = defaultdict(str) while True: try: pid, data = self.__queue.get_nowait() except Empty: break if pid == (): #show parent output after children #this is what users see, usually pid = ( 1e100, ) # googol! cache[pid] += data for pid in sorted(cache): #self.__buffer.write( '%s wrote: %r\n' % (pid, cache[pid]) ) #DEBUG self.__buffer.write( cache[pid] ) def write(self, data): # note that these pids are in the form of current_process()._identity # rather than OS pids from multiprocessing import current_process pid = current_process()._identity self.__queue.put((pid, data)) def __iter__(self): "getattr doesn't work for iter()" self.buffer() return self.__buffer def seek(self, offset, whence=0): self.buffer() return self.__buffer.seek(offset, whence) def getvalue(self): self.buffer() return self.__buffer.getvalue() def __getattr__(self, attr): return getattr(self.__buffer, attr)
class MultiProcessFile(object): """ helper for testing multiprocessing multiprocessing poses a problem for doctests, since the strategy of replacing sys.stdout/stderr with file-like objects then inspecting the results won't work: the child processes will write to the objects, but the data will not be reflected in the parent doctest-ing process. The solution is to create file-like objects which will interact with multiprocessing in a more desirable way. All processes can write to this object, but only the creator can read. This allows the testing system to see a unified picture of I/O. """ def __init__(self): # per advice at: # http://docs.python.org/library/multiprocessing.html#all-platforms self.__master = getpid() self.__queue = Manager().Queue() self.__buffer = StringIO() self.softspace = 0 def buffer(self): if getpid() != self.__master: return from Queue import Empty from collections import defaultdict cache = defaultdict(str) while True: try: pid, data = self.__queue.get_nowait() except Empty: break if pid == (): #show parent output after children #this is what users see, usually pid = ( 1e100, ) # googol! cache[pid] += data for pid in sorted(cache): #self.__buffer.write( '%s wrote: %r\n' % (pid, cache[pid]) ) #DEBUG self.__buffer.write( cache[pid] ) def write(self, data): # note that these pids are in the form of current_process()._identity # rather than OS pids from multiprocessing import current_process pid = current_process()._identity self.__queue.put((pid, data)) def __iter__(self): "getattr doesn't work for iter()" self.buffer() return self.__buffer def seek(self, offset, whence=0): self.buffer() return self.__buffer.seek(offset, whence) def getvalue(self): self.buffer() return self.__buffer.getvalue() def __getattr__(self, attr): return getattr(self.__buffer, attr)
pool = Pool(processes=40) queue = Manager().Queue(maxsize=40000) cnt = 0 for i in range(len(label_raw)): pool.apply_async(label_expand, args=( i, label_raw[i], queue, )) pool.close() pool.join() label_ex = [] while True: try: (i, lb) = queue.get_nowait() label_ex.append((i, lb)) except: break print('finish label expansion ', len(label_ex)) label_ex = sorted(label_ex, key=lambda x: x[0]) label = [] for (i, lb) in label_ex: label.append(lb) '''test code''' # plt.plot(input[7]) # plt.plot(label[7]) # plt.show() x_train = [] for dat in input:
class VocabBuilder(object): """ 用来构造词表的类 """ def __init__(self): self.file_full_path_list = self.get_file_path_list() self.dict = { "[PAD]": 0, "[UNK]": 1, "[CLS]": 2, "[SEP]": 3, "[MASK]": 4 } # 词表 self.count_dict = dict() # 统计每个字符出现次数的词典 {char1:10, char2:12} self.sorted_count_list = None # 按照频数排序后的列表 self.q_count_dic = Manager().Queue() # 用于存放所有线程的count_dic self.q_count_dic_done = Manager().Queue() # 用于存放已经完成字典合并的count_dic序列 @staticmethod def get_file_path_list(): """ 获取所有文本的完整路径 :return: """ data_path = "../corpus_processed" file_list = os.listdir(data_path) full_file_path_list = [ os.path.join(data_path, file) for file in file_list ] return full_file_path_list def tokenize_and_count(self, file): """ 按照字符切分句子,并统计词频 :param file: txt文本 :return: ['乙', '女', 'は', 'お', '姉', 'さ', 'ま', 'に', '恋', 'し', 'て', 'る', '櫻', 'の', '園', 'の', 'エ', 'ト', 'ワ', 'ー', 'ル'] """ basic_tokenizer = BasicTokenizer( do_lower_case=False) # 必须加上do_lower_case=False,这样じ才不会变成し token_list = [] with open(file, mode="r", encoding="utf-8") as fin: text = fin.read() str_list = basic_tokenizer.tokenize(text) for str in str_list: token_list += list(str) # 统计 tmp_count_dict = dict() for token in token_list: if token not in tmp_count_dict: tmp_count_dict[token] = 0 tmp_count_dict[token] += 1 self.q_count_dic.put(tmp_count_dict) # 将统计结果放入序列 def join_dicts(self): """ 使用一个单独的进程,将所有进程统计出来的字典与总字典合并 :return: """ while True: tmp_count_dict = self.q_count_dic.get() # 从队列获取统计出来的字典 all_keys = tmp_count_dict.keys() | self.count_dict.keys() self.count_dict = { key: tmp_count_dict.get(key, 0) + self.count_dict.get(key, 0) for key in all_keys } self.q_count_dic_done.put(1) # 将完成任务的字典放入序列中,只放字典的长度即可,不必放整个字典 if self.q_count_dic_done.qsize() == len(self.file_full_path_list): self.q_count_dic.put(self.count_dict) # 通过序列传递最终的结果 break def build_vocab(self, min_count=None, max_count=None, vocab_size=None): """ 构造词表 :param min_count: 字符出现的最少次数 :param max_count: 字符出现的最大次数 :param vocab_size: 词表中单词的最大数量 :return: """ self.count_dict = self.q_count_dic.get_nowait() # 从序列中获取最终的字典 # 对单词频数字典进行筛选,仅保留频数位于[min_count,max_count]的单词 if isinstance(min_count, int): self.count_dict = { key: value for key, value in self.count_dict.items() if min_count <= value } if isinstance(max_count, int): self.count_dict = { key: value for key, value in self.count_dict.items() if value <= max_count } # 对count字典按照频数进行降序排序,再转成列表 [('b', 10), ('a', 9), ('c', 8)] self.sorted_count_list = sorted(self.count_dict.items(), key=lambda x: x[1], reverse=True) # 限制词表中单词的最大数量 if isinstance(vocab_size, int): # 如果vocab_size<len(sorted_count_list),则只取前vocab_size个 if vocab_size < len(self.sorted_count_list): self.sorted_count_list = self.sorted_count_list[:vocab_size] # 将满足条件的sorted_count_list中的单词,保存到self.dic中 for token, _ in self.sorted_count_list: # if token not in self.dict.keys() and len(self.dict) < vocab_size: # 加上这一句是为了能够分句传入 self.dict[token] = len( self.dict) # dict由{UNK:0,PAD:1} --> {UNK:0,PAD:1,token:2} def save_vocab(self): dict_path = "./vocab.txt" with open(dict_path, mode="w", encoding="utf-8") as fout: for token in self.dict.keys(): fout.write(token + "\n") print("Vocabulary Size:%d" % len(self.dict)) def show_progress_bar(self): """做一个进度条""" with tqdm(total=len(self.file_full_path_list)) as pbar: while True: pbar.set_description("Building Vocabulary") processed_file_num = self.q_count_dic_done.qsize() pbar.n = processed_file_num pbar.refresh() if processed_file_num == len(self.file_full_path_list): break def multi_process_token_counter(self): po = Pool() po.apply_async(self.join_dicts) # 添加合并字典任务 for file_path in self.file_full_path_list: # 添加分词并统计任务 po.apply_async(self.tokenize_and_count, args=(file_path, )) self.show_progress_bar() po.close() po.join() def run(self): # 多进程词频分词、词频统计、字典合并 self.multi_process_token_counter() # 生成词表 self.build_vocab() # 保存词表 self.save_vocab()
class RelationExtractor: """ Relation Extraction based on Semantic Role Labeling of SENNA """ def __init__(self, data_source=None, relation_sink=None, workers=4): """ :param data_source: data_source object of type DataSource :param relation_sink: data_sink object of type DataSink :param workers: number of child process workers in source sink mode """ if data_source: assert isinstance(data_source, DSource.MongoDataSource),\ "data_source object must be instance of MongoDataSource" self.data_source = data_source if relation_sink: assert isinstance(relation_sink, DSink.ElasticDataSink), \ "relation_sink object must be instance of ElasticDataSink" self.relation_sink = relation_sink self.model_class = self.relation_sink.model_identifier.model_class self.relation_annotator = pnt.Annotator() self.stemmer = PorterStemmer() self.workers = workers self.relation_queue = Manager().Queue(maxsize=10000) self.persist_attributes = ['relation_annotator', 'stemmer', 'model_class', 'relation_queue'] def __getstate__(self): state = dict() for attr in self.persist_attributes: state[attr] = self.__dict__[attr] return state def __setstate(self, d): self.__dict__.update(d) @staticmethod def __populate_arguments(semantic_element): """ form a argument object from the srl semantic element :param semantic_element: SRL semantic element :return: RelationArgument instance """ return RelationArgument(A0=semantic_element.get('A0'), A1=semantic_element.get('A1'), A2=semantic_element.get('A2'), A3=semantic_element.get('A3')) @staticmethod def __populate_modifier(semantic_element): """ form a argument modifier object from the srl semantic element :param semantic_element: SRL semantic element :return: RelationModifier instance """ return RelationModifier(DIR=semantic_element.get('AM-DIR'), MNR=semantic_element.get('AM-MNR'), LOC=semantic_element.get('AM-LOC'), TMP=semantic_element.get('AM-TMP'), EXT=semantic_element.get('AM-EXT'), PNC=semantic_element.get('AM-PNC'), CAU=semantic_element.get('AM-CAU'), NEG=semantic_element.get('AM-NEG')) def form_relations(self, text, block_id, payload, ff, persist=True): """ form relation(s) on a given text :param text: text on which to get the relations on, text will be sentence tokenized and relations formed at sentence level :param block_id: unique identifier of the block :param persist: persist the relations extracted from the text in the sink, relation_sink needed to be specified :return: list of relations """ text_sentences = pattern.tokenize(text) relations = [] for sentence in text_sentences: # work with ascii string only sentence = "".join((c for c in sentence if 0 < ord(c) < 127)) try: senna_annotation = self.relation_annotator.getAnnotations(sentence) except Exception as e: logger.error(e) continue chunk_parse, pos_tags, role_labeling, tokenized_sentence = \ senna_annotation['chunk'], senna_annotation['pos'], senna_annotation['srl'], \ senna_annotation['words'] # nothing to do here empty srl if not role_labeling: continue for semantic_element in role_labeling: arguments = RelationExtractor.__populate_arguments(semantic_element) modifiers = RelationExtractor.__populate_modifier(semantic_element) verb = semantic_element.get('V') # order of the arguments returned is important, A0 --> A1 --> A2 --> A3 arguments = [v for v in vars(arguments).itervalues() if v] modifiers = [v for v in vars(modifiers).itervalues() if v] if not arguments: continue argument_pairs = [e for e in ((ai, aj) for i, ai in enumerate(arguments) for j, aj in enumerate(arguments) if i < j)] verb = relation_util.normalize_relation(verb) for a0, a1 in argument_pairs: en0 = relation_util.form_entity(tokenized_sentence, a0, chunk_parse, pos_tags) en1 = relation_util.form_entity(tokenized_sentence, a1, chunk_parse, pos_tags) if not en0 or not en1: continue relations.append(RelationTuple(left_entity=en0, right_entity=en1, relation=verb, sentence=sentence, text=text, block_id=block_id, payload=payload, ff = ff)) logger.info("generated a relation for ") logger.info(block_id) for arg_modifier in modifiers: mod_pos = sentence.find(arg_modifier) linked_arg = min([(a, abs(mod_pos - sentence.find(a))) for a in arguments], key=lambda e: e[1])[0] en0 = relation_util.form_entity(tokenized_sentence, linked_arg, chunk_parse, pos_tags) en1 = relation_util.form_entity(tokenized_sentence, arg_modifier, chunk_parse, pos_tags) if not en0 or not en1: continue relations.append(RelationTuple(left_entity=en0, right_entity=en1, relation=verb, sentence=sentence, text=text, block_id=block_id, payload=payload, ff=ff)) logger.info("generated a relation for ") logger.info(block_id) return relations def form_relations_source(self, source_item): if not source_item: logger.error("got an empty source item") return item_entry = "" payload = "" ff = "" for f_name, f_value in source_item: if f_name == "payload": payload = f_value elif f_name == "ff": ff = f_value else: item_entry += f_value if item_entry == ' ': return try: block_id = str(uuid.uuid1()) relations = self.form_relations(item_entry, block_id, payload, ff) except RuntimeError as e: logger.error("Error generating relations") logger.error(e) return for relation in relations: sink_relation = self.model_class() sink_relation.leftEntity = relation.left_entity sink_relation.rightEntity = relation.right_entity sink_relation.relation = relation.relation sink_relation.sentence = relation.sentence sink_relation.text = relation.text sink_relation.block_id = relation.block_id sink_relation.productName = relation.ff sink_relation.webLocation = relation.payload logger.info("generated a relation") logger.info(sink_relation) try: self.relation_queue.put(sink_relation, timeout=1) except Full as e: logger.error(e) def sink_relations(self): while not self.all_sinked: try: item = self.relation_queue.get_nowait() self.relation_sink.sink_item(item) except Empty as e: pass def form_relations_from_source(self): if not self.data_source or not self.relation_sink: raise RuntimeError("Data source and sink must be set") self.data_source.start() self.relation_sink.start() self.all_sinked = False pool = Pool(processes=self.workers) t1 = time.time() pool.imap(self.form_relations_source, self.data_source) sinker = Thread(target=self.sink_relations, name='Sink-Thread') sinker.start() pool.close() pool.join() self.all_sinked = True t2 = time.time() logger.info("process finished in :: %d seconds" %(t2 - t1))
pool = Pool(processes=40) queue = Manager().Queue(maxsize=20000) cnt = 0 for i in range(len(label_raw)): pool.apply_async(label_expand, args=( i, label_raw[i], queue, )) pool.close() pool.join() label_ex = [] while True: try: (i, lb) = queue.get_nowait() label_ex.append((i, lb)) except: break print('finish label expansion ', len(label_ex)) label_ex = sorted(label_ex, key=lambda x: x[0]) label = [] for (i, lb) in label_ex: label.append(lb) '''test code''' # plt.plot(input[7]) # plt.plot(label[7]) # plt.show() if input_dim < output_dim: print('input_dim smaller than output_dim, quit task')
class Crawler(object): def __init__(self, cookie: str = None, headers: dict = None, max_num: int = 10000, domain_regs: list = None, depth: int = 5): self.cookie = cookie self.headers = headers if headers else DEFAULT_HEADERS self.waiting_queue = Manager().Queue(maxsize=max_num * 2) self.current_queue = Manager().Queue(maxsize=max_num * 2) self.max_url_num = max_num self.crawled_urls = BloomFilter(element_num=max_num * 5, error_rate=0.01) self.url_dict = Manager().dict() self.domain_reg_list = domain_regs self.depth = depth self.current_depth = 0 self.filter_exts = [ 'css', 'png', 'gif', 'jpg', 'jpeg', 'swf', 'tiff', 'pdf', 'ico', 'flv', 'mp4', 'mp3', 'avi', 'mpg', 'gz', 'mpeg', 'iso', 'dat', 'mov', 'rar', 'exe', 'zip', 'tar', 'bin', 'bz2', 'xsl', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx', 'csv', 'map', "ttf", 'tif', 'woff', 'woff2', 'cab', 'apk', 'bmp', 'svg', 'exif', 'xml', 'rss', 'webp', 'js' ] def run(self, urls): self.consist_headers() # 默认只爬取当前根域下的url self.domain_reg_list = self.parse_domain(urls) if not self.domain_reg_list else self.domain_reg_list self._init_reg() for url in urls: self.call_crawl_handler(url) print('all task done') print(self.url_dict) def call_crawl_handler(self, url): if 'http' not in url: init_url = 'http://' + url else: init_url = url self.current_queue.put_nowait(init_url) # 初始化url为了避免重复爬取,在初始化时就放入布隆过滤器 init_url_without_protocol = url.split('//')[-1] self.crawled_urls.add(init_url_without_protocol) while self.current_depth < self.depth: if len(self.url_dict) >= self.max_url_num: print('达到预设爬去上限, 爬虫结束') break print('now crawl depth is :{}'.format(self.current_depth)) tmp_results = [] # 利用进程池去完成爬虫 pool = Pool(os.cpu_count() * 2) while not self.current_queue.empty(): print('当前在待爬队列中还有:{}个url'.format(self.current_queue.qsize())) url = self.current_queue.get_nowait() if not url.endswith('js'): result = pool.apply_async(func=self.crawl_handler, args=(url,)) tmp_results.append(result) # self.crawl_handler(url) pool.close() pool.join() tmp_reqs = [] for result in tmp_results: for r in result.get(): tmp_reqs.append(r) self._handle_url(tmp_reqs) self.current_queue = self.waiting_queue self.waiting_queue = Manager().Queue(maxsize=self.max_url_num * 2) self.current_depth += 1 print('depth:{} crawled done'.format(self.current_depth)) def consist_headers(self): if self.cookie: self.headers['Cookie'] = self.cookie @staticmethod def parse_domain(domain_list): """ 需要将输入的url或者域名解析成域名, 用于后续同域判断等操作 :param domain_list: :return: """ def _split_url_protocol_and_path(domain): # 去掉协议 if '://' in domain: domain = domain.split('://')[1] # 截取路径 if '.com.cn' in domain: return domain.split('.com.cn')[0] + '.com.cn' if '.com' in domain: return domain.split('.com')[0] + '.com' if '.io' in domain: return domain.split('.xyz')[0] + '.io' # 针对ip:port形式的url,截取/即可 return domain.split('/')[0] return [_split_url_protocol_and_path(domain) for domain in domain_list] @staticmethod def _parse_post_data(post_data) -> str: """ 解析动态请求获取里面的data成一个字符串 :param post_data: :return: """ if not post_data: return '' if not isinstance(post_data, dict): if '=' in post_data: param_dict = {} if '&' in post_data: params_couples = post_data.split('&') for param in params_couples: if '=' not in param: continue k, v = param.split('=') param_dict[k] = v else: k, v = post_data.split('=') param_dict[k] = v post_data = param_dict else: post_data = json.loads(post_data) post_data_list = [k for k, _ in post_data.items()] post_data_list.sort() return ''.join([param + '&' for param in post_data_list])[:-1] @staticmethod def parse_static_url(url): """ 把解析到的静态url, 重新组合成一个字典 { 'url': 'xxxxxx', 'originUrl': 'xxxxxx/a=aa', 'method': 'GET', 'queryString': 'a=aa' } :param url: :return: """ try: req = dict() req['method'] = 'GET' req['originUrl'] = url if '?' not in url: req['url'] = url return req url_consist = url.split('?') req['url'] = url_consist[0] params = url_consist[1] if '&' not in params: params_consist = params.split('=') req['queryString'] = params_consist[0] if params_consist[0] else '' return req multi_params = params.split('&') params_list = list(map(lambda y: y.split('=')[0], filter(lambda x: '=' in x, multi_params))) # 按首字母把参数排序 params_list.sort() req['queryString'] = ''.join([key + '=&' for key in params_list])[:-2] return req except Exception: msg = traceback.format_exc() print(msg) return None def _init_reg(self): """ 根据解析出来的域名拼接一个正则, 用于同域校验 :return: """ domain_reg = ['^'] domain_reg.extend(['(http|https):\/\/' + domain.replace('.', '\.') + '.*|' for domain in self.domain_reg_list]) # domain_reg.extend(map(lambda x: '(http|https):\/\/' + x.replace('.', '\.') + '.*|', self.domain_reg_list)) tmp_domain_reg = ''.join(domain_reg) self.domain_reg = tmp_domain_reg[:-1] + '$' def filter_ext(self, url): """ 过滤掉特殊后缀的url, 如一些静态资源等等 如果存在url的后缀是需要排除的,则排除 :param url: :return: """ try: f = url.split('/')[-1].strip() if '.' in f: ext = f.split('.')[-1].strip().lower() if ext and ext in self.filter_exts: return True else: return False return False except Exception: msg = traceback.format_exc() print(msg) return False def filter_url_by_domain(self, url): """ 检验当前的url是否满足条件 是 的url以及不在[不需要]的url集合里,返回True.不满足要求,返回false :param url: :return: """ # 校验域名 if not re.match(self.domain_reg, url, flags=0): return False # TODO: 后续补齐这部分功能 # if len(self.exclude_urls) == 0: # return True # 校验exclude_urls # if re.match(self.exclude_urls_reg_str, url, flags=0): # return False return True def static_crawler(self, page, results, url) -> List["ElementHandle"]: """ 主要用于页面中静态url的解析, 目前涵盖了a标签的href属性和src属性 """ links = page.query_selector_all("//a") tmp_link = [] for link in links: href = link.get_property("href").json_value() src = link.get_property("src").json_value() if not href or href == url: continue if not self.filter_ext(url=href) and self.filter_url_by_domain(url=href): req = self.parse_static_url(href) if req: print('href:{}'.format(req)) results.append(req) if not src or src == url: continue if not self.filter_ext(url=src) and self.filter_url_by_domain(url=src): req = self.parse_static_url(src) if req: print('src:{}'.format(req)) results.append(req) # 这里主要是用于有些a标签里的写法是<javascript>标签,用于执行某些js操作 if 'javascript' in href or 'javascript' in src: tmp_link.append(link) return tmp_link def _check_crawled_url(self, url) -> bool: """ 检查是否已爬取,不存在,则返回True :param url: :return: """ if url in self.crawled_urls: return False return True def _check_url_is_exist_by_md5(self, url_dict): """ 利用MD5去检查url是否重复 :param url_dict: :return: """ try: exist_md5 = list(url_dict.keys())[0] if exist_md5 in self.crawled_urls: return False return True except Exception: msg = traceback.format_exc() print(msg) return True @staticmethod def calculate_md5(url_har): """ 计算md5来去重 :param url_har: :return: """ url = url_har['url'] # 有些post请求后缀会加timestamp时间戳来防重放 tmp_list = url.split('//')[-1].split('?') url_without_protocol = tmp_list[0] if len(tmp_list) > 1 else url.split('//')[-1] method = url_har['method'] query_string = '' post_data = '' if 'queryString' in url_har: query_string = url_har['queryString'] if 'postData' in url_har: post_data = url_har['postData'] tmp_str = url_without_protocol + '&' + method + '&' + query_string + post_data return hashlib.md5(tmp_str.encode('utf-8')).hexdigest() def _handle_url(self, req_list): """ 处理爬到的url, 看看是不是需要过滤或者是不是已经爬取过了 :param req_list: :return: """ if not req_list: return insert_req_list = list() for req in req_list: url = req['originUrl'] if url.endswith('/'): url = url[:-1] url_without_protocol = url.split('//')[-1] ''' 解析完成后,返回的结构体包括:url,queryString(if exist),method 需要对url做判断: 1、是否存在于最后的url集合里 2、是否已爬过 3、url的后缀是否在需要过滤的集合里(最先判断,如果需要过滤则直接忽略) ''' md5 = self.calculate_md5(req) tmp_dict = { md5: req } if self._check_url_is_exist_by_md5(tmp_dict): if len(self.url_dict.keys()) < self.max_url_num: self.url_dict[md5] = req # TODO:后面可以定制化插入taskId insert_req_list.append({'taskId': 'test12', 'urlDict': json.dumps(req)}) # 如果没有爬过,则放入下一轮要爬取的队列里 if self._check_crawled_url(url_without_protocol) and not self.waiting_queue.full(): self.waiting_queue.put_nowait(req['originUrl']) self.crawled_urls.add(url_without_protocol) def crawl_handler(self, url) -> list: result = [] def intercept(route: Route, request: Request): # 拦截前端跳转,主要方法是修改请求响应为204 TODO: 后续在遇到前端跳转的时候,优化hook逻辑 if request.is_navigation_request() and request.frame.parent_frame: request.response().status = 204 route.continue_() return # 尝试拦截后端跳转 if request.redirected_to: if request.post_data_json: request.response().status = 200 self.waiting_queue.put_nowait(request.redirected_to.url) else: ... route.continue_() return resource_type = request.resource_type # 过滤动态请求 if resource_type in ['image', 'media', 'eventsource', 'websocket']: route.abort() else: url_origin = request.url if not url_origin: route.continue_() return if not self.filter_ext(url=url_origin) and self.filter_url_by_domain(url=url_origin): headers = request.headers method = request.method post_data_json: dict = request.post_data_json http_har = dict() if method == 'POST' or method == 'PUT': post_data_origin = post_data_json post_data_handled = self._parse_post_data(post_data_origin) content_type = headers['content-type'] if 'content-type' in headers else '' http_har['originPostData'] = post_data_origin http_har['postData'] = post_data_handled http_har['contentType'] = content_type http_har['url'] = url_origin http_har['originUrl'] = url_origin http_har['method'] = method if method == 'GET': http_har = self.parse_static_url(url_origin) result.append(http_har) route.continue_() with sync_playwright() as p: browser = p.webkit.launch(headless=True, chromium_sandbox=True, ) page = browser.new_page() page.set_default_navigation_timeout(30000) page.set_extra_http_headers(self.headers) page.route('**/*', intercept) page.goto(url) page.wait_for_load_state(state='networkidle', timeout=30000) tmp_links = self.static_crawler(page, result, url) page.evaluate(FORM_FILL_UPLOAD_JS) for link in tmp_links: link.click() page.close() browser.close() return result
class RelationExtractor: """ Relation Extraction based on Semantic Role Labeling of SENNA """ def __init__(self, data_source=None, relation_sink=None, workers=4): """ :param data_source: data_source object of type DataSource :param relation_sink: data_sink object of type DataSink :param workers: number of child process workers in source sink mode """ if data_source: assert isinstance(data_source, DSource.MongoDataSource),\ "data_source object must be instance of MongoDataSource" self.data_source = data_source if relation_sink: assert isinstance(relation_sink, DSink.ElasticDataSink), \ "relation_sink object must be instance of ElasticDataSink" self.relation_sink = relation_sink self.model_class = self.relation_sink.model_identifier.model_class self.relation_annotator = pnt.Annotator() self.stemmer = PorterStemmer() self.workers = workers self.relation_queue = Manager().Queue(maxsize=10000) self.persist_attributes = [ 'relation_annotator', 'stemmer', 'model_class', 'relation_queue' ] def __getstate__(self): state = dict() for attr in self.persist_attributes: state[attr] = self.__dict__[attr] return state def __setstate(self, d): self.__dict__.update(d) @staticmethod def __populate_arguments(semantic_element): """ form a argument object from the srl semantic element :param semantic_element: SRL semantic element :return: RelationArgument instance """ return RelationArgument(A0=semantic_element.get('A0'), A1=semantic_element.get('A1'), A2=semantic_element.get('A2'), A3=semantic_element.get('A3')) @staticmethod def __populate_modifier(semantic_element): """ form a argument modifier object from the srl semantic element :param semantic_element: SRL semantic element :return: RelationModifier instance """ return RelationModifier(DIR=semantic_element.get('AM-DIR'), MNR=semantic_element.get('AM-MNR'), LOC=semantic_element.get('AM-LOC'), TMP=semantic_element.get('AM-TMP'), EXT=semantic_element.get('AM-EXT'), PNC=semantic_element.get('AM-PNC'), CAU=semantic_element.get('AM-CAU'), NEG=semantic_element.get('AM-NEG')) def form_relations(self, text, block_id, payload, ff, persist=True): """ form relation(s) on a given text :param text: text on which to get the relations on, text will be sentence tokenized and relations formed at sentence level :param block_id: unique identifier of the block :param persist: persist the relations extracted from the text in the sink, relation_sink needed to be specified :return: list of relations """ text_sentences = pattern.tokenize(text) relations = [] for sentence in text_sentences: # work with ascii string only sentence = "".join((c for c in sentence if 0 < ord(c) < 127)) try: senna_annotation = self.relation_annotator.getAnnotations( sentence) except Exception as e: logger.error(e) continue chunk_parse, pos_tags, role_labeling, tokenized_sentence = \ senna_annotation['chunk'], senna_annotation['pos'], senna_annotation['srl'], \ senna_annotation['words'] # nothing to do here empty srl if not role_labeling: continue for semantic_element in role_labeling: arguments = RelationExtractor.__populate_arguments( semantic_element) modifiers = RelationExtractor.__populate_modifier( semantic_element) verb = semantic_element.get('V') # order of the arguments returned is important, A0 --> A1 --> A2 --> A3 arguments = [v for v in vars(arguments).itervalues() if v] modifiers = [v for v in vars(modifiers).itervalues() if v] if not arguments: continue argument_pairs = [ e for e in ((ai, aj) for i, ai in enumerate(arguments) for j, aj in enumerate(arguments) if i < j) ] verb = relation_util.normalize_relation(verb) for a0, a1 in argument_pairs: en0 = relation_util.form_entity(tokenized_sentence, a0, chunk_parse, pos_tags) en1 = relation_util.form_entity(tokenized_sentence, a1, chunk_parse, pos_tags) if not en0 or not en1: continue relations.append( RelationTuple(left_entity=en0, right_entity=en1, relation=verb, sentence=sentence, text=text, block_id=block_id, payload=payload, ff=ff)) logger.info("generated a relation for ") logger.info(block_id) for arg_modifier in modifiers: mod_pos = sentence.find(arg_modifier) linked_arg = min([(a, abs(mod_pos - sentence.find(a))) for a in arguments], key=lambda e: e[1])[0] en0 = relation_util.form_entity(tokenized_sentence, linked_arg, chunk_parse, pos_tags) en1 = relation_util.form_entity(tokenized_sentence, arg_modifier, chunk_parse, pos_tags) if not en0 or not en1: continue relations.append( RelationTuple(left_entity=en0, right_entity=en1, relation=verb, sentence=sentence, text=text, block_id=block_id, payload=payload, ff=ff)) logger.info("generated a relation for ") logger.info(block_id) return relations def form_relations_source(self, source_item): if not source_item: logger.error("got an empty source item") return item_entry = "" payload = "" ff = "" for f_name, f_value in source_item: if f_name == "payload": payload = f_value elif f_name == "ff": ff = f_value else: item_entry += f_value if item_entry == ' ': return try: block_id = str(uuid.uuid1()) relations = self.form_relations(item_entry, block_id, payload, ff) except RuntimeError as e: logger.error("Error generating relations") logger.error(e) return for relation in relations: sink_relation = self.model_class() sink_relation.leftEntity = relation.left_entity sink_relation.rightEntity = relation.right_entity sink_relation.relation = relation.relation sink_relation.sentence = relation.sentence sink_relation.text = relation.text sink_relation.block_id = relation.block_id sink_relation.productName = relation.ff sink_relation.webLocation = relation.payload logger.info("generated a relation") logger.info(sink_relation) try: self.relation_queue.put(sink_relation, timeout=1) except Full as e: logger.error(e) def sink_relations(self): while not self.all_sinked: try: item = self.relation_queue.get_nowait() self.relation_sink.sink_item(item) except Empty as e: pass def form_relations_from_source(self): if not self.data_source or not self.relation_sink: raise RuntimeError("Data source and sink must be set") self.data_source.start() self.relation_sink.start() self.all_sinked = False pool = Pool(processes=self.workers) t1 = time.time() pool.imap(self.form_relations_source, self.data_source) sinker = Thread(target=self.sink_relations, name='Sink-Thread') sinker.start() pool.close() pool.join() self.all_sinked = True t2 = time.time() logger.info("process finished in :: %d seconds" % (t2 - t1))
except IndexError as e : pass else: response = resp.group(1) if resp: ret = urllib.parse.unquote(response) return_rule = get_return_rule(json.loads(ret)['response']) result_queue.put_nowait(return_rule) else: print('没有response') if __name__ == '__main__': pool = Pool(4) qid_queue = Manager().Queue() # result_queue = Manager().Queue() pool.apply_async(find_log, args=('query', 0, qid_queue,), callback=call_back) print('有一个线程去找qid了') while isinstance(qid_queue.get(), int): pool.apply_async(find_log, args=('responseServer', qid_queue.get_nowait(), qid_queue,), callback=call_back) pool.close() pool.join() index = 1 while qid_queue.qsize() > 0: if not isinstance(qid_queue.get_nowait(), int) and index < 30: csv_file = open('travco_return_rule.csv', 'a', encoding='utf8') writer = csv.writer(csv_file) writer.writerow([qid_queue.get()]) csv_file.close() index += 1
args = parser.parse_args() # init pool and queue q_get = Manager().Queue() q_put = Manager().Queue() pool = Pool(processes = args.process) # put pic path into get queue: q_get pics = os.listdir(args.folder) for pic in pics: path = os.path.join(args.folder, pic) q_get.put(path) # use multi process to get hash result and put the result into q_put n = q_get.qsize() while True: try: image_path = q_get.get_nowait() i = q_get.qsize() except: break else: pool.apply_async(get_hash_queue, (i, n, image_path, q_put, )) pool.close() pool.join() sys.stdout.write('\n') # get hash bucket image_bucket, hash_bucket0, hash_bucket1, hash_bucket2, hash_bucket3 = get_hash_bucket(q_put) # image deduplicate image_deduplication(image_bucket, hash_bucket0, hash_bucket1, hash_bucket2, hash_bucket3)