def __init__(self, model=None, tokenizer=None, embedding_file=None, num_workers=None): """ Args: model: path to saved model file tokenizer: option string to select tokenizer class embedding_file: if provided, will expand dictionary to use all available pretrained vectors in this file. num_workers: number of CPU processes to use to preprocess batches. """ logger.info('Initializing model...') self.model = DocReader.load(model or DEFAULTS['model']) if embedding_file: logger.info('Expanding dictionary...') words = utils.index_embedding_words(embedding_file) added = self.model.expand_dictionary(words) self.model.load_embeddings(added, embedding_file) logger.info('Initializing tokenizer...') annotators = tokenizers.get_annotators_for_model(self.model) if not tokenizer: tokenizer_class = DEFAULTS['tokenizer'] else: tokenizer_class = tokenizers.get_class(tokenizer) if num_workers is None or num_workers > 0: self.workers = ProcessPool( num_workers, initializer=init, initargs=(tokenizer_class, annotators), ) else: self.workers = None self.tokenizer = tokenizer_class(annotators=annotators)
def calculateSystemRanks_multiprocessing(systemList, systemAddress, relevanceJudgementAddress, rankMetric): num_workers = None workers = ProcessPool(processes=20) system_metric_value = {} # key is the system_name #print systemList with tqdm(total=len(systemList)) as pbar: partial_calculateSystemRanks = partial( calculateSystemRanks, systemAddress=systemAddress, relevanceJudgementAddress=relevanceJudgementAddress, rankMetric=rankMetric) for system_info in tqdm( workers.imap_unordered(partial_calculateSystemRanks, systemList)): system_name = system_info[0] system_metric_val = system_info[1] system_metric_value[system_name] = system_metric_val pbar.update() workers.close() workers.join() system_metric_value_list = [] for system_name in sorted(system_metric_value.iterkeys()): system_metric_value_list.append(system_metric_value[system_name]) return (system_metric_value, system_metric_value_list)
def __init__(self, PoolType): # PoolType是取值为'Process'或'Thread'的字符串 name = 'MyProblem' # 初始化name(函数名称,可以随意设置) M = 1 # 初始化M(目标维数) maxormins = [1] # 初始化maxormins(目标最小最大化标记列表,1:最小化该目标;-1:最大化该目标) Dim = 4 # 初始化Dim(决策变量维数) varTypes = [1, 1, 0, 0] # 初始化varTypes(决策变量的类型,元素为0表示对应的变量是连续的;1表示是离散的) lb = [1, 1, 0, 0] # 决策变量下界 ub = [10000, 2000, 1, 1] # 决策变量上界 lbin = [0] * Dim # 决策变量下边界(0表示不包含该变量的下边界,1表示包含) ubin = [1] * Dim # 决策变量上边界(0表示不包含该变量的上边界,1表示包含) # 调用父类构造方法完成实例化 ea.Problem.__init__(self, name, M, maxormins, Dim, varTypes, lb, ub, lbin, ubin) # 目标函数计算中用到的一些数据 X_train, X_test, Y_train, Y_test = get_data(hour_num=0, transform='sin+cos', drop_time=True, scale=True) np.where(X_train == 0, X_train, 0.001) np.where(Y_train == 0, Y_train, 0.001) self.data = X_train # 训练集的特征数据(归一化) self.dataTarget = Y_train # 设置用多线程还是多进程 self.PoolType = PoolType if self.PoolType == 'Thread': self.pool = ThreadPool(2) # 设置池的大小 elif self.PoolType == 'Process': num_cores = int(mp.cpu_count()) # 获得计算机的核心数 self.pool = ProcessPool(num_cores) # 设置池的大小
def calculate_matches(all_docs: Dict[Text, Tuple[Text, Text]], closest_docs: List[Tuple[List[Text], np.ndarray]], answers: List[List[Text]], worker_num: int): global dpr_all_documents dpr_all_documents = all_docs tok_opts = {} tokenizer = SimpleTokenizer(**tok_opts) processes = ProcessPool(processes=worker_num) get_score_partial = partial(check_answer, tokenizer=tokenizer) closest_ids = [doc[0] for doc in closest_docs] answers_and_retrieved_docs = zip(answers, closest_ids) scores = processes.map(get_score_partial, answers_and_retrieved_docs) n_docs = len(closest_docs[0][0]) top_k_hits = [0] * n_docs for question_hits in scores: best_hit = next((i for i, x in enumerate(question_hits) if x), None) if best_hit is not None: top_k_hits[best_hit:] = [v + 1 for v in top_k_hits[best_hit:]] return QAMatchStats(top_k_hits, scores)
def store_contents(data_path, save_path, preprocess, num_workers=None): """Preprocess and store a corpus of documents in sqlite. Args: data_path: Root path to directory (or directory of directories) of files containing json encoded documents (must have `id` and `text` fields). save_path: Path to output sqlite db. preprocess: Path to file defining a custom `preprocess` function. Takes in and outputs a structured doc. num_workers: Number of parallel processes to use when reading docs. """ if os.path.isfile(save_path): raise RuntimeError('%s already exists! Not overwriting.' % save_path) logger.info('Reading into database...') conn = sqlite3.connect(save_path) c = conn.cursor() c.execute("CREATE TABLE documents (id PRIMARY KEY, title, text);") workers = ProcessPool(num_workers, initializer=init, initargs=(preprocess,)) files = [f for f in iter_files(data_path)] count = 0 with tqdm(total=len(files)) as pbar: for pairs in tqdm(workers.imap_unordered(get_contents, files)): count += len(pairs) c.executemany("INSERT INTO documents VALUES (?,?,?)", pairs) pbar.update() logger.info('Read %d docs.' % count) logger.info('Committing...') conn.commit() conn.close()
def ping_scan(network): # 多进程 pool = ProcessPool(processes=150) net = ipaddress.ip_network(network) result_obj_dict = {} for ip in net: # 获取返回的对象 result_obj = pool.apply_async(scapy_ping_one, args=(str(ip), )) result_obj_dict[str(ip)] = result_obj pool.close() pool.join() # print(result_obj_dict) active_ip = [] for ip, obj in result_obj_dict.items(): # print(obj.get()) if obj.get()[1] == 1: active_ip.append(ip) # print(active_ip) return active_ip
def batchFeedDownload(self, feedPack: list, procs: int) -> list: """ Downloads collection of feeds in parallel processes :param feedsPack: Feed data :param proc: Number of parallel processes to get data over different feeds """ logger.info('Download started') if procs == 1: downloadStartTime = datetime.now() feedData: list = [] # Iterate over feed links and download feeds for link in feedPack: feedData.append(self.getFeed(link)) downloadEndTime = datetime.now() downloadTime = downloadEndTime - downloadStartTime return feedData elif procs > 1: # Log download start time downloadStartTime = datetime.now() feedData: list = [] # Define process pool and start download feeds from feedPack in parallel processes pool = ProcessPool(procs) # Download feeds in a number of separate processes feedData = pool.map(self.getFeed, feedPack) # Log download end time downloadEndTime = datetime.now() downloadTime = downloadEndTime - downloadStartTime # Calcuate total feeds size totalFeedsSize: int = 0 for dictItem in feedData: totalFeedsSize += dictItem['feedSize'] # Log results logger.info( 'Successfully downloaded {0} feeds of {1} Kbytes in {2} seconds {3} msec' .format( len(feedPack), round(totalFeedsSize, 1), downloadTime.seconds, downloadTime.microseconds, ), ) pool.close() pool.join() return feedData
def _crawl_user(user_id): """ 查询user的画板 """ if not user_id: return user_url = BASE_URL + "/{}".format(user_id) limit = 5 try: # get first board data r = request.get(user_url).json() except requests.ConnectionError: request.headers.update({"User-Agent": choice(user_agent_list)}) r = request.get(user_url).json() except Exception as e: printcolor("Crawl first page error, user_id: {}".format(user_id), "yellow") logging.error(e, exc_info=True) else: if "user" in r: user_data = r["user"] else: printcolor(r.get("msg")) return board_number = int(user_data['board_count']) retry = 2 * board_number / limit board_ids = user_data['boards'] printcolor( "Current user <{}> boards number is {}, first boards number is {}". format(user_id, board_number, len(board_ids)), 'red') if len(board_ids) < board_number: last_board = user_data['boards'][-1]['board_id'] while 1 <= retry: # get ajax pin data user_next_url = BASE_URL + \ "/{}?jhhft3as&max={}&limit={}&wfl=1".format( user_id, last_board, limit) try: user_next_data = request.get(user_next_url).json()["user"] except Exception as e: logging.error(e, exc_info=True) continue else: board_ids += user_next_data["boards"] printcolor( "ajax load user with board_id {}, get boards number is {}, merged" .format(last_board, len(user_next_data["boards"])), "blue") if len(user_next_data["boards"]) == 0: break last_board = user_next_data["boards"][-1]["board_id"] retry -= 1 # 减轻访问频率 sleep(SLEEP_TIME) board_ids = map(str, [board['board_id'] for board in board_ids]) pool = ProcessPool() # 创建进程池 # board_ids:要处理的数据列表; _crawl_board:处理列表中数据的函数 pool.map(_crawl_board, board_ids) pool.close() # 关闭进程池,不再接受新的进程 pool.join() # 主进程阻塞等待子进程的退出 printcolor("Current user {}, download over".format(user_id), "green")
def get_count_matrix(args, db, db_opts): """Form a sparse word to document count matrix (inverted index). M[i, j] = # times word i appears in document j. """ # Map doc_ids to indexes global DOC2IDX db_class = retriever.get_class(db) with db_class(**db_opts) as doc_db: #doc_ids = doc_db.get_doc_ids() doc_ids = [] res = es.search(index="htts", doc_type="htts", body={"size":500,"query": {"match_all": {}}},scroll='10m') scroll = res['_scroll_id'] #logger.info(scroll) #for doc in res['hits']['hits']: #print("%s" % (doc['_source']['documentId'])) # doc_ids.append(doc['_source']['documentId']) #res2 = es.scroll(scroll_id = scroll, scroll = '1m') #for doc in res2['hits']['hits']: #print("%s" % (doc['_source']['documentId'])) # doc_ids.append(doc['_source']['documentId']) scroll_id = res['_scroll_id'] for ref in scrollr(es, scroll_id, extract_references): print(ref) doc_ids.append(ref) DOC2IDX = {doc_id: i for i, doc_id in enumerate(doc_ids)} # Setup worker pool tok_class = tokenizers.get_class(args.tokenizer) workers = ProcessPool( args.num_workers, initializer=init, initargs=(tok_class, db_class, db_opts) ) # Compute the count matrix in steps (to keep in memory) logger.info('Mapping...') row, col, data = [], [], [] step = max(int(len(doc_ids) / 10), 1) batches = [doc_ids[i:i + step] for i in range(0, len(doc_ids), step)] _count = partial(count, args.ngram, args.hash_size) for i, batch in enumerate(batches): logger.info('-' * 25 + 'Batch %d/%d' % (i + 1, len(batches)) + '-' * 25) for b_row, b_col, b_data in workers.imap_unordered(_count, batch): row.extend(b_row) col.extend(b_col) data.extend(b_data) workers.close() workers.join() logger.info('Creating sparse matrix...') count_matrix = sp.csr_matrix( (data, (row, col)), shape=(args.hash_size, len(doc_ids)) ) count_matrix.sum_duplicates() return count_matrix, (DOC2IDX, doc_ids)
def _open(self) -> None: """Open pools""" max_processes = cpu_count(logical=False) # Reserve one CPU for I/O bound tasks if max_processes > 2: max_processes = max_processes - 1 self.__process_pool = ProcessPool(max_processes) self.__thread_pool = ThreadPool(max_processes)
def process_pool(num): p = ProcessPool(num) start_time = time.time() ret = p.map(run, range(max_range)) p.close() p.join() print("process_pool %d, costTime: %fs ret.size: %d" % (num, (time.time() - start_time), len(ret)))
def get_comic_index(): urlpool = [url+str(i)+'/' for url in URLs for i in range(1,470)] print("Crawl index: ", ctime()) pool = ProcessPool(PROCESSNUMS) pool.map(_get_comic_index, urlpool) pool.close() pool.join print('Finished:', ctime())
def run_processpool(some_function, list_of_args, threads): pool = ProcessPool(threads) #pass the function that we will be multiprocessing along with args per process results = pool.map(some_function, list_of_args) pool.close() #complete the process with join pool.join() return results
def iterate_list_with_parallel(f, data_length, thread): p = ProcessPool(thread) # Ten Threads index_list = [(int(idx * data_length / thread), int((idx + 1) * data_length / thread)) for idx in range(thread)] p.map(f, index_list) p.close() p.join()
def __init__(self, M, file_dir_sce, file_dir_data, file_dir_eval, configure): name = 'AdaptObj' # 初始化name(函数名称,可以随意设置) maxormins = [1] * M # 初始化maxormins(目标最小最大化标记列表,1:最小化该目标;-1:最大化该目标) # Dim = 19 # 初始化Dim(决策变量维数) # varTypes = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1] # 初始化varTypes(决策变量的类型,0:实数;1:整数) # lb = [150, 8, 20, 212, 4, 0, 20, 205, 4, 0, 1, 150, 4, 0, 0, 50, 10, 1, 4] # 决策变量下界 # ub = [180, 16, 100, 217, 16, 3, 120, 210, 16, 3, 6, 250, 16, 3, 240, 250, 20, 3, 6] # 决策变量上界 # lbin = [1] * Dim # 决策变量下边界(0表示不包含该变量的下边界,1表示包含) # ubin = [1] * Dim # 决策变量上边界(0表示不包含该变量的上边界,1表示包含) self.config = configure Dim = 16 # 初始化Dim(决策变量维数) # varTypes = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1] # 初始化varTypes(决策变量的类型,0:实数;1:整数) lb = [ self.config.ego_s0[0], self.config.ego_v0[0], self.config.start_s[0], self.config.end_s[0], self.config.green_time[0], self.config.yellow_time[0], self.config.red_time[0], self.config.pos_s[0], self.config.pos_s[0], self.config.pos_y_1[0], self.config.velo_1[0], self.config.acc_1[0], self.config.pos_y_1[0], self.config.velo_1[0], self.config.acc_1[0], self.config.start_time_2[0] ] # 决策变量下界 ub = [ self.config.ego_s0[1], self.config.ego_v0[1], self.config.start_s[1], self.config.end_s[1], self.config.green_time[1], self.config.yellow_time[1], self.config.red_time[1], self.config.pos_s[1], self.config.pos_s[1], self.config.pos_y_1[1], self.config.velo_1[1], self.config.acc_1[1], self.config.pos_y_1[1], self.config.velo_1[1], self.config.acc_1[1], self.config.start_time_2[1] ] # 决策变量上界 Dim = len(lb) # print(Dim) varTypes = [0] * Dim lbin = [1] * Dim # 决策变量下边界(0表示不包含该变量的下边界,1表示包含) ubin = [1] * Dim # 决策变量上边界(0表示不包含该变量的上边界,1表示包含) self.file_dir_sce = file_dir_sce self.file_dir_data = file_dir_data self.file_dir_eval = file_dir_eval # 调用父类构造方法完成实例化 ea.Problem.__init__(self, name, M, maxormins, Dim, varTypes, lb, ub, lbin, ubin) # 设置用多线程还是多进程 self.PoolType = configure.PoolType if self.PoolType == 'Thread': self.pool = ThreadPool(4) # 设置池的大小 elif self.PoolType == 'Process': num_cores = int(mp.cpu_count()) # 获得计算机的核心数 self.pool = ProcessPool(num_cores) # 设置池的大小
def process_pool(num): p = ProcessPool(num) start_time = time.time() data = zip(['a', 'b', 'c'], [1, 3, 6]) ret = p.map(run, data) p.close() p.join() print("process_pool %d, costTime: %fs ret.size: %d" % (num, (time.time() - start_time), len(ret)))
def getDataAsync3(baseurl: str, process_num=os.cpu_count()) -> List[List]: """多进程 异步方式爬取数据""" datalist = [] urls = [baseurl + str(i * 25) for i in range(10)] with ProcessPool(process_num) as pool: htmls = pool.map(askUrl, urls) for html in htmls: datalist.extend(parseData(html)) return datalist
def main(): data_files = find_paths(args.input_dir) logger.info(f'{len(data_files)} data files found.') if len(data_files) == 0: return workers = ProcessPool(args.num_workers) with tqdm(total=len(data_files)) as pbar: for _ in tqdm(workers.imap_unordered(process_tweets_file, data_files)): pbar.update()
def sPool(callback, tnum=20, cnum='', arg=[]): from multiprocessing import Pool as ProcessPool # 进程池 from multiprocessing import cpu_count #cpu数量 if cnum == '': spool = ProcessPool(cpu_count()) else: spool = ProcessPool(cnum) arr = list( map( lambda i: { 'cnum': i, 'tnum': tnum, 'callback': callback, 'arg': arg }, range(cnum))) spool.map(fleader.bPool, arr) spool.close() spool.join()
def stream_corpus(data_path, dictionary, files, num_workers=None): workers = ProcessPool(num_workers) #files = [f for f in iter_files(data_path)] with tqdm(total=len(files)) as pbar: for pairs in tqdm(workers.imap_unordered(get_contents, files)): yield dictionary.doc2bow(pairs[0][1].split( )) # pairs[0][0]-->docId, pairs[0][1]-->documentContent pbar.update()
def _start_process(self, more_sample_content): cpu = os.cpu_count() pool = ProcessPool(cpu) index_batch, sample_batch = corpus_cut(more_sample_content, cpu) result = [pool.apply_async(self._process_task, args=(index_batch[i], sample_batch[i])) for i in range(cpu)] pool.close() pool.join() result = map(lambda item: item[1], sorted([item for res in result for item in res.get()], key=lambda item: item[0])) return list(result)
def active_learning(topic_list, al_protocol, al_classifier, document_collection, topic_seed_info, topic_complete_qrels_address,train_per_centage, data_path, file_name, use_pooled_budget, per_topic_budget_from_trec_qrels): num_workers = None workers = ProcessPool(processes = 1) with tqdm(total=len(topic_list)) as pbar: partial_active_learning_multi_processing = partial(active_learning_multi_processing, al_protocol=al_protocol, al_classifier = al_classifier, document_collection=document_collection,topic_seed_info=topic_seed_info,topic_complete_qrels_address=topic_complete_qrels_address,train_per_centage=train_per_centage, use_pooled_budget=use_pooled_budget, per_topic_budget_from_trec_qrels=per_topic_budget_from_trec_qrels) for topic_all_info in tqdm(workers.imap_unordered(partial_active_learning_multi_processing, topic_list)): topicId = topic_all_info[0][0] # 0 is the loopCounter Index and 0 is the first tuple file_complete_path = data_path + file_name + str(topicId) + ".pickle" pickle.dump(topic_all_info, open(file_complete_path, 'wb')) pbar.update()
def get_map_fn(num_procs, use_threads=False): if num_procs == 1: return map if use_threads: if num_procs not in MT_POOLS: MT_POOLS[num_procs] = ThreadPool(num_procs) return MT_POOLS[num_procs].map if num_procs not in MP_POOLS: MP_POOLS[num_procs] = ProcessPool(num_procs) return MP_POOLS[num_procs].map
def calc_factorials(max_int=100, pool_size=8, threads=True, chunk_size=10): if threads: pool = ThreadPool(pool_size) else: pool = ProcessPool(pool_size) results = pool.imap_unordered(factorial_calc, range(max_int), chunk_size) return results
def get_count_matrix(args, db, db_opts): """Form a sparse word to document count matrix (inverted index). M[i, j] = # times word i appears in document j. """ # Map doc_ids to indexes global DOC2IDX db_class = retriever.get_class(db) with db_class(**db_opts) as doc_db: doc_ids = doc_db.get_doc_ids()[:args.num_docs] DOC2IDX = {doc_id: i for i, doc_id in enumerate(doc_ids)} # Setup worker pool tok_class = tokenizers.get_class(args.tokenizer) workers = ProcessPool(args.num_workers, initializer=init, initargs=(tok_class, db_class, db_opts)) # Compute the count matrix in steps (to keep in memory) logger.info('Mapping...') row, col, data = [], [], [] unigrams, bigrams = [], [] hash2gram = {} step = max(int(len(doc_ids) / 10), 1) batches = [doc_ids[i:i + step] for i in range(0, len(doc_ids), step)] _count = partial(count, args.ngram, args.hash_size) for i, batch in enumerate(batches): logger.info('-' * 25 + 'Batch %d/%d' % (i + 1, len(batches)) + '-' * 25) for b_row, b_col, b_data, b_unigrams, b_bigrams, b_hash2gram in workers.imap_unordered( _count, batch): row.extend(b_row) col.extend(b_col) data.extend(b_data) unigrams.extend(b_unigrams) bigrams.extend(b_bigrams) hash2gram.update(b_hash2gram) workers.close() workers.join() unigrams = list(set(unigrams)) bigrams = list(set(bigrams)) logger.info('Creating sparse matrix...') count_matrix = None if args.matrix_type == 'csr': count_matrix = sp.csr_matrix((data, (row, col)), shape=(args.hash_size, len(doc_ids))) count_matrix.sum_duplicates() elif args.matrix_type == 'csc': count_matrix = sp.csr_matrix((data, (row, col)), shape=(args.hash_size, len(doc_ids))) count_matrix.sum_duplicates() return count_matrix, (DOC2IDX, doc_ids), (unigrams, bigrams, hash2gram)
def get_source_proxies(): get_config = GetConfig() # 获取免费代理的所有方法名字 proxy_func_names = get_config.get_proxy_func() # 多进程获取免费代理 pool = ProcessPool(4) result = pool.map(run_cls_func, proxy_func_names) pool.close() pool.join() # 将[[proxy, proxy, ...], [proxy, proxy, ...]]返回 return result
def _map_multiprocess(func, iterable, chunksize=1): # type: (Callable[[S], T], Iterable[S], int) -> Iterator[T] """Chop iterable into chunks and submit them to a process pool. For very long iterables using a large value for chunksize can make the job complete much faster than using the default value of 1. Return an unordered iterator of the results. """ with closing(ProcessPool()) as pool: return pool.imap_unordered(func, iterable, chunksize)
def mk_process(proxies, url, jobs, processnum, threadnum, method, post): Processpools = ProcessPool(processnum) for i in range(processnum): Processpools.apply_async(mk_threading, args=(proxies[i], url, jobs, threadnum, method, post)) print 'CC start...' print 'Waiting for all subprocesses done...' Processpools.close() Processpools.join() print 'All subprocesses done.'
def main(opts): files = sorted(glob.iglob(opts.pattern)) memc_conf = {'idfa': opts.idfa, 'gaid': opts.gaid, 'adid': opts.adid, 'dvid': opts.dvid} pool = ProcessPool(processes=FILES_PROCESSING_POOL_SIZE) load_file_fn = partial(load_file, threads_count=LOAD_FILE_THREADS_COUNT, memc_conf=memc_conf, dry_run=opts.dry) for file_name in pool.imap(load_file_fn, files): dot_rename(file_name)
def refresh_parallelized(list_cubes, size_pool=5, multiprocessing=True): if list_cubes: if multiprocessing: pool = ProcessPool(size_pool) else: pool = ThreadPool(size_pool) cubes_list = list() segments_list = list() for cubes, segments in list_cubes: cubes_list.append(cubes) segments_list.append(segments) pool.map(refresh_star, itertools.izip())