def GroupByParallelProcess(tweetsDF, cores, groupMethod): """ Group by and aggregate on time via a parallel process """ tweetsDF.label_date = tweetsDF.label_date.astype(int) tweetsDF = tweetsDF.set_index("label_date") # Parallelizing using Pool.apply() df_split = GetListOfSplitDFs(tweetsDF, cores) # create the multiprocessing pool pool = Pool(cores) # process the DataFrame by mapping function to each df across the pool logging.info("Starting the grouping and aggregating process.") if groupMethod == "weighted-average": df_out = pool.map(PerformGroupbyAndAggregate, df_split) elif groupMethod == "sum": df_out = pool.map(PerformSum, df_split) elif groupMethod == "mean": df_out = pool.map(PerformMean, df_split) else: logging.error("Choose correct group by method.") return None # close down the pool and join pool.close() pool.join() pool.clear() logging.info("Ended the grouping and aggregating process.") return df_out
def parallelize_dataframe(df, func, n_cores=16): df_split = np.array_split(df, n_cores) pool = Pool(n_cores) df = pd.concat(pool.map(func, df_split)) pool.close() pool.join() return df
def multi_word_cut(self, sentences): print('Multiprocessing Word cut ') if self.language == 'ch': jieba.initialize( ) # initialize first, or it will initialize in each process jieba.disable_parallel() def func(line): line = [i.strip() for i in jieba.cut(line, cut_all=False)] return [ i for i in line if ((not i.isdigit()) and (i not in self.stop_words)) ] else: def func(line): return [i.lower() for i in line.split(" ") if ((not i.isdigit()) and \ (i not in self.stop_words) and \ (len(i) >1 ) )] pool = Pool(nodes=5) t0 = time.time() word_cut = pool.map(func, sentences) pool.close() pool.join() pool.clear() print('MultiProcess time {:.0f}'.format(time.time() - t0)) return word_cut
def perplexity(lang="eng"): """ finds satistical perplexity of the language model in Google Books N-Gram dataset. """ pool = ProcessingPool(4) unigram_counter, mgram_counter, ngram_counter= pool.map(get_ngram_counter, [1,2,3], [lang] * 3) pool.close() pool.join() total_words = np.sum(np.array(list(unigram_counter.values()))) print("total_words = ", total_words) ngram_conditionals = get_ngram_conditionals(ngram_counter, mgram_counter) probs = np.power(np.array(list(ngram_conditionals.values()), dtype=np.float64), -np.array(list(ngram_counter.values()), dtype=np.float64) \ / total_words) print("probs shape = ", probs.shape) PP = (np.prod(probs, dtype=np.float64)) return PP
def preprocess(self): # Check if orderline should be extracted extract_orderline = conf['extract_orderline'] pool = Pool() start = time.time() # Run in parallel if extract_orderline: res_orderline = pool.apipe(self.create_orderline, return_dataframe=False) res_warehouse = pool.apipe(self.create_warehouse) res_district = pool.apipe(self.create_district) res_order = pool.apipe(self.create_order) res_customer = pool.apipe(self.create_customer) res_stock = pool.apipe(self.create_stock) # Consolidate result pool.close() pool.join() list_of_processed_files = [res_warehouse.get(), res_district.get(), res_order.get(), res_customer.get(), res_stock.get()] if extract_orderline: list_of_processed_files.append(res_orderline.get()) end = time.time() self.debug("Preprocessing of csv file took {}s".format(end - start)) return list_of_processed_files
def multi_Non_Tweep_friends(self, handle): min_position, links = self.get_tweets(handle) print("Scraping last 100 days of activity") while (True): min_position1, links1 = self.get_tweets(handle, min_position) links = links + links1 if (min_position1 == None): break min_position = min_position1 people_list = [] link = [x for x in links if handle in x] link = self.duplicates(link) p = Pool(10) # Pool tells how many at a time with Pool(10) as p: records = list(tqdm(p.imap(self.get_people, link), total=len(link))) p.terminate() p.join() p.close() people_list = [item for sublist in records for item in sublist] people_list = self.duplicates(people_list) people_list = [x for x in people_list if x != handle] return (people_list)
def q5_plot_chromatic_num_bounds_by_prob(n, prange, pstep, k=None,\ clique_finder=greedy_find_clique_number, multi=False): """Plots a graph of number of colours against edge probability, for each of the various lower/upper bounds of chromatic number multi: True/False/int multiprocessing - yes/no/ num processes (default 4 if true) """ probs = np.arange(prange[0], prange[1], pstep) graphs = [[get_random_graph(n, p, k) for _ in range(10)] for p in probs] mean_bounds = [] pool = Pool(multi if type(multi) is int else 4) # graph_generator = pool.imap(multiprocessing_chrom_bounds_func, graphs) if multi else map(f, graphs) f = lambda graphs_list: list(map(get_chromatic_number_bounds, graphs_list)) graph_generator = pool.imap(f, graphs) if multi else map(f, graphs) for bounds in tqdm.tqdm(graph_generator, total=len(graphs)): mean_bounds.append(np.mean(bounds, axis=0)) pool.close() pool.join() mean_bounds = np.array(mean_bounds) plt.figure() for i, label in zip(range(mean_bounds.shape[1]), \ ['lb_comp', 'lb_clique', 'ub_clique', 'ub_greedy_rand', 'ub_greedy_msd']): plt.plot(probs, mean_bounds[:, i], label=label) plt.legend() return probs, mean_bounds
class ConsensusMHSampler(MHSampler): def __init__(self, log_f, log_g, g_sample, x0, iterations, shards=1): super(ConsensusMHSampler, self).__init__(log_f, log_g, g_sample, x0, iterations) self.shards = shards assert len(self.log_distribution_fn) == self.shards self.log_fn_dict = {} # for pickling purposes for i in range(self.shards): self.log_fn_dict[i] = self.log_distribution_fn[i] self.pool = Pool(nodes=self.shards) def sample(self): map_results = self.pool.map(self.map_sample, range(self.shards)) self.pool.close() self.pool.join() self.pool.terminate() self.pool.restart() self.saved_states = self.reduce_sample(map_results) def map_sample(self, index): np.random.seed(1) cur_state = self.start_state sample_results = [cur_state] prob, count = 0, 0 for i in range(self.iterations): if i % 5000 == 0: print("iteration {}".format(i)) candidate_state = self.get_transition_sample(cur_state) acceptance = self.calculate_acceptance_ratio(candidate_state, self.log_fn_dict[index]) prob += acceptance count += 1 new_state = self.transition_step(cur_state, candidate_state, acceptance) sample_results.append(new_state) cur_state = new_state sample_results = np.array(sample_results) print("INDEX {}: Avg acceptance prob is {}".format(index, prob/count)) return (sample_results, 1.0 / (1e-8 + self.get_sample_variance(sample_results))) def get_sample_variance(self, data): return np.linalg.norm(np.var(np.array(data), axis=0)) def reduce_sample(self, results): ''' results is a list of (sample_array, weight) tuples ''' sample_results = 0 total_weight = 0 for sample, weight in results: sample_results += weight * sample total_weight += weight return sample_results / total_weight
def test_multiprocess(): x_list = [1,2,3,4,5,6,7,] y_list = ['1','2','3','4','5','6','7'] epoch = 8 pool = Pool(epoch) res = pool.amap(test_task,x_list,y_list) pool.pipe(test_task,'22','222') pool.close() pool.join()
def parallelize_dataframe(df: pd.DataFrame, func, n_cores=4) -> pd.DataFrame: df_split = np.array_split(df, n_cores) pool = Pool(n_cores) df = pd.concat(pool.map(func, df_split)) pool.close() pool.join() # have to include this to prevent leakage and allow multiple parallel function calls pool.terminate() pool.restart() return df
def parallelize(data, func, num_of_processes=8): '''Function for paralellizing any function on a dataframe. Stolen from stack overflow, user Tom Raz: https://stackoverflow.com/questions/26784164/pandas-multiprocessing-apply''' data_split = np.array_split(data, num_of_processes) pool = Pool(num_of_processes) data = pd.concat(pool.map(func, data_split)) pool.close() pool.join() return data
def goo(): pool = Pool(4) # def f(x): # return foo(100 + x) stuff = list(tqdm.tqdm(pool.imap(foo, range(20)), total=20)) print(stuff) print('aaa') pool.close() pool.join() print('bbb')
def Pool(cpus=cpu_count()) -> ProcessingPool: """Context manager for pathos ProcessingPool""" # Creates a pool with processes p = ProcessingPool(cpus) yield p # Need to clear due to: # https://github.com/uqfoundation/pathos/issues/111 p.close() p.join() p.clear()
def parallelize_dataframe(df, func, num_partitions=num_cores, num_cores=num_cores): df_split = np.array_split(df, num_partitions, axis=0) pool = Pool(num_cores) df = pd.concat(pool.map(func, df_split)) pool.close() pool.join() pool.clear() return df
def parallelmap(func, data, nodes = None): """ Return the averaged signal and background (based on blank frames) over the given runs """ if not nodes: nodes = multiprocessing.cpu_count() - 2 pool = ProcessingPool(nodes=nodes) try: return pool.map(func, data) except KeyboardInterrupt: pool.terminate() pool.join()
def Pool(threads: int, multiplier: int, name: str): """Context manager for pathos ProcessingPool""" # Creates a pool with threads else cpu_count * multiplier p = ProcessingPool(threads if threads else cpu_count() * multiplier) logging.debug(f"Created {name} pool") yield p # Need to clear due to: # https://github.com/uqfoundation/pathos/issues/111 p.close() p.join() p.clear()
def finally_ip(): (ipList,portList) = get_ip_list(url,headers) #运行pool = ThreadPool(2)有时会出现module '__main__' has no attribute '__spec__'错误 不造如何解决 #尝试过 __spec__=None 的方式 没有什么用 pool = ThreadPool(4) start_time = time.time() results = pool.map(test_ip,ipList,portList) pool.close() pool.join() end_time = time.time() print("并行耗时:"+str(end_time-start_time)) return results
def compute_scores(args): results_file = args.results_file scores_file = args.score_file num_captions = args.num_captions is_exp = args.exp generated_image_tokens = get_generated_tokens(res_file=results_file, num=num_captions) gt_image_tokens = get_gt_tokens() print('number of test images: %d, all images: %d') % ( len(generated_image_tokens), len(gt_image_tokens)) all_image_ids = generated_image_tokens.keys() def f(image_id_thread): image_ids = image_id_thread[0] thread_num = image_id_thread[1] scores = {} for image_id in image_ids: res_tokens = generated_image_tokens[str(image_id)] gt_tokens = gt_image_tokens[str(image_id)] wmd_score = word_mover_distance(res_tokens, gt_tokens, wvmodel=wvmodel, is_exp=is_exp) scores[image_id] = wmd_score print('Thread: %d, Image ID: %s, WMD score: %.5f') % ( thread_num, image_id, wmd_score) return scores num_images = len(all_image_ids) num_workers = 20 num_per_split = num_images // num_workers images_split = [] for i in range(num_workers): if i == (num_workers - 1): images_split.append([all_image_ids[(i * num_per_split):], i]) else: images_split.append([ all_image_ids[(i * num_per_split):((i + 1) * num_per_split)], i ]) pool = Pool(num_workers) all_scores = pool.map(f, images_split) pool.close() pool.join() scores = {} for s in all_scores: scores.update(s) with open(scores_file, 'w') as f: json.dump(scores, f) total_score = 0 for key in scores.keys(): total_score += scores[key] print('WMD score: %.5f') % (total_score / len(scores))
def parallelise_initsync(argv, ssp_params, process_control_id, logger): # Pivot the collection of source_system_profile records into # three separate lists to enable us to call pool.map on each record (source_schemas, tables, target_schemas, query_conditions) = map(list, zip(*ssp_params)) source_conn_detail = dbuser.get_dbuser_properties(argv.sourceuser) target_conn_detail = dbuser.get_dbuser_properties(argv.targetuser) logger.info("Processing tables with {} dedicated worker processes".format( argv.numprocesses)) pool = Pool(nodes=argv.numprocesses) argvs = [argv] * len(tables) source_conn_details = [source_conn_detail] * len(tables) target_conn_details = [target_conn_detail] * len(tables) pcids = [process_control_id] * len(tables) queues = [manager.Queue()] * len(tables) logger.debug("Starting a new process for each table in: {tables}".format( tables=tables)) # Execute initsync for each schema/table combination in parallel pool.map(initsync_table, argvs, source_conn_details, target_conn_details, source_schemas, tables, target_schemas, pcids, query_conditions, queues, chunksize=1) # Ensure tables are processed in sequence # and workers are fully utilised pool.close() logger.debug("parallelise_initsync: Pool joining") pool.join() logger.debug("parallelise_initsync: Pool joined") all_table_results = {} for q in queues: size = q.qsize() message = q.get() logger.debug("Message queue size = {s}, message = {m}".format( s=size, m=message)) all_table_results.update(message) logger.debug("all_table_results = {r}".format(r=all_table_results)) return all_table_results
def parallel_apply(self, df, func): # add try statement re function not returning a DataFrame if self.preprocessing_checks(df, func): # split DataFrame into a list of smaller DataFrames self.df_split = np.array_split(df, self.partitions, axis=0) # create the multiprocessing pool pool = Pool(self.cores) # process the DataFrame by mapping function to each df across the pool df = pd.concat(pool.map(func, self.df_split), axis=0).copy() # close down the pool and join pool.close() pool.join() pool.clear() return df
def get_gt_tokens( coco_file='../data/files/dataset_coco.json', coco_tokens_file='../data/files/coco_tokens_Google_news.json'): if os.path.exists(coco_tokens_file): with open(coco_tokens_file, 'r') as f: dataset = json.load(f) return dataset print 'Processing ground-truth data...' with open(coco_file, 'r') as f: dataset = json.load(f) def f(images): image_tokens = {} # images = dataset['images'] for image in images: sentence = image['sentences'] image_id = str(image['cocoid']) tokens = [] for s in sentence: tokens.extend(s['tokens']) filter_token = [] for token in tokens: if (token not in stop_words) and (token in vocab): filter_token.append(token) # tokens = [token for token in tokens if token not in stop_words and token in vocab] image_tokens[image_id] = filter_token return image_tokens all_images = dataset['images'] num_images = len(all_images) num_workers = 30 num_per_split = num_images // num_workers images_split = [] for i in range(num_workers): if i == (num_workers - 1): images_split.append(all_images[(i * num_per_split):]) else: images_split.append( all_images[(i * num_per_split):((i + 1) * num_per_split)]) pool = Pool(num_workers) all_images_tokens = pool.map(f, images_split) pool.close() pool.join() all_token_dict = {} for d in all_images_tokens: all_token_dict.update(d) with open(coco_tokens_file, 'w') as f: json.dump(all_token_dict, f) return all_token_dict
def parallelmap(func, lst, nodes=None): """ Return the averaged signal and background (based on blank frames) over the given runs using multiprocessing (as opposed to MPI). """ from pathos.multiprocessing import ProcessingPool from pathos import multiprocessing if not nodes: nodes = multiprocessing.cpu_count() - 2 pool = ProcessingPool(nodes=nodes) try: return pool.map(func, lst) except KeyboardInterrupt: pool.terminate() pool.join()
def parallelmap(func, lst, nodes = None): """ Return the averaged signal and background (based on blank frames) over the given runs using multiprocessing (as opposed to MPI). """ from pathos.multiprocessing import ProcessingPool from pathos import multiprocessing if not nodes: nodes = multiprocessing.cpu_count() - 2 pool = ProcessingPool(nodes=nodes) try: return pool.map(func, lst) except KeyboardInterrupt: pool.terminate() pool.join()
def make_query(self, size=1): ## quit if nr_unlabeled_samples = 1 if self.dataset.len_unlabeled() == 1: return self.dataset.get_unlabeled_entries()[0].astype(int) ## Set the possible labels self.possible_labels = list(set(self.dataset.get_labeled_entries()[1])) ## Train the model self.model.train(self.dataset) ## Get probabilities X_ids, X = self.dataset.get_unlabeled_entries() pred = self.model.predict_proba( X) # pred.shape = (n_unlabeled, nr_of_labels) ## Setup pool for cpu parallelisation p = Pool(cpu_count(), maxtasksperchild=1000) ## nr of unlabeled samples -> len(X) ## Get uncertainty after adding every sample with every label total = np.asarray( p.map(self._eer, X_ids, len(X) * [self.dataset], len(X) * [self.depth])) # total.shape = (n_unlabeled, nr_of_labels) ## Close the Pool again p.close() p.join() p.clear() ## Get the total uncertainty of one sample after adding a label weighted by the labels probability total = np.inner( pred, total, ).diagonal() # total.shape = (n_unlabeled,) ## Zip it total = zipit(X_ids, total) ## Sort it results = sort_by_2nd(total, 'min') return results[:size, 0].astype(int)
def start(self, text_data_dir, res_dir, nprocs=8): ''' entry function text_data_dir: folder of raw data text_res_dir: folder of output verbose: int. Information is printed every N records nprocs: number of cores in parallel ''' p = PathosPool(nprocs) filepathsvec, filenamesvec, respaths = list(), list(), list() for dirpath, _, filenames in os.walk(text_data_dir): for filename in filenames: if (("gz" in filename) and ('md5' not in filename) and ('copy' not in filename)): filepath = os.path.join(dirpath, filename) print(filepath) res_name = filename.split(".")[0] + ".csv.gz" respath = os.path.join(res_dir, res_name) #if os.path.exists(respath): # pass #else: if True: filepathsvec.append(filepath) filenamesvec.append(filename) respaths.append(respath) #p.apply_async(process_data, args = (filepath,filename, # respath, True, # [title_stop_path, # affil_stop_path, # mesh_stop_path])) self.affildicts = p.amap( partial(self.process_data, stop_paths=[ self.title_stop_path, self.affil_stop_path, self.mesh_stop_path ], rm_stopwords=True, affiliation_correction=True, select_journals=self.select_journals), filepathsvec, filenamesvec, respaths) p.close() p.join() # Having an issue joining print("joined") p.clear() # Delete the pool
def _mp_improve(self, container, scenario_builder): """Improves b/2 best solutions from the container and updates the score table with the generated solutions """ container.sort() pool = Pool(processes=self._proc_count) logging.info("Starting processes") start = datetime.now() best = [] builders = [] for i in range(self._b/2): best.append(container.get(i)) builders.append(scenario_builder) try: result = pool.map(self._improve, best, builders) pool.close() pool.join() except MemoryError as e: send_email("I crashed again, please help!") import pudb pudb.set_trace() print(e.message()) logging.info("Processes finished - %s" % (datetime.now() - start)) # How infuriating was that?! # pathos was being smart and was caching pool so this is needed # to prevent from erroring out pool.restart() start = datetime.now() logging.info("mp_improve second loop") for entry in result: index = container.index(entry['individual']) best = entry['improvements'].get(0) if best.get_utility() < entry['individual'].get_utility(): container.replace(best, index) for improvement in entry['improvements'].get_all(): self._update_score_table(improvement) logging.info("mp_improve second loop - %s" % (datetime.now() - start)) logging.info("Improved %d solutions" % container.get_changes()) container.reset_changes() return container
class ParallelGridSearch(Experiment): param_queue = [] def __init__(self, exp_class, parameters, parallel=4): """ :param exp_class: subclass of Experiment to run :type exp_class: class<Experiment> :param parameters: dict of list, experiment parameters to search within, i.e.: { "entropy": [1e-2, 1e-3], "learning_rate": [1e-3, 1e-4], ... } or list of dict-of-list, representing multiple groups of parameters: [ { "entropy": [1e-2, 1e-3], "learning_rate": [1e-3, 1e-4], ... }, { "batch_size": [32, 64], ... } ] """ super(ParallelGridSearch, self).__init__() self._exp_class, self._parameters, self._parallel = exp_class, parameters, parallel def run(self, args): self.log_root = args.logdir for parameter in GridSearch.product(self._parameters): label = GridSearch.labelize(parameter) ParallelGridSearch.param_queue.append( [self._exp_class, self.log_root, parameter, label, args]) n = len(ParallelGridSearch.param_queue) task_index = list(range(n)) logging.warning("total searched combination:%s", n) self.pool = Pool(self._parallel) ret = self.pool.amap(subprocess_run, task_index) ret.wait() self.pool.close() self.pool.join()
def combine_scores(): """Combine the scores from all patients and dump into all_dict.txt. """ all_dicts = {} duration_dict = {} all_dict_q = multiprocessing.Manager().Queue() duration_dict_q = multiprocessing.Manager().Queue() dirs = [ y for y in os.listdir(patient_dir) if os.path.isdir(os.path.join(patient_dir, y)) ] bar = progressbar.ProgressBar(redirect_stdout=True, max_value=len(dirs)) f = functools.partial(scores_and_duration_dict, all_dict_q, duration_dict_q) p = Pool() for i, _ in enumerate(p.imap(f, dirs, chunksize=50), 1): bar.update(i) p.close() p.join() while not all_dict_q.empty(): patient_dict = all_dict_q.get() dur_dict = duration_dict_q.get() for i in patient_dict: print(i) if i not in all_dicts: all_dicts[i] = patient_dict[i] else: all_dicts[i].update(patient_dict[i]) for i in dur_dict: print(i) if i not in duration_dict: duration_dict[i] = dur_dict[i] else: duration_dict[i].update(dur_dict[i]) print('done combining scores, dumping...') json.dump(all_dicts, open(os.path.join(patient_dir, 'all_dict.txt'), 'w')) json.dump(duration_dict, open(os.path.join(patient_dir, 'duration_dict.txt'), 'w'))
def multi_get_followers_location(self, followers_ids, amount=-1, workers=10): # takes list of screen names returns dict of location counts locations = {} if amount != -1: followers_ids = random.sample(followers_ids, amount) p = Pool(workers) # Pool tells how many at a time records = p.map(self.get_loc, followers_ids) p.terminate() p.join() print(records) for i in records: if i not in locations: locations[i] = 0 locations[i] = locations[i] + 1 return locations
class ProcessPool(): def __init__(self, maxprocesses): self.pool = Pool(processes=maxprocesses) logger.info("Initialized process pool of size {}".format(maxprocesses)) def start(self, fuction, local_dirs, remote_dirs): atexit.register(self.exit) logger.info( "Starting parallel rsync from a total of {} dirs ({} ...)".format( len(remote_dirs), remote_dirs[:3])) logger.info("Started with parallelization") results = self.pool.map(fuction, list(zip(local_dirs, remote_dirs))) logger.info("Stopped with results: {}".format(results)) return results def exit(self): self.pool.close() self.pool.join()
def wrapper(*args, **kwargs): obj, data, _args = tuple(), tuple(), tuple() if hasattr(args[0].__class__, fn.__name__): obj, data, *_args = args obj = (obj, ) else: data, *_args = args if type(data) != list: data = list(data) total_size = len(data) _batch_size = total_size // workers + 1 if batch_size is None else batch_size # assert type(data) == list, "Type of data must be list" print( f"@Parallel[workers={workers}, data_size={total_size}, batch_size={_batch_size}]: parallel for {fn.__qualname__}." ) if shuffle: print( f"@Parallel[workers={workers}, data_size={total_size}, batch_size={_batch_size}]: shuffle data for {fn.__qualname__}." ) random.shuffle(data) pool = Pool(workers) pool.terminate() pool.restart() proc = [] for beg, end in zip( range(0, total_size, _batch_size), range(_batch_size, total_size + _batch_size, _batch_size)): batch = data[beg:end] p = pool.apipe(fn, *obj, batch, *_args, **kwargs) proc.append(p) pool.close() pool.join() result = reduce_seqs([p.get() for p in proc]) if after_hook is not None: result = after_hook(result) return result
def get_result(): success = [] errors = [] updates = {'success': success, 'errors': errors} keys = url_db.sql_saved_keys() list_keys = [key for key in keys] pool = Pool() results = pool.map(check_key, list_keys) for key in results: if key['bool']: success.append(key['update']) elif key['bool'] is False: errors.append(key['link']) else: continue pool.close() pool.join() return updates