def main(data_dir, file_dict, surfix, dry_run_dict): encoder_path = '{}/{}_encoder_source.txt'.format(data_dir, surfix) decoder_path = '{}/{}_decoder_source.txt'.format(data_dir, surfix) source_sentences, target_sentences = merge_blanks( os.path.join(data_dir, file_dict['source']), os.path.join(data_dir, file_dict['target'])) print('String Preprocessing') source_sentences = str_utils_en.text_cleaning(source_sentences) target_sentences = str_utils_ch.text_cleaning(target_sentences) print('Double check source={}, target={}'.format(len(source_sentences), len(target_sentences))) print('Word segmentation') jieba.initialize() jieba.disable_parallel() with ProcessingPool(nodes=min(os.cpu_count(), 5)) as pool: source_sentences = pool.map( lambda x: [i.strip() for i in x.strip().lower().split(' ') if len(i) >= 1], source_sentences) with ProcessingPool(nodes=min(os.cpu_count(), 5)) as pool: target_sentences = pool.map( lambda x: [ i.strip() for i in jieba.cut(x.strip(), cut_all=False) if len(i) >= 1 ], target_sentences) print('Triple check source={}, target={}'.format(len(source_sentences), len(target_sentences))) source_sentences, target_sentences = filter_sample(source_sentences, target_sentences) print('Triple check source={}, target={}'.format(len(source_sentences), len(target_sentences))) print( 'Writing pair into encoder and decoder source at {}'.format(data_dir)) with open(encoder_path, 'w', encoding='utf-8') as fe, open(decoder_path, 'w', encoding='utf-8') as fd: for encoder_source, decoder_source in zip(source_sentences, target_sentences): fe.write(' '.join(encoder_source).lower()) fe.write('\n') fd.write(' '.join(decoder_source).lower()) fd.write('\n') # better sub tokenizer can be used to generate dictionary dump_dictionary(data_dir, source_sentences, prefix='source', debug=True, dry_run=dry_run_dict) dump_dictionary(data_dir, target_sentences, prefix='target', debug=True, dry_run=dry_run_dict)
def _featurize_compounds(self, df, featurizer, parallel=True): """Featurize individual compounds. Given a featurizer that operates on individual chemical compounds or macromolecules, compute & add features for that compound to the features dataframe """ sample_smiles = df["smiles"].tolist() if not parallel: features = [] for ind, smiles in enumerate(sample_smiles): if ind % self.log_every_n == 0: log("Featurizing sample %d" % ind, self.verbose) mol = Chem.MolFromSmiles(smiles) features.append(featurizer.featurize([mol])) else: def featurize_wrapper(smiles): mol = Chem.MolFromSmiles(smiles) return featurizer.featurize([mol]) features = ProcessingPool(mp.cpu_count()).map( featurize_wrapper, sample_smiles) df[featurizer.__class__.__name__] = features
def _featurize_complexes(self, df, featurizer, parallel=True, worker_pool=None): """Generates circular fingerprints for dataset.""" protein_pdbs = list(df["protein_pdb"]) ligand_pdbs = list(df["ligand_pdb"]) complexes = zip(ligand_pdbs, protein_pdbs) def featurize_wrapper(ligand_protein_pdb_tuple): ligand_pdb, protein_pdb = ligand_protein_pdb_tuple print("Featurizing %s" % ligand_pdb[0:2]) molecule_features = featurizer.featurize_complexes([ligand_pdb], [protein_pdb]) return molecule_features if worker_pool is None: features = [] for ligand_protein_pdb_tuple in zip(ligand_pdbs, protein_pdbs): features.append(featurize_wrapper(ligand_protein_pdb_tuple)) else: if worker_pool is None: worker_pool = ProcessingPool(mp.cpu_count()) features = worker_pool.map(featurize_wrapper, zip(ligand_pdbs, protein_pdbs)) else: features = worker_pool.map_sync(featurize_wrapper, zip(ligand_pdbs, protein_pdbs)) #features = featurize_wrapper(zip(ligand_pdbs, protein_pdbs)) df[featurizer.__class__.__name__] = list(features)
def simulate_walks_mult(self, num_walks, walk_length): ''' Repeatedly simulate random walks from each node. ''' G = self.G nodes = list(G.nodes()) def _single_walk(walk_iter): print(str(walk_iter + 1), '/', str(num_walks)) nodes2 = list(nodes) random.shuffle(nodes2) walks = list() for node in nodes2: walks.append( self.node2vec_walk(walk_length=walk_length, start_node=node)) return walks flatten = lambda l: [item for sublist in l for item in sublist] print('Walk iteration:') pool = ProcessingPool() all_walks = pool.map(_single_walk, range(num_walks)) return flatten(all_walks)
def preprocess_docs(self, docs): """ Preprocess string or list of strings """ if isinstance(docs, string_types): docs = [docs] if self.stemming is True: if not self.parallel: logger.info('preprocess %i documents without multiprocessing' % len(docs)) docs_preprocess = list(map(self.preprocess, docs)) else: if sys.version_info[0] == 3: from multiprocessing import Pool pool = Pool() n_processes = pool._processes else: logger.info('use pathos for multiprocessing') from pathos.multiprocessing import ProcessingPool pool = ProcessingPool() n_processes = pool.nodes logger.info('preprocess %i documents with %i workers' % (len(docs), n_processes)) docs_preprocess = pool.map(self.preprocess, docs) else: logger.info('no prepocess function apply') docs_preprocess = docs return docs_preprocess
def parallelMap(func, args, batchFunc=None, zippedIn=True, zippedOut=False, cores=-1, quiet=False): from pathos.multiprocessing import ProcessingPool """Parallel map using multiprocessing library Pathos Args: stderr (function): func args (arguments): [arg1s, arg2s ,..., argns](zippedIn==True) or [[arg1,arg2,...,argn], ...](zippedIn=False) batchFunc (func, optional): TODO. Defaults to None. zippedIn (bool, optional): See [args]. Defaults to True. zippedOut (bool, optional): See [Returns]. Defaults to False. cores (int, optional): How many processes. Defaults to -1. quiet (bool, optional): if do not print anything. Defaults to False. Returns: tuples: [out1s, out2s,..., outns](zippedOut==False) or [[out1,out2,...,outn], ...](zippedOut==True) """ if batchFunc is None: batchFunc = lambda x:x if zippedIn==True: args = list(map(list, zip(*args))) # transpose if cores==-1: cores = os.cpu_count() pool = ProcessingPool(nodes=cores) batchIdx = list(range(len(args[0]))) batches = array2batches(batchIdx, cores) out = [] iterations = enumerate(batches) if quiet==True else progbar(enumerate(batches)) for i,batch in iterations: batch_args = [[arg[i] for i in batch] for arg in args] out.extend( pool.map(func, *batch_args) ) if zippedOut == False: if type(out[0]) is not tuple: out=[(item,) for item in out] out = list(map(list, zip(*out))) return out
def paralel_fitness(ind): pool = ProcessingPool(nodes=3) global funcs global agents_one global agents_two global envs processes = [] manager = Manager() return_vals = manager.dict() for i in range(6): funcs[i] = toolbox.compile(expr=ind) agents_one[i] = Genetic_agent(1, funcs[i]) processes.append( Process(target=simulate_one_game, args=(i, envs[i], agents_one[i], agents_two[i], return_vals))) for i in range(len(processes)): processes[i].start() for i in range(len(processes)): processes[i].join() result = sum(return_vals.values()) // len(processes) print(f"sum: {result}" + 2 * "\n") return result,
def sample(self, num_observations: int, num_processes: Optional[int] = None) -> List[pd.DataFrame]: """Return dataframe of size (num_observations, len(variables)) """ def fn(chunk_size: int): np.random.seed() # reseed thread to ensure independence df = pd.DataFrame() for variable in self.variables: df[variable.idx] = variable.sample(df=df, num_observations=chunk_size) return df if num_processes is None: max_num_processes = max(int(num_observations / 10000), 1) num_processes = min(multiprocessing.cpu_count(), max_num_processes) pool = ProcessingPool() chunk_sizes = [ int(num_observations / num_processes) for _ in range(num_processes) ] chunk_sizes[-1] = num_observations - sum(chunk_sizes[:-1]) return pool.map(fn, chunk_sizes)
def main(args): """ Parmeters --------- args : dict See ``fragments`` subcommand """ # list of genome files genomeList = Utils.parseGenomeList(args['<genomeList>'], filePath=args['--fp']) # analyzing each genome (in parallel) pfunc = functools.partial(by_genome, args=args) # difussion calc in parallel pool = ProcessingPool(nodes=int(args['--np'])) if args['--debug']: fragList = map(pfunc, genomeList) else: fragList = pool.map(pfunc, genomeList) # writing out table if args['--tbl']: write_fragList(fragList) else: dill.dump(fragList, sys.stdout)
def main(args): """ Parameters ---------- args : dict See ``genome_index`` subcommand """ # loading genome list genomeList = Utils.parseGenomeList(args['<genomeList>'], filePath=args['--fp']) # setting function for parallel calling pfunc = functools.partial(index_genome, faToTwoBitExe=args['--twobit'], fmtExe=args['--fmt'], idxExe=args['--idx'], K_value=args['--K_value'], quiet=args['--quiet']) # indexing genomes in parallel pool = ProcessingPool(nodes=int(args['--np'])) if args['--debug']: KDE_BD = map(pfunc, genomeList) else: KDE_BD = pool.map(pfunc, genomeList) # status sys.stderr.write('#-- All genomes indexed --#\n')
def _run_multi_process(self, data_frame: DataFrame, wait=True, timeout=None): """Start the ingestion process with the specified number of processes. Args: data_frame (DataFrame): source DataFrame to be ingested. wait (bool): whether to wait for the ingestion to finish or not. timeout (Union[int, float]): ``concurrent.futures.TimeoutError`` will be raised if timeout is reached. """ batch_size = math.ceil(data_frame.shape[0] / self.max_processes) args = [] for i in range(self.max_processes): start_index = min(i * batch_size, data_frame.shape[0]) end_index = min(i * batch_size + batch_size, data_frame.shape[0]) args += [(data_frame[start_index:end_index], start_index, timeout)] def init_worker(): # ignore keyboard interrupts in child processes. signal.signal(signal.SIGINT, signal.SIG_IGN) self._processing_pool = ProcessingPool(self.max_processes, init_worker) self._processing_pool.restart(force=True) f = lambda x: self._run_multi_threaded(*x) # noqa: E731 self._async_result = self._processing_pool.amap(f, args) if wait: self.wait(timeout=timeout)
def closure(rolling_groupby, func, *args, **kwargs): groups = list(rolling_groupby._groupby.groups.items()) chunks = chunk(len(groups), nb_workers) object_id = plasma_client.put(rolling_groupby.obj) groups_id = plasma_client.put(groups) attribute2value = { attribute: getattr(rolling_groupby, attribute) for attribute in rolling_groupby._attributes } worker_args = [ ( plasma_store_name, object_id, groups_id, attribute2value, chunk, func, args, kwargs, ) for chunk in chunks ] with ProcessingPool(nb_workers) as pool: result_workers = pool.map(RollingGroupby.worker, worker_args) result = pd.concat( [plasma_client.get(result_worker) for result_worker in result_workers], copy=False, ) return result
def closure(df_grouped, func, *args, **kwargs): groups = list(df_grouped.groups.items()) chunks = chunk(len(groups), nb_workers) object_id = plasma_client.put(df_grouped.obj) groups_id = plasma_client.put(groups) workers_args = [(plasma_store_name, object_id, groups_id, chunk, func, args, kwargs) for chunk in chunks] with ProcessingPool(nb_workers) as pool: result_workers = pool.map(DataFrameGroupBy.worker, workers_args) if len(df_grouped.grouper.shape) == 1: # One element in "by" argument if type(df_grouped.keys) == list: # "by" argument is a list with only one element keys = df_grouped.keys[0] else: keys = df_grouped.keys index = pd.Series(list(df_grouped.grouper), name=keys) else: # A list in "by" argument index = pd.MultiIndex.from_tuples(list(df_grouped.grouper), names=df_grouped.keys) result = pd.DataFrame(list( itertools.chain.from_iterable([ plasma_client.get(result_worker) for result_worker in result_workers ])), index=index).squeeze() return result
def get_raw_data(data_type): emb_path = EMBEDDINGS_PATH / data_type chunks = np.array_split(os.listdir(emb_path), mp.cpu_count()) def process_chunk(files): data = {} for file in files: file_path = emb_path / file with open(file_path, 'r') as input_file: lines = input_file.readlines() raw_string = ''.join(lines).replace('\n', '').replace('[', '').replace( ']', '') if 'None' not in raw_string: embedding = np.fromstring(raw_string, dtype=np.float, sep=' ') #embedding = torch.FloatTensor(embedding) else: embedding = None data[file] = embedding return data with ProcessingPool(mp.cpu_count()) as pool: proc_chunks = list(pool.map(process_chunk, chunks)) merged = {} for dict_chunk in proc_chunks: merged = {**merged, **dict_chunk} return merged
def main(argv): logging.info('Building title features') config = configparser.ConfigParser() config.read([ 'config/database_config.ini', 'config/database_tables.ini', 'config/inventor/build_title_map_sql.ini' ]) # create output folder if it doesn't exist logging.info( 'writing results to folder: %s', os.path.dirname(config['INVENTOR_BUILD_TITLES']['feature_out'])) os.makedirs(os.path.dirname( config['INVENTOR_BUILD_TITLES']['feature_out']), exist_ok=True) feats = [n for n in ProcessingPool().imap(run, ['granted', 'pregranted'])] features = feats[0] for i in range(1, len(feats)): features.update(feats[i]) with open( config['INVENTOR_BUILD_TITLES']['feature_out'] + '.%s.pkl' % 'both', 'wb') as fout: pickle.dump(features, fout)
def sample_function(function, value_range=(-1, 1), resolution=(1000, 1000), grid=True, parallel=True, **params): """ Sample a function over an xy plane with the given value range and resolution. Function is called with ((x,y), **params) Returns an array of shape (resolution_x, resolution_y, *function_shape), e.g. (1000,1000,3) if f(p)=[a,b,c] e.g. (1000,1000,3,3) if f(p).shape=(3,3) """ # TODO make over any number of dimensions? xy = xy_plane(value_range, resolution, grid=grid) if parallel: # Flatten into array of 2d points [(x,y), ...] points = xy.reshape(-1, xy.shape[-1]) with ProcessingPool() as pool: values = pool.map(lambda p: function(p, **params), points) sampled = np.resize( values, xy.shape[:-1]) # TODO Doesn't work for non-scalar functions else: sampled = np.apply_along_axis(lambda p: function(p, **params), 2, xy) # returns shape: (resolution_x, resolution_y, *function_shape) return sampled
def perplexity(lang="eng"): """ finds satistical perplexity of the language model in Google Books N-Gram dataset. """ pool = ProcessingPool(4) unigram_counter, mgram_counter, ngram_counter= pool.map(get_ngram_counter, [1,2,3], [lang] * 3) pool.close() pool.join() total_words = np.sum(np.array(list(unigram_counter.values()))) print("total_words = ", total_words) ngram_conditionals = get_ngram_conditionals(ngram_counter, mgram_counter) probs = np.power(np.array(list(ngram_conditionals.values()), dtype=np.float64), -np.array(list(ngram_counter.values()), dtype=np.float64) \ / total_words) print("probs shape = ", probs.shape) PP = (np.prod(probs, dtype=np.float64)) return PP
def parallel(fun, params, MC=None, passID=True): """ Helper for parallel function evaluation. :param fun: Function to be evaluated :param params: If MC is provided, params is treated as a single parameter set used for each repetition of the function evaluation. Otherwise, params is treated as an iterable providing different parameter sets for each function evaluation. :param MC: Integer number for repeated function evaluation with a single parameter set (for Monte Carlo simulation). :param passID: pass the ID of each thread as additional keyword argument to the function (e.g. for random seed) :return: List of return values of all function evaluation. """ # create multiprocessing pool pool = ProcessingPool() # create iterator and corresponding callable for parallel function evaluations if passID: f = lambda x: fun(*x[0], id=x[1]) if MC is None: it = ((p, ID) for (p, ID) in zip(params, count())) else: it = ((p, ID) for (p, ID) in zip(repeat(params, MC), range(MC))) else: f = lambda x: fun(*x) if MC is None: it = params else: it = repeat(params, MC) # run threads and return results return pool.map(f, it)
def spawn_process(self): """Spawns file serving process""" utils.kill_port(self.port) self._process = ProcessingPool() self._process.apipe(self._serve_file) logging.debug("Served RPKI File")
def write_image_name_on_images_in_dir_parallel(img_files, input_folder_path, output_folder_path, override=False, xy=(0, 0), text_color=(255, 255, 255), font_size=40, background_color=(0, 0, 0), number_gpus=6): logger.info('write_image_name_on_images_in_dir_parallel: ...') logger.info('Using ' + str(number_gpus) + ' of ' + str(cpu_count()) + 'CPUs') from Utility.List_Extension import ListExtension chunks = ListExtension.split_list_in_n_parts(img_files, number_gpus) with ProcessingPool() as pool: results = [] for gpu_id in range(number_gpus): result = pool.apipe( ImageFileHandler.write_image_name_on_images_in_dir_sequential, *[chunks[gpu_id], input_folder_path, output_folder_path, override, xy, text_color, font_size, background_color] ) results.append(result) # Collect the asynchronous calls for result in results: result.get() logger.info('write_image_name_on_images_in_dir_parallel: Done')
def preprocess_transition_probs_multi(self): ''' Preprocessing of transition probabilities for guiding the random walks. ''' G = self.G is_directed = self.is_directed def _single_probe(node): unnormalized_probs = [ G[node][nbr]['weight'] for nbr in sorted(G.neighbors(node)) ] norm_const = sum(unnormalized_probs) normalized_probs = [ float(u_prob) / norm_const for u_prob in unnormalized_probs ] return node, alias_setup(normalized_probs) pool = ProcessingPool() alias_nodes = dict(pool.map(_single_probe, G.nodes())) alias_edges = {} if is_directed: for edge in G.edges(): alias_edges[edge] = self.get_alias_edge(edge[0], edge[1]) else: for edge in G.edges(): alias_edges[edge] = self.get_alias_edge(edge[0], edge[1]) alias_edges[(edge[1], edge[0])] = self.get_alias_edge(edge[1], edge[0]) self.alias_nodes = alias_nodes self.alias_edges = alias_edges return
def get_stats(self): """Get stats for all genomes. Concat the results into a DataFrame""" # pool.map needs an arg for each function that will be run dmx_mean = [self.dmx.mean()] * len(self.genome_paths) with ProcessingPool() as pool: results = pool.map(genome.mp_stats, self.genome_paths, dmx_mean) self.stats = pd.concat(results) self.stats.to_csv(self.stats_path)
def compute_importance(self, alpha): """ """ pool = ProcessingPool(self._numJobs) errors = pool.map(self._computeImportanceOfTree, [alpha] * self._numTree, range(self._numTree)) return np.array(errors).mean(axis=0)
def process(f, iterable, n_workers=MAX_WORKERS): if n_workers < 0: n_workers = MAX_WORKERS + 1 + n_workers assert n_workers > 0, f"n_workers must be between {-MAX_WORKERS} and {MAX_WORKERS}" if n_workers == 1: return [f(x) for x in iterable] pool = ProcessingPool(nodes=4) return pool.map(f, iterable)
def main(argv): logging.info('Building coinventor features') feats = [n for n in ProcessingPool().imap(run, ['granted', 'pregranted'])] features = feats[0] for i in range(1, len(feats)): features.update(feats[i]) with open(FLAGS.feature_out + '.%s.pkl' % 'both', 'wb') as fout: pickle.dump(features, fout)
def closure(df, func, *args, **kwargs): pool = ProcessingPool(nb_workers) manager = Manager() queue = manager.Queue() ProgressBars = (ProgressBarsNotebookLab if in_notebook_lab else ProgressBarsConsole) axis = kwargs.get("axis", 0) if axis == "index": axis = 0 elif axis == "columns": axis = 1 opposite_axis = 1 - axis chunks = chunk(df.shape[opposite_axis], nb_workers) maxs = [chunk.stop - chunk.start for chunk in chunks] values = [0] * nb_workers finished = [False] * nb_workers if display_progress_bar: progress_bar = ProgressBars(maxs) object_id = plasma_client.put(df) workers_args = [( plasma_store_name, object_id, chunk, func, display_progress_bar, queue, index, args, kwargs, ) for index, chunk in enumerate(chunks)] result_workers = pool.amap(DataFrame.worker_apply, workers_args) if display_progress_bar: while not all(finished): for _ in range(finished.count(False)): index, value, status = queue.get() values[index] = value finished[index] = status progress_bar.update(values) result = pd.concat( [ plasma_client.get(result_worker) for result_worker in result_workers.get() ], copy=False, ) return result
def ident_CATU(self, ncpus=None): if ncpus is None: ncpus = cpu_count() - 1 cols = ['NYMFID', 'TRANSACTION_DATE', 'PARENT_ID', 'AMOUNT'] temp = self.df.loc[:, cols] nymfids = temp['NYMFID'].unique() pool = ProcessingPool(ncpus=ncpus) result = pool.map(lambda x: _ident_CATU_worker(temp, x), nymfids) return list(filter(None, result))
def handle_document(self, document): doc_image = document.load_image() # Load doc doc_name = os.path.join(self.training_data_path, "doc_" + str(uuid.uuid4())) # Name doc cv2.imwrite(doc_name + ".jpg", doc_image) # Save doc # Apply masks return list(ProcessingPool().imap( self.apply_masks_unpack, product([doc_image], [doc_name], self.mask_set.mask_set)))
def Pool(cpus=cpu_count()) -> ProcessingPool: """Context manager for pathos ProcessingPool""" # Creates a pool with processes p = ProcessingPool(cpus) yield p # Need to clear due to: # https://github.com/uqfoundation/pathos/issues/111 p.close() p.join() p.clear()
def run_all_control_analysis(self): dirs = dir_walker(self.encode_root) control_dir = None for d in dirs: if 'control' in d.lower(): control_dir = d assert control_dir is not None replicates = dir_walker(control_dir, level=1) pool = ProcessingPool(nodes=14) pool.map(self.control_analysis, tuple(replicates)) return replicates