def store_atom_enumeration(self, filename=None, multithread=False, max_candidates=1): self.filename = filename DB = PrototypeSQL(filename=filename) DB._connect() N0 = DB.ase_db.count() prototypes = DB.select(max_atoms=self.max_atoms, spacegroups=self.spacegroups, source='prototype') Nprot = len(prototypes) pool = Pool() t0 = time.time() if multithread: res = pool.amap(self.store_atoms_for_prototype, prototypes) while not res.ready(): N = DB.ase_db.count() - N0 t = time.time() - t0 N_per_t = N / t if N > 0: print('---------------------------------') print( "{}/{} structures generated in {:.2f} sec".format(N, Nprot, t)) print("{} sec / structure".format(t / N)) print('Estimated time left: {:.2f} min'.format( Nprot / N_per_t / 60)) print('---------------------------------') time.sleep(10) res = res.get() else: for prototype in prototypes: self.store_atoms_for_prototype(prototype)
def fit(self, train_feats, train_labels, feat_names=None): if self.__feature_names is None: self.__feature_names = feat_names self.__train_features, self.__train_labels = train_feats, train_labels #self.__build_forest2() pool = Pool(cpu_count()) results = pool.amap(self.__build_forest, range(self.__n_estimators)) self.__trees = results.get()
def predict(self, test_features): if self.__train_features is None or self.__train_labels is None: raise Exception("Training Data not fitted.") self.__test_features = list(test_features) pool = Pool(cpu_count()) self.__predictions = pool.amap(self.__k_nearest_neighbours, self.__test_features).get() return self.__predictions
def closure(df, func, *args, **kwargs): pool = ProcessingPool(nb_workers) manager = Manager() queue = manager.Queue() ProgressBars = (ProgressBarsNotebookLab if in_notebook_lab else ProgressBarsConsole) axis = kwargs.get("axis", 0) if axis == "index": axis = 0 elif axis == "columns": axis = 1 opposite_axis = 1 - axis chunks = chunk(df.shape[opposite_axis], nb_workers) maxs = [chunk.stop - chunk.start for chunk in chunks] values = [0] * nb_workers finished = [False] * nb_workers if display_progress_bar: progress_bar = ProgressBars(maxs) object_id = plasma_client.put(df) workers_args = [( plasma_store_name, object_id, chunk, func, display_progress_bar, queue, index, args, kwargs, ) for index, chunk in enumerate(chunks)] result_workers = pool.amap(DataFrame.worker_apply, workers_args) if display_progress_bar: while not all(finished): for _ in range(finished.count(False)): index, value, status = queue.get() values[index] = value finished[index] = status progress_bar.update(values) result = pd.concat( [ plasma_client.get(result_worker) for result_worker in result_workers.get() ], copy=False, ) return result
def test_multiprocess(): x_list = [1,2,3,4,5,6,7,] y_list = ['1','2','3','4','5','6','7'] epoch = 8 pool = Pool(epoch) res = pool.amap(test_task,x_list,y_list) pool.pipe(test_task,'22','222') pool.close() pool.join()
def closure(rolling, func, *args, **kwargs): pool = ProcessingPool(nb_workers) manager = Manager() queue = manager.Queue() ProgressBars = (ProgressBarsNotebookLab if in_notebook_lab else ProgressBarsConsole) series = rolling.obj window = rolling.window chunks = chunk(len(series), nb_workers, window) maxs = [chunk.stop - chunk.start for chunk in chunks] values = [0] * nb_workers finished = [False] * nb_workers if display_progress_bar: progress_bar = ProgressBars(maxs) object_id = plasma_client.put(series) attribute2value = { attribute: getattr(rolling, attribute) for attribute in rolling._attributes } workers_args = [(plasma_store_name, object_id, chunk, func, display_progress_bar, queue, index, attribute2value, args, kwargs) for index, chunk in enumerate(chunks)] result_workers = pool.amap(SeriesRolling.worker, workers_args) if display_progress_bar: while not all(finished): for _ in range(finished.count(False)): index, value, status = queue.get() values[index] = value finished[index] = status progress_bar.update(values) result = pd.concat([ plasma_client.get(result_worker) for result_worker in result_workers.get() ], copy=False) return result
class PPool: """pathos multi-processing pool""" def __init__(self, processor_num: int = None, ): self.processor_num = cpu_count() if processor_num is None \ else min(processor_num, cpu_count()) LOGGER.debug('Building Pathos multi-processing pool with {} cores.'.format(self.processor_num)) self._pool = Pool(self.processor_num) def flatten_params(self, params: List): """params: List[*args, **kwargs]""" # block_size = int(math.ceil(len(params) / self.processor_num)) # block_num = int(math.ceil(len(params) / block_size)) block_size = (len(params) + self.processor_num - 1) // self.processor_num block_num = (len(params) + block_size - 1) // block_size block_params = [params[i * block_size:(i + 1) * block_size] for i in range(block_num)] return block_params def close(self): self._pool.close() self._pool.join() self._pool.clear() def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.close() def feed(self, func: Any, params: List, one_params: bool = False) -> List[Any]: if one_params: result = self._pool.amap(func, params).get() else: params = tuple(zip(*params)) result = self._pool.amap(func, *params).get() return result
def start(self, text_data_dir, res_dir, nprocs=8): ''' entry function text_data_dir: folder of raw data text_res_dir: folder of output verbose: int. Information is printed every N records nprocs: number of cores in parallel ''' p = PathosPool(nprocs) filepathsvec, filenamesvec, respaths = list(), list(), list() for dirpath, _, filenames in os.walk(text_data_dir): for filename in filenames: if (("gz" in filename) and ('md5' not in filename) and ('copy' not in filename)): filepath = os.path.join(dirpath, filename) print(filepath) res_name = filename.split(".")[0] + ".csv.gz" respath = os.path.join(res_dir, res_name) #if os.path.exists(respath): # pass #else: if True: filepathsvec.append(filepath) filenamesvec.append(filename) respaths.append(respath) #p.apply_async(process_data, args = (filepath,filename, # respath, True, # [title_stop_path, # affil_stop_path, # mesh_stop_path])) self.affildicts = p.amap( partial(self.process_data, stop_paths=[ self.title_stop_path, self.affil_stop_path, self.mesh_stop_path ], rm_stopwords=True, affiliation_correction=True, select_journals=self.select_journals), filepathsvec, filenamesvec, respaths) p.close() p.join() # Having an issue joining print("joined") p.clear() # Delete the pool
def main(args): """Main function for calculating BD shift. Parameters ---------- args : dict See ``BD_shift`` subcommand """ sys.stderr.write('Loading KDE objects...\n') kde1 = Utils.load_kde(args['<kde1>']) kde2 = Utils.load_kde(args['<kde2>']) # adding top-level library ID if not present kde1 = kde_add_lib(kde1) kde2 = kde_add_lib(kde2) sys.stderr.write('Calculating BD shifts...\n') print '\t'.join(['lib1','lib2','taxon','BD_shift']) for libID1,d1 in kde1.items(): for libID2,d2 in kde2.items(): msg = ' Comparing libraries: "{}", "{}"\n' sys.stderr.write(msg.format(libID1, libID2)) # overlap of taxa btw libraries taxa = taxon_overlap(d1, d2) # calculating BD shift (in parallel) pfunc = partial(kde_intersect, start=float(args['--start']), end=float(args['--end']), step=float(args['--step'])) pool = ProcessingPool(nodes=int(args['--np'])) if args['--debug']: res = map(pfunc, [(taxon, d1[taxon], d2[taxon]) for taxon in taxa]) else: res = pool.amap(pfunc, [(taxon, d1[taxon], d2[taxon]) for taxon in taxa]) while not res.ready(): time.sleep(2) res = res.get() # writing out table for line in res: print '\t'.join([libID1, libID2] + \ [str(x) for x in line])
def run_service3(service, iterable, iterable_arguments, iterable_argument_names, worker_count, log_function=print): start = timer() args = list(iterable_arguments.keys()) args.extend(iterable_argument_names) if log_function is not None: log_function("[run_service] running service {} with {} workers".format( service, worker_count)) # add everything to work queue all_args = [] for x in iterable: if type(x) is not tuple: x = [x] args = dict(dict(zip(iterable_argument_names, x)), **iterable_arguments) all_args.append(args) pool = Pool(worker_count) results = pool.amap(service, all_args) final_results = results.get() # if example service model is used, metrics can be gathered in this way messages = [] total = len(final_results) failure = 0 for error, mem_usage in final_results: if error is not False: failure += 1 if type(error) is str: messages.append(error) # if we should be logging and if there is material to be logged if log_function is not None and (total + failure + len(messages)) > 0: log_function( "[run_service] Summary {}:\n[run_service]\tTime: {}s\n[run_service]\tTotal: {}\n[run_service]\tFailure: {}" .format(service, int(timer() - start), total, failure)) log_function("[run_service]\tMessages:\n[run_service]\t\t{}".format( "\n[run_service]\t\t".join(messages))) # return relevant info return total, failure, messages
class ParallelGridSearch(Experiment): param_queue = [] def __init__(self, exp_class, parameters, parallel=4): """ :param exp_class: subclass of Experiment to run :type exp_class: class<Experiment> :param parameters: dict of list, experiment parameters to search within, i.e.: { "entropy": [1e-2, 1e-3], "learning_rate": [1e-3, 1e-4], ... } or list of dict-of-list, representing multiple groups of parameters: [ { "entropy": [1e-2, 1e-3], "learning_rate": [1e-3, 1e-4], ... }, { "batch_size": [32, 64], ... } ] """ super(ParallelGridSearch, self).__init__() self._exp_class, self._parameters, self._parallel = exp_class, parameters, parallel def run(self, args): self.log_root = args.logdir for parameter in GridSearch.product(self._parameters): label = GridSearch.labelize(parameter) ParallelGridSearch.param_queue.append( [self._exp_class, self.log_root, parameter, label, args]) n = len(ParallelGridSearch.param_queue) task_index = list(range(n)) logging.warning("total searched combination:%s", n) self.pool = Pool(self._parallel) ret = self.pool.amap(subprocess_run, task_index) ret.wait() self.pool.close() self.pool.join()
def launch_task(): def chunks(l, n): """Yield successive n-sized chunks from l.""" for i in range(0, len(l), n): yield l[i:i + n] def list_fn(todo_list) -> PayloadSaver: payload_saver = PayloadSaver() for doc_id, text in todo_list: tokenize_doc_and_save(payload_saver, doc_id, text, tokenize_fn) return payload_saver from pathos.multiprocessing import ProcessingPool as Pool p = Pool(num_thread, daemon=True) split_n = int(len(todo_list) / num_thread) + 1 args = chunks(todo_list, split_n) result_handle = p.amap(list_fn, args) return result_handle
class C(object): def __init__(self,files): self.pool = Pool(4) self.files = files def raw_processor(self, fi,prefix,somedict): df = pd.read_table( fi, header=None, names=['artist_id','ts'], parse_dates=['ts'])\ .sort_values(by='ts') user = fi.split('/')[-1][:-4] df.to_pickle('/Users/jaredlorince/git/MusicForaging/testData/scrobbles_test/{}_{}.pkl'.format(prefix,user)) rootLogger.info('preprocessing complete for user {} ({})'.format(user,fi)) def run_p(self): func_partial = partial(self.raw_processor,prefix='blah',somedict=d) result = self.pool.amap(func_partial, self.files)
def main(): output_review = [] lines = get_data(valid_file) pool = Pool(8) block_num = 1000 block_size = len(lines) // block_num for i in tqdm(range(block_num + 1)): if i == block_num: block = lines[i * block_size:] else: block = lines[i * block_size:(i + 1) * block_size] tunnel = pool.amap(sub_process, block) output = tunnel.get() output_review += output fw = open(output_valid, "w") fw.writelines(output_review) fw.close()
def closure(data, func, **kwargs): pool = ProcessingPool(nb_workers) manager = Manager() queue = manager.Queue() ProgressBars = (ProgressBarsNotebookLab if in_notebook_lab else ProgressBarsConsole) chunks = chunk(data.size, nb_workers) maxs = [chunk.stop - chunk.start for chunk in chunks] values = [0] * nb_workers finished = [False] * nb_workers if display_progress_bar: progress_bar = ProgressBars(maxs) object_id = plasma_client.put(data) workers_args = [(plasma_store_name, object_id, chunk, func, display_progress_bar, queue, index, kwargs) for index, chunk in enumerate(chunks)] result_workers = pool.amap(Series.worker_map, workers_args) if display_progress_bar: while not all(finished): for _ in range(finished.count(False)): index, value, status = queue.get() values[index] = value finished[index] = status progress_bar.update(values) result = pd.concat([ plasma_client.get(result_worker) for result_worker in result_workers.get() ], copy=False) return result
def generate_prescaled_dataset(self, sizes): if not self.prescaled_data: return print("Generating prescaled dataset...") data_path = self.prescaled_data_path if data_path is None: data_path = 'maua/datasets/%s_prescaled'%self.data_path.split('/')[-1] if not os.path.isdir(data_path) or \ not len(self.dataloader)*len(sizes) == len(ProGANDataLoader(data_path=data_path)): # create a copy of the dataset on disk for each size from pathos.multiprocessing import ProcessingPool pool = ProcessingPool() def prescale_dataset(tup): image_file, size = tup try: Image.open(data_path+"/%s/%s"%(size,image_file.split("/")[-1])) return 1 except: os.makedirs(data_path+"/%s"%size, exist_ok=True) image = Image.open(self.data_path+"/"+image_file) transforms = tn.Compose([self.transforms, tn.Resize(size), tn.ToTensor()]) processed = th.clamp(transforms(image), min=0, max=1) save_image(processed, data_path+"/%s/%s"%(size,image_file.split("/")[-1])) return 1 jobs = list(itertools.product(filter(lambda im: not im.startswith("."), os.listdir(self.data_path)), sizes)) results = pool.amap(prescale_dataset, jobs) time.sleep(1) pbar = tqdm.tqdm(total=len(self.dataloader)*len(sizes)) pbar.set_description("Images processed") while not results.ready(): num_files = sum([len(os.listdir(data_path+"/%s"%size)) for size in sizes]) pbar.update(num_files - pbar.n) time.sleep(1) pbar.close() pool.close() pool.join() assert sum(results.get()) == len(self.dataloader)*len(sizes) else: print("Dataset already generated.") self.data_path = data_path
def get_signatures(self, cpu_cores=1, verbose=False): if cpu_cores == 1: if self.nsims == 1: signatures = self.__signature(self._trajectories, self.parameters) signatures = signatures_to_dataframe(signatures, self.tspan, self.nsims) signatures = signatures.transpose().stack(0) return signatures else: signatures = [0] * self.nsims for idx in range(self.nsims): signatures[idx] = self.__signature(self._trajectories[idx], self.parameters[idx]) signatures = signatures_to_dataframe(signatures, self.tspan, self.nsims) signatures = signatures.transpose().stack(0) return signatures else: if Pool is None: raise Exception( 'Please install the pathos package for this feature') if self.nsims == 1: self._trajectories = [self._trajectories] self._parameters = [self._parameters] p = Pool(cpu_cores) res = p.amap(self.__signature, self._trajectories, self.parameters) if verbose: while not res.ready(): print('We\'re not done yet, %s tasks to go!' % res._number_left) time.sleep(60) signatures = res.get() signatures = signatures_to_dataframe(signatures, self.tspan, self.nsims) signatures = signatures.transpose().stack(0) return signatures
class SOS: def __init__( self, func, bounds, niter=500, population=10, ftol=0.001, workers=-1, restart=False, vec_dump=10, seed=None, aggressive_parasite=False ): """ Initialise a symbiotic organisms search instance Args: func (callable): Function to be minimised. f(x, *args) - x is the argument to be minimised, args is a tuple of any additional fixed parameters to specify the function bounds (list(Double)): list of pairs of (min,max) bounds for x niter (Int): number of iterations for optimiser population (Int): number of members in population ftol (Double) : convergence criteria for function workers (Int): number of multiprocessing workers to use. -1 sets workers to mp.cpu_count() vec_dump (Int): outputs restart file vec_dump number of steps restart (Bool): restart the run from a restart file seed (Int): seed for random number generator, useful for tests """ self.function = func self.niter = niter self.population = population self.particles = [] self.best_global_vec = None self.best_global_fit = math.inf self.ftol = ftol self.bounds = np.asarray(bounds) self.restart = restart self.vector_restart = VectorInOut(bounds, "sos.rst") self.vec_dump = vec_dump self.seed = seed self.aggressive_parasite = aggressive_parasite if workers == -1: self.pool = Pool(mp.cpu_count()) else: self.pool = Pool(workers) def vector_to_pot(self, vector): """ Converts sos vector to actual x values Args: vector (numpy array): vector position in parameter space """ return ((self.bounds[:, 1] - self.bounds[:, 0]) * vector) + self.bounds[:, 0] def part_init(self, vector): """ Wrapper for particle initialisation for multiprocess Args: vector (numpy array) Returns: vector (numpy array) result of function(vector) """ return vector, self.function(self.vector_to_pot(vector), self.args) def initialise_particles(self): """ Initialises the population: sets particle vectors using latin hypercube, and sets global bests Args: None """ if self.restart: vec, fit = self.vector_restart.read_vectors() for i, vec in enumerate(vec): self.particles.append(Particle(np.asarray(vec), fit[i], i)) self.set_global_best() else: vectors = lhs(len(self.bounds), self.population) res = self.pool.amap(self.part_init, vectors) for i, val in enumerate(res.get()): self.particles.append(Particle(val[0], val[1], i)) self.best_global_fit = copy.deepcopy(self.particles[0].return_fit) self.best_global_vec = copy.deepcopy(self.particles[0].return_vec) def set_global_best(self): """ Sets current global best fit for function, and corresponding vector Args: None """ for particle in self.particles: if particle.fit < self.best_global_fit: self.best_global_fit = copy.deepcopy(particle.return_fit) self.best_global_vec = copy.deepcopy(particle.return_vec) output("Current best fit:" + str(self.best_global_fit) + "\n") def mutualism(self, part): """ Performs mutualism step of sos Args: part (Particle): particle member of population on which to perform mutualism Returns: part.vector (np.array): vector position in paramter space part.fit (Double): value of function at point in param space corresponding to part.vector """ np.random.seed() b_ind = np.random.choice( [i for i in range(self.population) if i != part.index], 1, replace=False )[0] a = part.vector b = self.particles[b_ind].vector bf = np.random.randint(1, 3, 2) mutant = np.random.rand(len(self.bounds)) mutual = (a + b) / 2 new_a = np.clip(a + (mutant * (self.best_global_vec - (mutual * bf[0]))), 0, 1) new_b = np.clip(b + (mutant * (self.best_global_vec - (mutual * bf[1]))), 0, 1) for i, vec in enumerate([[part.index, new_a], [b_ind, new_b]]): trial_pot = self.vector_to_pot(vec[1]) error = self.function(trial_pot, self.args) if error < self.particles[vec[0]].fit: self.particles[vec[0]].fit = error self.particles[vec[0]].vector = vec[1] return part.vector, part.fit def run_mutualism(self): """ Wrapper for mutualism step, for multiprocessing Args: None """ res = self.pool.amap(self.mutualism, self.particles) for i, val in enumerate(res.get()): self.particles[i].vector, self.particles[i].fit = val def commensalism(self, part): """ Performs commensalism step of sos Args: part (Particle): particle member of population on which to perform commensalism Returns: part.vector (np.array): vector position in paramter space part.fit (Double): value of function at point in param space corresponding to part.vector """ np.random.seed() b_ind = np.random.choice( [i for i in range(self.population) if i != part.index], 1, replace=False )[0] a = part.vector b = self.particles[b_ind].vector mutant = np.random.uniform(-1, 1, len(self.bounds)) new_a = np.clip(a + (mutant[0] * (self.best_global_vec - b)), 0, 1) trial_pot = self.vector_to_pot(new_a) error = self.function(trial_pot, self.args) if error < part.fit: part.fit = error part.vector = new_a return part.vector, part.fit def run_commensalism(self): """ Wrapper for commensalism step, for multiprocessing Args: None """ res = self.pool.amap(self.commensalism, self.particles) for i, val in enumerate(res.get()): self.particles[i].vector, self.particles[i].fit = val def parasitism(self, part): """ Performs parasitism step of sos Args: part (Particle): particle member of population on which to perform parasitism Returns: part.vector (np.array): vector position in paramter space part.fit (Double): value of function at point in param space corresponding to part.vector """ np.random.seed() b_ind = np.random.choice( [i for i in range(self.population) if i != part.index], 1, replace=False )[0] if self.aggressive_parasite: trial = np.random.uniform(0, 1, len(self.bounds)) cross_points = np.random.rand(len(self.bounds)) < 0.3 if not np.any(cross_points): cross_points[np.random.randint(0, len(self.bounds))] = True parasite = np.where(cross_points,trial,part.vector) else: parasite = copy.deepcopy(part.vector) parasite[np.random.randint(0, len(self.bounds))] = np.random.rand() trial_pot = self.vector_to_pot(parasite) error = self.function(trial_pot, self.args) if error < self.particles[b_ind].fit: self.particles[b_ind].fit = error self.particles[b_ind].vector = parasite return b_ind, self.particles[b_ind].fit, self.particles[b_ind].vector def run_parasitism(self): """ Wrapper for parasitism step, for multiprocessing Args: None """ res = self.pool.amap(self.parasitism, self.particles) for i, val in enumerate(res.get()): self.particles[val[0]].vector, self.particles[val[0]].fit = val[2], val[1] def optimise(self, args): """ Optimise the function: run Args: function (Function): function to optimise args (Optional): any further args required by function """ self.args = args self.initialise_particles() for step in range(self.niter): output("Doing step: " + str(step) + "\n") self.run_mutualism() self.run_commensalism() self.run_parasitism() self.set_global_best() if self.best_global_fit < self.ftol: break if step % self.vec_dump == 0: output("Going to dump particle vectors\n") self.vector_restart.write_vectors(self.particles) results_min = OptimizeResult() results_min.x = self.vector_to_pot(self.best_global_vec) results_min.fun = self.best_global_fit self.vector_restart.write_vectors(self.particles) return results_min
class IngestionManagerPandas: """Class to manage the multi-threaded data ingestion process. This class will manage the data ingestion process which is multi-threaded. Attributes: feature_group_name (str): name of the Feature Group. sagemaker_fs_runtime_client_config (Config): instance of the Config class for boto calls. data_frame (DataFrame): pandas DataFrame to be ingested to the given feature group. max_workers (int): number of threads to create. max_processes (int): number of processes to create. Each process spawns ``max_workers`` threads. profile_name (str): the profile credential should be used for ``PutRecord`` (default: None). """ feature_group_name: str = attr.ib() sagemaker_fs_runtime_client_config: Config = attr.ib() max_workers: int = attr.ib(default=1) max_processes: int = attr.ib(default=1) profile_name: str = attr.ib(default=None) _async_result: AsyncResult = attr.ib(default=None) _processing_pool: ProcessingPool = attr.ib(default=None) _failed_indices: List[int] = attr.ib(factory=list) @staticmethod def _ingest_single_batch( data_frame: DataFrame, feature_group_name: str, client_config: Config, start_index: int, end_index: int, profile_name: str = None, ) -> List[int]: """Ingest a single batch of DataFrame rows into FeatureStore. Args: data_frame (DataFrame): source DataFrame to be ingested. feature_group_name (str): name of the Feature Group. client_config (Config): Configuration for the sagemaker feature store runtime client to perform boto calls. start_index (int): starting position to ingest in this batch. end_index (int): ending position to ingest in this batch. profile_name (str): the profile credential should be used for ``PutRecord`` (default: None). Returns: List of row indices that failed to be ingested. """ retry_config = client_config.retries if "max_attempts" not in retry_config and "total_max_attempts" not in retry_config: client_config = copy.deepcopy(client_config) client_config.retries = {"max_attempts": 10, "mode": "standard"} sagemaker_featurestore_runtime_client = boto3.Session( profile_name=profile_name).client( service_name="sagemaker-featurestore-runtime", config=client_config) logger.info("Started ingesting index %d to %d", start_index, end_index) failed_rows = list() for row in data_frame[start_index:end_index].itertuples(): record = [ FeatureValue( feature_name=data_frame.columns[index - 1], value_as_string=str(row[index]), ) for index in range(1, len(row)) if pd.notna(row[index]) ] try: sagemaker_featurestore_runtime_client.put_record( FeatureGroupName=feature_group_name, Record=[value.to_dict() for value in record], ) except Exception as e: # pylint: disable=broad-except logger.error("Failed to ingest row %d: %s", row[0], e) failed_rows.append(row[0]) return failed_rows @property def failed_rows(self) -> List[int]: """Get rows that failed to ingest. Returns: List of row indices that failed to be ingested. """ return self._failed_indices def wait(self, timeout=None): """Wait for the ingestion process to finish. Args: timeout (Union[int, float]): ``concurrent.futures.TimeoutError`` will be raised if timeout is reached. """ try: results = self._async_result.get(timeout=timeout) except KeyboardInterrupt as i: # terminate workers abruptly on keyboard interrupt. self._processing_pool.terminate() self._processing_pool.close() self._processing_pool.clear() raise i else: # terminate normally self._processing_pool.close() self._processing_pool.clear() self._failed_indices = [ failed_index for failed_indices in results for failed_index in failed_indices ] if len(self._failed_indices) > 0: raise IngestionError( self._failed_indices, f"Failed to ingest some data into FeatureGroup {self.feature_group_name}", ) def _run_multi_process(self, data_frame: DataFrame, wait=True, timeout=None): """Start the ingestion process with the specified number of processes. Args: data_frame (DataFrame): source DataFrame to be ingested. wait (bool): whether to wait for the ingestion to finish or not. timeout (Union[int, float]): ``concurrent.futures.TimeoutError`` will be raised if timeout is reached. """ # pylint: disable=I1101 batch_size = math.ceil(data_frame.shape[0] / self.max_processes) # pylint: enable=I1101 args = [] for i in range(self.max_processes): start_index = min(i * batch_size, data_frame.shape[0]) end_index = min(i * batch_size + batch_size, data_frame.shape[0]) args += [( self.max_workers, self.feature_group_name, self.sagemaker_fs_runtime_client_config, data_frame[start_index:end_index], start_index, timeout, self.profile_name, )] def init_worker(): # ignore keyboard interrupts in child processes. signal.signal(signal.SIGINT, signal.SIG_IGN) self._processing_pool = ProcessingPool(self.max_processes, init_worker) self._processing_pool.restart(force=True) f = lambda x: IngestionManagerPandas._run_multi_threaded( *x) # noqa: E731 self._async_result = self._processing_pool.amap(f, args) if wait: self.wait(timeout=timeout) @staticmethod def _run_multi_threaded( max_workers: int, feature_group_name: str, sagemaker_fs_runtime_client_config: Config, data_frame: DataFrame, row_offset=0, timeout=None, profile_name=None, ) -> List[int]: """Start the ingestion process. Args: data_frame (DataFrame): source DataFrame to be ingested. row_offset (int): if ``data_frame`` is a partition of a parent DataFrame, then the index of the parent where ``data_frame`` starts. Otherwise, 0. wait (bool): whether to wait for the ingestion to finish or not. timeout (Union[int, float]): ``concurrent.futures.TimeoutError`` will be raised if timeout is reached. profile_name (str): the profile credential should be used for ``PutRecord`` (default: None). Returns: List of row indices that failed to be ingested. """ executor = ThreadPoolExecutor(max_workers=max_workers) # pylint: disable=I1101 batch_size = math.ceil(data_frame.shape[0] / max_workers) # pylint: enable=I1101 futures = {} for i in range(max_workers): start_index = min(i * batch_size, data_frame.shape[0]) end_index = min(i * batch_size + batch_size, data_frame.shape[0]) futures[executor.submit( IngestionManagerPandas._ingest_single_batch, feature_group_name=feature_group_name, data_frame=data_frame, start_index=start_index, end_index=end_index, client_config=sagemaker_fs_runtime_client_config, profile_name=profile_name, )] = (start_index + row_offset, end_index + row_offset) failed_indices = list() for future in as_completed(futures, timeout=timeout): start, end = futures[future] result = future.result() if result: logger.error("Failed to ingest row %d to %d", start, end) else: logger.info("Successfully ingested row %d to %d", start, end) failed_indices += result executor.shutdown(wait=False) return failed_indices def run(self, data_frame: DataFrame, wait=True, timeout=None): """Start the ingestion process. Args: data_frame (DataFrame): source DataFrame to be ingested. wait (bool): whether to wait for the ingestion to finish or not. timeout (Union[int, float]): ``concurrent.futures.TimeoutError`` will be raised if timeout is reached. """ self._run_multi_process(data_frame=data_frame, wait=wait, timeout=timeout)
def main(uargs): """Main function for making OTU table. Parameters ---------- uargs : dict See ``OTU_table`` subcommand. """ # args formatting try: uargs['--abs'] = int(float(uargs['--abs'])) except TypeError: msg = '"{}" must be float-like' raise TypeError(msg.format(uargs['--abs'])) # logging status = Utils.Status(uargs['--quiet']) # loading files sys.stderr.write('Loading files...\n') ## BD kde BD_KDE_all = Utils.load_kde(uargs['<BD_KDE>']) BD_KDE_all_type = Utils.KDE_type(BD_KDE_all) ## community file comm_tbl = CommTable.from_csv(uargs['<communities>'], sep='\t') comm_tbl.abs_abund = uargs['--abs'] ## fraction file frac_tbl = FracTable.from_csv(uargs['<fractions>'], sep='\t') # iter by library: sys.stderr.write('Simulating OTUs...\n') u_taxon_names = comm_tbl.get_unique_taxon_names() OTU_counts = [] # list of all library-specific OTU_count dataframes for libID in comm_tbl.iter_libraries(): sys.stderr.write('Processing library: "{}"\n'.format(libID)) # dict of KDEs for library (libID) BD_KDE = _get_KDEs_for_libID(BD_KDE_all, BD_KDE_all_type, libID) # fraction bin list for library frac_bins = frac_tbl.BD_bins(libID) assert len(frac_bins) > 0, 'No fractions for library "{}"'.format( libID) libFracBins = [x for x in frac_bins] # iter of taxa in parallel pfunc = partial(sim_OTU, comm_tbl=comm_tbl, libID=libID, libFracBins=libFracBins, maxsize=int(uargs['--max'])) pool = ProcessingPool(nodes=int(uargs['--np'])) if uargs['--debug']: ret = map(pfunc, [(i, taxon, BD_KDE[taxon]) for i, taxon in enumerate(u_taxon_names)]) else: ret = pool.amap(pfunc, [(i, taxon, BD_KDE[taxon]) for i, taxon in enumerate(u_taxon_names)]) while not ret.ready(): time.sleep(2) ret = ret.get() # converting to a pandas dataframe df = pd.DataFrame([x[1] for x in ret]).fillna(0) df['taxon'] = [x[0] for x in ret] df = pd.melt(df, id_vars=['taxon']) df.columns = ['taxon', 'fraction', 'count'] df['library'] = libID x = df['fraction'].apply(_get_BD_range).apply(pd.Series) x.columns = ['BD_min', 'BD_mid', 'BD_max'] df = pd.concat([df, x], axis=1) df = df[[ 'library', 'taxon', 'fraction', 'BD_min', 'BD_mid', 'BD_max', 'count' ]] df.sort_values(by=['taxon', 'fraction'], inplace=True) # Adding to dataframe of all libraries OTU_counts.append(df) # combining library-specific dataframes df_comb = pd.concat(OTU_counts, ignore_index=False) # calculating taxon relative abundances df_comb['count'] = df_comb['count'].astype('int') cols = ['library', 'fraction'] df_comb['rel_abund'] = df_comb.groupby(cols).transform(tss)['count'] # writing out long form of table df_comb.sort_values(by=['library', 'taxon', 'BD_mid'], inplace=True) df_comb.to_csv(sys.stdout, sep='\t', index=False)
if __name__ == '__main__': import time start = time.time() threads = [] results = [] num = 10000 start = time.time() results = [] res = [square(x, results) for x in range(num)] print(results) end = time.time() print(end - start) start = time.time() with Pool(5) as p: rez = [] results = p.amap(f, range(num)) results = results.get() print(results) end = time.time() print(end - start) pool = Pool(cpu_count()) results = pool.amap(self.__build_forest, range(self.__n_estimators)) self.__trees = results.get()
regex_search.save_regex(args.save_regex[0]) elif args.existing_regex is not None: regex_search.load_regex(args.existing_regex[0]) print("Loaded %d regex in %d seconds!" % (len(regex_search.regex), time.time() - start_regex_time)) start_regex_time = None logging.info("Starting searches") #results = search_regex(regex_worker, regex_res, fastq_file, args.cpus) if __name__ == '__main__': result_1 = [] output_1 = [] if args.multithread is True: # errors = Queue() p = Pool(nodes=args.cpus[0]) res = p.amap(regex_worker_multithread, fastq_file.sequences) count = 0 while not res.ready(): count += 2 print("\rWaiting. Timer: %d" % count, end='') time.sleep(2) # for e in errors.get(): # print(e) # p.terminate() result_1 = res.get() print("\nDone searching") elif args.multithread is False: num_seq = 0 for seq in fastq_file.sequences: num_seq += 1
clus.diss_matrix(n_jobs=cpus) sil_df = clus.silhouette_score_spectral_range(cluster_range=range(2, 31), n_jobs=4, random_state=1234) if sil_threshold: silh_diff = sil_df['cluster_silhouette'].max() - sil_threshold # Define n_clus to have the minimum number of clusters when silh scores are too similar best_silhs = sil_df.loc[sil_df['cluster_silhouette'] > silh_diff] best_silh, n_clus = best_silhs.loc[best_silhs['num_clusters'].idxmin()] else: best_silh, n_clus = sil_df.loc[sil_df['cluster_silhouette'].idxmax()] n_clus = int(n_clus) clus.spectral_clustering(n_clusters=n_clus, n_jobs=4, random_state=1234) cluster_information = {signatures_idx: clus.cluster_percentage_color(), 'best_silh': best_silh, 'labels': clus.labels} return cluster_information drivers = all_signatures.keys() drivers.remove('species_combinations') drivers_to_analyze = [] for dr in drivers: if len(all_signatures['species_combinations'][dr]['products'][1]) > 1: drivers_to_analyze.append(dr) p = Pool(cpus) res = p.amap(cluster_percentage_color_aggomerative, drivers_to_analyze) results = res.get() with open('cluster_info_agglomerative_pydream_consumption.pickle', 'wb') as fp: pickle.dump(results, fp)
#!/usr/bin/env python # # Author: Mike McKerns (mmckerns @caltech and @uqfoundation) # Copyright (c) 1997-2014 California Institute of Technology. # License: 3-clause BSD. The full license text is available at: # - http://trac.mystic.cacr.caltech.edu/project/pathos/browser/pathos/LICENSE # instantiate and configure the worker pool from pathos.multiprocessing import ProcessingPool pool = ProcessingPool(nodes=4) _result = map(pow, [1,2,3,4], [5,6,7,8]) # do a blocking map on the chosen function result = pool.map(pow, [1,2,3,4], [5,6,7,8]) assert result == _result # do a non-blocking map, then extract the result from the iterator result_iter = pool.imap(pow, [1,2,3,4], [5,6,7,8]) result = list(result_iter) assert result == _result # do an asynchronous map, then get the results result_queue = pool.amap(pow, [1,2,3,4], [5,6,7,8]) result = result_queue.get() assert result == _result
def extract_nc(path, coord_path, variable_name, precision=3, num_pool=4): """extract variable(given region by coord) from .nc file input: path: path of the source nc file coord_path: path of the coord extracted by fishnet: OID_, lon, lat variable_name: name of the variable need to read precision: the minimum precision of lat/lon, to match the lat/lon of source nc file num_pool: the number of processes output: {variable_name}.txt [i, j]: i(file number) j(grid point number) lat_index.txt/lon_index.txt coord.txt """ print(f"variable:{variable_name}") coord = pd.read_csv(coord_path, sep=",") # read coord(extract by fishnet) print(f"grid point number:{len(coord)}") coord = coord.round( precision) # coord precision correlating with .nc file lat/lon result = [path + "/" + d for d in os.listdir(path) if d[-4:] == ".nc4"] print(f"file number:{len(result)}") variable = np.zeros( (len(result), len(coord) + 1)) # save the path correlated with read order # calculate the index of lat/lon in coord from source nc file f1 = Dataset(result[0], 'r') Dataset.set_auto_mask(f1, False) lat_index = [] lon_index = [] lat = f1.variables["lat"][:] lon = f1.variables["lon"][:] for j in range(len(coord)): lat_index.append(np.where(lat == coord["lat"][j])[0][0]) lon_index.append(np.where(lon == coord["lon"][j])[0][0]) f1.close() # read variable based on the lat_index/lon_index, based on multiprocessing def read(i): """read variable from nc file(i), used in pool""" vb = [] f = Dataset(result[i], 'r') vb.append(float(re.search(r"\d{6}", result[i])[0])) # re: the number depend on the nc file name(daily=8, month=6) Dataset.set_auto_mask(f, False) for j in range(len(coord)): vb.append(f.variables[variable_name][0, lat_index[j], lon_index[j]]) # require: nc file only have three dimension # f.variables['Rainf_f_tavg'][0, lat_index_lp, lon_index_lp]is a mistake, we only need the file # that lat/lon corssed (1057) rather than meshgrid(lat, lon) (1057*1057) print(f"complete read file:{i}") return vb po = Pool(num_pool) # pool res_po = [po.amap(read, (i, )) for i in range(len(result))] # the results of every process po.close() po.join() for i in range(len(result)): variable[i, :] = res_po[i].get()[0] # get varibale from result # sort by time variable = variable[variable[:, 0].argsort()] # save np.savetxt(f'{variable_name}.txt', variable, delimiter=' ') np.savetxt('lat_index.txt', lat_index, delimiter=' ') np.savetxt('lon_index.txt', lon_index, delimiter=' ') coord.to_csv("coord.txt")
silh_diff = sil_df['cluster_silhouette'].max() - sil_threshold # Define n_clus to have the minimum number of clusters when silh scores are too similar best_silhs = sil_df.loc[sil_df['cluster_silhouette'] > silh_diff] best_silh, n_clus = best_silhs.loc[best_silhs['num_clusters'].idxmin()] else: best_silh, n_clus = sil_df.loc[sil_df['cluster_silhouette'].idxmax()] n_clus = int(n_clus) clus.spectral_clustering(n_clusters=n_clus, n_jobs=4, random_state=1234) cluster_information = { signatures_idx: clus.cluster_percentage_color(), 'best_silh': best_silh, 'labels': clus.labels } return cluster_information drivers = all_signatures.keys() drivers.remove('species_combinations') drivers_to_analyze = [] for dr in drivers: if len(all_signatures['species_combinations'][dr]['products'][1]) > 1: drivers_to_analyze.append(dr) p = Pool(cpus) res = p.amap(cluster_percentage_color_spectral, drivers_to_analyze) results = res.get() with open('cluster_info_spectral_sampled_kd_consumption.pickle', 'wb') as fp: pickle.dump(results, fp)
class DiffEvolution: def __init__(self, func, bounds, niter=100, population=100, ftol=0.001, workers=-1, vec_dump=10, restart=False, mut_fac=0.3, cross_prob=0.7): """ Initialise a differential evolution optimisation instance. Args: func (callable): Function to be minimised. f(x, *args) - x is the argument to be minimised, args is a tuple of any additional fixed parameters to specify the function bounds (list(Double)): list of pairs of (min,max) bounds for x niter (Int): number of iterations for optimiser population (Int): number of members in population ftol (Double) : convergence criteria for function workers (Int): number of multiprocessing workers to use. -1 sets workers to mp.cpu_count() vec_dump (Int): outputs restart file vec_dump number of steps restart (Bool): restart the run from a restart file mut_fac (Double): mutation factor of diff evolution cross_prob (Double): cross over probability for mutant to generate trial """ self.function = func self.bounds = bounds self.niter = niter self.population = population self.ftol = ftol self.vec_dump = vec_dump self.restart = restart self.mut_fac = mut_fac self.cross_prob = cross_prob self.particles = [] self.best_global_vec = None self.best_global_fit = math.inf self.dim = len(self.bounds) self.vector_restart = VectorInOut(bounds, "sos.rst") if workers == -1: self.pool = Pool(mp.cpu_count()) else: self.pool = Pool(workers) def part_init(self, vector): """ Wrapper for particle initialisation for multiprocess Args: vector (numpy array) Returns: vector (numpy array) result of function(vector) """ return vector, self.function(self.vector_to_pot(vector), self.args) def initialise_particles(self): """ Initialises the population: sets particle vectors using latin hypercube, and sets global bests Args: None """ if self.restart: vec, fit = self.vector_restart.read_vectors() for i, vec in enumerate(vec): self.particles.append(Particle(np.asarray(vec), fit[i], i)) self.set_global_best() else: vectors = lhs(len(self.bounds), self.population) res = self.pool.amap(self.part_init, vectors) for i, val in enumerate(res.get()): self.particles.append(Particle(val[0], val[1], i)) self.best_global_fit = copy.deepcopy(self.particles[0].return_fit) self.best_global_vec = copy.deepcopy(self.particles[0].return_vec) def vector_to_pot(self, vector): """ Converts particle vector to actual x values Args: vector (numpy array): vector position in parameter space """ return ((self.bounds[:, 1] - self.bounds[:, 0]) * vector) + self.bounds[:, 0] def set_global_best(self): """ Sets current global best fit for function, and corresponding vector Args: None """ for particle in self.particles: if particle.fit < self.best_global_fit: self.best_global_fit = copy.deepcopy(particle.return_fit) self.best_global_vec = copy.deepcopy(particle.return_vec) output("Current best fit:" + str(self.best_global_fit) + "\n") def evolve(self, part): np.random.seed() ind = np.random.choice( [i for i in range(self.population) if i != part.index], 3, replace=False) a = self.particles[ind[0]] b = self.particles[ind[1]] c = self.particles[ind[2]] mutant = a.vector + self.mut_fac * (b.vector - c.vector) mutant[mutant > 1.0] = np.random.uniform() mutant[mutant < 0.0] = np.random.uniform() cross_points = np.random.rand(self.dim) < self.cross_prob if not np.any(cross_points): cross_points[np.random.randint(0, self.dim)] = True trial = np.where(cross_points, mutant, part.vector) fit = self.function(self.vector_to_pot(trial), self.args) if fit < part.fit: return trial, fit else: return part.vector, part.fit def run_evolution(self): res = self.pool.amap(self.evolve, self.particles) for i, val in enumerate(res.get()): self.particles[i].vector, self.particles[i].fit = val def optimise(self, args): """ Optimise the function: run Args: function (Function): function to optimise args (Optional): any further args required by function """ self.args = args self.initialise_particles() self.set_global_best() for step in range(self.niter): output("Doing step: " + str(step) + "\n") self.run_evolution() self.set_global_best() if self.best_global_fit < self.ftol: break if step % self.vec_dump == 0: output("Going to dump particle vectors\n") self.vector_restart.write_vectors(self.particles)
import numpy as np from pysb.simulator import ScipyOdeSimulator from pathos.multiprocessing import ProcessingPool as Pool from earm2_flat import model import pickle tspan = np.linspace(0, 20000, 100) def run_simulation(param_values): sim = ScipyOdeSimulator(model, tspan=tspan).run(param_values=param_values).species return sim all_parameters = np.load('sampled_kd_ic_parameter.npy') cpu_cores = 31 p = Pool(cpu_cores) res = p.amap(run_simulation, all_parameters) sims = res.get() with open('sims_sampled_kd_ic_list.pickle', 'wb') as fp: pickle.dump(sims, fp)
fuzzy = False else: regex_search = MicrosatelliteRegex(fuzzy_substitution=0, min_seq_length=min_seq_len) fuzzy = False regex_pregen = regex_search.create_regex(preloaded=True) logging.info("Starting searches") if __name__ == '__main__': result_1 = [] output_1 = [] if args.multithread is True: p = Pool(nodes=args.cpus[0]) res = p.amap(regex_worker_multithread, fastq_file.sequences) count = 0 while not res.ready(): count += 2 print("\rWaiting. Timer: %d" % count, end='') time.sleep(2) result_1 = res.get() print("\nDone searching") elif args.multithread is False: num_seq = 0 for seq in fastq_file.sequences: num_seq += 1 rexex = regex_worker(seq.seq) if rexex: result_1.append(rexex)
sim1 = ScipyOdeSimulator(model, tspan=tspan, param_values=pars_label1).run() sim1.save( 'sims_baxkd80_sensitivities_sampled_kd/earm_scipyode_sims_good{0}.h5' .format(label)) if label == 6: """ Cluster 6: Dominant reactions: BidM_BaxC, BidM_Bcl2M """ pars_label1 = pars[np.where(clus_sp37_labels == label)] pars_label1[:, 63] = pars_ref1[63] * 0.2 # 80% Knock down of bax # pars_label1[:, 58] = pars_ref1[58] * 0.2 # 20% Knock down of Bcl2 # pars_label1[:, 64] = pars_label1[64] * 0.8 # 80% Knock down of bak # pars_label1[:, 57] = pars_label1[57] * 0.8 # 80% Knock down of mcl1 sim1 = ScipyOdeSimulator(model, tspan=tspan, param_values=pars_label1).run() sim1.save( 'sims_baxkd80_sensitivities_sampled_kd/earm_scipyode_sims_good{0}.h5' .format(label)) return p = Pool(25) res = p.amap(sims_kd, unique_labels) res.get()
# instantiate and configure the worker pool from pathos.multiprocessing import ProcessingPool pool = ProcessingPool(nodes=4) _result = map(pow, [1,2,3,4], [5,6,7,8]) # do a blocking map on the chosen function result = pool.map(pow, [1,2,3,4], [5,6,7,8]) assert result == _result # do a non-blocking map, then extract the result from the iterator result_iter = pool.imap(pow, [1,2,3,4], [5,6,7,8]) result = list(result_iter) assert result == _result # do an asynchronous map, then get the results result_queue = pool.amap(pow, [1,2,3,4], [5,6,7,8]) result = result_queue.get() assert result == _result