def certScanner (self) : p = Pool(nodes = 512) cprint ("[+] Keywords : " + " ".join(str(x) for x in self.keywordList), 'green') # self.allipAddrList = self.shuffleList() self.allipAddrList = [x for x in self.shuffleList() if self.region in x ] for self.tryipClass in self.allipAddrList: self.ipExtractResult = self.ipExtract(self.tryipClass.split("@")[0]) _max = len(self.ipExtractResult) cprint ("[+] Scanning IP Addr Class : " + self.tryipClass + "\t-- Number of scan target is :" + str(len(self.ipExtractResult)), 'green') with tqdm(total=_max) as pbar: pbar.set_description("[+] Progressing : %s " %self.tryipClass) for i, domain in tqdm(enumerate(p.imap(self.certChecker, self.ipExtractResult))): pbar.update() if domain is not None: self.resList.append(domain) pbar.close() p.terminate() # Like p.close() p.restart() # Like p.join() if self.resList: self.printRes() else: cprint ("[!] No kewords found on this IP class \n", 'red') time.sleep(1) self.ipExtractResult = [] self.resList = []
class ConsensusMHSampler(MHSampler): def __init__(self, log_f, log_g, g_sample, x0, iterations, shards=1): super(ConsensusMHSampler, self).__init__(log_f, log_g, g_sample, x0, iterations) self.shards = shards assert len(self.log_distribution_fn) == self.shards self.log_fn_dict = {} # for pickling purposes for i in range(self.shards): self.log_fn_dict[i] = self.log_distribution_fn[i] self.pool = Pool(nodes=self.shards) def sample(self): map_results = self.pool.map(self.map_sample, range(self.shards)) self.pool.close() self.pool.join() self.pool.terminate() self.pool.restart() self.saved_states = self.reduce_sample(map_results) def map_sample(self, index): np.random.seed(1) cur_state = self.start_state sample_results = [cur_state] prob, count = 0, 0 for i in range(self.iterations): if i % 5000 == 0: print("iteration {}".format(i)) candidate_state = self.get_transition_sample(cur_state) acceptance = self.calculate_acceptance_ratio(candidate_state, self.log_fn_dict[index]) prob += acceptance count += 1 new_state = self.transition_step(cur_state, candidate_state, acceptance) sample_results.append(new_state) cur_state = new_state sample_results = np.array(sample_results) print("INDEX {}: Avg acceptance prob is {}".format(index, prob/count)) return (sample_results, 1.0 / (1e-8 + self.get_sample_variance(sample_results))) def get_sample_variance(self, data): return np.linalg.norm(np.var(np.array(data), axis=0)) def reduce_sample(self, results): ''' results is a list of (sample_array, weight) tuples ''' sample_results = 0 total_weight = 0 for sample, weight in results: sample_results += weight * sample total_weight += weight return sample_results / total_weight
def parallelize_dataframe(df: pd.DataFrame, func, n_cores=4) -> pd.DataFrame: df_split = np.array_split(df, n_cores) pool = Pool(n_cores) df = pd.concat(pool.map(func, df_split)) pool.close() pool.join() # have to include this to prevent leakage and allow multiple parallel function calls pool.terminate() pool.restart() return df
def _mp_improve(self, container, scenario_builder): """Improves b/2 best solutions from the container and updates the score table with the generated solutions """ container.sort() pool = Pool(processes=self._proc_count) logging.info("Starting processes") start = datetime.now() best = [] builders = [] for i in range(self._b/2): best.append(container.get(i)) builders.append(scenario_builder) try: result = pool.map(self._improve, best, builders) pool.close() pool.join() except MemoryError as e: send_email("I crashed again, please help!") import pudb pudb.set_trace() print(e.message()) logging.info("Processes finished - %s" % (datetime.now() - start)) # How infuriating was that?! # pathos was being smart and was caching pool so this is needed # to prevent from erroring out pool.restart() start = datetime.now() logging.info("mp_improve second loop") for entry in result: index = container.index(entry['individual']) best = entry['improvements'].get(0) if best.get_utility() < entry['individual'].get_utility(): container.replace(best, index) for improvement in entry['improvements'].get_all(): self._update_score_table(improvement) logging.info("mp_improve second loop - %s" % (datetime.now() - start)) logging.info("Improved %d solutions" % container.get_changes()) container.reset_changes() return container
def wrapper(*args, **kwargs): obj, data, _args = tuple(), tuple(), tuple() if hasattr(args[0].__class__, fn.__name__): obj, data, *_args = args obj = (obj, ) else: data, *_args = args if type(data) != list: data = list(data) total_size = len(data) _batch_size = total_size // workers + 1 if batch_size is None else batch_size # assert type(data) == list, "Type of data must be list" print( f"@Parallel[workers={workers}, data_size={total_size}, batch_size={_batch_size}]: parallel for {fn.__qualname__}." ) if shuffle: print( f"@Parallel[workers={workers}, data_size={total_size}, batch_size={_batch_size}]: shuffle data for {fn.__qualname__}." ) random.shuffle(data) pool = Pool(workers) pool.terminate() pool.restart() proc = [] for beg, end in zip( range(0, total_size, _batch_size), range(_batch_size, total_size + _batch_size, _batch_size)): batch = data[beg:end] p = pool.apipe(fn, *obj, batch, *_args, **kwargs) proc.append(p) pool.close() pool.join() result = reduce_seqs([p.get() for p in proc]) if after_hook is not None: result = after_hook(result) return result
class IngestionManagerPandas: """Class to manage the multi-threaded data ingestion process. This class will manage the data ingestion process which is multi-threaded. Attributes: feature_group_name (str): name of the Feature Group. sagemaker_fs_runtime_client_config (Config): instance of the Config class for boto calls. data_frame (DataFrame): pandas DataFrame to be ingested to the given feature group. max_workers (int): number of threads to create. max_processes (int): number of processes to create. Each process spawns ``max_workers`` threads. profile_name (str): the profile credential should be used for ``PutRecord`` (default: None). """ feature_group_name: str = attr.ib() sagemaker_fs_runtime_client_config: Config = attr.ib() max_workers: int = attr.ib(default=1) max_processes: int = attr.ib(default=1) profile_name: str = attr.ib(default=None) _async_result: AsyncResult = attr.ib(default=None) _processing_pool: ProcessingPool = attr.ib(default=None) _failed_indices: List[int] = attr.ib(factory=list) @staticmethod def _ingest_single_batch( data_frame: DataFrame, feature_group_name: str, client_config: Config, start_index: int, end_index: int, profile_name: str = None, ) -> List[int]: """Ingest a single batch of DataFrame rows into FeatureStore. Args: data_frame (DataFrame): source DataFrame to be ingested. feature_group_name (str): name of the Feature Group. client_config (Config): Configuration for the sagemaker feature store runtime client to perform boto calls. start_index (int): starting position to ingest in this batch. end_index (int): ending position to ingest in this batch. profile_name (str): the profile credential should be used for ``PutRecord`` (default: None). Returns: List of row indices that failed to be ingested. """ retry_config = client_config.retries if "max_attempts" not in retry_config and "total_max_attempts" not in retry_config: client_config = copy.deepcopy(client_config) client_config.retries = {"max_attempts": 10, "mode": "standard"} sagemaker_featurestore_runtime_client = boto3.Session( profile_name=profile_name).client( service_name="sagemaker-featurestore-runtime", config=client_config) logger.info("Started ingesting index %d to %d", start_index, end_index) failed_rows = list() for row in data_frame[start_index:end_index].itertuples(): record = [ FeatureValue( feature_name=data_frame.columns[index - 1], value_as_string=str(row[index]), ) for index in range(1, len(row)) if pd.notna(row[index]) ] try: sagemaker_featurestore_runtime_client.put_record( FeatureGroupName=feature_group_name, Record=[value.to_dict() for value in record], ) except Exception as e: # pylint: disable=broad-except logger.error("Failed to ingest row %d: %s", row[0], e) failed_rows.append(row[0]) return failed_rows @property def failed_rows(self) -> List[int]: """Get rows that failed to ingest. Returns: List of row indices that failed to be ingested. """ return self._failed_indices def wait(self, timeout=None): """Wait for the ingestion process to finish. Args: timeout (Union[int, float]): ``concurrent.futures.TimeoutError`` will be raised if timeout is reached. """ try: results = self._async_result.get(timeout=timeout) except KeyboardInterrupt as i: # terminate workers abruptly on keyboard interrupt. self._processing_pool.terminate() self._processing_pool.close() self._processing_pool.clear() raise i else: # terminate normally self._processing_pool.close() self._processing_pool.clear() self._failed_indices = [ failed_index for failed_indices in results for failed_index in failed_indices ] if len(self._failed_indices) > 0: raise IngestionError( self._failed_indices, f"Failed to ingest some data into FeatureGroup {self.feature_group_name}", ) def _run_multi_process(self, data_frame: DataFrame, wait=True, timeout=None): """Start the ingestion process with the specified number of processes. Args: data_frame (DataFrame): source DataFrame to be ingested. wait (bool): whether to wait for the ingestion to finish or not. timeout (Union[int, float]): ``concurrent.futures.TimeoutError`` will be raised if timeout is reached. """ # pylint: disable=I1101 batch_size = math.ceil(data_frame.shape[0] / self.max_processes) # pylint: enable=I1101 args = [] for i in range(self.max_processes): start_index = min(i * batch_size, data_frame.shape[0]) end_index = min(i * batch_size + batch_size, data_frame.shape[0]) args += [( self.max_workers, self.feature_group_name, self.sagemaker_fs_runtime_client_config, data_frame[start_index:end_index], start_index, timeout, self.profile_name, )] def init_worker(): # ignore keyboard interrupts in child processes. signal.signal(signal.SIGINT, signal.SIG_IGN) self._processing_pool = ProcessingPool(self.max_processes, init_worker) self._processing_pool.restart(force=True) f = lambda x: IngestionManagerPandas._run_multi_threaded( *x) # noqa: E731 self._async_result = self._processing_pool.amap(f, args) if wait: self.wait(timeout=timeout) @staticmethod def _run_multi_threaded( max_workers: int, feature_group_name: str, sagemaker_fs_runtime_client_config: Config, data_frame: DataFrame, row_offset=0, timeout=None, profile_name=None, ) -> List[int]: """Start the ingestion process. Args: data_frame (DataFrame): source DataFrame to be ingested. row_offset (int): if ``data_frame`` is a partition of a parent DataFrame, then the index of the parent where ``data_frame`` starts. Otherwise, 0. wait (bool): whether to wait for the ingestion to finish or not. timeout (Union[int, float]): ``concurrent.futures.TimeoutError`` will be raised if timeout is reached. profile_name (str): the profile credential should be used for ``PutRecord`` (default: None). Returns: List of row indices that failed to be ingested. """ executor = ThreadPoolExecutor(max_workers=max_workers) # pylint: disable=I1101 batch_size = math.ceil(data_frame.shape[0] / max_workers) # pylint: enable=I1101 futures = {} for i in range(max_workers): start_index = min(i * batch_size, data_frame.shape[0]) end_index = min(i * batch_size + batch_size, data_frame.shape[0]) futures[executor.submit( IngestionManagerPandas._ingest_single_batch, feature_group_name=feature_group_name, data_frame=data_frame, start_index=start_index, end_index=end_index, client_config=sagemaker_fs_runtime_client_config, profile_name=profile_name, )] = (start_index + row_offset, end_index + row_offset) failed_indices = list() for future in as_completed(futures, timeout=timeout): start, end = futures[future] result = future.result() if result: logger.error("Failed to ingest row %d to %d", start, end) else: logger.info("Successfully ingested row %d to %d", start, end) failed_indices += result executor.shutdown(wait=False) return failed_indices def run(self, data_frame: DataFrame, wait=True, timeout=None): """Start the ingestion process. Args: data_frame (DataFrame): source DataFrame to be ingested. wait (bool): whether to wait for the ingestion to finish or not. timeout (Union[int, float]): ``concurrent.futures.TimeoutError`` will be raised if timeout is reached. """ self._run_multi_process(data_frame=data_frame, wait=wait, timeout=timeout)
random_strains.append(x0) # reshape the generated strains basin_strains = random_strains[:basin_tests*num_strains] if multiprocess: if verbose: print('Done') print('Initializing multiprocessing ...', end=' ') # initialize a multiprocessing pool try: tp = ProcessingPool(max_workers=max_workers) except: tp.restart() map_function = tp.map else: map_function = map if verbose: print('Done') print('Calculating costs ...', end=' ') # calculate costs for random and random orthonormalized strain sets try: random_costs = map_function(cost_function, random_strains) except Exception as e: print(e) tp.restart()
def parmap(f, X, nprocs=multiprocessing.cpu_count(), force_parallel=False, chunk_size=1): from ResearchNLP import Constants as cn from ResearchNLP.util_files import function_cache if len(X) == 0: return [] # like map # nprocs = min(nprocs, cn.max_procs) if nprocs != multiprocessing.cpu_count() and len(X) < nprocs * chunk_size: chunk_size = 1 # use chunk_size = 1 if there is enough procs for a batch size of 1 nprocs = max(1, min(nprocs, len(X) / chunk_size)) # at least 1 if len(X) < nprocs: if cn.verbose and nprocs != multiprocessing.cpu_count(): print "parmap too much procs" nprocs = len(X) # too much procs if nprocs == 1 or (cn.serial_parmap and not force_parallel ): # we want it serial (maybe for profiling) return map(f, X) def _spawn_fun(input, func): import random, numpy from ResearchNLP import Constants as cn2 from ResearchNLP.util_files import function_cache as function_cache2 random.seed(1554 + i) numpy.random.seed(42 + i) # set random seeds try: res = func(input) res_dict = dict() res_dict["res"] = res res_dict["functions_dict"] = function_cache2.caches_dicts res_dict["experiment_purpose"] = cn2.experiment_purpose res_dict["curr_params_list"] = cn2.curr_experiment_params_list return res_dict except: import traceback traceback.print_exc() raise # re-raise exception # if chunk_size == 1: # chunk_size = math.ceil(float(len(X)) / nprocs) # all procs work on an equal chunk try: # try-catch hides bugs global proc_count old_proc_count = proc_count proc_count = nprocs p = Pool(nprocs) p.restart(force=True) retval_par = p.map( _spawn_fun, X, [f] * len(X), chunk_size=chunk_size) # can throw if current proc is daemon p.terminate() for res_dict in retval_par: # add all experiments params we missed curr_params_list = res_dict["curr_params_list"] for param in curr_params_list: cn.add_experiment_param(param) cn.experiment_purpose = retval_par[0][ "experiment_purpose"] # use the "experiment_purpose" from the fork function_cache.merge_cache_dicts_from_parallel_runs( map(lambda a: a["functions_dict"], retval_par)) # merge all retval = map(lambda res_dict: res_dict["res"], retval_par) # make it like the original map proc_count = old_proc_count global i i += 1 except AssertionError as e: if e.message == "daemonic processes are not allowed to have children": retval = map(f, X) # can't have pool inside pool else: print "error message is: " + str(e.message) raise # re-raise orig exception return retval