Python ProcessingPool.restartの例、pathos.multiprocessing.ProcessingPool.restart Pythonの例

コード例 #1

0

ファイルを表示

ファイル: cloudium.py プロジェクト: june5079/cloudium

    def certScanner (self) :
        p = Pool(nodes = 512)
        cprint ("[+] Keywords : " + " ".join(str(x) for x in self.keywordList), 'green')
        # self.allipAddrList = self.shuffleList()
        self.allipAddrList = [x for x in self.shuffleList() if self.region in x ]
        
        for self.tryipClass in self.allipAddrList:
            self.ipExtractResult = self.ipExtract(self.tryipClass.split("@")[0])
            _max = len(self.ipExtractResult)
            cprint ("[+] Scanning IP Addr Class : " + self.tryipClass + "\t-- Number of scan target is :" + str(len(self.ipExtractResult)), 'green')

            with tqdm(total=_max) as pbar:
                pbar.set_description("[+] Progressing : %s " %self.tryipClass)
                for i, domain in tqdm(enumerate(p.imap(self.certChecker, self.ipExtractResult))):
                    pbar.update()
                    if domain is not None:
                        self.resList.append(domain)
                pbar.close()
                p.terminate() # Like p.close()
                p.restart() # Like p.join()

            if self.resList:
                self.printRes()

            else:
                cprint ("[!] No kewords found on this IP class \n", 'red')

            time.sleep(1)
            self.ipExtractResult = []
            self.resList = []

コード例 #2

0

ファイルを表示

class ConsensusMHSampler(MHSampler):
	def __init__(self, log_f, log_g, g_sample, x0, iterations, shards=1):
		super(ConsensusMHSampler, self).__init__(log_f, log_g, g_sample, x0, iterations)
		self.shards = shards

		assert len(self.log_distribution_fn) == self.shards
		self.log_fn_dict = {} # for pickling purposes
		for i in range(self.shards):
			self.log_fn_dict[i] = self.log_distribution_fn[i]

		self.pool = Pool(nodes=self.shards)

	def sample(self):
		map_results = self.pool.map(self.map_sample, range(self.shards))
		self.pool.close()
		self.pool.join()
		self.pool.terminate()
		self.pool.restart()
		self.saved_states = self.reduce_sample(map_results)

	def map_sample(self, index):
		np.random.seed(1)
		cur_state = self.start_state
		sample_results = [cur_state]
		prob, count = 0, 0

		for i in range(self.iterations):
			if i % 5000 == 0:
				print("iteration {}".format(i))
			candidate_state = self.get_transition_sample(cur_state)
			acceptance = self.calculate_acceptance_ratio(candidate_state, self.log_fn_dict[index])
			prob += acceptance
			count += 1

			new_state = self.transition_step(cur_state, candidate_state, acceptance)
			sample_results.append(new_state)
			cur_state = new_state
		sample_results = np.array(sample_results)

		print("INDEX {}: Avg acceptance prob is {}".format(index, prob/count))

		return (sample_results, 1.0 / (1e-8 + self.get_sample_variance(sample_results)))

	def get_sample_variance(self, data):
		return np.linalg.norm(np.var(np.array(data), axis=0))

	def reduce_sample(self, results):
		'''
			results is a list of (sample_array, weight) tuples
		'''
		sample_results = 0
		total_weight = 0
		for sample, weight in results:
			sample_results += weight * sample
			total_weight += weight

		return sample_results / total_weight

コード例 #3

0

ファイルを表示

def parallelize_dataframe(df: pd.DataFrame, func, n_cores=4) -> pd.DataFrame:
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    # have to include this to prevent leakage and allow multiple parallel function calls
    pool.terminate()
    pool.restart()
    return df

コード例 #4

0

ファイルを表示

    def _mp_improve(self, container, scenario_builder):
        """Improves b/2 best solutions from the container and updates
        the score table with the generated solutions
        """
        container.sort()
        pool = Pool(processes=self._proc_count)

        logging.info("Starting processes")
        start = datetime.now()
        best = []
        builders = []
        for i in range(self._b/2):
            best.append(container.get(i))
            builders.append(scenario_builder)

        try:
            result = pool.map(self._improve, best, builders)
            pool.close()
            pool.join()
        except MemoryError as e:
            send_email("I crashed again, please help!")
            import pudb
            pudb.set_trace()
            print(e.message())

        logging.info("Processes finished - %s" % (datetime.now() - start))
        # How infuriating was that?!
        # pathos was being smart and was caching pool so this is needed
        # to prevent from erroring out
        pool.restart()

        start = datetime.now()
        logging.info("mp_improve second loop")
        for entry in result:
            index = container.index(entry['individual'])
            best = entry['improvements'].get(0)
            if best.get_utility() < entry['individual'].get_utility():
                container.replace(best, index)

            for improvement in entry['improvements'].get_all():
                self._update_score_table(improvement)

        logging.info("mp_improve second loop - %s" % (datetime.now() - start))
        logging.info("Improved %d solutions" % container.get_changes())
        container.reset_changes()
        return container

コード例 #5

0

ファイルを表示

ファイル: annotation.py プロジェクト: WingCHWang/KG-Tools

        def wrapper(*args, **kwargs):

            obj, data, _args = tuple(), tuple(), tuple()
            if hasattr(args[0].__class__, fn.__name__):
                obj, data, *_args = args
                obj = (obj, )
            else:
                data, *_args = args

            if type(data) != list:
                data = list(data)

            total_size = len(data)
            _batch_size = total_size // workers + 1 if batch_size is None else batch_size
            # assert type(data) == list, "Type of data must be list"
            print(
                f"@Parallel[workers={workers}, data_size={total_size}, batch_size={_batch_size}]: parallel for {fn.__qualname__}."
            )

            if shuffle:
                print(
                    f"@Parallel[workers={workers}, data_size={total_size}, batch_size={_batch_size}]: shuffle data for {fn.__qualname__}."
                )
                random.shuffle(data)

            pool = Pool(workers)
            pool.terminate()
            pool.restart()

            proc = []
            for beg, end in zip(
                    range(0, total_size, _batch_size),
                    range(_batch_size, total_size + _batch_size, _batch_size)):
                batch = data[beg:end]
                p = pool.apipe(fn, *obj, batch, *_args, **kwargs)
                proc.append(p)
            pool.close()
            pool.join()

            result = reduce_seqs([p.get() for p in proc])
            if after_hook is not None:
                result = after_hook(result)

            return result

コード例 #6

0

ファイルを表示

class IngestionManagerPandas:
    """Class to manage the multi-threaded data ingestion process.

    This class will manage the data ingestion process which is multi-threaded.

    Attributes:
        feature_group_name (str): name of the Feature Group.
        sagemaker_fs_runtime_client_config (Config): instance of the Config class
            for boto calls.
        data_frame (DataFrame): pandas DataFrame to be ingested to the given feature group.
        max_workers (int): number of threads to create.
        max_processes (int): number of processes to create. Each process spawns
            ``max_workers`` threads.
        profile_name (str): the profile credential should be used for ``PutRecord``
            (default: None).
    """

    feature_group_name: str = attr.ib()
    sagemaker_fs_runtime_client_config: Config = attr.ib()
    max_workers: int = attr.ib(default=1)
    max_processes: int = attr.ib(default=1)
    profile_name: str = attr.ib(default=None)
    _async_result: AsyncResult = attr.ib(default=None)
    _processing_pool: ProcessingPool = attr.ib(default=None)
    _failed_indices: List[int] = attr.ib(factory=list)

    @staticmethod
    def _ingest_single_batch(
        data_frame: DataFrame,
        feature_group_name: str,
        client_config: Config,
        start_index: int,
        end_index: int,
        profile_name: str = None,
    ) -> List[int]:
        """Ingest a single batch of DataFrame rows into FeatureStore.

        Args:
            data_frame (DataFrame): source DataFrame to be ingested.
            feature_group_name (str): name of the Feature Group.
            client_config (Config): Configuration for the sagemaker feature store runtime
                client to perform boto calls.
            start_index (int): starting position to ingest in this batch.
            end_index (int): ending position to ingest in this batch.
            profile_name (str): the profile credential should be used for ``PutRecord``
                (default: None).

        Returns:
            List of row indices that failed to be ingested.
        """
        retry_config = client_config.retries
        if "max_attempts" not in retry_config and "total_max_attempts" not in retry_config:
            client_config = copy.deepcopy(client_config)
            client_config.retries = {"max_attempts": 10, "mode": "standard"}
        sagemaker_featurestore_runtime_client = boto3.Session(
            profile_name=profile_name).client(
                service_name="sagemaker-featurestore-runtime",
                config=client_config)

        logger.info("Started ingesting index %d to %d", start_index, end_index)
        failed_rows = list()
        for row in data_frame[start_index:end_index].itertuples():
            record = [
                FeatureValue(
                    feature_name=data_frame.columns[index - 1],
                    value_as_string=str(row[index]),
                ) for index in range(1, len(row)) if pd.notna(row[index])
            ]
            try:
                sagemaker_featurestore_runtime_client.put_record(
                    FeatureGroupName=feature_group_name,
                    Record=[value.to_dict() for value in record],
                )
            except Exception as e:  # pylint: disable=broad-except
                logger.error("Failed to ingest row %d: %s", row[0], e)
                failed_rows.append(row[0])
        return failed_rows

    @property
    def failed_rows(self) -> List[int]:
        """Get rows that failed to ingest.

        Returns:
            List of row indices that failed to be ingested.
        """
        return self._failed_indices

    def wait(self, timeout=None):
        """Wait for the ingestion process to finish.

        Args:
            timeout (Union[int, float]): ``concurrent.futures.TimeoutError`` will be raised
                if timeout is reached.
        """
        try:
            results = self._async_result.get(timeout=timeout)
        except KeyboardInterrupt as i:
            # terminate workers abruptly on keyboard interrupt.
            self._processing_pool.terminate()
            self._processing_pool.close()
            self._processing_pool.clear()
            raise i
        else:
            # terminate normally
            self._processing_pool.close()
            self._processing_pool.clear()

        self._failed_indices = [
            failed_index for failed_indices in results
            for failed_index in failed_indices
        ]

        if len(self._failed_indices) > 0:
            raise IngestionError(
                self._failed_indices,
                f"Failed to ingest some data into FeatureGroup {self.feature_group_name}",
            )

    def _run_multi_process(self,
                           data_frame: DataFrame,
                           wait=True,
                           timeout=None):
        """Start the ingestion process with the specified number of processes.

        Args:
            data_frame (DataFrame): source DataFrame to be ingested.
            wait (bool): whether to wait for the ingestion to finish or not.
            timeout (Union[int, float]): ``concurrent.futures.TimeoutError`` will be raised
                if timeout is reached.
        """
        # pylint: disable=I1101
        batch_size = math.ceil(data_frame.shape[0] / self.max_processes)
        # pylint: enable=I1101

        args = []
        for i in range(self.max_processes):
            start_index = min(i * batch_size, data_frame.shape[0])
            end_index = min(i * batch_size + batch_size, data_frame.shape[0])
            args += [(
                self.max_workers,
                self.feature_group_name,
                self.sagemaker_fs_runtime_client_config,
                data_frame[start_index:end_index],
                start_index,
                timeout,
                self.profile_name,
            )]

        def init_worker():
            # ignore keyboard interrupts in child processes.
            signal.signal(signal.SIGINT, signal.SIG_IGN)

        self._processing_pool = ProcessingPool(self.max_processes, init_worker)
        self._processing_pool.restart(force=True)

        f = lambda x: IngestionManagerPandas._run_multi_threaded(
            *x)  # noqa: E731
        self._async_result = self._processing_pool.amap(f, args)

        if wait:
            self.wait(timeout=timeout)

    @staticmethod
    def _run_multi_threaded(
        max_workers: int,
        feature_group_name: str,
        sagemaker_fs_runtime_client_config: Config,
        data_frame: DataFrame,
        row_offset=0,
        timeout=None,
        profile_name=None,
    ) -> List[int]:
        """Start the ingestion process.

        Args:
            data_frame (DataFrame): source DataFrame to be ingested.
            row_offset (int): if ``data_frame`` is a partition of a parent DataFrame, then the
                index of the parent where ``data_frame`` starts. Otherwise, 0.
            wait (bool): whether to wait for the ingestion to finish or not.
            timeout (Union[int, float]): ``concurrent.futures.TimeoutError`` will be raised
                if timeout is reached.
            profile_name (str): the profile credential should be used for ``PutRecord``
                (default: None).

        Returns:
            List of row indices that failed to be ingested.
        """
        executor = ThreadPoolExecutor(max_workers=max_workers)
        # pylint: disable=I1101
        batch_size = math.ceil(data_frame.shape[0] / max_workers)
        # pylint: enable=I1101

        futures = {}
        for i in range(max_workers):
            start_index = min(i * batch_size, data_frame.shape[0])
            end_index = min(i * batch_size + batch_size, data_frame.shape[0])
            futures[executor.submit(
                IngestionManagerPandas._ingest_single_batch,
                feature_group_name=feature_group_name,
                data_frame=data_frame,
                start_index=start_index,
                end_index=end_index,
                client_config=sagemaker_fs_runtime_client_config,
                profile_name=profile_name,
            )] = (start_index + row_offset, end_index + row_offset)

        failed_indices = list()
        for future in as_completed(futures, timeout=timeout):
            start, end = futures[future]
            result = future.result()
            if result:
                logger.error("Failed to ingest row %d to %d", start, end)
            else:
                logger.info("Successfully ingested row %d to %d", start, end)
            failed_indices += result

        executor.shutdown(wait=False)

        return failed_indices

    def run(self, data_frame: DataFrame, wait=True, timeout=None):
        """Start the ingestion process.

        Args:
            data_frame (DataFrame): source DataFrame to be ingested.
            wait (bool): whether to wait for the ingestion to finish or not.
            timeout (Union[int, float]): ``concurrent.futures.TimeoutError`` will be raised
                if timeout is reached.
        """
        self._run_multi_process(data_frame=data_frame,
                                wait=wait,
                                timeout=timeout)

コード例 #7

0

ファイルを表示

     
     random_strains.append(x0)
     
 # reshape the generated strains
 basin_strains = random_strains[:basin_tests*num_strains]
 
 if multiprocess:
     if verbose:
         print('Done')
         print('Initializing multiprocessing ...', end=' ')
         
     # initialize a multiprocessing pool
     try:
         tp = ProcessingPool(max_workers=max_workers)
     except:
         tp.restart()
         
     map_function = tp.map
 else:
     map_function = map
     
 if verbose:
     print('Done')
     print('Calculating costs ...', end=' ')
     
 # calculate costs for random and random orthonormalized strain sets    
 try:
     random_costs = map_function(cost_function, random_strains)
 except Exception as e:
     print(e)
     tp.restart()

コード例 #8

0

ファイルを表示

ファイル: multiproc_util.py プロジェクト: jonzarecki/textual-mqs

def parmap(f,
           X,
           nprocs=multiprocessing.cpu_count(),
           force_parallel=False,
           chunk_size=1):
    from ResearchNLP import Constants as cn
    from ResearchNLP.util_files import function_cache

    if len(X) == 0:
        return []  # like map

    # nprocs = min(nprocs, cn.max_procs)
    if nprocs != multiprocessing.cpu_count() and len(X) < nprocs * chunk_size:
        chunk_size = 1  # use chunk_size = 1 if there is enough procs for a batch size of 1
    nprocs = max(1, min(nprocs, len(X) / chunk_size))  # at least 1
    if len(X) < nprocs:
        if cn.verbose and nprocs != multiprocessing.cpu_count():
            print "parmap too much procs"
        nprocs = len(X)  # too much procs

    if nprocs == 1 or (cn.serial_parmap and not force_parallel
                       ):  # we want it serial (maybe for profiling)
        return map(f, X)

    def _spawn_fun(input, func):
        import random, numpy
        from ResearchNLP import Constants as cn2
        from ResearchNLP.util_files import function_cache as function_cache2
        random.seed(1554 + i)
        numpy.random.seed(42 + i)  # set random seeds
        try:
            res = func(input)
            res_dict = dict()
            res_dict["res"] = res
            res_dict["functions_dict"] = function_cache2.caches_dicts
            res_dict["experiment_purpose"] = cn2.experiment_purpose
            res_dict["curr_params_list"] = cn2.curr_experiment_params_list
            return res_dict
        except:
            import traceback
            traceback.print_exc()
            raise  # re-raise exception

    # if chunk_size == 1:
    #     chunk_size = math.ceil(float(len(X)) / nprocs)  # all procs work on an equal chunk

    try:  # try-catch hides bugs
        global proc_count
        old_proc_count = proc_count
        proc_count = nprocs
        p = Pool(nprocs)
        p.restart(force=True)
        retval_par = p.map(
            _spawn_fun, X, [f] * len(X),
            chunk_size=chunk_size)  # can throw if current proc is daemon
        p.terminate()
        for res_dict in retval_par:  # add all experiments params we missed
            curr_params_list = res_dict["curr_params_list"]
            for param in curr_params_list:
                cn.add_experiment_param(param)
        cn.experiment_purpose = retval_par[0][
            "experiment_purpose"]  # use the "experiment_purpose" from the fork
        function_cache.merge_cache_dicts_from_parallel_runs(
            map(lambda a: a["functions_dict"], retval_par))  # merge all
        retval = map(lambda res_dict: res_dict["res"],
                     retval_par)  # make it like the original map
        proc_count = old_proc_count
        global i
        i += 1
    except AssertionError as e:
        if e.message == "daemonic processes are not allowed to have children":
            retval = map(f, X)  # can't have pool inside pool
        else:
            print "error message is: " + str(e.message)
            raise  # re-raise orig exception
    return retval