def parcompute_example(): dc = PMPExample() dc2 = PMPExample() dc3 = PMPExample() dc4 = PMPExample() n_datapoints = 100 inp_data = range(n_datapoints) r1 = dc.threadcompute(inp_data) assert (len(dc.cache) == n_datapoints) r2 = dc2.processcompute(inp_data) assert (len(dc2.cache) == 0) assert (r1 == r2) r3 = ProcessPool(4).map(dc3.compute, inp_data) r4 = ThreadPool(4).map(dc4.compute, inp_data) assert (r4 == r3 == r2) assert (len(dc3.cache) == 0) assert (len(dc4.cache) == n_datapoints) log.info("Size of threadpooled class caches: {0}, {1}".format( len(dc.cache), len(dc4.cache))) log.info("Size of processpooled class caches: {0}, {1}".format( len(dc2.cache), len(dc3.cache)))
def __init__(self, func, schema, ds, scheduler: str = "single", workers: int = 1, **kwargs): """| Transform applies a user defined function to each sample in single threaded manner. Parameters ---------- func: function user defined function func(x, **kwargs) schema: dict of dtypes the structure of the final dataset that will be created ds: Iterative input dataset or a list that can be iterated scheduler: str choice between "single", "threaded", "processed" workers: int how many threads or processes to use **kwargs: additional arguments that will be passed to func as static argument for all samples """ self._func = func self.schema = schema self._ds = ds self.kwargs = kwargs self.workers = workers if isinstance(self._ds, Transform): self.base_ds = self._ds.base_ds self._func = self._ds._func[:] self._func.append(func) self.kwargs = self._ds.kwargs[:] self.kwargs.append(kwargs) else: self.base_ds = ds self._func = [func] self.kwargs = [kwargs] if scheduler == "threaded" or (scheduler == "single" and workers > 1): self.map = ThreadPool(nodes=workers).map elif scheduler == "processed": self.map = ProcessPool(nodes=workers).map elif scheduler == "single": self.map = map elif scheduler == "ray": try: from ray.util.multiprocessing import Pool as RayPool except Exception: pass self.map = RayPool().map else: raise Exception( f"Scheduler {scheduler} not understood, please use 'single', 'threaded', 'processed'" )
def download_top_melee_gifs(pages = 1): print('Looking for gifs on the top {} reddit pages'.format(pages)) saveDir = create_timestamped_dir() print('Will save to {}'.format(saveDir)) urls = get_melee_gif_urls(pages) print('Found {} gif urls'.format(len(urls))) pool = ThreadPool(50) results = pool.map(lambda url:download_gif_and_convert_to_images(url, saveDir), urls) print('Done downloading and converting {} gifs'.format(len(filter(None, results))))
def fit(self, words): self.coherence_scores = {} self.pairwise_probability = {} self.word_probability = {} self.pairwise_hits = {} self.word_hits = {} pool = ThreadPool(N_CPUS) pool.map(self.compute_word_hits, words) # for word_i in self.words: sorted_desc = sorted(self.word_hits.items(), key=operator.itemgetter(1), reverse=True) sorted_asc = sorted(self.word_hits.items(), key=operator.itemgetter(1)) for most_common in sorted_desc: most_common_ngram = most_common[0] most_common_hits = most_common[1] for most_rare in sorted_asc: most_rare_ngram = most_rare[0] most_rare_hits = most_rare[1] if most_common_ngram is not most_rare_ngram: if most_rare_hits < most_common_hits: pairwise_key = most_rare_ngram + "_" + most_common_ngram if pairwise_key not in self.pairwise_probability.keys( ): self.pairwise_probability[pairwise_key] = 0 self.pairwise[pairwise_key] = { "most_common_ngram": most_common_ngram, "most_common_hits": most_common_hits, "most_rare_ngram": most_rare_ngram, "most_rare_hits": most_rare_hits } pool.map(self.compute_pairwise_hits, self.pairwise.keys()) return sum(self.coherence_scores.values())
def main(): npool = 4 ppool = ProcessPool(npool) tpool = ThreadPool(npool) parapool = ParallelPool(npool) spool = SerialPool() pool = Pool(npool) nloops = 8 print('For Loop') forloop(nloops) print('ThreadPool') test(nloops, tpool) print('ParallelPool') test(nloops, parapool) print('SerialPool') test(nloops, spool) print('Pool') test(nloops, pool) print('ProcessPool') test(nloops, ppool)
def fit(self, words): self.coherence_scores = {} self.pairwise_probability = {} self.word_probability = {} self.pairwise_hits = {} self.word_hits = {} for word_i in words: for word_j in words: if word_i is not word_j: pairwise_key = "_".join(sorted([word_i, word_j])) if pairwise_key not in self.pairwise_probability.keys(): self.pairwise_probability[pairwise_key] = 0 self.pairwise.append(pairwise_key) pool = ThreadPool(N_CPUS) pool.map(self.compute_word_hits, words) pool.map(self.compute_pairwise_hits, self.pairwise) return sum(self.coherence_scores.values())
print("Running serial python ...") y = list(map(sin2, x)) print("Output: %s\n" % np.asarray(y)) if HAS_PYINA: # map sin2 to the workers, then print to screen print("Running mpi4py on %d cores..." % nodes) y = MpiPool(nodes).map(sin2, x) print("Output: %s\n" % np.asarray(y)) # map sin2 to the workers, then print to screen print("Running multiprocesing on %d processors..." % nodes) y = ProcessPool(nodes).map(sin2, x) print("Output: %s\n" % np.asarray(y)) # map sin2 to the workers, then print to screen print("Running multiprocesing on %d threads..." % nodes) y = ThreadPool(nodes).map(sin2, x) print("Output: %s\n" % np.asarray(y)) # map sin2 to the workers, then print to screen print("Running parallelpython on %d cpus..." % nodes) y = ParallelPool(nodes).map(sin2, x) print("Output: %s\n" % np.asarray(y)) # EOF
xp = np.arange(N * nodes, dtype=np.float64)[::-1] print("Input: %s\n" % x) # map sin_diff to the workers, then print to screen print("Running serial python ...") y = list(map(sin_diff, x, xp)) print("Output: %s\n" % np.asarray(y)) if HAS_PYINA: # map sin_diff to the workers, then print to screen print("Running mpi4py on %d cores..." % nodes) y = MpiPool(nodes).map(sin_diff, x, xp) print("Output: %s\n" % np.asarray(y)) # map sin_diff to the workers, then print to screen print("Running multiprocesing on %d processors..." % nodes) y = ProcessPool(nodes).map(sin_diff, x, xp) print("Output: %s\n" % np.asarray(y)) # map sin_diff to the workers, then print to screen print("Running multiprocesing on %d threads..." % nodes) y = ThreadPool(nodes).map(sin_diff, x, xp) print("Output: %s\n" % np.asarray(y)) # map sin_diff to the workers, then print to screen print("Running parallelpython on %d cpus..." % nodes) y = ParallelPool(nodes).map(sin_diff, x, xp) print("Output: %s\n" % np.asarray(y)) # EOF
def threadcompute(self, xs): pool = ThreadPool(4) results = pool.map(self.compute, xs) return results
def h(x): return sum(tmap(g, x)) def f(x,y): return x*y x = range(10) y = range(5) if __name__ == '__main__': from pathos.helpers import freeze_support freeze_support() from pathos.pools import ProcessPool, ThreadPool amap = ProcessPool().amap tmap = ThreadPool().map print(amap(f, [h(x),h(x),h(x),h(x),h(x)], y).get()) def _f(m, g, x, y): return sum(m(g,x))*y print(amap(_f, [tmap]*len(y), [g]*len(y), [x]*len(y), y).get()) from math import sin, cos print(amap(tmap, [sin,cos], [x,x]).get())
def __init__(self): self.num_partitions = self.num_cores self.pool = ThreadPool(self.num_cores)
def orm_extract(args): """ Function for the ORMExtractParser :param args: Namespace :return: nothing """ # Load database Base = databaseManage.WebsiteBase(args.database[0]) Base.create_tables() if type(args.thread) is list: args.thread = args.thread[0] # Load data URLs = list(importData.csv_to_list(args.path[0])[1].keys()) # --------------------- # Filter the results already in database # --------------------- alreadyIn = [] for url in Base.session.query(Base.__getattribute__(args.table[0])).all(): alreadyIn.append(url.url) for url in URLs: if "http://" in url[:7]: URLs[URLs.index(url)] = url[7:] elif "https://" in url[:8]: URLs[URLs.index(url)] = url[8:] URLs = set(URLs) for url in alreadyIn: try: URLs.remove(url) except KeyError: pass logger.info("{} websites will be added to the database".format(len(URLs))) itera = iter(URLs) URLs = zip(*[itera] * args.thread) # --------------------- # Add to the database # -------------------- dBase = databaseManage.NormalizationBase("DB/norm.db") normDict = {} for norm in dBase.session.query(dBase.Normalization).all(): normDict[norm.feature] = {"data": norm.data, "normalizer": norm.normalizer, "scaler": norm.scaler} i = 1 for url in URLs: logger.debug(str(i)) logger.info("Add : {}".format(url)) i += args.thread # Create URL object result1 = ThreadPool().map(Website.website, url) result2 = [] tmp = [] for web in result1: if web.html is None: result2.append(web) # result1.remove(web) else: tmp.append(web) if args.extraction: # Extract features fct = partial(Website.website.features_extraction, normDict=normDict) ThreadPool().map(fct, tmp) result2 += tmp for web in result2: print(web) # Add in database Base.adding(web, args.table[0]) else: for web in result1: # Add in database Base.adding(web, args.table[0]) if i % ((50 // args.thread) * args.thread) == 1 and i != 1: # Get new identity with tor with Controller.from_port(port=9051) as controller: controller.authenticate() controller.signal(Signal.NEWNYM)
def df_apply(df, f, pool=None, n_cpus=None, return_df=True): """Apply the function `f` to each row in `df` in a parallel fashion. """ if pool is None: if n_cpus is None: n_cpus = cpu_count() pool = ThreadPool(n_cpus) class RecordProxy: """A proxy object to wrap a `DataFrame.iat[row_i, col_i]` access model and provide a dictionary style interface. """ __df = df __field_names = list(df.columns) @classmethod def _field_i(cls, name): try: return cls.__field_names.index(name) except ValueError as e: raise KeyError( f"key '{name}' not found on record. Available keys are: {cls.__field_names}" ) @classmethod def wrap_map_func(cls, f): """Wraps the given function to be passed to a map() style function. Returns a function that expects to be called with an index value and it will call the given function passing it an object with a python dictionary style interface to the row. """ return lambda row_i: f(cls(row_i)) @property def index(self): return self.__row_i def __init__(self, row_i): self.__row_i = row_i def __getitem__(self, key): i = self._field_i(key) return self.__df.iat[self.__row_i, i] def __setitem__(self, key, value): i = self._field_i(key) self.__df.iat[self.__row_i, i] = value def get(self, key, value=None): try: i = self._field_i(key) return self.__df.iat[self.__row_i, i] except KeyError: return value def __str__(self): parts = ["Record({"] fields_repr = [] for field_name in self.__field_names: field_repr = self.__getitem__(field_name).__repr__() fields_repr.append(f"'{field_name}': {field_repr}") parts.extend(",".join(fields_repr)) parts.append("})") return "".join(parts) def dict(self, keys=None): if keys is None: keys = self.__field_names return { key: self.__df.iat[self.__row_i, i] for i, key in enumerate(self.__field_names) if key in keys } def __iter__(self): return (self.__df.iat[self.__row_i, i] for i in range(len(self.__field_names))) results = pool.map(RecordProxy.wrap_map_func(f), range(df.shape[0])) if return_df: return df else: return results