コード例 #1
0
ファイル: mp_class_example.py プロジェクト: yodeng/pathos
def parcompute_example():
    dc = PMPExample()
    dc2 = PMPExample()
    dc3 = PMPExample()
    dc4 = PMPExample()

    n_datapoints = 100
    inp_data = range(n_datapoints)
    r1 = dc.threadcompute(inp_data)
    assert (len(dc.cache) == n_datapoints)

    r2 = dc2.processcompute(inp_data)
    assert (len(dc2.cache) == 0)
    assert (r1 == r2)

    r3 = ProcessPool(4).map(dc3.compute, inp_data)
    r4 = ThreadPool(4).map(dc4.compute, inp_data)
    assert (r4 == r3 == r2)
    assert (len(dc3.cache) == 0)
    assert (len(dc4.cache) == n_datapoints)

    log.info("Size of threadpooled class caches: {0}, {1}".format(
        len(dc.cache), len(dc4.cache)))
    log.info("Size of processpooled class caches: {0}, {1}".format(
        len(dc2.cache), len(dc3.cache)))
コード例 #2
0
    def __init__(self,
                 func,
                 schema,
                 ds,
                 scheduler: str = "single",
                 workers: int = 1,
                 **kwargs):
        """| Transform applies a user defined function to each sample in single threaded manner.

        Parameters
        ----------
        func: function
            user defined function func(x, **kwargs)
        schema: dict of dtypes
            the structure of the final dataset that will be created
        ds: Iterative
            input dataset or a list that can be iterated
        scheduler: str
            choice between "single", "threaded", "processed"
        workers: int
            how many threads or processes to use
        **kwargs:
            additional arguments that will be passed to func as static argument for all samples
        """
        self._func = func
        self.schema = schema
        self._ds = ds
        self.kwargs = kwargs
        self.workers = workers

        if isinstance(self._ds, Transform):
            self.base_ds = self._ds.base_ds
            self._func = self._ds._func[:]
            self._func.append(func)
            self.kwargs = self._ds.kwargs[:]
            self.kwargs.append(kwargs)
        else:
            self.base_ds = ds
            self._func = [func]
            self.kwargs = [kwargs]

        if scheduler == "threaded" or (scheduler == "single" and workers > 1):
            self.map = ThreadPool(nodes=workers).map
        elif scheduler == "processed":
            self.map = ProcessPool(nodes=workers).map
        elif scheduler == "single":
            self.map = map
        elif scheduler == "ray":
            try:
                from ray.util.multiprocessing import Pool as RayPool
            except Exception:
                pass
            self.map = RayPool().map
        else:
            raise Exception(
                f"Scheduler {scheduler} not understood, please use 'single', 'threaded', 'processed'"
            )
コード例 #3
0
def download_top_melee_gifs(pages = 1):
    print('Looking for gifs on the top {} reddit pages'.format(pages))

    saveDir = create_timestamped_dir()
    print('Will save to {}'.format(saveDir))

    urls = get_melee_gif_urls(pages)
    print('Found {} gif urls'.format(len(urls)))

    pool = ThreadPool(50)
    results = pool.map(lambda url:download_gif_and_convert_to_images(url, saveDir), urls)

    print('Done downloading and converting {} gifs'.format(len(filter(None, results))))
コード例 #4
0
    def fit(self, words):
        self.coherence_scores = {}

        self.pairwise_probability = {}
        self.word_probability = {}

        self.pairwise_hits = {}
        self.word_hits = {}

        pool = ThreadPool(N_CPUS)
        pool.map(self.compute_word_hits, words)

        # for word_i in self.words:

        sorted_desc = sorted(self.word_hits.items(),
                             key=operator.itemgetter(1),
                             reverse=True)
        sorted_asc = sorted(self.word_hits.items(), key=operator.itemgetter(1))

        for most_common in sorted_desc:
            most_common_ngram = most_common[0]
            most_common_hits = most_common[1]

            for most_rare in sorted_asc:
                most_rare_ngram = most_rare[0]
                most_rare_hits = most_rare[1]

                if most_common_ngram is not most_rare_ngram:
                    if most_rare_hits < most_common_hits:
                        pairwise_key = most_rare_ngram + "_" + most_common_ngram

                        if pairwise_key not in self.pairwise_probability.keys(
                        ):
                            self.pairwise_probability[pairwise_key] = 0
                            self.pairwise[pairwise_key] = {
                                "most_common_ngram": most_common_ngram,
                                "most_common_hits": most_common_hits,
                                "most_rare_ngram": most_rare_ngram,
                                "most_rare_hits": most_rare_hits
                            }

        pool.map(self.compute_pairwise_hits, self.pairwise.keys())

        return sum(self.coherence_scores.values())
コード例 #5
0
def main():
    npool = 4
    ppool = ProcessPool(npool)
    tpool = ThreadPool(npool)
    parapool = ParallelPool(npool)
    spool = SerialPool()
    pool = Pool(npool)

    nloops = 8
    print('For Loop')
    forloop(nloops)
    print('ThreadPool')
    test(nloops, tpool)
    print('ParallelPool')
    test(nloops, parapool)
    print('SerialPool')
    test(nloops, spool)
    print('Pool')
    test(nloops, pool)
    print('ProcessPool')
    test(nloops, ppool)
コード例 #6
0
    def fit(self, words):

        self.coherence_scores = {}

        self.pairwise_probability = {}
        self.word_probability = {}

        self.pairwise_hits = {}
        self.word_hits = {}

        for word_i in words:
            for word_j in words:
                if word_i is not word_j:
                    pairwise_key = "_".join(sorted([word_i, word_j]))
                    if pairwise_key not in self.pairwise_probability.keys():
                        self.pairwise_probability[pairwise_key] = 0
                        self.pairwise.append(pairwise_key)

        pool = ThreadPool(N_CPUS)
        pool.map(self.compute_word_hits, words)
        pool.map(self.compute_pairwise_hits, self.pairwise)

        return sum(self.coherence_scores.values())
コード例 #7
0
    print("Running serial python ...")
    y = list(map(sin2, x))
    print("Output: %s\n" % np.asarray(y))


    if HAS_PYINA:
        # map sin2 to the workers, then print to screen
        print("Running mpi4py on %d cores..." % nodes)
        y = MpiPool(nodes).map(sin2, x)
        print("Output: %s\n" % np.asarray(y))


    # map sin2 to the workers, then print to screen
    print("Running multiprocesing on %d processors..." % nodes)
    y = ProcessPool(nodes).map(sin2, x)
    print("Output: %s\n" % np.asarray(y))


    # map sin2 to the workers, then print to screen
    print("Running multiprocesing on %d threads..." % nodes)
    y = ThreadPool(nodes).map(sin2, x)
    print("Output: %s\n" % np.asarray(y))


    # map sin2 to the workers, then print to screen
    print("Running parallelpython on %d cpus..." % nodes)
    y = ParallelPool(nodes).map(sin2, x)
    print("Output: %s\n" % np.asarray(y))

# EOF
コード例 #8
0
ファイル: all_scatter_gather2.py プロジェクト: vreuter/pathos
    xp = np.arange(N * nodes, dtype=np.float64)[::-1]
    print("Input: %s\n" % x)

    # map sin_diff to the workers, then print to screen
    print("Running serial python ...")
    y = list(map(sin_diff, x, xp))
    print("Output: %s\n" % np.asarray(y))

    if HAS_PYINA:
        # map sin_diff to the workers, then print to screen
        print("Running mpi4py on %d cores..." % nodes)
        y = MpiPool(nodes).map(sin_diff, x, xp)
        print("Output: %s\n" % np.asarray(y))

    # map sin_diff to the workers, then print to screen
    print("Running multiprocesing on %d processors..." % nodes)
    y = ProcessPool(nodes).map(sin_diff, x, xp)
    print("Output: %s\n" % np.asarray(y))

    # map sin_diff to the workers, then print to screen
    print("Running multiprocesing on %d threads..." % nodes)
    y = ThreadPool(nodes).map(sin_diff, x, xp)
    print("Output: %s\n" % np.asarray(y))

    # map sin_diff to the workers, then print to screen
    print("Running parallelpython on %d cpus..." % nodes)
    y = ParallelPool(nodes).map(sin_diff, x, xp)
    print("Output: %s\n" % np.asarray(y))

# EOF
コード例 #9
0
ファイル: mp_class_example.py プロジェクト: yodeng/pathos
 def threadcompute(self, xs):
     pool = ThreadPool(4)
     results = pool.map(self.compute, xs)
     return results
コード例 #10
0
def h(x):
  return sum(tmap(g, x))

def f(x,y):
  return x*y

x = range(10)
y = range(5)


if __name__ == '__main__':
    from pathos.helpers import freeze_support
    freeze_support()

    from pathos.pools import ProcessPool, ThreadPool
    amap = ProcessPool().amap
    tmap = ThreadPool().map

    print(amap(f, [h(x),h(x),h(x),h(x),h(x)], y).get())

    def _f(m, g, x, y):
      return sum(m(g,x))*y

    print(amap(_f, [tmap]*len(y), [g]*len(y), [x]*len(y), y).get())

    from math import sin, cos

    print(amap(tmap, [sin,cos], [x,x]).get())


コード例 #11
0
ファイル: CMerModel.py プロジェクト: hochshi/SeaTFIDF
 def __init__(self):
     self.num_partitions = self.num_cores
     self.pool = ThreadPool(self.num_cores)
コード例 #12
0
ファイル: phishGan.py プロジェクト: riyadics/PhishGan
def orm_extract(args):
    """
        Function for the ORMExtractParser
        :param args: Namespace
        :return: nothing
        """

    # Load database
    Base = databaseManage.WebsiteBase(args.database[0])
    Base.create_tables()

    if type(args.thread) is list:
        args.thread = args.thread[0]

    # Load data
    URLs = list(importData.csv_to_list(args.path[0])[1].keys())

    # ---------------------
    #  Filter the results already in database
    # ---------------------
    alreadyIn = []
    for url in Base.session.query(Base.__getattribute__(args.table[0])).all():
        alreadyIn.append(url.url)

    for url in URLs:
        if "http://" in url[:7]:
            URLs[URLs.index(url)] = url[7:]
        elif "https://" in url[:8]:
            URLs[URLs.index(url)] = url[8:]

    URLs = set(URLs)

    for url in alreadyIn:
        try:
            URLs.remove(url)
        except KeyError:
            pass
    logger.info("{} websites will be added to the database".format(len(URLs)))
    itera = iter(URLs)
    URLs = zip(*[itera] * args.thread)

    # ---------------------
    #  Add to the database
    # --------------------
    dBase = databaseManage.NormalizationBase("DB/norm.db")
    normDict = {}
    for norm in dBase.session.query(dBase.Normalization).all():
        normDict[norm.feature] = {"data": norm.data, "normalizer": norm.normalizer, "scaler": norm.scaler}

    i = 1
    for url in URLs:
        logger.debug(str(i))
        logger.info("Add : {}".format(url))
        i += args.thread

        # Create URL object
        result1 = ThreadPool().map(Website.website, url)
        result2 = []
        tmp = []
        for web in result1:
            if web.html is None:
                result2.append(web)
                # result1.remove(web)
            else:
                tmp.append(web)
        if args.extraction:
            # Extract features
            fct = partial(Website.website.features_extraction, normDict=normDict)
            ThreadPool().map(fct, tmp)
            result2 += tmp
            for web in result2:
                print(web)
                # Add in database
                Base.adding(web, args.table[0])
        else:
            for web in result1:
                # Add in database
                Base.adding(web, args.table[0])

        if i % ((50 // args.thread) * args.thread) == 1 and i != 1:
            # Get new identity with tor
            with Controller.from_port(port=9051) as controller:
                controller.authenticate()
                controller.signal(Signal.NEWNYM)
コード例 #13
0
ファイル: parallel.py プロジェクト: samhug/luigi_report_utils
def df_apply(df, f, pool=None, n_cpus=None, return_df=True):
    """Apply the function `f` to each row in `df` in a parallel fashion.
    """
    if pool is None:
        if n_cpus is None:
            n_cpus = cpu_count()
        pool = ThreadPool(n_cpus)

    class RecordProxy:
        """A proxy object to wrap a `DataFrame.iat[row_i, col_i]` access model and
        provide a dictionary style interface.
        """

        __df = df
        __field_names = list(df.columns)

        @classmethod
        def _field_i(cls, name):
            try:
                return cls.__field_names.index(name)
            except ValueError as e:
                raise KeyError(
                    f"key '{name}' not found on record. Available keys are: {cls.__field_names}"
                )

        @classmethod
        def wrap_map_func(cls, f):
            """Wraps the given function to be passed to a map() style function.
            Returns a function that expects to be called with an index value and it will call
            the given function passing it an object with a python dictionary style interface to the row.
            """
            return lambda row_i: f(cls(row_i))

        @property
        def index(self):
            return self.__row_i

        def __init__(self, row_i):
            self.__row_i = row_i

        def __getitem__(self, key):
            i = self._field_i(key)
            return self.__df.iat[self.__row_i, i]

        def __setitem__(self, key, value):
            i = self._field_i(key)
            self.__df.iat[self.__row_i, i] = value

        def get(self, key, value=None):
            try:
                i = self._field_i(key)
                return self.__df.iat[self.__row_i, i]
            except KeyError:
                return value

        def __str__(self):
            parts = ["Record({"]
            fields_repr = []
            for field_name in self.__field_names:
                field_repr = self.__getitem__(field_name).__repr__()
                fields_repr.append(f"'{field_name}': {field_repr}")
            parts.extend(",".join(fields_repr))
            parts.append("})")
            return "".join(parts)

        def dict(self, keys=None):
            if keys is None:
                keys = self.__field_names

            return {
                key: self.__df.iat[self.__row_i, i]
                for i, key in enumerate(self.__field_names) if key in keys
            }

        def __iter__(self):
            return (self.__df.iat[self.__row_i, i]
                    for i in range(len(self.__field_names)))

    results = pool.map(RecordProxy.wrap_map_func(f), range(df.shape[0]))

    if return_df:
        return df
    else:
        return results