Ejemplo n.º 1
0
    def extract(self) -> None:
        self.__n_cores = cpu_count()
        self.__get_video_details()

        self.__jump_unit = ceil(self.n_frames / self.__n_cores)

        pool = Pool(processes=self.__n_cores)
        if self.verbose:
            tqdm.set_lock(RLock())
            pool = Pool(
                processes=self.__n_cores,
                initializer=tqdm.set_lock,
                initargs=(tqdm.get_lock(), ),
            )

        pool.map(self._batch_process, range(self.__n_cores))

        self.__combine()
Ejemplo n.º 2
0
    freeze_support()  # for Windows support
    L = list(range(NUM_SUBITERS))[::-1]

    print("Simple thread mapping")
    thread_map(partial(progresser, write_safe=not PY2), L, max_workers=4)

    print("Simple process mapping")
    process_map(partial(progresser), L, max_workers=4)

    print("Manual nesting")
    for i in trange(16, desc="1"):
        for _ in trange(16, desc="2 @ %d" % i, leave=i % 2):
            sleep(0.01)

    print("Multi-processing")
    tqdm.set_lock(RLock())
    p = Pool(initializer=tqdm.set_lock, initargs=(tqdm.get_lock(), ))
    p.map(partial(progresser, progress=True), L)

    print("Multi-threading")
    tqdm.set_lock(TRLock())
    pool_args = {}
    if not PY2:
        pool_args.update(initializer=tqdm.set_lock,
                         initargs=(tqdm.get_lock(), ))
    with ThreadPoolExecutor(**pool_args) as p:
        p.map(
            partial(progresser,
                    progress=True,
                    write_safe=not PY2,
                    blocking=False), L)
Ejemplo n.º 3
0
    DASHBOARD_STARTED_EVENT = None

    def get_function_details(_):
        pass

    def get_manager_client_dicts():
        raise NotImplementedError


logger = logging.getLogger(__name__)

DATETIME_FORMAT = "%Y-%m-%d, %H:%M:%S"

# Set lock for TQDM such that racing conditions are avoided when using multiple progress bars
TQDM_LOCK = Lock()
tqdm.set_lock(TQDM_LOCK)


class ProgressBarHandler:
    def __init__(self, func: Callable, n_jobs: int, show_progress_bar: bool,
                 progress_bar_total: int, progress_bar_position: int,
                 worker_comms: WorkerComms,
                 worker_insights: WorkerInsights) -> None:
        """
        :param func: Function passed on to a WorkerPool map function
        :param n_jobs: Number of workers that are used
        :param show_progress_bar: When ``True`` will display a progress bar
        :param progress_bar_total: Total number of tasks that will be processed
        :param progress_bar_position: Denotes the position (line nr) of the progress bar. This is useful wel using
            multiple progress bars at the same time
        :param worker_comms: Worker communication objects (queues, locks, events, ...)
Ejemplo n.º 4
0
    def sample(self,
               n_samples,
               epsilon=None,
               prop_scale=0.5,
               burn=100,
               tune=True,
               tune_iter=500,
               tune_interval=100,
               stat_weight=1.,
               stat_scale=1.,
               use_pilot=False,
               chains=2,
               seed=None,
               return_journal=False):
        """
        tune: bool
            Flag for tuning. Defaults to True.
        tune_interval: int
            The frequency of tuning. Defaults to 100 iterations.

        Due to multiprocessing, estimation time (iteration per loop, total
        time, etc.) could be unstable, but the progress bar works perfectly.

        A good choice for the number of jobs is the number of cores or processors on your computer.
        If your processor supports hyperthreading, you can select an even higher number of jobs.
        The number of jobs is set to the number of cores found in the system by default.

        There are some papers that suggest Metropolis-Hastings is most efficient
        when you accept 23.4% of proposed samples, and it turns out that lowering
        step size increases the probability of accepting a proposal. PyMC3 will
        spend the first 500 steps increasing and decreasing the step size to try
        to find the best value of sd that will give you an acceptance rate of
        23.4% (you can even set different acceptance rates).

        burn : either burn away or add to n_samples
        """

        if self._log:
            self.logger.info("Run MCMC sampler.")

        if use_pilot:
            if not self._done_pilot_study:
                msg = ("In order to use tuning from pilot study, the "
                       "pilot_study method must be run in advance.")
                raise PilotStudyMissing(msg)
        else:
            if epsilon is None:
                msg = ("epsilon must be passed.")
                raise ValueError(msg)
            self._epsilon = epsilon
            self._quantile = None
            self._stat_scale = stat_scale

        self._n_samples = n_samples
        self._burn = burn
        self._stat_weight = stat_weight

        # These are set in base instead
        #self._prior_logpdfs = [prior.logpdf for prior in self._priors]
        #self._rng = np.random.default_rng
        #self._uniform_distr = stats.uniform(loc=0, scale=1)

        # mcmc knobs
        self._prop_scale = prop_scale

        # force equal, n_samples
        n_samples, chains, tasks, seeds = self.batches(n_samples,
                                                       chains,
                                                       seed,
                                                       force_equal=True)
        # n_samples + burn

        if self._log:
            # for managing output contention
            initializer = tqdm.set_lock(RLock(), )
            initargs = (tqdm.get_lock(), )
        else:
            initializer = None
            initargs = None

        with ProcessPool(chains) as pool:
            r0, r1, r2, r3 = zip(*pool.map(self._sample,
                                           tasks,
                                           range(chains),
                                           seeds,
                                           initializer=initializer,
                                           initargs=initargs))

        #self._original_samples = np.stack(r0)
        self._original_samples = np.concatenate(r0, axis=0)
        self._samples = copy.deepcopy(self._original_samples)
        self._distances = np.concatenate(r1, axis=0)
        self._sum_stats = np.concatenate(r2, axis=0)
        self._n_accepted = np.sum(r3)

        self._done_sampling = True

        if return_journal:
            return self.journal()
Ejemplo n.º 5
0
    def pilot_study(
        self,
        n_sim=500,
        quantile=None,
        stat_scale=None,
        stat_weight=1.,
        n_jobs=-1,
        seed=None,
    ):
        r"""Perform pilot study.

        The pilot study runs the simulator ``n_sim`` times and sets the
        threshold parameter ``epsilon`` automatically as the q-quantile of
        simulated distances from the prior predictive distribution. For
        instance, the 0.5-quantile (the median) will give a threshold that
        accepts 50% of the simulations.

        The pilot study can also be used to provide an estimate of the
        ``stat_scale`` parameter, used in the weighted Euclidean distance, from
        the prior predictive distribution by passing the ``stat_scale`` keyword
        as ``sd`` or ``mad``. The ``stat_scale`` parameter is used to avoid
        dominance of particular summary statistics. ``stat_scale=sd`` scales the
        summary statistics according to their standard deviation (SD) estimated
        from the prior predictive samples, and ``stat_scale=mad`` according to
        their median absolute deviation (MAD).

        It is important to note that if more than 50% of the prior predictive
        samples for a particular summary statistic have identical values, MAD
        will equal zero. In this case, the logger will raise a warning and
        the scale for the particular summary statistic will be set to SD
        instead. If there is no variability at all, the scale will be set to 1.

        It is recommended to check that there are not too many identical
        samples before setting the scale, to avoid surprises.

        Parameters
        ----------
        n_sims : `int`, optional
            Number of simulator runs.
        quantile : `int`
            Quantile of the Euclidean distances.
        stat_scale : `str`, optional
            Summary statistics scale to estimate; can be set as either ``sd``
            (standard deviation)  or ``mad`` (median absolute deviation).
            If ``None``, scale is set to ``1.0``. Default: ``None``.
        stat_weight : {`int`, `float`}, `numpy.ndarray`, optional
                Importance weights of summary statistics. Default: ``1.0``.
        n_jobs : `int`, optional
            Number of processes (workers). If ``n_jobs=-1``, then ``n_jobs`` is
            set to half of the CPUs found by
            `Pathos <https://pathos.readthedocs.io/en/latest/pathos.html>`_
            (we assume half of the CPUs are hardware threads only and ignore
            those). Default: ``-1``.
        seed : `int`
            User-provided seed. Will be used to generate seed for each
            worker. Default: ``None``.
        """

        if quantile is None:
            msg = ("quantile must be passed. The pilot study sets the "
                   "accept/reject threshold as the provided q-quantile of the "
                   "distances.")
            raise ValueError(msg)

        if not 0 < quantile <= 1.0:
            msg = ("quantile must be a value in (0, 1].")
            raise ValueError(msg)

        if isinstance(stat_scale, str):
            if stat_scale not in VALID_STAT_SCALES:
                msg = ("scale can be set as either sd (standard deviation) or "
                       "mad (median absolute deviation). If None, it defaults "
                       "to 1.")
                raise ValueError(msg)

        if self._log:
            msg = f"Run pilot study to estimate:\n"
            msg += f"* epsilon as the {quantile}-quantile of the distances"

            if stat_scale is not None:
                msg += f"\n* summary statistics scale ({stat_scale.upper()}) "
                msg += f"from the prior predictive distribution"

            self.logger.info(msg)

        self._quantile = quantile
        _, n_jobs, tasks, seeds = self.batches(n_sim, n_jobs, seed)

        if self._log:
            # for managing output contention
            initializer = tqdm.set_lock(RLock())
            initargs = (Lock(),)
        else:
            initializer = None
            initargs = None

        if n_jobs == 1:
            sum_stats = self._pilot_study(tasks[0], 0, seeds[0])

        else:
            with ProcessPool(n_jobs) as pool:

                results = pool.map(self._pilot_study,
                                   tasks,
                                   range(n_jobs),
                                   seeds,
                                   initializer=initializer,
                                   initargs=initargs
                                   )

            sum_stats = np.concatenate(results, axis=0)

        if stat_scale is None:
            self._stat_scale = 1.

        elif stat_scale == "sd":
            self._stat_scale = self._sd(sum_stats)

            if 0 in self._stat_scale:
                idx = np.where(self._stat_scale == 0)

                if self._log:
                    msg = (f"Encounterd SD = 0 for summary statistic at index:"
                           f" {idx[0]}. Setting this to 1.")
                    self.logger.warn(msg)

                self._stat_scale[idx] = 1.

        elif stat_scale == "mad":
            self._stat_scale = self._mad(sum_stats)

            if 0 in self._stat_scale:
                # Check if MAD=0 for some sum stats
                idx = np.where(self._stat_scale == 0)

                if self._log:
                    msg = (f"Encounterd MAD = 0 for summary statistic at index:"
                           f" {idx[0]}. Setting this to SD "
                           "(or 1 if also SD = 0).")
                    self.logger.warn(msg)

                backup_stat_scale = sum_stats.std(axis=0)
                self._stat_scale[idx] = backup_stat_scale[idx]

                if 0 in self._stat_scale:
                    # Ensure that replaced sum stat scales is not SD=0
                    idx = np.where(self._stat_scale == 0)
                    self._stat_scale[idx] = 1.

        distances = []
        for sum_stat in sum_stats:
            distance = self.distance(sum_stat,
                                     self._obs_sumstat,
                                     weight=stat_weight,
                                     scale=self._stat_scale
                                     )
            distances.append(distance)

        distances = np.array(distances, dtype=np.float64)
        distances[distances == np.inf] = np.NaN
        self._epsilon = np.nanquantile(distances, self._quantile)
        self._done_pilot_study = True

        if self._log:
            self.logger.info(f"epsilon = {self._epsilon}")
            self.logger.info(f"stat_scale = {self._stat_scale}")
Ejemplo n.º 6
0
    def sample(self,
               n_samples,
               epsilon=None,
               stat_weight=1.,
               stat_scale=1.,
               use_pilot=False,
               n_jobs=-1,
               seed=None,
               return_journal=False):
        """
        Due to multiprocessing, estimation time (iteration per loop, total
        time, etc.) could be unstable, but the progress bar works perfectly.

        A good choice for the number of jobs is the number of cores or processors on your computer.
        If your processor supports hyperthreading, you can select an even higher number of jobs.
        The number of jobs is set to the number of cores found in the system by default.
        """

        if self._log:
            self.logger.info("Run rejection sampler.")

        if use_pilot:
            if not self._done_pilot_study:
                msg = ("In order to use tuning from pilot study, the "
                       "pilot_study method must be run in advance.")
                raise PilotStudyMissing(msg)
        else:
            if epsilon is None:
                msg = ("epsilon must be passed.")
                raise ValueError(msg)
            self._epsilon = epsilon
            self._quantile = None
            self._stat_scale = stat_scale

        self._n_samples = n_samples
        self._stat_weight = stat_weight

        _, n_jobs, tasks, seeds = self._batches(n_samples, n_jobs, seed)

        if self._log:
            tqdm.set_lock(RLock())  # for managing output contention
            initializer = tqdm.set_lock
        else:
            initializer = None

        with ProcessPool(n_jobs) as pool:
            r0, r1, r2, r3 = zip(*pool.map(self._sample,
                                           tasks,
                                           range(n_jobs),
                                           seeds,
                                           initializer=initializer))

        self._original_samples = np.concatenate(r0, axis=0)
        self._samples = copy.deepcopy(self._original_samples)
        self._distances = np.concatenate(r1, axis=0)
        self._sum_stats = np.concatenate(r2, axis=0)
        self._n_sims = np.sum(r3)

        self._done_sampling = True

        if return_journal:
            return self.journal()
Ejemplo n.º 7
0
    def pilot_study(
        self,
        n_sim=500,
        quantile=None,
        stat_scale=None,  # accept sd, mad
        stat_weight=1.,
        n_jobs=-1,
        seed=None,
    ):
        """
        Pilot study to set threshold and optionally summary statistics scale.

        Set scale and epsilon  (add bool for if weights also?)
        """

        if self._log:
            msg = "Run pilot study to estimate:\n"
            msg += "* epsilon as the p-quantile of the distances"

            if stat_scale is not None:
                msg += "\n* summary statistics scale from the prior "
                msg += "predictive distribution"

            self.logger.info(msg)

        if quantile is None:
            msg = ("quantile must be passed. The pilot study sets the "
                   "accept/reject threshold as the provided p-quantile of the "
                   "distances.")
            raise ValueError(msg)

        if stat_scale is not None:
            if stat_scale not in VALID_STAT_SCALES:
                msg = ("scale can be set as either sd (standard deviation) or "
                       "mad (median absolute deviation). If None, it defaults "
                       "to 1.")
                raise ValueError(msg)

        self._quantile = quantile
        _, n_jobs, tasks, seeds = self._batches(n_sim, n_jobs, seed)

        if self._log:
            tqdm.set_lock(RLock())  # for managing output contention
            initializer = tqdm.set_lock
        else:
            initializer = None

        with ProcessPool(n_jobs) as pool:
            results = pool.map(self._pilot_study,
                               tasks,
                               range(n_jobs),
                               seeds,
                               initializer=initializer)

        sum_stats = np.concatenate(results, axis=0)

        if stat_scale is None:
            self._stat_scale = 1.
        elif stat_scale == "sd":
            self._stat_scale = sum_stats.std(axis=0)
        elif stat_scale == "mad":
            self._stat_scale = np.median(
                np.absolute(sum_stats - np.median(sum_stats, axis=0)), axis=0)
        else:
            msg = ("scale can be set as either sd (standard deviation) or "
                   "mad (median absolute deviation). If None, defaults to 1.")
            raise ValueError(msg)

        distances = []
        for sum_stat in sum_stats:
            distance = self._distance(sum_stat,
                                      self._obs_sumstat,
                                      weight=stat_weight,
                                      scale=self._stat_scale)
            distances.append(distance)

        self._epsilon = np.quantile(np.array(distances), self._quantile)
        self._done_pilot_study = True
Ejemplo n.º 8
0
    def sample(self, n_samples, epsilon=None, quantile=None, n_tune=500, n_jobs=-1, log=False):
        """
        Due to multiprocessing, estimation time (iteration per loop, total
        time, etc.) could be unstable, but the progress bar works perfectly.

        A good choice for the number of jobs is the number of cores or processors on your computer.
        If your processor supports hyperthreading, you can select an even higher number of jobs.
        The number of jobs is set to the number of cores found in the system by default.
        """

        _inference_scheme = "Rejection ABC"
        self._epsilon = epsilon

        if log:
            self.logger = setup_logger(self.__class__.__name__)
            self.logger.info(f"Run {_inference_scheme} sampler.")
            n_jobs = check_and_set_jobs(n_jobs, self.logger)
        else:
            n_jobs = check_and_set_jobs(n_jobs)

        seeds = generate_seed_sequence(self._seed, n_jobs)
        tasks = distribute_workload(n_samples, n_jobs)

        if quantile is not None:
            tasks = distribute_workload(n_samples, n_jobs)

            distances_tune = self._pilot_study(n_tune, seeds[0])
            # print(distances_tune)
            #distances_tune = np.concatenate(distances_tune, axis=0)
            self._epsilon = np.quantile(np.array(distances_tune), quantile)
            # print(self._epsilon)

        if log:
            tqdm.set_lock(RLock())  # for managing output contention
            with ProcessPool(n_jobs) as pool:
                samples, distances, sum_stats, epsilons, n_sims = zip(*pool.map(
                    self._sample_with_log,
                    tasks,
                    range(n_jobs),
                    seeds,
                    initializer=tqdm.set_lock)
                )
        else:
            with ProcessPool(n_jobs) as pool:
                samples, distances, sum_stats, epsilons, n_sims = zip(*pool.map(
                    self._sample,
                    tasks,
                    seeds)
                )

        samples = np.concatenate(samples, axis=0)
        distances = np.concatenate(distances, axis=0)
        sum_stats = np.concatenate(sum_stats, axis=0)
        epsilons = np.concatenate(epsilons, axis=0)
        n_sims = np.sum(n_sims)

        journal = Journal()
        journal._write_to_journal(
            observation=self._obs_data,
            simulator=self._simulator,
            stat_calc=self._stat_calc,
            priors=self._priors,
            distance_metric=self._distance_metric,
            inference_scheme=_inference_scheme,
            n_samples=n_samples,
            n_simulations=n_sims,
            posterior_samples=samples,
            summary_stats=sum_stats,
            distances=distances,
            epsilons=epsilons,
            log=log)

        # return results
        # return samples, distances, sum_stats, epsilons, n_sims
        return journal