def make_pdf(distribution: st.rv_continuous, params: Tuple[float, ...], size: int = 25_000) -> pd.Series: """ Generate a pandas Series for the distributions's Probability Distribution Function. This Series will have axis values as index, and PDF values as values. Args: distribution (st.rv_continuous): a scipy.stats generator object params (Tuple[float, ...]): the parameters for this generator given back by the fit. size (int): the number of points to evaluate. Returns: A pandas Series object with the PDF as values, corresponding axis values as index. """ # Separate parts of parameters *args, loc, scale = params logger.debug("Getting sane start and end points of distribution") start = (distribution.ppf(0.01, *args, loc=loc, scale=scale) if args else distribution.ppf(0.01, loc=loc, scale=scale)) end = (distribution.ppf(0.99, *args, loc=loc, scale=scale) if args else distribution.ppf(0.99, loc=loc, scale=scale)) logger.debug("Building PDF") x = np.linspace(start, end, size) y = distribution.pdf(x, loc=loc, scale=scale, *args) return pd.Series(y, x)
def discrete_distrb(distrb: rv_continuous) -> np.ndarray: """ Returns a discretisation of specified distribution at values x = 0, 1, 2, 3, ..., ceiling(10^-6 quantile) """ upper_lim = np.ceil(distrb.ppf(1 - 1e-6)) bin_lims = np.linspace(0.5, upper_lim + 0.5, int(upper_lim + 1)) cdf = distrb.cdf(bin_lims) pmf = np.diff(cdf, prepend=0) return pmf / pmf.sum()
def generate_type_differentiated_rates(x: int, y: int, mRNA_distribution: stats.rv_continuous=stats.truncnorm, mRNA_parameters: np.ndarray=np.array([5, 15]), miRNA_distribution: stats.rv_continuous=stats.truncnorm, miRNA_parameters: np.ndarray=np.array([5, 15])) -> np.ndarray: """ Generates a value for each species. Values for mRNA species are drawn from mRNA distribution, likewise for miRNAs species. Used to get arrival, decay, and burst rates. """ rates = np.zeros(x+y) rates[:x] = mRNA_distribution.rvs(*mRNA_parameters, size=x) rates[x:] = miRNA_distribution.rvs(*miRNA_parameters, size=y) return rates
def __init__(self, ds: Tuple, ηs: Tuple, event_size: stats.rv_continuous): self.ds = ds self.ηs = ηs self.event_size = event_size self.event_size_distribution_name = str(event_size.__class__) self.event_size_mean = float(event_size.mean()) self.event_size_std = float(event_size.std()) self.df_waveforms = pd.DataFrame(columns=self.columns) self.df_parameters = pd.DataFrame( columns=["MC_type", "MC_name", "count", "params"]) self.t_max = 420e-9 # sec. See https://arxiv.org/abs/0810.4930v2 self.tqdm = True
def normalize(a: np.ndarray, dist: rv_continuous = norm, **kwargs) -> np.ndarray: """Assumes a is 1d.""" indices, a = np.argsort(a), a.copy().astype(np.float) disc_dist = dist.ppf(np.linspace(0, 1, len(a) + 2, endpoint=True), **kwargs) a[indices] = disc_dist[1:-1] return a
def generate_iid_species_rates(x: int, y: int, distribution: stats.rv_continuous=stats.truncnorm, parameters: np.ndarray=np.array([5, 15])): """ Generate a value drawn from the given distribution for each species. Used to generate arrival, decay, and bursting rates. """ rates = distribution.rvs(*parameters, size=(x+y)) return rates
def find_distribution_area(distribution: stats.rv_continuous, lower, upper, *args, **kwargs): """ Find the area between the lower and upper bounds of a scipy.stats continuous random variable distribution. """ # Verify that either lower or upper is specified if lower is None and upper is None: print("Lower and upper bounds can't both be None") raise UserInputError # If lower is not specified we want to find the area below upper if lower is None: return distribution.cdf(upper, *args, **kwargs) # If upper is not specified we want to find the are above lower if upper is None: return 1 - distribution.cdf(lower, *args, **kwargs) # Otherwise we find the area between the lower and upper bounds return distribution.cdf(upper, *args, **kwargs) - distribution.cdf( lower, *args, **kwargs)
def read_flights(filename: str, basetime: datetime.datetime, rtc_dist: sps.rv_continuous, weight_dist: sps.rv_continuous): test_data = pandas.read_excel(io=filename, index_row=None) flights = set() for row in test_data.itertuples(): fid = row.fid airline = row.Airline departtime = basetime + datetime.timedelta(minutes=row.DT) duration = datetime.timedelta(minutes=row.FCA) - datetime.timedelta( minutes=row.DT) rtc = datetime.timedelta(seconds=float(rtc_dist.rvs(size=1))) weight = float(weight_dist.rvs(size=1)) flights.add( bctop.allocations.Flight(fid=fid, airline=airline, deptime=departtime, flight_duration=duration, rtc=rtc, weight=weight)) return flights
def plot_cts_distribution(distribution: stats.rv_continuous, epsilon: float = 5e-5, epsilon_end: float = 1e-10, from_samples: bool = False, num_samples: int = 10000) -> None: """ Plot the continuous distribution using seaborn and matplotlib See scipy.stats for cts distributions: https://docs.scipy.org/doc/scipy/reference/stats.html epsilon -> Dist between plot points epsilon_end -> Plot on interval [x,y] where P(X<x) = epsilon_end to P(X<y) = 1 - epsilon_end from_samples = True -> Plot from random sampling = False -> Plot the PDF num_samples -> If samples true gives number of samples Example: plot_cts_distribution(stats.arcsine(), epsilon_end=1e-2) plot_cts_distribution(stats.arcsine(), samples=True) User-defined: plot_cts_distribution(exponential_rv(k=0.5)) plot_cts_distribution(exponential_rv(k=0.5),from_samples=True,num_samples=1000) """ if from_samples: rv_samples = distribution.rvs(size=num_samples) ax = sns.distplot(rv_samples, color="m") plt.title('Samples') else: x = np.linspace(distribution.ppf(0 + epsilon_end), distribution.ppf(1 - epsilon_end), int(1 / epsilon)) df = pd.DataFrame({'Values': x, 'Probability': distribution.pdf(x)}) ax = sns.lineplot(x='Values', y='Probability', data=df) plt.title('Density') plt.show() return
def exceedance_probability(distribution: rv_continuous, n_samples: Optional[int] = None): """ Calculates the exceedance probability of a random variable following a continuous multivariate distribution. Exceedance probability: φ_i = p(∀j != i: x_i > x_j | x ~ ``distribution``). :param distribution: the continuous multivariate distribution. :param n_samples: the number of realization sampled from the distribution to approximate the exceedance probability. Default to ``None`` and numerical integration is used instead of Monte Carlo simulation. :return: the exceedance probability of a random variable following the continuous multivariate distribution. """ if n_samples is None: # Numerical integration from scipy.stats._multivariate import dirichlet_frozen, multivariate_normal_frozen if type(distribution) is multivariate_normal_frozen: # Speekenbrink, M., & Konstantinidis, E. (2015). Uncertainty and exploration in a restless bandit problem. # https://onlinelibrary.wiley.com/doi/pdf/10.1111/tops.12145: p. 4. distribution: multivariate_normal_frozen μ, Σ = distribution.mean, distribution.cov n = len(μ) φ = np.zeros(n) I = -np.eye(n - 1) for i in range(n): A = np.insert(I, i, 1, axis=1) φ[i] = (mvn.cdf(A @ μ, cov=A @ Σ @ A.T)) elif type(distribution) is dirichlet_frozen: # Soch, J. & Allefeld, C. (2016). Exceedance Probabilities for the Dirichlet Distribution. # https://arxiv.org/pdf/1611.01439.pdf: p. 361. distribution: dirichlet_frozen α = distribution.alpha n = len(α) γ = [gammaln(α[i]) for i in range(n)] def f(x, i): φ_i = 1 for j in range(n): if i != j: φ_i *= gammainc(α[j], x) return φ_i * exp((α[i] - 1) * log(x) - x - γ[i]) φ = [ integrate.quad(lambda x: f(x, i), 0, np.inf)[0] for i in range(n) ] else: raise NotImplementedError( 'Numerical integration not implemented for this distribution!') φ = np.array(φ) else: # Monte Carlo simulation samples = distribution.rvs(size=n_samples) φ = (samples == np.amax(samples, axis=1, keepdims=True)).sum(axis=0) return φ / φ.sum()
def _fit_parametric_family(dist: stats.rv_continuous, sample: np.ndarray) -> _tp.Tuple[float, ...]: if dist == stats.multivariate_normal: # has no fit method... return np.mean(sample, axis=0), np.cov(sample.T, ddof=1) if dist in {stats.f, stats.beta}: fit_kwargs = {"floc": 0, "fscale": 1} elif dist in {stats.gamma, stats.lognorm, stats.invgauss, stats.pareto}: fit_kwargs = {"floc": 0} else: fit_kwargs = {} return dist.fit(sample, **fit_kwargs) # type: ignore
def _resample_parametric( sample: np.ndarray, size: int, dist: stats.rv_continuous, rng: np.random.Generator) -> _tp.Generator[np.ndarray, None, None]: n = len(sample) # fit parameters by maximum likelihood and sample from that if dist == stats.poisson: # - poisson has no fit method and there is no scale parameter # - random number generation for poisson distribution in scipy seems to be buggy mu = np.mean(sample) for _ in range(size): yield rng.poisson(mu, size=n) else: args = _fit_parametric_family(dist, sample) dist = dist(*args) for _ in range(size): yield dist.rvs(size=n, random_state=rng)
def generate_gammas(x: int, y: int, distribution: stats.rv_continuous=stats.truncnorm, parameters: np.ndarray=np.array([5, 15])) -> np.ndarray: gammas = np.zeros([x+y, x+y]) mRNA_to_miRNA = np.zeros([x, y]) for row in range(x): mRNA_to_miRNA[row] = distribution.rvs(*parameters, size=y) # Remove some gamma values legal = False while not legal: culled_gammas = Network.new_cull_gammas(x, y, deepcopy(mRNA_to_miRNA)) legal = Network.new_check_network_legality(x, y, culled_gammas) gammas[:x, x:] = culled_gammas gammas[x:, :x] = culled_gammas.T return gammas
def generate_iid_gammas(x: int, y: int, distribution: stats.rv_continuous=stats.truncnorm, parameters: np.ndarray=np.array([1, 5])) -> np.ndarray: """ Generates iid gamma rates between each mRNA and miRNA. Rates are drawn from the given distribution with the given parameters. """ rates = np.zeros([x+y, x+y]) mRNA_to_miRNA = np.zeros([x, y]) for row in range(x): mRNA_to_miRNA[row] = distribution.rvs(*parameters, size=y) # Remove some gamma values legal = False culled_gammas = cull_gammas(x, y, mRNA_to_miRNA) rates[:x, x:] = culled_gammas rates[x:, :x] = culled_gammas.T return rates
def run_experiment(n: int, k_vals: dict[str, int], dist: rv_continuous, batch_size: int) -> dict[str, int]: """ Run an experiment with the given number of experts, k_vals, distribution, and batch size. :param n: Number of experts. :param k_vals: Dictionary with keys that are the names of the k values (i.e., "sqrt", "logn") and values that are the actual k values. :param dist: Distribution to draw the expert competencies. :param batch_size: Number of instances to run at once (makes computation faster as one big array). :return: Dictionary with keys as k_value names, and values that are the number of instances (out of batch_size total),that the top k experts got the correct answer. """ # Sample competencies competencies = dist.rvs((n, batch_size)) # Sort by expert competencies sorted_comps = np.sort(competencies, axis=0) # Sample expert opinions from their competencies expert_opinions = bernoulli(sorted_comps).rvs() # Calculate number correct for each k. return { k_name: best_k_accuracies(expert_opinions, k_val) for k_name, k_val in k_vals.items() }
def initialize(self, rv_p: stats.rv_continuous, rv_p_kwargs: dict): if not rv_p_kwargs: rv_p_kwargs = {'loc': 0, 'scale': 1} self.rv_p = rv_p(**rv_p_kwargs) self.p = rv_p.rvs( size=self.n_vars) # np.random.uniform(0, 1, num_instruments)