Example #1
0
def get_minmax(sample, min_samples = 8):
    """

    We consider that a ratio of connections is typical if it falls within the 99.99 % percentile of the Normal distribution N(m, v) modelling ratios. This ensures that the expected rate of false alarms is about 1/10000, and therefore only a handful a week (given the large number of jurisdictions). Similarly, we infer the range of the rate of usage from each jurisdiction (given Ci j) to be the 99.99 % percentile range of a Poisson distribution with parameter Ci j. This full range must be within the typical range of ratios to avoid raising an alarm.

    Args:
        sample (pandas.core.series.Series): A series containing the relative change values for a set of countries.
    """

    log.debug("Getting min and max for a sample on {0}'s data: {1}".format(sample.name, sample))
    initial_sample_len = len(sample)

    if initial_sample_len > min_samples:
        sample = drop_outliers(sample)

        num_outliers = initial_sample_len - len(sample)
        log.debug("Sample had {0} outliers removed. Current sample: {1}".format(num_outliers, sample))

        if len(sample) > min_samples:
            mu, signma = norm.fit(sample)
            sample_max = norm.ppf(0.9999, mu, signma)
            sample_min = norm.ppf(1 - 0.9999, mu, signma)

            log.debug("Sample min == {0}, Sample max == {1}".format(sample_min, sample_max))

            return pd.Series({"max":sample_max, "min":sample_min})
        else:
            log.debug("After removing outliers the sample was a length of {0}. This is shorter than acceptable minimum length of {1}.".format(len(sample), min_samples))

            return pd.Series({"max":None, "min":None})
    else:
        log.debug("Sample with length of {0} is shorter than acceptable minimum length of {1}.".format(initial_sample_len, min_samples))

        return pd.Series({"max":None, "min":None})
Example #2
0
def Z_test(x1, x2, Alpha=0.95):
    ResultsTable = pd.DataFrame()

    # Compute standard deviation and number of observation
    S_x1 = x1.std(ddof=1)
    S_x2 = x2.std(ddof=1)
    N_x1 = len(x1)
    N_x2 = len(x2)

    # Test statistic and p value
    Z = (x1.mean() - x2.mean()) / np.sqrt(S_x1**2 / N_x1 + S_x2**2 / N_x2)
    p = 2 * (1 - norm.cdf(abs(Z)))

    # Rejection range
    MinValue = norm.ppf((1 - Alpha) / 2)
    MaxValue = norm.ppf(1 - (1 - Alpha) / 2)
    RejectionRange = np.array([[-np.inf, round(MinValue, 3)],
                               [round(MaxValue, 3), np.inf]])

    Results = {
        'Test statistic': round(Z, 3),
        'p value': round(p, 9),
        'Significance level (%)': Alpha * 100,
        'Rejection range': RejectionRange
    }

    ResultsTable = ResultsTable.append(Results, ignore_index=True)

    return ResultsTable
Example #3
0
def make_tendencies_minmax(l, INTERVAL=1):
    lminus1 = dict([(ccode, n_day_rel(l[ccode], INTERVAL)) for ccode in l])
    c = lminus1[lminus1.keys()[0]]
    dists = []
    minx = []
    maxx = []
    for i in range(len(c)):
        vals = [
            lminus1[ccode][i] for ccode in lminus1.keys()
            if lminus1[ccode][i] != None
        ]
        if len(vals) < 8:
            dists += [None]
            minx += [None]
            maxx += [None]
        else:
            vals.sort()
            median = vals[len(vals) / 2]
            q1 = vals[len(vals) / 4]
            q2 = vals[(3 * len(vals)) / 4]
            qd = q2 - q1
            vals = [
                v for v in vals if median - qd * 4 < v and v < median + qd * 4
            ]
            if len(vals) < 8:
                dists += [None]
                minx += [None]
                maxx += [None]
                continue
            mu, signma = norm.fit(vals)
            dists += [(mu, signma)]
            maxx += [norm.ppf(0.9999, mu, signma)]
            minx += [norm.ppf(1 - 0.9999, mu, signma)]
    ## print minx[-1], maxx[-1]
    return minx, maxx
Example #4
0
def make_tendencies_minmax(l, INTERVAL = 1):
  lminus1 = dict([(ccode, n_day_rel(l[ccode], INTERVAL)) for ccode in l])
  c = lminus1[lminus1.keys()[0]]
  dists = []
  minx = []
  maxx = []
  for i in range(len(c)):
    vals = [lminus1[ccode][i] for ccode in lminus1.keys() if lminus1[ccode][i] != None]
    if len(vals) < 8:
      dists += [None]
      minx += [None]
      maxx += [None]
    else:
      vals.sort()
      median = vals[len(vals)/2]
      q1 = vals[len(vals)/4]
      q2 = vals[(3*len(vals))/4]
      qd = q2 - q1
      vals = [v for v in vals if median - qd*4 < v and  v < median + qd*4]
      if len(vals) < 8:
        dists += [None]
        minx += [None]
        maxx += [None]
        continue
      mu, signma = norm.fit(vals)
      dists += [(mu, signma)]
      maxx += [norm.ppf(0.9999, mu, signma)]
      minx += [norm.ppf(1 - 0.9999, mu, signma)]
  ## print minx[-1], maxx[-1]
  return minx, maxx
Example #5
0
def QQPlot(DataValues, Alpha_CI=0.95, DataLabel='Data', FigFile='QQPlot.png'):

    ### Based on: https://www.tjmahr.com/quantile-quantile-plots-from-scratch/
    ### Itself based on Fox book: Fox, J. (2015)
    ### Applied Regression Analysis and Generalized Linear Models.
    ### Sage Publications, Thousand Oaks, California.

    # Data analysis
    N = len(DataValues)
    X_Bar = np.mean(DataValues)
    S_X = np.std(DataValues,ddof=1)

    # Sort data to get the rank
    Data_Sorted = np.zeros(N)
    Data_Sorted += DataValues
    Data_Sorted.sort()

    # Compute quantiles
    EmpiricalQuantiles = np.arange(0.5, N + 0.5) / N
    TheoreticalQuantiles = norm.ppf(EmpiricalQuantiles, X_Bar, S_X)
    ZQuantiles = norm.ppf(EmpiricalQuantiles,0,1)

    # Compute data variance
    DataIQR = np.quantile(DataValues, 0.75) - np.quantile(DataValues, 0.25)
    NormalIQR = np.sum(np.abs(norm.ppf(np.array([0.25, 0.75]), 0, 1)))
    Variance = DataIQR / NormalIQR
    Z_Space = np.linspace(min(ZQuantiles), max(ZQuantiles), 100)
    Variance_Line = Z_Space * Variance + np.median(DataValues)

    # Compute alpha confidence interval (CI)
    Z_SE = np.sqrt(norm.cdf(Z_Space) * (1 - norm.cdf(Z_Space)) / N) / norm.pdf(Z_Space)
    Data_SE = Z_SE * Variance
    Z_CI_Quantile = norm.ppf(np.array([(1 - Alpha_CI) / 2]), 0, 1)

    # Create point in the data space
    Data_Space = np.linspace(min(TheoreticalQuantiles), max(TheoreticalQuantiles), 100)

    # QQPlot
    BorderSpace = max( 0.05*abs(Data_Sorted.min()), 0.05*abs(Data_Sorted.max()))
    Y_Min = Data_Sorted.min() - BorderSpace
    Y_Max = Data_Sorted.max() + BorderSpace
    Figure, Axes = plt.subplots(1, 1, figsize=(5.5, 4.5), dpi=100)
    Axes.plot(TheoreticalQuantiles, Data_Sorted, linestyle='none', marker='o', mew=0.5, fillstyle='none', color=(0, 0, 0), label=DataLabel)
    Axes.plot(Data_Space, Variance_Line, linestyle='--', color=(1, 0, 0), label='Variance :' + str(format(np.round(Variance, 2),'.2f')))
    Axes.plot(Data_Space, Variance_Line + Z_CI_Quantile * Data_SE, linestyle='--', color=(0, 0, 1), label=str(int(100*Alpha_CI)) + '% CI')
    Axes.plot(Data_Space, Variance_Line - Z_CI_Quantile * Data_SE, linestyle='--', color=(0, 0, 1))
    plt.xlabel('Theoretical quantiles (-)')
    plt.ylabel('Empirical quantiles (-)')
    plt.ylim([Y_Min, Y_Max])
    plt.legend(loc='upper center', ncol=3, bbox_to_anchor=(0.5, 1.15), prop={'size':10})
    plt.savefig(FigFile)
    plt.show()
    plt.close(Figure)

    return Variance
    def summary(self, yname=None, xname=None, title=None, alpha=0.05):

        df = pd.DataFrame()

        df["Type"] = (["Mean"] * self.k_exog + ["Scale"] * self.k_scale +
                      ["Smooth"] * self.k_smooth + ["SD"] * self.k_noise)
        df["coef"] = self.params

        try:
            df["std err"] = np.sqrt(np.diag(self.cov_params()))
        except Exception:
            df["std err"] = np.nan

        from scipy.stats.distributions import norm
        df["tvalues"] = df.coef / df["std err"]
        df["P>|t|"] = 2 * norm.sf(np.abs(df.tvalues))

        f = norm.ppf(1 - alpha / 2)
        df["[%.3f" % (alpha / 2)] = df.coef - f * df["std err"]
        df["%.3f]" % (1 - alpha / 2)] = df.coef + f * df["std err"]

        df.index = self.model.data.param_names

        summ = summary2.Summary()
        if title is None:
            title = "Gaussian process regression results"
        summ.add_title(title)
        summ.add_df(df)

        return summ
Example #7
0
    def startSimulation(self) -> bool:
        if self.is_running():
            logging.info(self.__class__.__name__,
                         ":startSimulation already in progress.")
            return False

        if not self._validate():
            return False

        if len(self._simulation_profiles) == 0:
            return False

        days_range: range = range(0, (self._num_trading_days), 1) if self._trading_days_order == "A" \
            else range(self._num_trading_days -1, -1, -1)

        for sim_num in range(self.simulations_number):
            logging.debug(self.__class__.__name__,
                          'Starting simulation {}'.format(sim_num))

            for day in days_range:
                daily_return = norm.ppf(
                    np.random.rand(self._num_years_per_sim),
                    self._ret_dist_mean, self._ret_dist_std)
                done = day == days_range[-1]

                for profile_name, profile in self._simulation_profiles.items():
                    logging.debug(
                        self.__class__.__name__,
                        ": Simulating profile {}".format(profile_name))

                    profile.performTransition(daily_return,
                                              (sim_num, day, done))

        return True
Example #8
0
def mquantiles_cimj(data, prob=[0.25,0.50,0.75], alpha=0.05, axis=None):
    """
    Computes the alpha confidence interval for the selected quantiles of the
    data, with Maritz-Jarrett estimators.

    Parameters
    ----------
    data : ndarray
        Data array.
    prob : sequence, optional
        Sequence of quantiles to compute.
    alpha : float, optional
        Confidence level of the intervals.
    axis : int or None, optional
        Axis along which to compute the quantiles.
        If None, use a flattened array.

    Returns
    -------
    ci_lower : ndarray
        The lower boundaries of the confidence interval.  Of the same length as
        `prob`.
    ci_upper : ndarray
        The upper boundaries of the confidence interval.  Of the same length as
        `prob`.

    """
    alpha = min(alpha, 1 - alpha)
    z = norm.ppf(1 - alpha/2.)
    xq = mstats.mquantiles(data, prob, alphap=0, betap=0, axis=axis)
    smj = mjci(data, prob, axis=axis)
    return (xq - z * smj, xq + z * smj)
Example #9
0
def test_null_constrained():

    # Create a mixed population of Z-scores: 1000 standard normal and
    # 20 uniformly distributed between 3 and 4.
    grid = np.linspace(0.001, 0.999, 1000)
    z0 = norm.ppf(grid)
    z1 = np.linspace(3, 4, 20)
    zs = np.concatenate((z0, z1))

    for estimate_mean in False,True:
        for estimate_scale in False,True:
            for estimate_prob in False,True:

                emp_null = NullDistribution(zs, estimate_mean=estimate_mean,
                                            estimate_scale=estimate_scale,
                                            estimate_null_proportion=estimate_prob)

                if not estimate_mean:
                    assert_allclose(emp_null.mean, 0, atol=1e-5, rtol=1e-5)
                if not estimate_scale:
                    assert_allclose(emp_null.sd, 1, atol=1e-5, rtol=1e-2)
                if not estimate_prob:
                    assert_allclose(emp_null.null_proportion, 1, atol=1e-5, rtol=1e-2)

                # consistency check
                assert_allclose(emp_null.pdf(np.r_[-1, 0, 1]),
                                norm.pdf(np.r_[-1, 0, 1], loc=emp_null.mean,
                                         scale=emp_null.sd),
                                rtol=1e-13)
Example #10
0
    def summary(self, yname=None, xname=None, title=None, alpha=0.05):

        df = pd.DataFrame()

        df["Type"] = (["Mean"] * self.k_exog + ["Scale"] * self.k_scale +
                      ["Smooth"] * self.k_smooth + ["SD"] * self.k_noise)
        df["coef"] = self.params

        try:
            df["std err"] = np.sqrt(np.diag(self.cov_params()))
        except Exception:
            df["std err"] = np.nan

        from scipy.stats.distributions import norm
        df["tvalues"] = df.coef / df["std err"]
        df["P>|t|"] = 2 * norm.sf(np.abs(df.tvalues))

        f = norm.ppf(1 - alpha / 2)
        df["[%.3f" % (alpha / 2)] = df.coef - f * df["std err"]
        df["%.3f]" % (1 - alpha / 2)] = df.coef + f * df["std err"]

        df.index = self.model.data.param_names

        summ = summary2.Summary()
        if title is None:
            title = "Gaussian process regression results"
        summ.add_title(title)
        summ.add_df(df)

        return summ
Example #11
0
def sampleSize(stdev, tolerance, percentConfidence, printLatex=False):
    from scipy.stats.distributions import norm
    k = round(norm.ppf(0.5 + percentConfidence / 200., 0, 1) *
              100) / 100.  # 1.-(100-percentConfidence)/200.
    if printLatex:
        print('${0}^2\\frac{{{1}^2}}{{{2}^2}}$'.format(k, stdev, tolerance))
    return (k * stdev / tolerance)**2
Example #12
0
def test_null_constrained(estimate_mean, estimate_scale, estimate_prob):

    # Create a mixed population of Z-scores: 1000 standard normal and
    # 20 uniformly distributed between 3 and 4.
    grid = np.linspace(0.001, 0.999, 1000)
    z0 = norm.ppf(grid)
    z1 = np.linspace(3, 4, 20)
    zs = np.concatenate((z0, z1))

    emp_null = NullDistribution(zs,
                                estimate_mean=estimate_mean,
                                estimate_scale=estimate_scale,
                                estimate_null_proportion=estimate_prob)

    if not estimate_mean:
        assert_allclose(emp_null.mean, 0, atol=1e-5, rtol=1e-5)
    if not estimate_scale:
        assert_allclose(emp_null.sd, 1, atol=1e-5, rtol=1e-2)
    if not estimate_prob:
        assert_allclose(emp_null.null_proportion, 1, atol=1e-5, rtol=1e-2)

    # consistency check
    assert_allclose(emp_null.pdf(np.r_[-1, 0, 1]),
                    norm.pdf(np.r_[-1, 0, 1],
                             loc=emp_null.mean,
                             scale=emp_null.sd),
                    rtol=1e-13)
Example #13
0
def mquantiles_cimj(data, prob=[0.25, 0.50, 0.75], alpha=0.05, axis=None):
    """
    Computes the alpha confidence interval for the selected quantiles of the
    area_data, with Maritz-Jarrett estimators.

    Parameters
    ----------
    data : ndarray
        Data array.
    prob : sequence, optional
        Sequence of quantiles to compute.
    alpha : float, optional
        Confidence level of the intervals.
    axis : int or None, optional
        Axis along which to compute the quantiles.
        If None, use a flattened array.

    Returns
    -------
    ci_lower : ndarray
        The lower boundaries of the confidence interval.  Of the same length as
        `prob`.
    ci_upper : ndarray
        The upper boundaries of the confidence interval.  Of the same length as
        `prob`.

    """
    alpha = min(alpha, 1 - alpha)
    z = norm.ppf(1 - alpha / 2.)
    xq = mstats.mquantiles(data, prob, alphap=0, betap=0, axis=axis)
    smj = mjci(data, prob, axis=axis)
    return (xq - z * smj, xq + z * smj)
Example #14
0
	def regr_data_prep(self,kk,N_i=1):
		''' Regression data preparation via nested simulations
		'''
		import customML as cm

		# --- Computation budget allocatoin ---
		N_o = int(kk/N_i)

		# --- portfolio price @ t = \tau via Nested simulations---
		t0 = time.time()
		ran1 = npr.standard_normal((N_o,1))
		S1 = np.zeros((N_o,1))
		S1[:] = self.S0
		S1[:] = S1[:] * np.exp((self.mu - 0.5*self.sigma*self.sigma)*self.tau + \
								self.sigma * np.sqrt(self.tau) * ran1[:])

		ran2 = npr.standard_normal((N_o,N_i))
		S2 = np.zeros((N_o,N_i))
		S2[:,:] = np.dot(S1[:],np.ones((1,N_i))) * np.exp((self.rfr - 0.5*self.sigma*self.sigma)*(self.T-self.tau) \
						+ self.sigma * np.sqrt(self.T-self.tau) * ran2[:,:])

		prob0 = (1.-np.exp(-2.*(np.log(np.dot(S1[:],np.ones((1,N_i)))/(self.H[0]*np.ones((N_o,N_i))))\
			                   *np.log(S2[:,:]/(self.H[0]*np.ones((N_o,N_i))))/(self.sigma**2)/(self.T-self.tau))))\
		                       *(np.dot(S1[:],np.ones((1,N_i))) >= self.H[0]*np.ones((N_o,N_i))).astype(float)\
		                       *(S2[:,:] >= self.H[0]*np.ones((N_o,N_i))).astype(float)
		prob1 = (1.-np.exp(-2.*(np.log(np.dot(S1[:],np.ones((1,N_i)))/(self.H[1]*np.ones((N_o,N_i))))\
			                   *np.log(S2[:,:]/(self.H[1]*np.ones((N_o,N_i))))/(self.sigma**2)/(self.T-self.tau))))\
		                       *(np.dot(S1[:],np.ones((1,N_i))) >= self.H[1]*np.ones((N_o,N_i))).astype(float)\
		                       *(S2[:,:] >= self.H[1]*np.ones((N_o,N_i))).astype(float)
		prob2 = (1.-np.exp(-2.*(np.log(np.dot(S1[:],np.ones((1,N_i)))/(self.H[2]*np.ones((N_o,N_i))))\
			                   *np.log(S2[:,:]/(self.H[2]*np.ones((N_o,N_i))))/(self.sigma**2)/(self.T-self.tau))))\
		                       *(np.dot(S1[:],np.ones((1,N_i))) >= self.H[2]*np.ones((N_o,N_i))).astype(float)\
		                       *(S2[:,:] >= self.H[2]*np.ones((N_o,N_i))).astype(float)


		Vtau0 = np.dot((np.maximum(self.K[0]-S2[:,:],0)*prob0), np.ones((N_i,1))) / \
						float(N_i) * np.exp(-self.rfr*(self.T-self.tau))
		Vtau1 = np.dot((np.maximum(self.K[1]-S2[:,:],0)*prob1), np.ones((N_i,1))) / \
						float(N_i) * np.exp(-self.rfr*(self.T-self.tau))
		Vtau2 = np.dot((np.maximum(self.K[2]-S2[:,:],0)*prob2), np.ones((N_i,1))) / \
						float(N_i) * np.exp(-self.rfr*(self.T-self.tau))


		ValueTau = Vtau0*self.pos[0] + Vtau1*self.pos[1] + Vtau2*self.pos[2]

		t_ns = time.time() - t0

		# prediction samples
		#ran3 = norm(loc=0, scale=1).ppf(lhs(D, samples=I_pred))
		stratified_gaussian  = np.array([(i-0.5)/self.I_pred for i in range(1,self.I_pred+1)])
		ran3 = norm.ppf(stratified_gaussian[:,np.newaxis])
		S_pred = np.zeros((self.I_pred,1))
		S_pred[:] = self.S0
		S_pred[:] = S_pred[:] * np.exp((self.mu - 0.5*self.sigma*self.sigma)*self.tau +\
										self.sigma*np.sqrt(self.tau) * ran3[:])

		self.X = S1
		self.X_pred = S_pred
		self.y = ValueTau
Example #15
0
def test_sqrt_lasso():

    np.random.seed(234923)

    # Based on the example in the Belloni paper
    n = 100
    p = 500
    ii = np.arange(p)
    cx = 0.5**np.abs(np.subtract.outer(ii, ii))
    cxr = np.linalg.cholesky(cx)

    x = np.dot(np.random.normal(size=(n, p)), cxr.T)
    b = np.zeros(p)
    b[0:5] = [1, 1, 1, 1, 1]

    from scipy.stats.distributions import norm
    alpha = 1.1 * np.sqrt(n) * norm.ppf(1 - 0.05 / (2 * p))

    # Use very low noise level for a unit test
    y = np.dot(x, b) + 0.25 * np.random.normal(size=n)

    # At low noise levels, the sqrt lasso should be around a
    # factor of 3 from the oracle without refit, and should
    # almost equal the oracle with refit.
    expected_oracle = {False: 3, True: 1}

    # Used for regression testing
    expected_params = {
        False: np.r_[0.87397122, 0.96051874, 0.9905915, 0.93868953,
                     0.90771773],
        True: np.r_[0.95114241, 1.0302987, 1.01723074, 0.97587343, 0.99846403]
    }

    for refit in False, True:

        rslt = OLS(y, x).fit_regularized(method="sqrt_lasso",
                                         alpha=alpha,
                                         refit=refit)
        err = rslt.params - b
        numer = np.sqrt(np.dot(err, np.dot(cx, err)))

        oracle = OLS(y, x[:, 0:5]).fit()
        oracle_err = np.zeros(p)
        oracle_err[0:5] = oracle.params - b[0:5]
        denom = np.sqrt(np.dot(oracle_err, np.dot(cx, oracle_err)))

        # Check performance relative to oracle, should be around
        assert_allclose(numer / denom,
                        expected_oracle[refit],
                        rtol=0.5,
                        atol=0.1)

        # Regression test the parameters
        assert_allclose(rslt.params[0:5],
                        expected_params[refit],
                        rtol=1e-5,
                        atol=1e-5)
Example #16
0
def sampleSize(stdev,
               tolerance,
               percentConfidence,
               nRoundingDigits=None,
               printLatex=False):
    from scipy.stats.distributions import norm
    if nRoundingDigits is None:
        k = round(norm.ppf(0.5 + percentConfidence / 200., 0, 1),
                  2)  # 1.-(100-percentConfidence)/200.
    else:
        k = round(norm.ppf(0.5 + percentConfidence / 200., 0, 1),
                  nRoundingDigits)
        stdev = round(stdev, nRoundingDigits)
        tolerance = round(tolerance, nRoundingDigits)
    if printLatex:
        print('$z_{{{}}}^2\\frac{{s^2}}{{e^2}}={}^2\\frac{{{}^2}}{{{}^2}}$'.
              format(0.5 + percentConfidence / 200., k, stdev, tolerance))
    return (k * stdev / tolerance)**2
Example #17
0
def logrank_power(n, surv1, surv2, alpha=0.05):
    d = n * (2 - surv1 - surv2)
    if surv1 == 1 or surv2 == 1:
        return 0
    elif surv1 == 0 or surv2 == 0:
        return -1
    phi = log(surv1) / log(surv2) if surv1 < surv2 else log(surv2) / log(surv1)
    z_a = norm.ppf(1 - alpha)
    z_1_beta = sqrt(d * (1 - phi) * (1 - phi) / (1 + phi) / (1 + phi)) - z_a
    return norm.cdf(z_1_beta)
def getdprime(A_correct, A_total, B_correct, B_total, corrected):
    if corrected == True:
        if A_correct == A_total:
            tA = 1 - 1/(2*A_total)
        elif A_correct == 0:
            tA = 1 / (2*A_total)
        else:
            tA = A_correct/(A_total)
        
        if B_correct == B_total:
            tB = 1 - 1/(2*B_total)
        elif B_correct == 0:
            tB = 1 / (2*B_total)
        else:
            tB = B_correct/(B_total)
    else:
        tA = A_correct/(A_total)
        tB = B_correct/(B_total)
    dp = norm.ppf(tA) - norm.ppf(1-(tB))
    return dp
Example #19
0
def getdprime(A_correct, A_total, B_correct, B_total, corrected):
    if corrected == True:
        if A_correct == A_total:
            tA = 1 - 1 / (2 * A_total)
        elif A_correct == 0:
            tA = 1 / (2 * A_total)
        else:
            tA = A_correct / (A_total)

        if B_correct == B_total:
            tB = 1 - 1 / (2 * B_total)
        elif B_correct == 0:
            tB = 1 / (2 * B_total)
        else:
            tB = B_correct / (B_total)
    else:
        tA = A_correct / (A_total)
        tB = B_correct / (B_total)
    dp = norm.ppf(tA) - norm.ppf(1 - (tB))
    return dp
Example #20
0
def statsmodels_to_results(model):
    """
    Convert statsmodels summary to a dataframe.

    Parameters
    ----------
    model : statsmodels model output
        The output of a statsmodels analysis. For example rlm or mixedlm.

    Returns
    -------
    df : Pandas dataframe.
    """
    from statsmodels.regression.mixed_linear_model import MixedLMResultsWrapper
    from scipy.stats.distributions import norm
    df = summary_to_dataframe(model.summary())
    # deal with numerical precision loss in at least some of the values
    for col, attr in _REPLACEMENTS:
        if col in df.columns:
            df[col] = getattr(model, attr, df[col])

    # This one messes up the standard error and quartiles, too
    if isinstance(model, MixedLMResultsWrapper):
        sl = slice(model.k_fe)
        mu = np.asarray(df.iloc[sl, df.columns == 'Coef.'])[:, 0]
        # Adapted from statsmodels, see
        # https://github.com/statsmodels/statsmodels/blob/master/statsmodels/regression/mixed_linear_model.py#L2710-L2736  # noqa: E501
        stderr = np.sqrt(np.diag(model.cov_params()[sl]))
        df.iloc[sl, df.columns == 'Std.Err.'] = stderr
        # Confidence intervals
        qm = -norm.ppf(0.05 / 2)
        df.iloc[sl, df.columns == '[0.025'] = mu - qm * stderr
        df.iloc[sl, df.columns == '0.975]'] = mu + qm * stderr
        # All random effects variances and covariances
        sdf = np.zeros((model.k_re2 + model.k_vc, 2))
        jj = 0
        for i in range(model.k_re):
            for j in range(i + 1):
                sdf[jj, 0] = np.asarray(model.cov_re)[i, j]
                sdf[jj, 1] = np.sqrt(model.scale) * model.bse[model.k_fe + jj]
                jj += 1

        # Variance components
        for i in range(model.k_vc):
            sdf[jj, 0] = model.vcomp[i]
            sdf[jj, 1] = np.sqrt(model.scale) * model.bse[model.k_fe + jj]
            jj += 1

        df.iloc[model.k_fe:, df.columns == 'Coef.'] = sdf[:, 0]
        df.iloc[model.k_fe:, df.columns == 'Std.Err.'] = sdf[:, 1]

    df = expand_summary_dataframe(df)
    return df
Example #21
0
def _calculate_ci_approx(p,sigma,n):
    """Return index j and k that correspond to confidence interval
    of level sigma for percentile p*100 along with the respective
    confidence level

    Large n approximation
    """
    nu = norm.ppf((1+sigma)/2)*np.sqrt(p*(1-p))
    # print(nu)
    j = np.floor(n*p-nu*np.sqrt(n))
    k = np.ceil(n*p+nu*np.sqrt(n))
    return (j,k,sigma)
def test_sqrt_lasso():

    np.random.seed(234923)

    # Based on the example in the Belloni paper
    n = 100
    p = 500
    ii = np.arange(p)
    cx = 0.5 ** np.abs(np.subtract.outer(ii, ii))
    cxr = np.linalg.cholesky(cx)

    x = np.dot(np.random.normal(size=(n, p)), cxr.T)
    b = np.zeros(p)
    b[0:5] = [1, 1, 1, 1, 1]

    from scipy.stats.distributions import norm
    alpha = 1.1 * np.sqrt(n) * norm.ppf(1 - 0.05 / (2 * p))

    # Use very low noise level for a unit test
    y = np.dot(x, b) + 0.25 * np.random.normal(size=n)

    # At low noise levels, the sqrt lasso should be around a
    # factor of 3 from the oracle without refit, and should
    # almost equal the oracle with refit.
    expected_oracle = {False: 3, True: 1}

    # Used for regression testing
    expected_params = {False: np.r_[0.87397122, 0.96051874, 0.9905915 , 0.93868953, 0.90771773],
                       True: np.r_[0.95114241, 1.0302987 , 1.01723074, 0.97587343, 0.99846403]}

    for refit in False, True:

        rslt = OLS(y, x).fit_regularized(method="sqrt_lasso", alpha=alpha, refit=refit)
        err = rslt.params - b
        numer = np.sqrt(np.dot(err, np.dot(cx, err)))

        oracle = OLS(y, x[:, 0:5]).fit()
        oracle_err = np.zeros(p)
        oracle_err[0:5] = oracle.params - b[0:5]
        denom = np.sqrt(np.dot(oracle_err, np.dot(cx, oracle_err)))

        # Check performance relative to oracle, should be around
        assert_allclose(numer / denom, expected_oracle[refit],
             rtol=0.5, atol=0.1)

        # Regression test the parameters
        assert_allclose(rslt.params[0:5], expected_params[refit],
                rtol=1e-5, atol=1e-5)
Example #23
0
def confidenceInterval(mean, stdev, nSamples, percentConfidence, trueStd = True, printLatex = False):
    '''if trueStd, use normal distribution, otherwise, Student

    Use otherwise t.interval or norm.interval
    ex: norm.interval(0.95, loc = 0., scale = 2.3/sqrt(11))
    t.interval(0.95, 10, loc=1.2, scale = 2.3/sqrt(nSamples))
    loc is mean, scale is sigma/sqrt(n) (for Student, 10 is df)'''
    from math import sqrt
    from scipy.stats.distributions import norm, t
    if trueStd:
        k = round(norm.ppf(0.5+percentConfidence/200., 0, 1)*100)/100. # 1.-(100-percentConfidence)/200.
    else: # use Student
         k = round(t.ppf(0.5+percentConfidence/200., nSamples-1)*100)/100.
    e = k*stdev/sqrt(nSamples)
    if printLatex:
        print('${0} \pm {1}\\frac{{{2}}}{{\sqrt{{{3}}}}}$'.format(mean, k, stdev, nSamples))
    return mean-e, mean+e
Example #24
0
def test_local_fdr():

    # Create a mixed population of Z-scores: 1000 standard normal and
    # 20 uniformly distributed between 3 and 4.
    grid = np.linspace(0.001, 0.999, 1000)
    z0 = norm.ppf(grid)
    z1 = np.linspace(3, 4, 20)
    zs = np.concatenate((z0, z1))

    # Exact local FDR for U(3, 4) component.
    f1 = np.exp(-z1**2 / 2) / np.sqrt(2 * np.pi)
    r = len(z1) / float(len(z0) + len(z1))
    f1 /= (1 - r) * f1 + r

    fdr = local_fdr(zs)
    fdr1 = fdr[len(z0):]

    assert_allclose(f1, fdr1, rtol=0.05, atol=0.1)
Example #25
0
def test_local_fdr():

    # Create a mixed population of Z-scores: 1000 standard normal and
    # 20 uniformly distributed between 3 and 4.
    grid = np.linspace(0.001, 0.999, 1000)
    z0 = norm.ppf(grid)
    z1 = np.linspace(3, 4, 20)
    zs = np.concatenate((z0, z1))

    # Exact local FDR for U(3, 4) component.
    f1 = np.exp(-z1**2 / 2) / np.sqrt(2*np.pi)
    r = len(z1) / float(len(z0) + len(z1))
    f1 /= (1 - r) * f1 + r

    fdr = local_fdr(zs)
    fdr1 = fdr[len(z0):]

    assert_allclose(f1, fdr1, rtol=0.05, atol=0.1)
Example #26
0
def avg_with_error(_d):
    try:
        d = map(float, _d)
    except:
        print _d
        raise
    if has_confidence:
        n = len(d)
        avg = mean(d)
        sd = std(d)
        alpha = 1.0 - confidence
        intv = norm.ppf(1.0 - alpha / 2.0) * (sd / sqrt(n))
        return (avg, intv)
    else:
        n = len(d)
        if n > 0:
            return (sum(d) / float(n), 0.0)
        else:
            return (0.0, 0.0)
Example #27
0
def mquantiles_cimj(data, prob=[0.25,0.50,0.75], alpha=0.05, axis=None):
    """Computes the alpha confidence interval for the selected quantiles of the
    data, with Maritz-Jarrett estimators.
    
:Input:
    data : sequence
        Input data.
    prob : sequence *[0.25,0.5,0.75]*
        Sequence of quantiles whose standard error must be estimated.
    alpha : float *[0.05]*
        Confidence degree.
    axis : integer *[None]*
        Axis along which to compute the standard error.
    """
    alpha = min(alpha, 1-alpha)
    z = norm.ppf(1-alpha/2.)
    xq = mquantiles(data, prob, alphap=0, betap=0, axis=axis)
    smj = mjci(data, prob, axis=axis)
    return (xq - z * smj, xq + z * smj)
Example #28
0
def avg_with_error(_d):
    try:
        d = map(float, _d)
    except:
        print _d
        raise
    if has_confidence:
        n = len(d)
        avg = mean(d)
        sd = std(d)
        alpha = 1.0 - confidence
        intv = norm.ppf(1.0 - alpha / 2.0) * (sd / sqrt(n))
        return (avg, intv)
    else:
        n = len(d)
        if n > 0:
            return (sum(d) / float(n), 0.0)
        else:
            return (0.0, 0.0)
Example #29
0
def mquantiles_cimj(data, prob=[0.25,0.50,0.75], alpha=0.05, axis=None):
    """Computes the alpha confidence interval for the selected quantiles of the
data, with Maritz-Jarrett estimators.

Parameters
----------
    data: ndarray
        Data array.
    prob: sequence
        Sequence of quantiles to compute.
    alpha : float
        Confidence level of the intervals.
    axis : integer
        Axis along which to compute the quantiles. If None, use a flattened array.
    """
    alpha = min(alpha, 1-alpha)
    z = norm.ppf(1-alpha/2.)
    xq = mstats.mquantiles(data, prob, alphap=0, betap=0, axis=axis)
    smj = mjci(data, prob, axis=axis)
    return (xq - z * smj, xq + z * smj)
Example #30
0
def mquantiles_cimj(data, prob=[0.25, 0.50, 0.75], alpha=0.05, axis=None):
    """Computes the alpha confidence interval for the selected quantiles of the
data, with Maritz-Jarrett estimators.

Parameters
----------
    data: ndarray
        Data array.
    prob: sequence
        Sequence of quantiles to compute.
    alpha : float
        Confidence level of the intervals.
    axis : integer
        Axis along which to compute the quantiles. If None, use a flattened array.
    """
    alpha = min(alpha, 1 - alpha)
    z = norm.ppf(1 - alpha / 2.)
    xq = mstats.mquantiles(data, prob, alphap=0, betap=0, axis=axis)
    smj = mjci(data, prob, axis=axis)
    return (xq - z * smj, xq + z * smj)
Example #31
0
def confidenceInterval(mean,
                       stdev,
                       nSamples,
                       percentConfidence,
                       trueStd=True,
                       printLatex=False):
    '''if trueStd, use normal distribution, otherwise, Student

    Use otherwise t.interval or norm.interval
    ex: norm.interval(0.95, loc = 0., scale = 2.3/sqrt(11))
    t.interval(0.95, 10, loc=1.2, scale = 2.3/sqrt(nSamples))
    loc is mean, scale is sigma/sqrt(n) (for Student, 10 is df)'''
    from scipy.stats.distributions import norm, t
    if trueStd:
        k = round(norm.ppf(0.5 + percentConfidence / 200., 0, 1), 2)
    else:  # use Student
        k = round(t.ppf(0.5 + percentConfidence / 200., nSamples - 1), 2)
    e = k * stdev / sqrt(nSamples)
    if printLatex:
        print('${0} \pm {1}\\frac{{{2}}}{{\sqrt{{{3}}}}}$'.format(
            mean, k, stdev, nSamples))
    return mean - e, mean + e
Example #32
0
    def __init__(self,
                 n_normal,
                 normal_max_value,
                 p_zeros,
                 rhos,
                 i_normal=None,
                 i_ps=None):
        """
        Constructor.

        The Gaussian Conditional Independence Model for Credit Risk
        Reference: https://arxiv.org/abs/1412.1183

        Args:
            n_normal (int): number of qubits to represent the latent normal random variable Z
            normal_max_value (float): min/max value to truncate the latent normal random variable Z
            p_zeros (list or array): standard default probabilities for each asset
            rhos (list or array): sensitivities of default probability of assets with respect to latent variable Z
            i_normal (list or array): indices of qubits to represent normal variable
            i_ps (list or array): indices of qubits to represent asset defaults
        """
        self.n_normal = n_normal
        self.normal_max_value = normal_max_value
        self.p_zeros = p_zeros
        self.rhos = rhos
        self.K = len(p_zeros)
        num_qubits = [n_normal] + [1] * self.K

        # set and store indices
        if i_normal is not None:
            self.i_normal = i_normal
        else:
            self.i_normal = range(n_normal)

        if i_ps is not None:
            self.i_ps = i_ps
        else:
            self.i_ps = range(n_normal, n_normal + self.K)

        # get normal (inverse) CDF and pdf
        F = lambda x: norm.cdf(x)
        F_inv = lambda q: norm.ppf(q)
        f = lambda x: norm.pdf(x)

        # set low/high values
        low = [-normal_max_value] + [0] * self.K
        high = [normal_max_value] + [1] * self.K

        # call super constructor
        super().__init__(num_qubits, low=low, high=high)

        # create normal distribution
        self._normal = NormalDistribution(n_normal, 0, 1, -normal_max_value,
                                          normal_max_value)

        # create linear rotations for conditional defaults
        self._slopes = np.zeros(self.K)
        self._offsets = np.zeros(self.K)
        self._rotations = []
        for k in range(self.K):

            psi = F_inv(p_zeros[k]) / np.sqrt(1 - rhos[k])

            # compute slope / offset
            slope = -np.sqrt(rhos[k]) / np.sqrt(1 - rhos[k])
            slope *= f(psi) / np.sqrt(1 - F(psi)) / np.sqrt(F(psi))
            offset = 2 * np.arcsin(np.sqrt(F(psi)))

            # adjust for integer to normal range mapping
            offset += slope * (-normal_max_value)
            slope *= 2 * normal_max_value / (2**n_normal - 1)

            self._offsets[k] = offset
            self._slopes[k] = slope

            lry = LinearYRotation(slope,
                                  offset,
                                  n_normal,
                                  i_state=self.i_normal,
                                  i_target=self.i_ps[k])
            self._rotations += [lry]
    Pairs = [SFS, SC0, SC1, SC2, SObsr, SLHD, SMHD, SESD, SLC]
    PairsLength = [len(SFS), len(SC0), len(SC1), len(SC2), len(SObsr), len(SLHD), len(SMHD), len(SESD), len(SLC)]
    Combinations = len(SFS)*len(SC0)*len(SC1)*len(SC2)*len(SObsr)*len(SLHD)*len(SMHD)*len(SESD)*len(SLC)
#=================================================================================
# Latin Hypercube Sampling Design
# Candidate sets
    sam = 300 # sample size
    LHsets = lhs(9, samples = sam)      				# Generate candidate sets            
    LHsets = norm(loc=0, scale=1).ppf(LHsets)       # Normalized the value of candidate sets to N(0,1)
    for i in range(9):                             # Substitute the LH matrix to real value
        Range = Pairs[i]
        N = PairsLength[i]
        Prob = 1/N
        Interval = [];
        for k in range(N-1):
            Interval.append(norm.ppf(Prob*(k+1)))
        for j in range(sam):
            for q in range(N-1):
                if Interval[q] > LHsets[j,i]:
                    LHsets[j,i] = Range[q]
                    break
                if q == N-2:
                    LHsets[j,i] = Range[N-1]
#=================================================================================
# Groundtruth Travel Time Data    
	# INRIX Travel Time Data
    Data = xlrd.open_workbook('Your Path');
    Table = Data.sheet_by_name(u'Sheet1')
    I1 = Table.col_values(0) # Travel time for segment 1
    I2 = Table.col_values(1) # Travel time for segment 2
#=================================================================================  
Example #34
0
def real_power(ms, mus, alpha=0.05):
    za = norm.ppf(1-alpha/2)
    sms = sqrt(ms)
    true_hits = (mus != 0).astype(double)
    return true_hits.dot(norm.cdf(-za+sms*mus) + norm.cdf(-za-sms*mus))
Example #35
0
 def F_inv(x): return norm.ppf(x)
 def f(x): return norm.pdf(x)
    def quantile_ci(self, p, alpha=0.05, method='cloglog'):
        """
        Returns a confidence interval for a survival quantile.

        Parameters
        ----------
        p : float
            The probability point for which a confidence interval is
            determined.
        alpha : float
            The confidence interval has nominal coverage probability
            1 - `alpha`.
        method : str
            Function to use for g-transformation, must be ...

        Returns
        -------
        lb : float
            The lower confidence limit.
        ub : float
            The upper confidence limit.

        Notes
        -----
        The confidence interval is obtained by inverting Z-tests.  The
        limits of the confidence interval will always be observed
        event times.

        References
        ----------
        The method is based on the approach used in SAS, documented here:

          http://support.sas.com/documentation/cdl/en/statug/68162/HTML/default/viewer.htm#statug_lifetest_details03.htm
        """

        tr = norm.ppf(1 - alpha / 2)

        method = method.lower()
        if method == "cloglog":
            g = lambda x: np.log(-np.log(x))
            gprime = lambda x: -1 / (x * np.log(x))
        elif method == "linear":
            g = lambda x: x
            gprime = lambda x: 1
        elif method == "log":
            g = lambda x: np.log(x)
            gprime = lambda x: 1 / x
        elif method == "logit":
            g = lambda x: np.log(x / (1 - x))
            gprime = lambda x: 1 / (x * (1 - x))
        elif method == "asinsqrt":
            g = lambda x: np.arcsin(np.sqrt(x))
            gprime = lambda x: 1 / (2 * np.sqrt(x) * np.sqrt(1 - x))
        else:
            raise ValueError("unknown method")

        r = g(self.surv_prob) - g(1 - p)
        r /= (gprime(self.surv_prob) * self.surv_prob_se)

        ii = np.flatnonzero(np.abs(r) <= tr)
        if len(ii) == 0:
            return np.nan, np.nan

        lb = self.surv_times[ii[0]]

        if ii[-1] == len(self.surv_times) - 1:
            ub = np.inf
        else:
            ub = self.surv_times[ii[-1] + 1]

        return lb, ub
Example #37
0
def __sample_with_corr(mean_values,
                       std_dev,
                       desired_corr,
                       num_samples,
                       distro='normal'):
    """
    Randomally samples from a normal-multivariate distribution using LHS while attempting to get the desired_cov

    Parameters
    ----------
    mean_values
    desired_cov
    num_samples
    distro : str
        normal, lognormal (no proper handling of corr conversion)
    Returns
    -------

    """

    # raise Exception("This method is deprecated please use sample_with_corr")

    # draw samples in an uncorrelated manner
    num_vars = len(mean_values)
    samples = lhs_normal_sample(num_samples, np.zeros(num_vars),
                                np.ones(num_vars))
    # samples = lhs_uniform_sample(num_vars, num_samples)

    # cholesky-like decomp for non PD matricies.
    T = np.corrcoef(samples.T)
    # this decomposition might be right but it's used wrong..
    # permutation, Q, e = gmw_cholesky(T)

    Q = np.linalg.cholesky(T)

    # this matrix has the same correlation as the desired RStar.
    # It is known to be PD since any neg eigenvalues were removed already.
    # this can be changed to using gmw_cholesky to be more general though.
    P = np.linalg.cholesky(desired_corr)

    dependent_samples = np.dot(samples, np.dot(P, np.linalg.inv(Q)).T)

    # for il=1:ntry
    #     for j=1:nvar
    #         % rank RB
    #         [r,id]=ranking(RB(:,j));
    #         % sort R
    #         [RS,id]=sort(R(:,j));
    #         % permute RS so has the same rank as RB
    #         z(:,j) = RS(r).*xsd(j)+xmean(j);
    #     end
    #     ae=sum(sum(abs(corrcoef(z)-corr)));
    #     if(ae<amin)
    #         zb=z;
    #         amin=ae;
    #     end
    # end

    ntry = 1
    amin = 1.8e308
    z = np.zeros(np.shape(samples))
    for il in range(ntry):
        for j in range(num_vars):
            r = np.argsort(dependent_samples[:, j])
            rank = np.zeros(np.shape(r), dtype=int)
            rank[r] = np.array(range(num_samples))
            rs = np.sort(samples[:, j])
            z[:, j] = np.multiply(rs[rank], std_dev[j]) + mean_values[j]

        ae = np.abs(np.corrcoef(z.T) - desired_corr).sum().sum()

        if ae < amin:
            zb = z
            amin = ae
        else:
            raise Exception('Could not order samples ae={0}'.format(ae))

    # zb are the uniform correlated samples, now transform them to desired
    #
    # transform the uniform sample about the mean to the unit interval
    for i in range(num_vars):
        zb[:, i] = (zb[:, i] - min(zb[:, i]))
        zb[:, i] = zb[:, i] / max(zb[:, i])

        slightly_lt0 = zb[:, i] <= 0.0  # + 1e-5
        slightly_gt1 = zb[:, i] >= 1.0  # - 1e-5

        zb[slightly_lt0, i] = 1e-10  # 1e-5
        zb[slightly_gt1, i] = 1 - 1e-10  # 1.0 - 1e-5

    distro = distro.lower()

    # using the desired distro's ppf, sample the distro with the correlated uniform sample
    for i in range(num_vars):
        # create a norm distro with mean/std_dev then sample from it using percent point func (inv of cdf percentiles)
        if distro == 'normal':
            zb[:, i] = norm.ppf(zb[:, i], loc=mean_values[i], scale=std_dev[i])
        elif distro == 'lognormal':
            # using mu/sigma from wiki + the scipy convention of loc and scale to specify the mean and sigma
            mean = np.log(mean_values[i] /
                          (1 + std_dev[i]**2 / mean_values[i]**2)**0.5)
            sigma = (np.log(1 + std_dev[i]**2 / mean_values[i]**2))**0.5
            zb[:, i] = np.exp(norm.ppf(zb[:, i], loc=mean, scale=sigma))
        elif distro == 'uniform':
            zb[:, i] = uniform.ppf(zb[:, i],
                                   loc=mean_values[i],
                                   scale=std_dev[i])
        else:
            raise Exception(
                "Distro {0} not supported at the moment".format(distro))

    return zb
def plot_fdc(series, multimode=True, plot_enso=False,
             starting_month=None, lag=6,
             scale='log', xmin=0.0005, xmax=0.9995, ax=None, **kwargs):
    """
    Plots one or several flow duration curves (FDCs) for the series.
    
    The input series should be 1D or 2D.
    By default, if the series is 1D, one curve only will be plotted, whereas if
    the series is 2D, a curve will be plotted for each line of the series.
    
    A 1D series can also be converted into an annual series
    with the :keyword:`starting_month` parameter. 
    In that case, ``starting_month`` should be an integer between 1 and 12
    precising the month at which the 12-month period should start.
    For example, to plot the FDCs for each water year (usually from April
    to the following March), use ``starting_month=4``.
    
    When ``enso=True``, ENSO phases are plotted with different colors. 
    When the series is 2D or if it has been converted to an annual frequency, 
    the ENSO indices are defined with the ``full_year=True`` option, where an ENSO
    episode lasts at least 12 consecutive months.

    Parameters
    ----------
    series : TimeSeries
        Flow data.
    ax : {None, :class:`matplotlib.axes.Axes`}, optional
        Subplot where to plot the flow duration curves.
        If None, use the current plot.
    multimode : {True, False}, optional
        Whether to interpret a 2D input series as several series or a single one.
    starting_month : {None, integer}, optional
        First month of each year.
        If None, plots the global flow duration curve.
        Otherwise, ``starting_month`` must be an integer between 1 and 12,
        corresponding to the first month of the water year
        (usually, 4 for April).
    plot_enso : {True, False}, optional
        Whether to plot each ENSO phase with a different color.
    lag : {integer}, optional
        Number of months of lag for the definition of ENSO indices. For example,
        if lag=6, the ENSO phase starting in Oct. 2001 is applied starting on 
        Apr. 2002.
        If None, use a lag computed as the time difference between ``starting_month``
        and the first month of the reference season of the ENSO indicator (or October
        if undefined).
    scale : {'log','lin'}, optional
        String indicating whether the x-axis is in log (``'log'``) or linear
        (``'lin'``) scale.
        If ``'log'``, each plotting position is expressed as a Gaussian pdf.
    other parameters :
        The parameters recognized by the :func:`matplotlib.pyplot.plot` function are 
        also recognized.

    Raises
    ------
    TypeError
        If ``plot_enso=True`` but the series is not a
        :class:`~scikits.hydroclimpy.enso.ClimateSeries`.

    ValueError
        * If ``starting_month`` is not between 1 and 12.
        * If ``starting_month`` is defined but the initial series is not 1D.
    """
    if ax is None:
        ax = gca()
    # Make sure we have at most a 2D series ...............
    if series.ndim > 2:
        raise ValueError("The input series should be 2D at most!")
    # Get the ENSO indicator associated w/ the series (if any)
    ensoindicator = getattr(series, 'ensoindicator', None)
    # Check the starting month ............................
    if starting_month is not None:
        # Make sure we have an integer between 1 and 12
        starting_month = int(starting_month)
        if (starting_month < 1) or (starting_month > 12):
            errmsg = "The starting month should be between 1 (Jan.) and "\
                     "12 (Dec.)! (got %s instead)" % starting_month
            raise ValueError(errmsg)

    # Check whether we need to plot the ENSO information ..
    if plot_enso is True:
        # Make sure we have some ENSO information .........
        if ensoindicator is None:
            errmsg = "No ENSO information is associated with the input series."
            raise InvalidENSOError(errmsg)
        # Reset the indices if we have a starting_month ...
        if starting_month is not None:
            if lag is None:
                refmonth = (ensoindicator.reference_season or [10, ])[0]
                lag = (starting_month + 12 - refmonth) % 12
            series.set_ensoindices(full_year=True, lag=lag)
        else:
            # Make sure that the indices are already set
            series.set_ensoindices()
        # Load the default marker colors ..................
        from scikits.hydroclimpy.plotlib.ensotools import ENSOlines, \
                                                          ENSOmarkers, \
                                                          ENSOlabels
    # No ENSO information to plot : get basic lines & markers
    else:
        ENSOlines = {'G':'#cccccc'}
        ENSOmarkers = {'G':'#cccccc'}
    # Check whether we are in multimode or not ............
    ## 1D input
    if series.ndim == 1:
        # Convert to annual if needed
        if starting_month:
            multimode = True
            series = series.convert(FR_ANNSTART[starting_month - 1], func=None)
        else:
            multimode = False
        _series = series.view(ma.MaskedArray)
    ## 2D input
    else:
        #  w/ starting month
        if starting_month is not None:
            errmsg = "The input series should be 2D! (got %s instead)"
            raise ValueError(errmsg % str(series.shape))
        # w/o multimode
        if not multimode:
            _series = series.view(ma.MaskedArray).ravel()
    # Get the number of valid data per year (ie, per row)
    n = _series.count(axis= -1)
    # Get the xdata .........
    scale = scale[:3].lower()
    if scale == 'lin':
        if multimode:
            xdata = [np.linspace(1. / (nx + 1), 1 - 1. / (nx + 1), nx)
                     for nx in n]
        else:
            xdata = np.linspace(1. / (n + 1), 1 - 1. / (n + 1), n)
#            xdata = ma.empty(len(series), dtype=float)
#            xdata[:n] = np.linspace(1. / (n + 1), 1 - 1. / (n + 1), n)
    elif scale == 'log':
        if multimode:
            xdata = [norm.ppf(np.linspace(1. / (nx + 1), 1 - 1. / (nx + 1), nx))
                     for nx in n]
        else:
            xdata = norm.ppf(np.linspace(1. / (n + 1), 1 - 1. / (n + 1), n))
#            xdata = ma.empty(len(series), dtype=float)
#            xdata[:n] = norm.ppf(np.linspace(1. / (n + 1), 1 - 1. / (n + 1), n))
    else:
        raise ValueError("Unrecognized option '%' for scale: "\
                         "should be in ['lin','log'])")
    # Get some defaults .....
    if multimode:
        lwdefault = 0.8
        zorderdefault = 3
        colordefault = ENSOlines['G']
    else:
        lwdefault = 2
        zorderdefault = 10
        colordefault = 'k'
    marker = kwargs.pop('marker', 'o')
    markersize = kwargs.get('markersize', kwargs.get('ms', 3))
    lw = kwargs.pop('linewidth', kwargs.pop('lw', lwdefault))
    zorder = kwargs.pop('zorder', zorderdefault)
    color = kwargs.pop('color', kwargs.pop('c', colordefault))

    # Multi-mode : one line per year ......................
    if multimode:
        if plot_enso:
            ensoindices = series.ensoindices
            if ensoindices.ndim > 1:
                ensoindices = ensoindices[:, 0]
            # ENSO mode : different colors for different phases
#            eidx = series.ensoindices._data
#            # Take the first column if it's 2D
#            if eidx.ndim > 1:
#                eidx=eidx[:,0]
            for(i, attr) in zip((-1, 0, 1), ('cold', 'neutral', 'warm')):
                key = attr[0].upper()
                label = ENSOlabels[key]
                ydata = series[ensoindices == i]
                ydata = [np.sort(_).compressed()[::-1] for _ in ydata]
#                ydata = np.sort(getattr(series, attr).compressed())[::-1]
                points = [zip(x, y) for (x, y) in zip(xdata, ydata)]
                collec = LineCollection(points,
                                        label=ENSOlabels[key],
                                        color=ENSOlines[key],
                                        zorder=zorder, linewidth=lw)
                ax.add_collection(collec, autolim=True)
        else:
            ydata = [np.sort(y.compressed())[::-1] for y in _series]
            points = [zip(x, y) for (x, y) in zip(xdata, ydata)]
            label = kwargs.pop('label', None)
            collec = LineCollection(points, label=label, linewidth=lw,
                                    colors=ENSOlines['G'])
            ax.add_collection(collec, autolim=True)
    # One line for the while dataset ......................
    else:
        ydata = ma.sort(series.compressed(), endwith=False)[::-1]
        points = [zip(xdata, ydata._series)]
        label = kwargs.pop('label', 'none')
        collec = LineCollection(points, label=label, linewidth=lw,
                                colors=color, zorder=zorder)
        ax.add_collection(collec, autolim=True)
        # If we need to add some colors
        if plot_enso and marker:
            for attr in ('cold', 'neutral', 'warm'):
                key = attr[0].upper()
                label = ENSOlabels[key]
                color = ENSOmarkers[key]
                #ydata = ma.sort(getattr(series, attr), endwith=False)[::-1]
                current = getattr(ydata, attr)._series
                _fdc = ax.plot(xdata, current, ls='', lw=0,
                               marker=marker, ms=markersize,
                               mfc=color, mec=color,
                               label=label, zorder=zorder)
    #........................
    set_normal_limits(ax, xmin=xmin, xmax=xmax, scale=scale)
    ax.set_ylim(_series.min(), _series.max())
    return ax
Example #39
0
    def quantile_ci(self, p, alpha=0.05, method='cloglog'):
        """
        Returns a confidence interval for a survival quantile.

        Parameters
        ----------
        p : float
            The probability point for which a confidence interval is
            determined.
        alpha : float
            The confidence interval has nominal coverage probability
            1 - `alpha`.
        method : string
            Function to use for g-transformation, must be ...

        Returns
        -------
        lb : float
            The lower confidence limit.
        ub : float
            The upper confidence limit.

        Notes
        -----
        The confidence interval is obtained by inverting Z-tests.  The
        limits of the confidence interval will always be observed
        event times.

        References
        ----------
        The method is based on the approach used in SAS, documented here:

          http://support.sas.com/documentation/cdl/en/statug/68162/HTML/default/viewer.htm#statug_lifetest_details03.htm
        """

        tr = norm.ppf(1 - alpha / 2)

        method = method.lower()
        if method == "cloglog":
            g = lambda x: np.log(-np.log(x))
            gprime = lambda x: -1 / (x * np.log(x))
        elif method == "linear":
            g = lambda x: x
            gprime = lambda x: 1
        elif method == "log":
            g = lambda x: np.log(x)
            gprime = lambda x: 1 / x
        elif method == "logit":
            g = lambda x: np.log(x / (1 - x))
            gprime = lambda x: 1 / (x * (1 - x))
        elif method == "asinsqrt":
            g = lambda x: np.arcsin(np.sqrt(x))
            gprime = lambda x: 1 / (2 * np.sqrt(x) * np.sqrt(1 - x))
        else:
            raise ValueError("unknown method")

        r = g(self.surv_prob) - g(1 - p)
        r /= (gprime(self.surv_prob) * self.surv_prob_se)

        ii = np.flatnonzero(np.abs(r) <= tr)
        if len(ii) == 0:
            return np.nan, np.nan

        lb = self.surv_times[ii[0]]

        if ii[-1] == len(self.surv_times) - 1:
            ub = np.inf
        else:
            ub = self.surv_times[ii[-1] + 1]

        return lb, ub
Example #40
0
def test_0(size_N, mu_0, sgm_0):
    x = spnorm.rvs(size=size_N, loc=mu_0, scale=sgm_0)
    inv_x_0 = spnorm.ppf(x, loc=mu_0, scale=sgm_0)
    opencl_kernel = " "" ""  "
 def F_inv(x):
     return norm.ppf(x)
Example #42
0
# https://www.hko.gov.hk/en/wxinfo/season/fcvsobs_seasonal.htm
climat_yrs = [(1981, 2011), (1991, 2021)]

# seasonal prediction by HKO
# 0: "Normal to below normal"
# 1: "Normal to above normal"
# https://www.hko.gov.hk/en/wxinfo/season/fcvsobs_seasonal.htm
b_norm = [
    [1,1,1,1,1,1,1,0], #temp
    [1,1,0,0,0,1,1,0]  #rf
]

# CDF to Z-scores for above normal and below normal
CDF_AN = 0.7
CDF_BN = 0.3
Z_AN = norm.ppf(CDF_AN)
Z_BN = norm.ppf(CDF_BN)

# Get all data filter by season only
def extract_by_season(season='spring'):
    if season == 'spring':
        return df_all[df_all['month'].isin([3, 4, 5])]
    elif season == 'summer':
        return df_all[df_all['month'].isin([6, 7, 8])]
    elif season == 'autumn':
        return df_all[df_all['month'].isin([9, 10, 11])]
    elif season == 'winter':
        return df_all[df_all['month'].isin([12, 1, 2])]
    else:
        raise ValueError('Season not defined')
Example #43
0
pname = fname.replace("patterns_", "patterns_plot_")
pname = pname.replace(".txt", ".pdf")
pdf = PdfPages(pname)

lb = ["1→1", "0→1", "1→0", "0→0", "-1→0", "0→-1", "-1→-1"]

from turbo_colormap import turbo_colormap_data
c = turbo_colormap_data
ii = np.linspace(15, len(c) - 30, 7)  # Avoid very dark colors
ii = np.round(ii).astype(np.int)
cols = [c[i] for i in ii]
syms = ['s', 'o', 'x', '+', 'D', '>', '<']

next(fid)  # Skip initial ```

fq = norm.ppf(1 - 0.025)

while True:

    group = next(fid).rstrip()
    if group == "```":
        break

    head = next(fid).strip().split()
    table = [next(fid).rstrip() for k in range(7)]

    rows = [x[0:36].rstrip() for x in table]
    table = [x[36:].lstrip() for x in table]
    next(fid)
    next(fid)
    next(fid)
Example #44
0
##
## results summary
##

import numpy as np
import pandas as pd
from scipy.stats.distributions import norm

from .tools import maybe_diag

##
## constants
##

z95 = norm.ppf(0.975)

##
## param summary
##


def param_table(beta, y_name, x_names, sigma=None):
    # basic frame
    frame = pd.DataFrame({
        'coeff': beta,
    }, index=x_names)
    frame = frame.rename_axis(y_name, axis=1)

    # handle sigma cases
    if sigma is None:
        return frame
Example #45
0
# (x'x)^{-1} = (vs^2v')^{-1}
xtx = np.dot(vt.T / s**2, vt)

# Standard error for the interaction term
se = np.sqrt(uv * xtx[3, 3])

# Z-scores for the interaction term
zs = params[:, 3] / se
zs = zs.dropna()
zsa = np.abs(zs)

# P-values for the interaction term
pv = student_t.cdf(-np.abs(zs), xmat.shape[0] - xmat.shape[1])

# Bonferroni threshold
bt = norm.ppf(1 - 0.025 / zs.shape[0])

# Calculate the FDR for a range of threshold from 2 to 5.
fdr = []
n = len(zs)
for t in np.linspace(0, 6, 20):
    d = np.sum(zsa > t)
    f = 2 * n * norm.cdf(-t) / d
    fdr.append([t, f, d])
fdr = np.asarray(fdr)

# Plots relating to FDR
plt.clf()
plt.grid(True)
plt.plot(fdr[:, 0], fdr[:, 1], '-')
plt.xlabel("Z-score threshold", size=15)
Example #46
0
def betanorm(ms, xbars, alpha=0.05):
    """Return the power of a Z-test for given means and sample sizes."""
    za = norm.ppf(1 - alpha/2)
    sms = sqrt(ms)
    bn = norm.cdf(-za+sms*xbars) + norm.cdf(-za-sms*xbars)
    return bn
Example #47
0
    plt.clf()
    plt.grid(True)
    for j in range(xm.shape[1]):
        plt.plot(x, xm[:, j], '-')
    plt.xlabel("x", size=15)
    plt.ylabel("Spline value", size=15)
    plt.title(title)
    pdf.savefig()


x = np.linspace(-1, 1, n)
for k in range(3, 11):
    fml = "bs(x, %d)" % k
    title = "df=%d, uniform" % k
    plot(fml, x, title)

p = np.linspace(0.001, 0.999, n)
x = -np.log(1 - p)
for k in range(3, 11):
    fml = "bs(x, %d)" % k
    title = "df=%d, exponential" % k
    plot(fml, x, title)

x = norm.ppf(p)
for k in range(3, 11):
    fml = "bs(x, %d)" % k
    title = "df=%d, Gaussian" % k
    plot(fml, x, title)

pdf.close()
Example #48
0
 def F_inv(x):  # pylint: disable=invalid-name
     return norm.ppf(x)
Example #49
0
def sampleSize(stdev, tolerance, percentConfidence, printLatex = False):
    from scipy.stats.distributions import norm
    k = round(norm.ppf(0.5+percentConfidence/200., 0, 1)*100)/100. # 1.-(100-percentConfidence)/200.
    if printLatex:
        print('${0}^2\\frac{{{1}^2}}{{{2}^2}}$'.format(k, stdev, tolerance))
    return (k*stdev/tolerance)**2