Esempio n. 1
0
 def getCI_t(self):
     '''
     the t-method confidence bounds, accurator than earlier methods
     '''
     SE = np.std(self.btheta, ddof = 1)
     self.CI_t = (self.theta - SE*t.ppf(0.975, self.n - 1), self.theta - SE*t.ppf(0.025, self.n - 1))
     return self.CI_t
Esempio n. 2
0
    def ppf(self, arg):
        """Inverse cumulative density function (ICDF).

        Parameters
        ----------
        arg : array
            Grid of point to evaluate ICDF at. Must belong to (0, 1)

        Returns
        -------
        array
            ICDF values. Same shape as the input.

        """
        arg = np.atleast_1d(arg)

        a = self.__const_a()
        b = self.__const_b()

        cond = arg < (1-self.lam)/2

        ppf1 = t.ppf(arg / (1-self.lam), self.eta)
        ppf2 = t.ppf(.5 + (arg - (1-self.lam)/2) / (1+self.lam), self.eta)
        ppf = -999.99*np.ones_like(arg)
        ppf = np.nan_to_num(ppf1) * cond \
            + np.nan_to_num(ppf2) * np.logical_not(cond)
        ppf = (ppf * (1+np.sign(arg-(1-self.lam)/2)*self.lam) \
            * (1-2/self.eta)**.5 - a)/b

        if ppf.shape == (1, ):
            return float(ppf)
        else:
            return ppf
Esempio n. 3
0
def GeneratePDF(Data, method = 'Robust_Student_t', lower_threshold = 0.15, upper_threshold = 0.85):
    
    '''Generate the pdf estimate of the data
    Input: /Data/   data to estimate pdf on
           /method/ Method of estimation.
                    Available methods: 'Robust_Student_t'; 'KDE'; 'Normal'
           /lower_threshold/ in percentage
           /upper_threshold/ in percentage
    Output: /pdf/   fitted pdf
            /cdf/   fitted cdf
    '''
    x = np.linspace(min(Data), max(Data), 100)
    if method == 'Robust_Student_t':
        nu, mu, sigma = uvtfit(Data)
        pdf = t.pdf(x, nu, mu, sigma)
        cdf = t.cdf(x, nu, mu, sigma)
        lower = t.ppf(lower_threshold, nu, mu, sigma)
        upper = t.ppf(upper_threshold, nu, mu, sigma)
        
    elif method == 'Normal':
        mu, sigma = norm.fit(Data)
        pdf = norm.pdf(x, mu, sigma)
        cdf = norm.cdf(x, mu, sigma)
        lower = norm.ppf(lower_threshold, mu, sigma)
        upper = norm.ppf(upper_threshold, mu, sigma)
        
    elif method == 'KDE':
        kernal = gaussian_kde(Data)
        pdf = kernal.evaluate(x)
        cdf = np.array([kernal.integrate_box(x[0], x[i+1]) for i in range(len(x)-1)])
        lower = np.percentile(cdf, lower_threshold*100)
        upper = np.percentile(cdf, upper_threshold*100)
        
    return x, pdf, cdf, lower, upper
Esempio n. 4
0
 def solve(self):
     df = 51-1
     statistic = 1.1/(4.9/51**0.5)
     statistic = round(statistic,2)
     a = t.ppf(0.025, df)
     b = t.ppf(0.975, df)
     test = False
     if statistic>=a and statistic<=b:
         test = True
     return [df,statistic,test]
Esempio n. 5
0
 def solve(self):
     df = 20-1
     statistic = (4.6-5)/(2.2/20**0.5)
     statistic = round(statistic,2)
     a = t.ppf(0.025, df)
     b = t.ppf(0.975, df)
     test = False
     if statistic>=a and statistic<=b:
         test = True
     return [df,statistic,test]
Esempio n. 6
0
    def evaluateLogLikelihoodHessian(self, par):

        print(par)

        df = par[0]
        self.par[1] = par[1]
        self.par[2] = par[2]
        self.par[3] = par[3]

        # Extract the degrees of freedom and the dimension
        p = (self.uhat).shape[1]
        n = (self.uhat).shape[0]

        self.constructCorrelationMatrix(p)

        # Compute the percentile function on univariate t
        tppf_uhat = t.ppf(self.uhat, df)

        # Calculate the first part of the log-likelihood
        part1 = 0

        for ii in range(n):

            part1 += multiTLogPDF(tppf_uhat[ii, :], np.zeros(p), self.P, df, p)

        # Calculate the second part of the log-likelihood
        part2 = np.sum(t.logpdf(tppf_uhat, df))

        return part1 - part2
Esempio n. 7
0
 def solve(self):
     x = round((18.985+21.015)/2,2)
     n = 36
     df = n-1
     s = -(21.015-18.985)/2 * np.sqrt(n) / t.ppf(0.025, df)
     s = round(s,2)
     return [x,s]
Esempio n. 8
0
 def solve(self):
     de=t.ppf(0.05,50)
     result=(1.1-0)/(4.9/(np.sqrt(51)))
     if de<=result:
         return [round(50,2),round(result,2),True]
     else:
         return [round(50,2),round(result,2),False]
Esempio n. 9
0
    def getConfidenceIntervals(variance_type, groups):
        """
        Expects a dictionary of endpoint groups and the endpoint variance-type.
        Appends results to the dictionary for each endpoint-group.

        Confidence interval calculated using a two-tailed t-test,
        assuming 95% confidence interval.
        """

        for grp in groups:
            lower_ci = grp.get('lower_ci')
            upper_ci = grp.get('upper_ci')
            n = grp.get('n')
            if (
                    lower_ci is None and
                    upper_ci is None and
                    n is not None and
                    grp['estimate'] is not None and
                    grp['variance'] is not None
               ):
                    est = grp['estimate']
                    var = grp['variance']
                    z = t.ppf(0.975, max(n-1, 1))
                    change = None

                    if variance_type == 'SD':
                        change = z * var / math.sqrt(n)
                    elif variance_type in ('SE', 'SEM'):
                        change = z * var

                    if change is not None:
                        lower_ci = round(est - change, 2)
                        upper_ci = round(est + change, 2)

                    grp.update(lower_ci=lower_ci, upper_ci=upper_ci)
Esempio n. 10
0
 def solve(self):
     de=t.ppf(0.05,19)
     result=(4.6-5)/(2.2/np.sqrt(20))
     if de<=result :
         return [round(19,2),round(result,2),True]
     else:
         return [round(19,2),round(result,2),False]
Esempio n. 11
0
def calc_stats(amostra):
	# confidence interval of 95%
	tdist = t.ppf(0.95, len(amostra)-1)
	mean = numpy.mean(amostra)
	std = numpy.std(amostra)
	error = tdist*(std/math.sqrt(len(amostra)))
	return mean, std, error
Esempio n. 12
0
def mu_intervall(sample, var, gamma):
    """
		calcuates the confidence intervall for the mean of a population.

		Parameters
		==========
		sample: array
			sample data
		var: 	float
			variance of sample. 0 if not known. will be calculated by an estimator
		gamme: 	float
			confidence

		Returns
		=======
		value :		tuple (a,b)
			confidence intervall as a tuple
	"""

    s_mean = np.array(sample).mean()

    if var == 0:
        std = _sample_std(sample)
        q = t.ppf((1 + gamma) / 2.0, len(sample) - 1)
    else:
        std = np.sqrt(var)
        q = norm.ppf((1 + gamma) / 2.0)

    c = q * std / np.sqrt(len(sample))
    c1 = s_mean - c
    c2 = s_mean + c
    return (c1, c2)
Esempio n. 13
0
 def different_stdev(self, alpha):
     t0 = (self.y1 - self.y2) / (np.sqrt(self.S1**2/self.n1 +
                                         self.S2**2/self.n2))
     # hypothesis testing2
     n1, n2, y1, y2, S1, S2 = self.n1, self.n2, self.y1, self.y2, self.S1, self.S2
     df = int((S1**2/n1+S2**2/n2)**2/((S1**2/n1)**2/(n1-1)+(S2**2/n2)**2/(n2-1)))
     H1a = t.ppf(1 - alpha/2., df) < np.abs(t0)
     H1b = t.ppf(1 - alpha, df) < t0
     H1c = t.ppf(alpha, df) > t0
     # p-value
     p1a = t.sf(np.abs(t0), df) * 2
     p1b = t.sf(t0, df)
     p1c = t.cdf(t0, df)
     c1 = y1 - y2 - t.ppf(1 - alpha/2., df) * np.sqrt(S1**2/n1+S2**2/n2)        
     c2 = y1 - y2 + t.ppf(1 - alpha/2., df) * np.sqrt(S1**2/n1+S2**2/n2)
     return H1a, H1b, H1c, p1a, p1b, p1c, (c1,c2)
Esempio n. 14
0
 def solve(self):
     upper = 21.015
     lower = 18.985
     de = t.ppf(0.025, 35)
     mean = (upper + lower) / 2
     s = np.sqrt(36) * (upper - lower) / (2 * (-de))
     return [round(mean, 2), round(s, 2)]
Esempio n. 15
0
def t_test(df, modelDir, start_date, confidence=0.99, model="neuralNet"):
	"""Is given a dataframe of demand, temp, and dates (in that order)"""
	from scipy.stats import t
	df = df.copy()

	if model == "neuralNet":
		df["dates"] = pd.date_range(start_date, freq="H", periods = df.shape[0])
		df.columns = ["load", "tempc", "dates"]
		all_X = omf.loadForecast.makeUsefulDf(df)
		actual = df["load"].values
		pred, acc = omf.loadForecast.neural_net_predictions(all_X, actual)

	if model == "nextDayPeakKatrina":
		ppt, pred, act_time, actual = omf.loadForecast.nextDayPeakKatrinaForecast(
			df.values, start_date, modelDir, {}, returnActuals=True
		)

	diff = [p - a for p, a in zip(pred, actual[-8760:])]
	diff = np.asarray(diff)
	alpha = 1 - confidence
	twosigma = -1 * t.ppf(alpha / 2, len(diff)) * np.std(diff)
	diff = np.abs(diff)
	diff = diff > twosigma

	if model == "neuralNet":
		return diff, actual[-8760:], pred, pred - twosigma, pred + twosigma
	if model == "nextDayPeakKatrina":
		return diff, actual, act_time
Esempio n. 16
0
def conf_calc(x, y_err, c_limit=0.975):
    '''
    Calculates confidence interval of regression between x and y
    
    Parameters
    ----------
    x:       1D numpy array
    y_err:   1D numpy array of residuals (y - fit)
    c_limit: (optional) float number representing the area to the left
             of the critical value in the t-statistic table
             eg: for a 2 tailed 95% confidence interval (the default)
                    c_limit = 0.975

    Returns
    -------
    confs: 1D numpy array of predicted y values for x inputs
    
    '''
    # Define the variables you need
    # to calculate the confidence interval
    mean_x = np.mean(x)			# mean of x
    n = len(x)				# number of samples in origional fit
    tstat = t.ppf(c_limit, n-1)         # appropriate t value
    s_err = np.sum(np.power(y_err,2))	# sum of the squares of the residuals

    # create series of new test x-values to predict for
    p_x = np.linspace(np.min(x),np.max(x),50)

    confs = tstat * np.sqrt((s_err/(n-2))*(1.0/n + (np.power((p_x-mean_x),2)/
			((np.sum(np.power(x,2)))-n*(np.power(mean_x,2))))))

    return p_x, confs
Esempio n. 17
0
def t_student(n, alfa):
    '''
        Calcula o t_{alfa/2,n} da distribuicao t-student.
    '''
    from scipy.stats import t

    return t.ppf(1 - 1.0 * alfa / 2, n)
Esempio n. 18
0
def _t(u, rho, nu):
    """ Generates values of the T copula
    
    Inputs:
    u -- u is an N-by-P matrix of values in [0,1], representing N
         points in the P-dimensional unit hypercube.  
    rho -- a P-by-P correlation matrix.
    nu  -- degrees of freedom for T Copula
    
    Outputs:
    y -- the value of the T Copula
    """
    n  = u.shape[0]
    p  = u.shape[1]
    loIntegrationVal = -40
    lo = np.full((1,p), loIntegrationVal)        # more accuracy, but slower :/
    hi = t.ppf(u, nu)
    
    mu = np.zeros(p)
    
    y = np.zeros(n)
    for ii in np.arange(n):
        x = hi[ii,:]
        x[x<-40] = -40
        p = mvt.mvstdtprob(lo[0], x, rho, nu)
        y[ii] = p
    
    return y
    def predicate(cls, tasks, user_id, cost):
        if len(tasks) < 3:
            return None, None, None, None, None

        # use only same user tasks?
        same_user_tasks = filter_user_id(tasks, user_id)
        if len(same_user_tasks) > 3:
            tasks = same_user_tasks

        # use only same cost tasks?
        same_cost_tasks = filter_cost(tasks, cost)
        if len(same_cost_tasks) > 3:
            tasks = same_cost_tasks

        # use only last N tasks
        tasks = tasks[-8:]

        sample = np.array([x['actualWorkTime'] / x['cost'] for x in tasks])
        n = sample.size
        mu = np.mean(sample)
        s2 = np.var(sample, ddof=1)

        t45 = sci_t.ppf(0.95, n - 1)
        mlow, mhigh = mu + np.array([-t45, t45]) * (np.sqrt(s2) / np.sqrt(n))

        chi45a = sci_chi2.ppf(0.95, n - 1)
        shigh = np.sqrt((n - 1) * s2 / chi45a)

        low, high = mlow - shigh, mhigh + shigh

        return (mlow + mhigh) / 2 * cost, mlow * cost, mhigh * cost, low * cost, high * cost
Esempio n. 20
0
 def _interval(self, X, alpha, pred):
     """
     Helper for computing prediction/confidence intervals.
     """
     # Comments from QR decomposition solution to Ax = y:
     #
     #   Rather than A'A we have R from the QR decomposition of A, but
     #   R'R equals A'A.  Note that R is not upper triangular since we
     #   have already multiplied it by the permutation matrix, but it
     #   is invertible.  Rather than forming the product R'R which is
     #   ill-conditioned, we can rewrite x' inv(A'A) x as the equivalent
     #      x' inv(R) inv(R') x = t t', for t = x' inv(R)
     #
     # We have since switched to an SVD solver, which gives us
     #
     #    invC = A' A  = (USV')' USV' = VSU' USV' = V S S V'
     #    C = inv(A'A) = inv(VSSV') = inv(V') inv(S S) inv(V)
     #      = V inv(S S) V' = V inv(S) inv(S) V'
     #
     # Substituting, we get
     #
     #    x' inv(A'A) x = t t', for t = x' V inv(S)
     #
     # Since x is a vector, t t' is the inner product sum(t**2).
     # Note that LAPACK allows us to do this simultaneously for many
     # different x using sqrt(sum(T**2,axis=1)), with T = X' Vinv(S).
     #
     # Note: sqrt(F(1-a;1,df)) = T(1-a/2;df)
     #
     from scipy.stats import t  # lazy import in case scipy not present
     y = np.dot(X, self.x).ravel()
     s = t.ppf(1-alpha/2, self.DoF) * self.rnorm/np.sqrt(self.DoF)
     t = np.dot(X, self._SVinv)
     dy = s * np.sqrt(pred + np.sum(t**2, axis=1))
     return y, dy
Esempio n. 21
0
    def accept(self):
        self.con = float(self.con_edit.text())
        first_data = []
        second_data = []
        samples = self.appropriate[self.currentGroup]

        group_values, counts = self.dataset.GetNumericValues(self.currentVar)

        for i in range(len(group_values)):
            if self.dataset.GetValue(self.currentGroup, i+1) == samples[0]:
                element = group_values[i]
                first_data.append(element)
            elif self.dataset.GetValue(self.currentGroup, i+1) == samples[1]:
                element = group_values[i]
                second_data.append(element)

        self.t_score, self.pvalue = ttest_ind(first_data, second_data, equal_var=self.equal_variances)
        mean1 = sum(first_data)/len(first_data)
        mean2 = sum(second_data)/len(second_data)
        self.means = {samples[0]:mean1, samples[1]:mean2}

        if len(first_data) < len(second_data):
            self.df = len(first_data)-1
        else:
            self.df = len(second_data)-1

        if self.radio_noteq.isChecked():
            pass
        elif self.radio_greater.isChecked():
            self.pvalue /= 2
        elif self.radio_less.isChecked():
            self.pvalue /= 2

        self.P_obs = t.ppf(1-self.con, self.df)
Esempio n. 22
0
 def solve(self):
     de = t.ppf(0.05, 24)
     result = (7.73 - 8) / (0.77 / np.sqrt(25))
     if de <= result:
         return [24, round(result, 2), True]
     else:
         return [24, round(result, 2), False]
Esempio n. 23
0
File: 38.py Progetto: XNYu/Statistic
def sampling_distribution():
    fig, ax = plt.subplots(1, 1)
    #display the probability density function
    df = 10
    x=np.linspace(t.ppf(0.01, df), t.ppf(0.99, df), 100)
    ax.plot(x, t.pdf(x, df))
    
    #simulate the sampling distribution
    y = []
    for i in range(1000):
        r = norm.rvs(loc=5, scale=2, size=df+1)
        rt =(np.mean(r)-5)/np.sqrt(np.var(r)/df)
        y.append(rt)

    ax.hist(y, normed=True, alpha=0.2)
    plt.savefig('sampling_distribution.png')
Esempio n. 24
0
    def confidence_int(self, conf_level=95):
        """
        Calculate confidence interval of the mean
        time measured

        Parameters
        ----------
        conf_level: float
            confidence level desired for the confidence interval in percent.
            this will be transformed into the quantile needed to get the z value
            for the t distribution.
            default is 95% confidence interval

        Returns
        -------
        lower_mean : float
            lower confidence interval boundary
        mean : float
            mean value
        upper_mean : float
            upper confidence interval boundary

        """
        # calculate quantile from confidence level in percent
        t_quantile = 1 - (1 - conf_level / 100.0) / 2.0
        # get t value from distribution
        t_val = t.ppf(t_quantile, self.n - self.ddof)
        # calculate standard error for estimated values
        std_err = self.stdev / np.sqrt(self.n)
        lower_mean = self.mean - t_val * std_err
        upper_mean = self.mean + t_val * std_err
        return lower_mean, self.mean, upper_mean
def confidence_interval(standard_deviation,observations,confidence):
	confidence_fraction = (1 - (100-float(confidence))/200)
	if observations > 30:
		total_length_of_confidence_interval = (standard_deviation*2*norm.ppf(confidence_fraction)/np.sqrt(observations))
	else:
		total_length_of_confidence_interval = (standard_deviation*2*t.ppf(confidence_fraction,observations)/np.sqrt(observations))
	return total_length_of_confidence_interval
Esempio n. 26
0
File: gwr.py Progetto: ljwolf/pysal
    def filter_tvals(self, alpha):
        """
        Utility function to set tvalues with an absolute value smaller than the
        absolute value of the alpha (critical) value to 0

        Parameters
        ----------
        alpha           : scalar
                          critical value to determine which tvalues are
                          associated with statistically significant parameter
                          estimates

        Returns
        -------
        filtered       : array
                          n*k; new set of n tvalues for each of k variables
                          where absolute tvalues less than the absolute value of
                          alpha have been set to 0.
        """
        alpha = np.abs(alpha)/2.0
        n = self.n
        critical = t.ppf(1-alpha, n-1)
        subset = (self.tvalues < critical) & (self.tvalues > -1.0*critical)
        tvalues = self.tvalues.copy()
        tvalues[subset] = 0
        return tvalues
    def regression_analysis(self, key, info):
        '''
        Calculates all the values we will need for simple linear regression 
        analysis, and does the analysis itself.
        '''
        # not the most efficient, but we want to keep these values
        # to calculate standard errors
        info = list(info)

        # calculate sums
        sumx, sumy, sumxx, sumyy, sumxy, n = (0.0, 0.0, 0.0, 0.0, 0.0, 0)
        for (x, y) in info:
            sumx += x
            sumy += y
            sumxx += x * x
            sumyy += y * y
            sumxy += x * y
            n += 1

        # calculate correlation
        corr = 0
        corr_denom = math.sqrt((n * sumxx - sumx**2) * (n * sumyy - sumy**2))
        if corr_denom < 0.0001:
            yield False, "Could not calculate coefficients"

        corr_num = n * sumxy - sumx * sumy 
        corr = corr_num / corr_denom

        if abs(corr) < 0.0001:
            yield False, "Could not calculate coefficients"

        # calculate regression coefficients
        beta1 = (sumxy - sumx * sumy / n) / (sumxx - sumx**2 / n)
        beta0 = (sumy - beta1 * sumx) / n

        # calculate standard errors
        y_reals = [y for (x, y) in info]
        y_hats = [beta0 + beta1 * y for y in y_reals]
        s_num = sum([(y - yhat) for (y, yhat) in zip(y_reals, y_hats)])
        s = math.sqrt(s_num / (n - 2))

        se_denom = n * sumxx - sumx**2
        se_beta0 = s * math.sqrt(sumxx / se_denom)
        se_beta1 = s * math.sqrt(n / se_denom)

        # calculate t-values
        t0 = beta0 / se_beta0
        t1 = beta1 / se_beta1

        # calculate 2-sided p-values
        alpha = 0.05
        t_stat = t.ppf(1 - alpha/2, n - 2)
        beta0_p_value = t.sf(abs(t0), n - 2) * 2
        beta1_p_value = t.sf(abs(t1), n - 2) * 2

        # output most important values in a human-readable format
        print("Correlation: {}".format(corr))
        print("Beta 0: {}, p-value: {}".format(beta0, beta0_p_value))
        print("Beta 1: {}, p-value: {}".format(beta1, beta1_p_value))
Esempio n. 28
0
def get_intervals(values,alpha):
	n = len(values)
	mean = sum(values) / n
	stddev = sqrt(sum(map(lambda x: (x - mean)**2,values))/n)
	delta = t.ppf(1.0 - (1.0 - alpha)/2,n-1) * stddev / sqrt(n)
	lower = mean - delta
	upper = mean + delta
	return lower,upper
Esempio n. 29
0
    def conf_int(self, alpha=.05, cols=None, dispersion=None):
        '''
        Returns the confidence interval of the specified theta estimates.

        Parameters
        ----------
        alpha : float, optional
            The `alpha` level for the confidence interval.
            ie., `alpha` = .05 returns a 95% confidence interval.
        cols : tuple, optional
            `cols` specifies which confidence intervals to return
                
        Returns : array
            Each item contains [lower, upper]
        
        Example
        -------
        >>>import numpy as np
        >>>from numpy.random import standard_normal as stan
        >>>import nipy.fixes.scipy.stats.models as SSM
        >>>x = np.hstack((stan((30,1)),stan((30,1)),stan((30,1))))
        >>>beta=np.array([3.25, 1.5, 7.0])
        >>>y = np.dot(x,beta) + stan((30))
        >>>model = SSM.regression.OLSModel(x, hascons=False).fit(y)
        >>>model.conf_int(cols=(1,2))

        Notes
        -----
        TODO:
        tails : string, optional
            `tails` can be "two", "upper", or "lower"
        '''
        if cols is None:
            lower = self.theta - t.ppf(1-alpha/2,self.df_resid) *\
                    np.diag(np.sqrt(self.vcov(dispersion=dispersion)))
            upper = self.theta + t.ppf(1-alpha/2,self.df_resid) *\
                    np.diag(np.sqrt(self.vcov(dispersion=dispersion)))
        else:
            lower=[]
            upper=[]
            for i in cols:
                lower.append(self.theta[i] - t.ppf(1-alpha/2,self.df_resid) *\
                    np.diag(np.sqrt(self.vcov(dispersion=dispersion)))[i])
                upper.append(self.theta[i] + t.ppf(1-alpha/2,self.df_resid) *\
                    np.diag(np.sqrt(self.vcov(dispersion=dispersion)))[i])
        return np.asarray(zip(lower,upper))
Esempio n. 30
0
    def equal_stdev(self, alpha):
        n1, n2, y1, y2 = self.n1, self.n2, self.y1, self.y2
        Sp = np.sqrt( ((n1 - 1)*self.S1**2 +
                       (n2 - 1)*self.S2**2) / (n1 + n2 - 2) )
        t0 = (y1 - y2) / (Sp * np.sqrt(1./n1 + 1./n2))

        # hypothesis testing2
        H1a = t.ppf(1 - alpha/2., n1 + n2 -2) < np.abs(t0)
        H1b = t.ppf(1 - alpha, n1 + n2 -2) < t0
        H1c = t.ppf(alpha, n1 + n2 -2) > t0
        # p-value
        p1a = t.sf(np.abs(t0), n1 + n2 -2) * 2
        p1b = t.sf(t0, n1 + n2 -2)
        p1c = t.cdf(t0, n1 + n2 -2)
        c1 = y1 - y2 - t.ppf(1 - alpha/2., n1 + n2 -2) * Sp * np.sqrt(1./n1+1./n2)        
        c2 = y1 - y2 + t.ppf(1 - alpha/2., n1 + n2 -2) * Sp * np.sqrt(1./n1+1./n2)
        return H1a, H1b, H1c, p1a, p1b, p1c, (c1,c2)
Esempio n. 31
0
# List 6-6  母平均の差の検定(母分散未知の場合) ウェルチの近似・t検定
import math
import numpy as np
from scipy.stats import t

X = [75, 70, 89, 65, 95, 82, 62, 77, 90, 58]
Y = [58, 75, 80, 70, 66, 63, 70, 76, 82, 65]

m = len(X)
n = len(Y)
meanX = np.average(X)
meanY = np.average(Y)
sX = np.std(X, ddof=1)  # Xの標本標準偏差
sY = np.std(Y, ddof=1)  # Yの標本標準偏差
# nuの計算
nu = (((sX**2) / m + (sY**2) / n)**2) / (((((sX**2) / m)**2) / (m - 1)) +
                                         ((((sY**2) / n)**2) / (n - 1)))
nuasta = round(nu)
tt = (meanX - meanY) / math.sqrt((sX**2) / m + (sY**2) / n)
t_lower = t.ppf(0.025, nuasta)  # 自由度nu*のt分布の%値
t_upper = t.ppf(0.975, nuasta)  # 自由度nu*のt分布の%値
print('t=', tt.round(4), 'reject=', (tt < t_lower) or (t_upper < tt))

# tに対するp値を計算するには
p = t.cdf(-np.abs(tt), nuasta) * 2
print('p値=', p.round(4))

# 出力結果は
# t= 1.2376 reject= False
# p値= 0.2349
Esempio n. 32
0
def consumption_method_1_month_cycle(product):
    product_data = product.consumptiondata_set.all()
    arr = []
    for p in product_data:
        if p.consumptionQty != None:
            arr.append(p.consumptionQty)

    data_np_arr = np.array(arr)
    code = product.code
    desc = product.description
    price = float(product.price)
    planned_qty_2_month_mean = float(product.planned_Qty_2Month_Mean)
    planned_qty_1_month_mean = planned_qty_2_month_mean / 2
    planned_qty_1_month_order_cost = price * planned_qty_1_month_mean
    planned_qty_1_month_order_cost_thousands = planned_qty_1_month_order_cost / 1000
    annual_plan_cost = planned_qty_1_month_order_cost * 12

    if arr != []:
        p_sum = data_np_arr.sum()
        p_mean = data_np_arr.mean()
        p_length = len(data_np_arr)
        x2 = []
        for value in data_np_arr:
            x2.append(pow(value, 2))

        x2_np_arr = np.array(x2)
        x2_sum = x2_np_arr.sum()
        square_n = (pow(p_sum, 2)) / (p_length)
        if p_length == 1:
            std_error = 0
        else:
            var = ((x2_sum) - (square_n)) / (p_length - 1)
            std = np.sqrt(var)
            std_error = np.float(std) / np.sqrt(p_length)

        pb_t_test = 0.95
        df = p_length - 1
        t_statistic = t.ppf(pb_t_test, df)
        confidence_interval = t_statistic * std_error
        deviation_from_mean = (confidence_interval / float(p_mean)) * 100

        amc = float(p_mean)
        amc_in_packs = amc
        no_of_stock_outs = 0
        amc_adjusted_for_stock_outs = amc_in_packs / (
            1 - (no_of_stock_outs / 30.5))
        percentage_change_in_consumption = deviation_from_mean
        min_amc = amc_in_packs * (
            (100 - percentage_change_in_consumption) / 100)
        max_amc = amc_in_packs * (
            (100 + percentage_change_in_consumption) / 100)
        poisson_mode_quantity = round(amc_in_packs)
        poisson_mode_qty_adjusted_for_changes_in_use = poisson_mode_quantity
        safety_stock = poisson_mode_qty_adjusted_for_changes_in_use * 0.5
        qty_to_procure = poisson_mode_qty_adjusted_for_changes_in_use * (
            0.5 + 1) + safety_stock
        eff_qty_to_procure = qty_to_procure
        calc_1_month_cycle_qty_to_procure_adjusted_for_losses = eff_qty_to_procure
        calculated_1_month_cycle_cost_of_procurement = price * calc_1_month_cycle_qty_to_procure_adjusted_for_losses
        calculated_1_month_cycle_cost_of_procurement_thousand = calculated_1_month_cycle_cost_of_procurement / 1000
        consumption_annual_procurement_cost = 12 * calculated_1_month_cycle_cost_of_procurement
        budget_deficit_in_plan = calculated_1_month_cycle_cost_of_procurement - float(
            planned_qty_1_month_order_cost)
        if calculated_1_month_cycle_cost_of_procurement != 0:
            percentage_available_funding = float(
                planned_qty_1_month_order_cost) / (
                    calculated_1_month_cycle_cost_of_procurement / 100)
        else:
            percentage_available_funding = np.nan
        return dict([('code', code), ('desc', desc), ('price', price),
                     ('planned_qty_1_month_mean', planned_qty_1_month_mean),
                     ('planned_qty_1_month_order_cost',
                      planned_qty_1_month_order_cost),
                     ('planned_qty_1_month_order_cost_thousands',
                      planned_qty_1_month_order_cost_thousands),
                     ('annual_plan_cost', annual_plan_cost),
                     ('amc_in_packs', np.round(amc_in_packs, 2)),
                     ('amc_adjusted_for_stock_outs',
                      np.round(amc_adjusted_for_stock_outs, 2)),
                     ('percentage_change_in_consumption',
                      np.round(percentage_change_in_consumption, 2)),
                     ('min_amc', np.round(min_amc, 2)),
                     ('max_amc', np.round(max_amc, 2)),
                     ('poisson_mode_quantity', poisson_mode_quantity),
                     ('safety_stock', safety_stock),
                     ('qty_to_procure', qty_to_procure),
                     ('calculated_1_month_cycle_cost_of_procurement',
                      calculated_1_month_cycle_cost_of_procurement),
                     ('calculated_1_month_cycle_cost_of_procurement_thousand',
                      calculated_1_month_cycle_cost_of_procurement_thousand),
                     ('consumption_annual_procurement_cost',
                      consumption_annual_procurement_cost),
                     ('budget_deficit_in_plan', budget_deficit_in_plan),
                     ('percentage_available_funding',
                      np.round(percentage_available_funding, 2))])
    else:
        return dict([
            ('code', code), ('desc', desc), ('price', price),
            ('planned_qty_1_month_mean', planned_qty_1_month_mean),
            ('planned_qty_1_month_order_cost', planned_qty_1_month_order_cost),
            ('planned_qty_1_month_order_cost_thousands',
             planned_qty_1_month_order_cost_thousands),
            ('annual_plan_cost', annual_plan_cost), ('amc_in_packs', None),
            ('amc_adjusted_for_stock_outs', None),
            ('percentage_change_in_consumption', None), ('min_amc', None),
            ('max_amc', None), ('poisson_mode_quantity', None),
            ('safety_stock', None), ('qty_to_procure', None),
            ('calculated_1_month_cycle_cost_of_procurement', None),
            ('calculated_1_month_cycle_cost_of_procurement_thousand', None),
            ('consumption_annual_procurement_cost', None),
            ('budget_deficit_in_plan', None),
            ('percentage_available_funding', None)
        ])
Esempio n. 33
0
    def evaluate(self, current_configuration: Configuration,
                 experiment: Experiment):
        """
        Return number of measurements to finish Configuration or 0 if it finished.
        In other case - compute result as average between all experiments.
        :param current_configuration: instance of Configuration class
        :param experiment: instance of 'experiment' is required for experiment-awareness.
        :return: int min_tasks_per_configuration if Configuration was not measured at all
                 or 1 if Configuration was not measured precisely or 0 if it finished
        """
        tasks_data = current_configuration.get_tasks()

        if len(tasks_data) == 0:
            return 1

        c_c_results = current_configuration.results
        c_s_results = experiment.get_current_solution().results
        c_c_results_l = []
        c_s_results_l = []
        for key in experiment.get_objectives():
            c_c_results_l.append(c_c_results[key])
            c_s_results_l.append(c_s_results[key])

        if len(tasks_data) < self.min_tasks_per_configuration:
            if self.is_experiment_aware:
                ratios = [
                    cur_config_dim / cur_solution_dim
                    for cur_config_dim, cur_solution_dim in zip(
                        c_c_results_l, c_s_results_l)
                ]
                if all([
                        ratio >= ratio_max
                        for ratio, ratio_max in zip(ratios, self.ratios_max)
                ]):
                    return 0
            return self.min_tasks_per_configuration - len(tasks_data)

        elif len(tasks_data) >= self.max_tasks_per_configuration:
            return 0
        else:
            # Calculating standard deviation
            all_dim_std = current_configuration.get_standard_deviation()

            # The number of Degrees of Freedom generally equals the number of observations (Tasks) minus
            # the number of estimated parameters.
            degrees_of_freedom = len(tasks_data) - len(c_c_results_l)

            # Calculate the critical t-student value from the t distribution
            student_coefficients = [
                t.ppf(c_l, df=degrees_of_freedom)
                for c_l in self.confidence_levels
            ]

            # Calculating confidence interval for each dimension, that contains a confidence intervals for
            # singular measurements and confidence intervals for multiple measurements.
            # First - singular measurements errors:
            conf_intervals_sm = []
            for c_l, d_s_a, d_a_c, avg in zip(self.confidence_levels,
                                              self.device_scale_accuracies,
                                              self.device_accuracy_classes,
                                              c_c_results_l):
                d = sqrt((c_l * d_s_a / 2)**2 + (d_a_c * avg / 100)**2)
                conf_intervals_sm.append(c_l * d)

            # Calculation of confidence interval for multiple measurements:
            conf_intervals_mm = []
            for student_coefficient, dim_skd in zip(student_coefficients,
                                                    all_dim_std):
                conf_intervals_mm.append(student_coefficient * dim_skd /
                                         sqrt(len(tasks_data)))

            # confidence interval, or in other words absolute error
            absolute_errors = []
            for c_i_ss, c_i_mm in zip(conf_intervals_sm, conf_intervals_mm):
                absolute_errors.append(sqrt(pow(c_i_ss, 2) + pow(c_i_mm, 2)))

            # Calculating relative error for each dimension
            relative_errors = []
            for interval, avg_res in zip(absolute_errors, c_c_results_l):
                if not avg_res:  # it is 0 or 0.0
                    # if new use-cases appear with the same behaviour.
                    if interval == 0:
                        avg_res = 1  # Anyway relative error will be 0 and avg will not be changed.
                    else:
                        return 1
                relative_errors.append(interval / avg_res * 100)

            # Thresholds for relative errors that should not be exceeded for accurate measurement.
            thresholds = []
            if self.is_experiment_aware:
                # We adapt thresholds
                objectives_minimization = experiment.get_objectives_minimization(
                )

                for i in range(len(objectives_minimization)):
                    if objectives_minimization[i]:
                        if not c_s_results_l[i]:
                            ratio = 1
                        else:
                            ratio = c_c_results_l[i] / c_s_results_l[i]
                    else:
                        if not c_c_results_l[i]:
                            ratio = 1
                        else:
                            ratio = c_s_results_l[i] / c_c_results_l[i]

                    adopted_threshold = \
                        self.base_acceptable_errors[i] \
                        + (self.max_acceptable_errors[i] - self.base_acceptable_errors[i]) \
                        / (1 + exp(- (10 / self.ratios_max[i]) * (ratio - self.ratios_max[i] / 2)))

                    thresholds.append(adopted_threshold)

            else:
                # Or we don't adapt thresholds
                for acceptable_error in self.base_acceptable_errors:
                    thresholds.append(acceptable_error)

            # Simple implementation of possible multi-dim Repeater decision making:
            # If any of resulting dimensions are not accurate - just terminate.
            for threshold, error in zip(thresholds, relative_errors):
                if error > threshold:
                    return 1
            return 0
Esempio n. 34
0
UR2_sim = np.random.normal(UR20, sig_UR2, N)
R1_sim = np.random.normal(R10, sig_R1, N)
R2_sim = np.random.normal(R20, sig_R2, N)

# Berechnung der Zielgröße und der statistischen Kennwerte
gamma02_sim = gamma01_sim*(1+alpha0*(temp1_sim-T_0))/(1+alpha0*(temp2_sim-T_0))*\
    (UR2_sim/UR1_sim)*((U10*R1_sim)/(U20*R2_sim))

Gmean = np.mean(gamma02_sim)
Gstd = np.std(gamma02_sim, ddof=1)
Gplot = np.arange(1 - 0.05, 1 + 0.05, 0.001)
fsim = norm.pdf(Gplot, Gmean, Gstd)
Fsim = norm.cdf(Gplot, Gmean, Gstd)

# Toleranz als Prognoseintervall (Mittelwert und Varianz unbekannt)
c1 = t.ppf((1 - GAMMA) / 2, N - 1)
c2 = t.ppf((1 + GAMMA) / 2, N - 1)
TGMC1 = Gstd * np.sqrt(1 + 1 / N) * (c2 - c1)
print(' ')
print('Toleranzbereich bei Monte-Carlo-Simulation mit Prognoseintervall: ',
      round(TGMC1, 4))
""" Grafische Darstellung der Simulation """
fig = plt.figure(3, figsize=(12, 4))
fig.suptitle('Ergebnisse der statistischen Simulation')
ax1, ax2 = fig.subplots(1, 2)
ax1.plot(gamma02_sim, 'r+')
#ax1.axis([0,N,2.35,2.65])
ax1.set_xlabel('Stichprobe $n$')
ax1.set_ylabel('Ausgangsspannung $U$ / V')
ax1.grid(True)
ax2.hist(gamma02_sim, int(np.sqrt(N)), density=True, facecolor='b')
Esempio n. 35
0
def _two_sample_ttest_for_stacked_data(table, response_cols, factor_col, alternatives, first=None , second=None , hypo_diff=0, equal_vari='pooled', confi_level=0.95):

    if(type(table[factor_col][0]) != str):
        if(type(table[factor_col][0]) == bool):
            if(first != None):
                first = bool(first)
            if(second != None):
                second = bool(second)
        else:
            if(first != None):
                first = float(first)
            if(second != None):
                second = float(second)
    if(first == None or second == None):
        tmp_factors = []
        if(first != None):
            tmp_factors += [first]
        if(second != None):
            tmp_factors += [second]
        for i in range(len(table[factor_col])):
            if(table[factor_col][i] != None and table[factor_col][i] not in tmp_factors):
                if(len(tmp_factors) == 2):
                    raise Exception("There are more that 2 factors.")
                else:
                    tmp_factors += [table[factor_col][i]]
    if(first == None):    
        if(tmp_factors[0] != second):
            first = tmp_factors[0]
        else:
            first = tmp_factors[1]
    if(second == None):
        if(tmp_factors[0] != first):
            second = tmp_factors[0]
        else:
            second = tmp_factors[1]
    table_first = table[table[factor_col] == first]
    table_second = table[table[factor_col] == second]
    tmp_table = []

    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    ## Two Sample T Test for Stacked Data Result
    | - Hypothesized mean = {hypo_diff}
    | - Confidence level = {confi_level}
    """.format(hypo_diff=hypo_diff, confi_level=confi_level)))
    
    for response_col in response_cols:
        tmp_model = []
        number1 = len(table_first[response_col])
        number2 = len(table_second[response_col])
        mean1 = (table_first[response_col]).mean()
        mean2 = (table_second[response_col]).mean()
        std1 = (table_first[response_col]).std()
        std2 = (table_second[response_col]).std()
        start_auto = 0
        if(equal_vari == 'auto'):
            start_auto = 1
            f_value = (std1 ** 2) / (std2 ** 2)
            f_test_p_value_tmp = stats.f.cdf(1 / f_value, number1 - 1, number2 - 1)
            if(f_test_p_value_tmp > 0.5):
                f_test_p_value = (1 - f_test_p_value_tmp) * 2
            else:
                f_test_p_value = f_test_p_value_tmp * 2
            if(f_test_p_value < 0.05):
                equal_vari = 'unequal'
            else:
                equal_vari = 'pooled'
        ttestresult = ttest_ind(table_first[response_col], table_second[response_col], 'larger', usevar=equal_vari, value=hypo_diff)
        
        if 'larger' in alternatives:
            ttestresult = ttest_ind(table_first[response_col], table_second[response_col], 'larger', usevar=equal_vari, value=hypo_diff)
            df = ttestresult[2]
            if(equal_vari == 'pooled'):    
                std_number1number2 = sqrt(((number1 - 1) * (std1) ** 2 + (number2 - 1) * (std2) ** 2) / (number1 + number2 - 2))
                margin = t.ppf((confi_level) , df) * std_number1number2 * sqrt(1 / number1 + 1 / number2)
            if(equal_vari == 'unequal'):
                margin = t.ppf((confi_level) , df) * sqrt(std1 ** 2 / (number1) + std2 ** 2 / (number2))
            tmp_model += [['true difference in means > 0.0'] + 
            [ttestresult[1]] + [(mean1 - mean2 - margin, math.inf)]]
            tmp_table += [['%s by %s(%s,%s)' % (response_col, factor_col, first, second)] + 
            ['true difference in means > 0.0'] + 
            ['t statistic, t distribution with %f degrees of freedom under the null hypothesis' % ttestresult[2]] + 
            [ttestresult[0]] + [ttestresult[1]] + [confi_level] + [mean1 - mean2 - margin] + [math.inf]]
            
        if 'smaller' in alternatives:
            ttestresult = ttest_ind(table_first[response_col], table_second[response_col], 'smaller', usevar=equal_vari, value=hypo_diff)
            df = ttestresult[2]
            if(equal_vari == 'pooled'):    
                std_number1number2 = sqrt(((number1 - 1) * (std1) ** 2 + (number2 - 1) * (std2) ** 2) / (number1 + number2 - 2))
                margin = t.ppf((confi_level) , df) * std_number1number2 * sqrt(1 / number1 + 1 / number2)
            if(equal_vari == 'unequal'):
                margin = t.ppf((confi_level) , df) * sqrt(std1 ** 2 / (number1) + std2 ** 2 / (number2))
            tmp_model += [['true difference in means < 0.0'] + 
            [ttestresult[1]] + [(-math.inf, mean1 - mean2 + margin)]] 
            tmp_table += [['%s by %s(%s,%s)' % (response_col, factor_col, first, second)] + 
            ['true difference in means < 0.0'] + 
            ['t statistic, t distribution with %f degrees of freedom under the null hypothesis' % ttestresult[2]] + 
            [ttestresult[0]] + [ttestresult[1]] + [confi_level] + [-math.inf] + [mean1 - mean2 + margin]] 
            
        if 'two-sided' in alternatives:
            ttestresult = ttest_ind(table_first[response_col], table_second[response_col], 'two-sided', usevar=equal_vari, value=hypo_diff)
            df = ttestresult[2]
            if(equal_vari == 'pooled'):    
                std_number1number2 = sqrt(((number1 - 1) * (std1) ** 2 + (number2 - 1) * (std2) ** 2) / (number1 + number2 - 2))
                margin = t.ppf((confi_level + 1) / 2 , df) * std_number1number2 * sqrt(1 / number1 + 1 / number2)
            if(equal_vari == 'unequal'):
                margin = t.ppf((confi_level + 1) / 2 , df) * sqrt(std1 ** 2 / (number1) + std2 ** 2 / (number2))
            tmp_model += [['true difference in means != 0.0'] + 
            [ttestresult[1]] + [(mean1 - mean2 - margin, mean1 - mean2 + margin)]]
            tmp_table += [['%s by %s(%s,%s)' % (response_col, factor_col, first, second)] + 
            ['true difference in means != 0.0'] + 
            ['t statistic, t distribution with %f degrees of freedom under the null hypothesis' % ttestresult[2]] + 
            [ttestresult[0]] + [ttestresult[1]] + [confi_level] + [mean1 - mean2 - margin] + [mean1 - mean2 + margin]]
            
        result_model = pd.DataFrame.from_records(tmp_model)
        result_model.columns = ['alternative hypothesis', 'p-value', '%g%% confidence interval' % (confi_level * 100)]
        rb.addMD(strip_margin("""
        | #### Data = {response_col} by {factor_col}({first},{second})
        
        | - Statistics = t statistic, t distribution with {ttestresult2} degrees of freedom under the null hypothesis
        | - t-value = {ttestresult0}
        |
        | {result_model}
        |
        """.format(ttestresult2=ttestresult[2], response_col=response_col, factor_col=factor_col, first=first, second=second, ttestresult0=ttestresult[0], result_model=pandasDF2MD(result_model))))
        if(start_auto == 1):
            equal_vari = 'auto'
    result = pd.DataFrame.from_records(tmp_table)
    result.columns = ['data', 'alternative_hypothesis', 'statistics', 'estimates', 'p_value', 'confidence_level', 'lower_confidence_interval', 'upper_confidence_interval']

    model = dict()
    model['_repr_brtc_'] = rb.get()    
    return {'out_table' : result, 'model' : model}
Esempio n. 36
0
        # ax.set(ylabel="Packets", xlabel='Consumers')
        # ax.grid(axis='y')
        # fig.xticks = 'Consumers'
        # fig.yticks = 'Hits'
        # plt.xticks(rotation=0)
        # ax2 = ax.twinx()
        # plot = ax2.plot(ax.get_xticks(), out_hits[['prodRec']], marker='.', markeredgecolor='black')
        # ax2.set_ylabel(r"Names")
        # plot[0].get_figure().savefig('data/' + output + '_content.png', bbox_inches='tight')

        # Average content received per consumer
        # Confidence content retrieval
        # 95 confidence interval bw simulations - hit_c[mode][simulations]
        confidence = 0.95
        hits_c[mode] = [
            sem(hit_c[mode][s]) * t.ppf((1 + confidence) / 2,
                                        len(hit_c[mode][s]) - 1)
            for s in range(simulations)
        ]

        # a_con = [hits[mode][i] / consum[i] for i in range(simulations)]
        # # Plot content retrieved
        # out_hits2 = pd.DataFrame({'hits': a_con}, index=consum)
        # out_hits2_c = pd.DataFrame({'hits': hits_c[mode]}, index=consum)
        # out_hits2 = out_hits2.sort_index()
        # out_hits2_c = out_hits2_c.sort_index()
        # out_hits2.to_csv('data/' + output + '_a_content.csv')
        # # Create figure and plot first axis
        # fig2 = plt.figure(figsize=[12, 8])
        # ax = out_hits2.plot.bar(yerr=out_hits2_c, title="Content retrieved per consumer", ax=fig2.add_subplot(111))
        # ax.set(ylabel="Packets", xlabel='Consumers')
        # ax.grid(axis='y')
Esempio n. 37
0
def coeffs_criterias(yi, x, y):
    global b, m
    k = len(x[0])
    mx = [[] for i in range(len(x) + 1)]
    mx[0].append(k)
    for i in range(1, len(x) + 1):
        suma = round(sum(x[i - 1]), 5)
        mx[0].append(suma)
        mx[i].append(suma)
        for j in range(0, len(x)):
            mx[i].append(
                round(sum([round(x[i - 1][l] * x[j][l], 5) for l in range(k)]),
                      5))

    det = numpy.linalg.det(mx)
    delta = round(det, 5)

    my = [round(sum(yi), 5)]
    for i in range(len(x)):
        my.append(round(sum([yi[j] * x[i][j] for j in range(k)]), 5))

    b = [copy.deepcopy(mx) for i in range(len(x) + 1)]
    for i in range(len(x) + 1):
        for j in range(len(x) + 1):
            b[i][j][i] = my[j]
        b[i] = round(numpy.linalg.det(b[i]) / delta, 5)
        print("b" + str(i) + ": " + str(b[i]))

    S2 = []
    for i in range(len(y)):
        S2.append(sum([(y[i][j] - yi[i])**2 for j in range(len(y[i]))]))
        S2[i] = round(S2[i] / len(y[i]), 3)
    print("S2: " + str(S2))

    Gp = round(max(S2) / sum(S2), 3)
    print("Gp: " + str(Gp))

    f1 = m - 1
    f2 = k

    print("f1:" + str(f1))
    print("f2:" + str(f2))

    alpha = 0.05

    Gcr = round(cochran(f1, f2, alpha), 4)
    print("Gcr: " + str(Gcr))
    if Gp < Gcr:
        print("Cochran's C: OK")
    else:
        print("Cochran's C: :(")
        m += 1
        return generate_y(x)

    S2v = sum(S2) / 4

    S2b = round(S2v / (4 * m), 3)
    Sb = round(math.sqrt(S2b), 3)

    f3 = f1 * f2
    print("f3: " + str(f3))
    tcr = round(t.ppf(1 - alpha / 2, df=f3), 3)
    print("t: " + str(tcr))
    bs = []
    ts = []
    d = 0
    bs.append(round(sum([yi[j] for j in range(len(yi))]) / len(yi), 3))

    ts.append(round(bs[0] / Sb, 3))
    if ts[0] < 0:
        ts[0] *= -1
    if ts[0] > tcr:
        ts[0] = True
        d += 1
    else:
        ts[0] = False
    for i in range(len(x)):
        bs.append(
            round(sum([yi[j] * x[i][j] for j in range(len(yi))]) / len(yi), 3))
        ts.append(round(bs[i + 1] / Sb, 3))
        if ts[i + 1] < 0:
            ts[i + 1] *= -1
        if ts[i + 1] > tcr:
            ts[i + 1] = True
            d += 1
        else:
            ts[i + 1] = False

    print("Чи значимі b: " + str(ts))

    f4 = k - d
    print("f4: " + str(f4))
    yj = []
    b0 = []
    for i in range(len(b)):
        if ts[i]:
            b0.append(b[i])
        else:
            b0.append(0)
    for j in range(k):
        yj.append(
            round(
                b0[0] + sum([x[i - 1][j] * b0[i] for i in range(1, len(b0))]),
                3))
    print("yj: " + str(yj))

    S2ad = round(m * sum([(yj[i] - yi[i])**2 for i in range(4)]) / f4, 3)
    Fp = round(S2ad / S2v, 3)
    print("Fp: " + str(Fp))
    Fcr = round(f.ppf(1 - alpha, f4, f3), 1)
    print("Fcr: " + str(Fcr))
    if Fp < Fcr:
        print("F-criteria: OK")
    else:
        print("F-criteria: :(")
        start(x)
Esempio n. 38
0
def main(n, m):
    x1min = -30
    x1max = 0
    x2min = 10
    x2max = 60
    x3min = 10
    x3max = 35

    x01 = (x1max + x1min) / 2
    x02 = (x2max + x2min) / 2
    x03 = (x3max + x3min) / 2
    deltax1 = x1max - x01
    deltax2 = x2max - x02
    deltax3 = x3max - x03

    xn = [[-1, -1, -1, +1, +1, +1, -1, +1, +1, +1],
          [-1, -1, +1, +1, -1, -1, +1, +1, +1, +1],
          [-1, +1, -1, -1, +1, -1, +1, +1, +1, +1],
          [-1, +1, +1, -1, -1, +1, -1, +1, +1, +1],
          [+1, -1, -1, -1, -1, +1, +1, +1, +1, +1],
          [+1, -1, +1, -1, +1, -1, -1, +1, +1, +1],
          [+1, +1, -1, +1, -1, -1, -1, +1, +1, +1],
          [+1, +1, +1, +1, +1, +1, +1, +1, +1, +1],
          [-1.73, 0, 0, 0, 0, 0, 0, 2.9929, 0, 0],
          [+1.73, 0, 0, 0, 0, 0, 0, 2.9929, 0, 0],
          [0, -1.73, 0, 0, 0, 0, 0, 0, 2.9929, 0],
          [0, +1.73, 0, 0, 0, 0, 0, 0, 2.9929, 0],
          [0, 0, -1.73, 0, 0, 0, 0, 0, 0, 2.9929],
          [0, 0, +1.73, 0, 0, 0, 0, 0, 0, 2.9929],
          [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]

    x1 = [x1min, x1min, x1min, x1min, x1max, x1max, x1max, x1max, -1.73 * deltax1 + x01, 1.73 * deltax1 + x01, x01, x01,
          x01, x01, x01]
    x2 = [x2min, x2min, x2max, x2max, x2min, x2min, x2max, x2max, x02, x02, -1.73 * deltax2 + x02, 1.73 * deltax2 + x02,
          x02, x02, x02]
    x3 = [x3min, x3max, x3min, x3max, x3min, x3max, x3min, x3max, x03, x03, x03, x03, -1.73 * deltax3 + x03,
          1.73 * deltax3 + x03, x03]

    x1x2 = [0] * 15
    x1x3 = [0] * 15
    x2x3 = [0] * 15
    x1x2x3 = [0] * 15
    x1kv = [0] * 15
    x2kv = [0] * 15
    x3kv = [0] * 15

    for i in range(15):
        x1x2[i] = x1[i] * x2[i]
        x1x3[i] = x1[i] * x3[i]
        x2x3[i] = x2[i] * x3[i]
        x1x2x3[i] = x1[i] * x2[i] * x3[i]
        x1kv[i] = x1[i] ** 2
        x2kv[i] = x2[i] ** 2
        x3kv[i] = x3[i] ** 2

    list_for_a = round_matrix(list(zip(x1, x2, x3, x1x2, x1x3, x2x3, x1x2x3, x1kv, x2kv, x3kv)))

    planning_matrix_with_naturalized_coeffs_x = PrettyTable()
    planning_matrix_with_naturalized_coeffs_x.title = 'Матриця планування з натуралізованими коефіцієнтами X'
    planning_matrix_with_naturalized_coeffs_x.field_names = ['X1', 'X2', 'X3', 'X1X2', 'X1X3', 'X2X3', 'X1X2X3', 'X1X1',
                                                             'X2X2', 'X3X3']
    planning_matrix_with_naturalized_coeffs_x.add_rows(list_for_a)
    print(planning_matrix_with_naturalized_coeffs_x)

    Y = round_matrix(
        [[function(list_for_a[j][0], list_for_a[j][1], list_for_a[j][2]) for i in range(m)] for j in range(15)])

    planning_matrix_y = PrettyTable()
    planning_matrix_y.title = 'Матриця планування Y'
    planning_matrix_y.field_names = ['Y1', 'Y2', 'Y3']
    planning_matrix_y.add_rows(Y)
    print(planning_matrix_y)

    Y_average = []
    for i in range(len(Y)):
        Y_average.append(np.mean(Y[i], axis=0))
    print("Середні значення відгуку за рядками:")
    for i in range(15):
        print("{:.3f}".format(Y_average[i]), end=" ")

    dispersions = []
    for i in range(len(Y)):
        a = 0
        for k in Y[i]:
            a += (k - np.mean(Y[i], axis=0)) ** 2
        dispersions.append(a / len(Y[i]))

    def find_known(num):
        a = 0
        for j in range(15):
            a += Y_average[j] * list_for_a[j][num - 1] / 15
        return a

    def a(first, second):
        a = 0
        for j in range(15):
            a += list_for_a[j][first - 1] * list_for_a[j][second - 1] / 15
        return a

    my = sum(Y_average) / 15
    mx = []

    for i in range(10):
        number_lst = []
        for j in range(15):
            number_lst.append(list_for_a[j][i])
        mx.append(sum(number_lst) / len(number_lst))

    det1 = [
        [1, mx[0], mx[1], mx[2], mx[3], mx[4], mx[5], mx[6], mx[7], mx[8], mx[9]],
        [mx[0], a(1, 1), a(1, 2), a(1, 3), a(1, 4), a(1, 5), a(1, 6), a(1, 7), a(1, 8), a(1, 9), a(1, 10)],
        [mx[1], a(2, 1), a(2, 2), a(2, 3), a(2, 4), a(2, 5), a(2, 6), a(2, 7), a(2, 8), a(2, 9), a(2, 10)],
        [mx[2], a(3, 1), a(3, 2), a(3, 3), a(3, 4), a(3, 5), a(3, 6), a(3, 7), a(3, 8), a(3, 9), a(3, 10)],
        [mx[3], a(4, 1), a(4, 2), a(4, 3), a(4, 4), a(4, 5), a(4, 6), a(4, 7), a(4, 8), a(4, 9), a(4, 10)],
        [mx[4], a(5, 1), a(5, 2), a(5, 3), a(5, 4), a(5, 5), a(5, 6), a(5, 7), a(5, 8), a(5, 9), a(5, 10)],
        [mx[5], a(6, 1), a(6, 2), a(6, 3), a(6, 4), a(6, 5), a(6, 6), a(6, 7), a(6, 8), a(6, 9), a(6, 10)],
        [mx[6], a(7, 1), a(7, 2), a(7, 3), a(7, 4), a(7, 5), a(7, 6), a(7, 7), a(7, 8), a(7, 9), a(7, 10)],
        [mx[7], a(8, 1), a(8, 2), a(8, 3), a(8, 4), a(8, 5), a(8, 6), a(8, 7), a(8, 8), a(8, 9), a(8, 10)],
        [mx[8], a(9, 1), a(9, 2), a(9, 3), a(9, 4), a(9, 5), a(9, 6), a(9, 7), a(9, 8), a(9, 9), a(9, 10)],
        [mx[9], a(10, 1), a(10, 2), a(10, 3), a(10, 4), a(10, 5), a(10, 6), a(10, 7), a(10, 8), a(10, 9), a(10, 10)]]

    det2 = [my, find_known(1), find_known(2), find_known(3), find_known(4), find_known(5), find_known(6), find_known(7),
            find_known(8), find_known(9), find_known(10)]

    beta = solve(det1, det2)
    print("\nОтримане рівняння регресії:")
    print("{:.3f} + {:.3f} * X1 + {:.3f} * X2 + {:.3f} * X3 + {:.3f} * Х1X2 + {:.3f} * Х1X3 + {:.3f} * Х2X3"
          "+ {:.3f} * Х1Х2X3 + {:.3f} * X11^2 + {:.3f} * X22^2 + {:.3f} * X33^2 = ŷ"
          .format(beta[0], beta[1], beta[2], beta[3], beta[4], beta[5], beta[6], beta[7], beta[8], beta[9], beta[10]))
    y_i = [0] * 15
    print("Експериментальні значення:")
    for k in range(15):
        y_i[k] = beta[0] + beta[1] * list_for_a[k][0] + beta[2] * list_for_a[k][1] + beta[3] * list_for_a[k][2] + \
                 beta[4] * list_for_a[k][3] + beta[5] * list_for_a[k][4] + beta[6] * list_for_a[k][5] + beta[7] * \
                 list_for_a[k][6] + beta[8] * list_for_a[k][7] + beta[9] * list_for_a[k][8] + beta[10] * list_for_a[k][
                     9]
    for i in range(15):
        print("{:.3f}".format(y_i[i]), end=" ")

    start1 = time.time()
    print("\n\nПеревірка за критерієм Кохрена")
    Gp = max(dispersions) / sum(dispersions)
    Gt = 0.3346
    print("Gp =", Gp)
    if Gp < Gt:
        print("Дисперсія однорідна")
    else:
        print("Дисперсія неоднорідна")
    end1 = time.time()

    start2 = time.time()
    print("\nПеревірка значущості коефіцієнтів за критерієм Стьюдента")
    sb = sum(dispersions) / len(dispersions)
    sbs = (sb / (15 * m)) ** 0.5

    F3 = (m - 1) * n
    coefs1 = []
    coefs2 = []
    d = 11
    res = [0] * 11
    for j in range(11):
        t_pract = 0
        for i in range(15):
            if j == 0:
                t_pract += Y_average[i] / 15
            else:
                t_pract += Y_average[i] * xn[i][j - 1]
            res[j] = beta[j]
        if fabs(t_pract / sbs) < t.ppf(q=0.975, df=F3):
            coefs2.append(beta[j])
            res[j] = 0
            d -= 1
        else:
            coefs1.append(beta[j])
    print("Значущі коефіцієнти регресії:", [round(i, 3) for i in coefs1])
    print("Незначущі коефіцієнти регресії:", [round(i, 3) for i in coefs2])
    y_st = []
    for i in range(15):
        y_st.append(res[0] + res[1] * x1[i] + res[2] * x2[i] + res[3] * x3[i] + res[4] * x1x2[i] + res[5] *
                    x1x3[i] + res[6] * x2x3[i] + res[7] * x1x2x3[i] + res[8] * x1kv[i] + res[9] *
                    x2kv[i] + res[10] * x3kv[i])
    print("Значення з отриманими коефіцієнтами:")
    for i in range(15):
        print("{:.3f}".format(y_st[i]), end=" ")
    end2 = time.time()

    start3 = time.time()
    print("\n\nПеревірка адекватності за критерієм Фішера")
    Sad = m * sum([(y_st[i] - Y_average[i]) ** 2 for i in range(15)]) / (n - d)
    Fp = Sad / sb
    F4 = n - d
    print("Fp =", Fp)
    if Fp < f.ppf(q=0.95, dfn=F4, dfd=F3):
        print("Рівняння регресії адекватне при рівні значимості 0.05")
    else:
        print("Рівняння регресії неадекватне при рівні значимості 0.05")
    end3 = time.time()

    print('-----------------------------------------------------------------------------------------------------')
    time_cohren = end1 - start1
    time_student = end2 - start2
    time_fisher = end3 - start3
    print("Час початку перевірки за критерієм Кохрена", start1)
    print("Час закінчення перевірки за критерієм Кохрена", end1)
    print("--- Час виконання перевірки за критерієм Кохрена: %s seconds ---" % time_cohren)
    print()
    print("Час початку перевірки за критерієм Стьюдента", start2)
    print("Час закінчення перевірки за критерієм Стьюдента", end2)
    print("--- Час виконання перевірки за критерієм Стьюдента: %s seconds ---" % time_student)
    print()
    print("Час початку перевірки за критерієм Фішера", start3)
    print("Час закінчення перевірки за критерієм Фішера", end3)
    print("--- Час виконання перевірки за критерієм Фішера: %s seconds ---" % time_fisher)
    print()
report_path = sys.argv[2]
tables_path = sys.argv[3]

# read report files for each ml algorithm
df_dict = {'knn' : pd.read_csv(report_path+'precision_recall_knn_experiments.csv'),
           'dt' : pd.read_csv(report_path+'precision_recall_dt_experiments.csv'),
           'rf' : pd.read_csv(report_path+'precision_recall_rf_experiments.csv')}

# generate LaTeX code and save in text file
for ml in ['knn','dt','rf']:
    precision = df_dict[ml][[f'test{x} precision' for x in range(1,noOfTests+1)]]
    recall = df_dict[ml][[f'test{x} recall' for x in range(1,noOfTests+1)]]
    
    # calculate mean and 99.9% error interval for precision and recall
    df_dict[ml]['precision mean'] = precision.mean(axis = 1)
    df_dict[ml]['precision error'] = t.ppf(.999, noOfTests-1) * ( precision.std(axis = 1) / np.sqrt(noOfTests))
    df_dict[ml]['recall mean'] = recall.mean(axis = 1)
    df_dict[ml]['recall error'] = t.ppf(.999, noOfTests-1) * ( recall.std(axis = 1) / np.sqrt(noOfTests))
    
    # LaTeX code generation
    head = '\\begin{table}[htpb]\n\centering\n\\resizebox{\\textwidth}{!}{%\n'
    table = '\\begin{tabular}{l'+'c'*(noOfTests*2)+'cc}\n\cline{2-'+str(noOfTests*2+3)+'}\n'
    title1 = '\multicolumn{1}{c}{\\textbf{}} & '
    title2 = '\\textbf{Features}'
    for i in range(1,noOfTests+1):
        title1 += '\multicolumn{2}{c}{\\textbf{Test '+str(i)+'}} & '
        title2 += ' & \\textbf{Precision} & \\textbf{Recall}'
    title1 += '\multicolumn{2}{c}{\\textbf{\\begin{tabular}[c]{@{}c@{}}Confidence\\\\ Interval 99\%\end{tabular}}} \\\\ \hline \n'
    title2 += ' & \\textbf{Precision} & \\textbf{Recall} \\\\ \hline \hline \n'
    
    body = ''
Esempio n. 40
0
                 (-1) + y8av8 * 1) / 8
        beta7 = (y1av1 * (-1) + y2av2 * 1 + y3av3 * 1 + y4av4 *
                 (-1) + y5av5 * 1 + y6av6 * (-1) + y7av7 *
                 (-1) + y8av8 * 1) / 8

        t0 = abs(beta0) / sbs
        t1 = abs(beta1) / sbs
        t2 = abs(beta2) / sbs
        t3 = abs(beta3) / sbs
        t4 = abs(beta4) / sbs
        t5 = abs(beta5) / sbs
        t6 = abs(beta6) / sbs
        t7 = abs(beta7) / sbs

        f3 = f1 * f2
        ttabl = round(abs(t.ppf(q / 2, f3)), 4)

        d = 8
        if t0 < ttabl:
            print("t0<ttabl, b0 не значимий")
            b0 = 0
            d = d - 1
        if t1 < ttabl:
            print("t1<ttabl, b1 не значимий")
            b1 = 0
            d = d - 1
        if t2 < ttabl:
            print("t2<ttabl, b2 не значимий")
            b2 = 0
            d = d - 1
        if t3 < ttabl:
def mean_confidence_interval(data, confidence=0.95):
    a = 1.0 * np.array(data)
    n = len(a)
    m, se = np.mean(a), sem(a, ddof=n-1)
    h = se * t.ppf((1 + confidence) / 2., n-1)
    return m, m-h, m+h
            i += 1
            log[j * 50].append(cost)  #seconds
#%%
"""
        DATA MINING
"""
data = pd.DataFrame.from_dict(log).swapaxes(0, 1)
mean = data.mean(numeric_only=True, axis=1)
std = data.std(axis=1)
data["Mean (sec.)"] = mean
data["STD"] = std
data.reset_index(inplace=True)
data = data.rename(columns={'index': 'Size(D)'})
data = data[['Size(D)', 'Mean (sec.)', "STD"]]
data["Sm"] = (data["STD"] / np.sqrt((data['Size(D)'])))
data["h_95"] = (data["Sm"] * t.ppf((1 + 0.95) / 2, data["Size(D)"] - 1))
data["h_90"] = (data["Sm"] * t.ppf((1 + 0.90) / 2, data["Size(D)"] - 1))
data.plot.line(x="Size(D)", y="Mean (sec.)")

#%%
"""
        ISCOVER CHECKS IF
        IT IS COVER 
        FOR THE SAKE OF CORRECTNESS
        MAIN_TEST GUARANTEES CORRECT INPUT
"""


def isCover(universe, Cover):
    for subset in Cover:
        universe = universe - subset
Esempio n. 43
0
 def get_student_value(f3, significance):
     from _pydecimal import Decimal
     from scipy.stats import t
     return Decimal(abs(t.ppf(significance / 2,
                              f3))).quantize(Decimal('.0001')).__float__()
Esempio n. 44
0
Sb = sum(syList) / N
S = math.sqrt(Sb / (N * m))

bettaList = [
    sum([syList[i] * normValuesOfX0[i] for i in range(N)]) / N,
    sum([syList[i] * normValuesOfX1[i] for i in range(N)]) / N,
    sum([syList[i] * normValuesOfX2[i] for i in range(N)]) / N,
    sum([syList[i] * normValuesOfX3[i] for i in range(N)]) / N
]
bettaList = [round(i, 2) for i in bettaList]

tList = [bettaList[i] * S for i in range(N)]

for i in range(N):
    if tList[i] < t.ppf(
            q=0.975,
            df=f3):  # перевірка за критерієм Стьюдента з використанням scipy
        bList[i] = 0
        d -= 1
        print('Виключаємо з рівняння коефіціент b' + str(i))

print("y = " + str(bList[0]) + ' + (' + str(bList[1]) + ") * x1 + (" +
      str(bList[2]) + ") * x2 + (" + str(bList[3]) + ") * x3")

# Критерій Фішера
print("=================Критерій Фішера=================")
f4 = N - d
S_ad = (m * sum([(bList[0] + bList[1] * x1List[i] + bList[2] * x2List[i] +
                  bList[3] * x3List[i] - avgYList[i])**2
                 for i in range(N)]) / f4)
Fp = S_ad / Sb
Esempio n. 45
0
#Get predictions from training data
Y_train_pred=[y(val[0]) for val in X_train]

#Get degrees of freedom
deg_f=len(Y_train_pred)-3

#Compute MSres from training data
pred_true_df=pd.DataFrame({'Pred.':Y_train_pred, 'True':[val[0] for val in Y_train]})
pred_true_df['Resid_sqr']=pred_true_df.apply(lambda row: (row['Pred.']-row['True'])**2, axis=1)
    
RSS=sum(pred_true_df['Resid_sqr'])
MSres=RSS/deg_f

#Get tc critical value from t distribution
t_c=t.ppf(.025, df=deg_f)

#Save training data for plotting             
Y_train_plot=[val[0] for val in Y_train]
X_train_plot=[val[0] for val in X_train]
    
#Propend column of 1s to data array and interpret as matrix
X_train=np.asmatrix([[1,val[0],val[1]] for val in X_train])
X_train_T=X_train.transpose()
C=np.dot(X_train_T,X_train).I

#Upper confidence window for y
def y_up(x):
    A=np.asmatrix([1,x,x**2])
    se=math.sqrt(MSres*np.dot(np.dot(A,C),A.transpose()))
    return(y(x)+t_c*se)
Esempio n. 46
0
# grid for the degrees of freedom parameter
nu_vec_cop = np.arange(nu_min_copula, nu_max_copula + 1)
l_ = len(nu_vec_cop)

# initialize variables
rho2_copula_vec = np.zeros((i_, i_, l_))
llike_nu = np.zeros(l_)
epsi_tilde = np.zeros((t_, i_, l_))

db_estimation_copula = {}

for l in range(l_):
    # calculate standardized invariants
    for i in range(i_):
        epsi_tilde[:, i, l] = tstu.ppf(u[:, i], nu_vec_cop[l])

    # estimate copula parameters with maximum likelihood
    _, sig2 = \
        fit_locdisp_mlfp_difflength(epsi_tilde[:, :, l],
                                    p=p_copula,
                                    nu=nu_vec_cop[l],
                                    threshold=10 ** -3,
                                    maxiter=1000)

    # shrinkage: factor analysis
    beta, delta2 = factor_analysis_paf(sig2, k_)
    sig2_fa = beta @ beta.T + np.diag(delta2)

    # compute correlation matrix
    rho2_copula_vec[:, :, l], _ = cov_2_corr(sig2_fa)
Esempio n. 47
0
import numpy as np
from scipy.stats import t

melons = [7.72, 9.58, 12.38, 7.77, 11.27, 8.80, 11.10, 7.80, 10.17, 6.00]
melons = np.array(melons)
xbar = np.mean(melons)
s_x = np.std(melons, ddof=1)
alpha = 0.05
n = np.size(melons)

t_statistic = t.ppf(alpha / 2.0, n -1)
confid_lower = xbar - abs(t_statistic) * s_x / np.sqrt(n)
confid_upper = xbar + abs(t_statistic) * s_x / np.sqrt(n)

print("T-statistic: ", t_statistic)
print("Confid. interval: ", [confid_lower, confid_upper])
Esempio n. 48
0
# We compute the coeff.
beta_0, beta_1 = np.linalg.lstsq(XX, yy, rcond=None)[0]
beta = [beta_0, beta_1]

# Calculate the SSE
SSE = np.linalg.lstsq(XX, yy, rcond=None)[1]

# We get confidence interval
alpha = 0.05
x0 = np.linspace(7, 15, 50)
X0 = np.array([np.ones(len(x0)), x0]).T

aux_t_conf = np.sqrt(SSE / (n - p) *
                     (np.diag(X0 @ np.linalg.inv(XX.T @ XX) @ X0.T)))
yy0_hat = X0 @ np.array([beta_0, beta_1])
upp_conf = yy0_hat + t.ppf(1 - alpha / 2, n - p) * aux_t_conf
low_conf = yy0_hat - t.ppf(1 - alpha / 2, n - p) * aux_t_conf

# We get prediction interval
aux_t_pred = np.sqrt(SSE / (n - p) *
                     (1 + np.diag(X0 @ np.linalg.inv(XX.T @ XX) @ X0.T)))
yy0_hat = X0 @ np.array([beta_0, beta_1])
upp_pred = yy0_hat + t.ppf(1 - alpha / 2, n - p) * aux_t_pred
low_pred = yy0_hat - t.ppf(1 - alpha / 2, n - p) * aux_t_pred

plt.figure(figsize=(10, 5))
plt.plot(dat[:, 0], yy, 'o', label='Original data', markersize=5)
plt.plot(x0, beta_0 + beta_1 * x0, 'r', label='Fitted line')
plt.fill_between(x0,
                 low_pred,
                 upp_pred,
Esempio n. 49
0
 def getStudentVal(f3, q):
     return Decimal(abs(t.ppf(q / 2,
                              f3))).quantize(Decimal('.0001')).__float__()
def CI_vs_samples(distributions, samples_list, samples_to_plot):

    avgs_nr = 10000
    fig, ax = plt.subplots(3, 3, figsize=(16, 15))

    results = []

    for idx, distribution in enumerate(distributions):

        CI_df_cols = [
            'n', 'Normal', 'Exp. Sigma', 'Sigma', 'CI_norm', 'CI_norm_score',
            'CI_t', 'CI_t_score'
        ]
        CI_df = pd.DataFrame(columns=CI_df_cols)

        real_avg = np.mean(random_data_avg(distribution, 1000, avgs_nr))

        # Repeat for each fo the samples number
        for i, samples in enumerate(samples_list):

            data_avgs = []
            CI_norm_score = []
            CI_t_score = []

            # Repeat 10k for each samples number
            for j in range(avgs_nr):

                # Generates random data
                data = random_data(distribution, samples)
                data_avg = np.mean(data)
                data_avgs.append(data_avg)

                # Computes the CI assuming normal
                data_std = np.std(data)
                CI = 1.96 * data_std / np.sqrt(samples)
                lower = data_avg - CI
                upper = data_avg + CI
                CI_norm_score.append(lower <= real_avg <= upper)

                # Computes the CI assuming t-distribution
                confidence = 0.95
                std_err = sem(data)
                h = std_err * t.ppf((1 + confidence) / 2, samples - 1)
                lower = data_avg - h
                upper = data_avg + h
                CI_t_score.append(lower <= real_avg <= upper)

            # Plots the histogram
            if samples in samples_to_plot:
                label = "n = {}".format(samples)
                color = color_lin_gradient(
                    np.array([1, 0, 0]), np.array([0.2, 0, 1]),
                    len(samples_to_plot))[samples_to_plot.index(samples)]
                ax[idx, 0].hist(data_avgs, bins=50, label=label, color=color)
                qqplot(np.array(data_avgs),
                       fit=True,
                       line='45',
                       ax=ax[idx, 1],
                       label=label,
                       color=color)

            # Computes the std deviation
            is_normal = normaltest(data_avgs)[-1] > 0.05
            data_std = np.std(random_data(distribution, samples))
            expected_avgs_std = data_std / np.sqrt(samples)
            real_avgs_std = np.std(data_avgs)

            # Update the series and add to the dataframe
            CI_series = pd.Series(index=CI_df_cols,
                                  data=[
                                      samples, is_normal, expected_avgs_std,
                                      real_avgs_std, 2 * CI,
                                      np.mean(CI_norm_score), 2 * h,
                                      np.mean(CI_t_score)
                                  ])
            CI_df = CI_df.append(CI_series, ignore_index=True)

        # Plots the graphs
        ax[idx, 0].set_xlabel("Value")
        ax[idx, 0].set_ylabel("Count")
        ax[idx, 0].set_xlim(0, 100)
        ax[idx, 0].legend()
        ax[idx, 1].legend()
        if distribution == 'exponential':
            ax[idx, 1].set_xlim(-4, 4)
            ax[idx, 1].set_ylim(-4, 6)

        ax[idx, 2].set_xlabel("n")
        ax[idx, 2].set_ylabel("CI score")
        ax[idx, 2].plot(samples_list,
                        CI_df['CI_norm_score'].values,
                        'o',
                        label="Normal Approx (Eq.2)",
                        color="crimson")
        ax[idx, 2].plot(samples_list,
                        CI_df['CI_t_score'].values,
                        'o',
                        label="T-Distr Approx (Eq.3)",
                        color="blue")
        ax[idx, 2].plot(samples_list,
                        len(samples_list) * [0.95],
                        '--',
                        label='Theoretical CI',
                        color='black')
        ax[idx, 2].set_xscale('log')
        ax[idx, 2].legend()

        results.append(CI_df)

    return results
Esempio n. 51
0
def calculate_fdc(
    input_ts="-",
    columns=None,
    start_date=None,
    end_date=None,
    clean=False,
    skiprows=None,
    index_type="datetime",
    names=None,
    percent_point_function=None,
    plotting_position="weibull",
    source_units=None,
    target_units=None,
    sort_values="ascending",
    sort_index="ascending",
    add_index=False,
    include_sd=False,
    include_cl=False,
    ci=0.9,
):
    """Return the frequency distribution curve."""
    sort_values = bool(sort_values == "ascending")

    tsd = tsutils.common_kwds(
        tsutils.read_iso_ts(input_ts,
                            skiprows=skiprows,
                            names=names,
                            index_type=index_type),
        start_date=start_date,
        end_date=end_date,
        pick=columns,
        source_units=source_units,
        target_units=target_units,
        clean=clean,
    )

    ppf = tsutils.set_ppf(percent_point_function)
    newts = pd.DataFrame()
    for col in tsd:
        tmptsd = tsd[col].dropna()
        if len(tmptsd) > 1:
            xdat = ppf(
                tsutils.set_plotting_position(tmptsd.count(),
                                              plotting_position))
            tmptsd.sort_values(ascending=sort_values, inplace=True)
            tmptsd.index = xdat * 100
            tmptsd = pd.DataFrame(tmptsd)
            if include_sd is True or include_cl is True:
                sd = (xdat * (1 - xdat) / len(xdat))**0.5
            if include_sd is True:
                tmptsd[col + "_sd"] = sd
            if include_cl is True:
                tval = t.ppf(ci, df=len(xdat) - 1)
                ul = 2 * (1 - xdat) * tval * sd
                ll = 2 * xdat * tval * sd
                tmptsd[col + "_ul"] = (xdat + ul) * 100
                tmptsd[col + "_ll"] = (xdat - ll) * 100
                tmptsd[col + "_vul"] = tmptsd[col] + ul * tmptsd[col]
                tmptsd[col + "_vll"] = tmptsd[col] - ll * tmptsd[col]
        else:
            tmptsd = pd.DataFrame()
        newts = newts.join(tmptsd, how="outer")
    newts.index.name = "Plotting_position"
    newts = newts.groupby(newts.index).first()
    if sort_index == "descending":
        return newts.iloc[::-1]
    if add_index is True:
        newts.reset_index(inplace=True)
    return newts
Esempio n. 52
0
]) / N
b_1 = sum([
    globals()['y' + str(i + 1) + '_abs'] * matrix[i][1]
    for i in range(len(matrix))
]) / N
b_2 = sum([
    globals()['y' + str(i + 1) + '_abs'] * matrix[i][2]
    for i in range(len(matrix))
]) / N
b_3 = sum([
    globals()['y' + str(i + 1) + '_abs'] * matrix[i][3]
    for i in range(len(matrix))
]) / N

f3 = f1 * f2
t_kr = t.ppf(df=f3, q=(1 + 0.95) / 2)

d = 0
t = [abs(globals()['b_' + str(i)]) / s_beta for i in range(N)]

for i in range(len(t)):
    if t[i] < t_kr:
        t[i] = 0
    else:
        t[i] = 1
        d += 1

print("b{} is unnecessary".format([i for i in range(len(t)) if t[i] == 0]))

# Equations with valuable coefficients
Esempio n. 53
0
def student_value(f3, significance):
    return Decimal(abs(t.ppf(significance / 2, f3))).quantize(Decimal('.0001')).__float__()
# We compute the coeff.
B, C, D = np.linalg.lstsq(XX, yy, rcond=None)[0]
params = [B, C, D]

# Calculate the SSE
SSE = np.linalg.lstsq(XX, yy, rcond=None)[1]

# We get confidence interval
alpha = 0.05
x0 = np.linspace(-1.5, 3.5, 50)
X0 = np.vstack([x0**2, x0, np.ones(len(x0))]).T

aux_t_conf = np.sqrt(SSE / (n - p) *
                     (np.diag(X0 @ np.linalg.inv(XX.T @ XX) @ X0.T)))
yy0_hat = X0 @ np.array(params)
upp_conf = yy0_hat + t.ppf(1 - alpha / 2, n - p) * aux_t_conf
low_conf = yy0_hat - t.ppf(1 - alpha / 2, n - p) * aux_t_conf

# We get prediction interval
aux_t_pred = np.sqrt(SSE / (n - p) *
                     (1 + np.diag(X0 @ np.linalg.inv(XX.T @ XX) @ X0.T)))
yy0_hat = X0 @ np.array(params)
upp_pred = yy0_hat + t.ppf(1 - alpha / 2, n - p) * aux_t_pred
low_pred = yy0_hat - t.ppf(1 - alpha / 2, n - p) * aux_t_pred

plt.figure(figsize=(7.5, 7.5))
plt.plot(points_x, points_y, 'o', label='Original data', markersize=5)
plt.plot(x0, B * x0**2 + C * x0 + D, 'r', label='Fitted parabola')
plt.fill_between(x0,
                 low_pred,
                 upp_pred,
Esempio n. 55
0
def find_critvals(n: int, r: int, alpha: float) -> list:
    """Computes critical values :math:`\lambda_i` for the 
    generalized extreme Studentized deviate (ESD) test.

    Parameters
    ----------
    n:
        Number of data points.
    r:
        Maximum number of outliers.
    alpha:
        Significance level for the statistical test.
    
    Returns
    -------
    :
        Critical values.

    Notes
    -----
    The :math:`\lambda_i` values are calculated as follows:

    .. math::

        \lambda_i = \\frac{ (n-i)\ t_{p, n-i-1} }{ \sqrt{(n-i-1-t_{n-i-1}^2)(n-i+1)} } 
        \quad i \in \{1,2, \dots, r \}

    .. math::

        p = 1 - \\frac{\\alpha}{2(n-i+1)}

    Where

    - :math:`n`       : number of points in the array.
    - :math:`\\alpha` : significance level.
    - :math:`t_{p,v}` : percent point function of the t-distribution 
      at :math:`p` value and :math:`v` degrees of freedom.
    - :math:`r`               : maximum number of outliers.

    Example
    -------
    >>> from araucaria.stats import find_critvals
    >>> n     = 54    # number of points
    >>> r     = 5     # max number of outliers
    >>> alpha = 0.05  # significance level
    >>> lambd = find_critvals(n, r, alpha)
    >>> for val in lambd:
    ...     print('%1.3f' % val)
    3.159
    3.151
    3.144
    3.136
    3.128
    """
    critvals = []  # container for critical values
    for i in range(1, r + 1):
        p = 1 - (alpha / (2 * (n - i + 1)))

        # finds t value corresponding to probability that
        # sample within data set is itself an outlying point
        tval = t.ppf(p, n - i - 1)
        val = ((n - i) * tval) / (((n - i - 1 + (tval**2)) *
                                   (n - i + 1))**(1 / 2))
        critvals.append(val)
    return critvals
Esempio n. 56
0
import numpy as np
from scipy.stats import norm, chi2, t

#print("Лабораторная работа №8; Выполнила: Фомина Дарья\n")

selections = [norm.rvs(size=20), norm.rvs(size=100)]
gamma = 0.95
for sel in selections:
    xm = np.mean(sel)
    s = np.sqrt(np.mean(sel*sel) - xm**2)
    n = len(sel)
    
    print(f'\n\tРазмер: {n}')
    ct = t.ppf((1 + gamma) / 2, n - 1)
    chi_low = chi2.ppf((1 + gamma) / 2, n - 1)
    chi_high = chi2.ppf((1 - gamma) / 2, n - 1)
    
    print('\n\tКлассические интервальные оценки')
    dx = s*ct*(n - 1)**(-0.5)
    print(f'm in ({xm-dx}; {xm+dx})')
    print(f's in ({s*(n/chi_low)**(0.5)}; {s*(n/chi_high)**(0.5)})')
    
    print('\n\tАсимптотические интервальные оценки')
    cu = norm.ppf((1 + gamma) / 2)
    dx = s * cu *(n**(-0.5))
    m4 = np.mean((sel - xm)**4)
    e = m4/(s**4) - 3
    U = cu*np.sqrt((e+2)/n)
    print(f'm in ({xm-dx}; {xm+dx})')
    print(f's in ({s*(1+U)**(-0.5)}; {s*(1-U)**(-0.5)})')
Esempio n. 57
0
File: speu.py Progetto: hmedal/speu2
def get_tdist_hw(x):
    n = len(x)
    return tDist.ppf(1 - alpha / 2.0, n - 1) * np.std(x) / np.sqrt(n)
Esempio n. 58
0
def filter_cells(adata: AnnData,
                 device="cpu",
                 p_level=None,
                 subset=True,
                 plot=False,
                 copy=False):
    """\
    Filter cells using on gene/molecule relationship.

    Code has been translated from pagoda2 R function gene.vs.molecule.cell.filter.


    Parameters
    ----------
    adata
        Annotated data matrix.
    device
        Run gene and molecule counting on either `cpu` or on `gpu`.
    p_level
        Statistical confidence level for deviation from the main trend, used for cell filtering (default=min(1e-3,1/adata.shape[0]))
    subset
        if False, add a column `outlier` in adata.obs, otherwise subset the adata.
    plot
        Plot the molecule distribution and the gene/molecule dependency fit.
    copy
        Return a copy instead of writing to adata.
    Returns
    -------

    adata : anndata.AnnData
        if `copy=True` and `subset=True` it returns subsetted (removing outliers) or else add fields to `adata`:

        `.obs['outlier']`
            whether a cell is an outlier.

    """

    adata = adata.copy() if copy else adata

    logg.info("Filtering cells", reset=True)
    X = adata.X.copy()

    logg.info("    obtaining gene and molecule counts")
    if device == "cpu":
        log1p_total_counts = np.log1p(np.array(X.sum(axis=1))).ravel()
        X.data = np.ones_like(X.data)
        log1p_n_genes_by_counts = np.log1p(np.array(X.sum(axis=1))).ravel()
    elif device == "gpu":
        import cupy as cp
        from cupyx.scipy.sparse import csr_matrix as csr_matrix_gpu

        X = csr_matrix_gpu(X)
        log1p_total_counts = cp.log1p(X.sum(axis=1)).get().ravel()
        X.data = cp.ones_like(X.data)
        log1p_n_genes_by_counts = cp.log1p(X.sum(axis=1)).get().ravel()

    df = pd.DataFrame(
        {
            "log1p_total_counts": log1p_total_counts,
            "log1p_n_genes_by_counts": log1p_n_genes_by_counts,
        },
        index=adata.obs_names,
    )

    logg.info("    fitting RLM")

    rlm_model = sm.RLM.from_formula(
        "log1p_n_genes_by_counts ~ log1p_total_counts",
        df,
    ).fit()

    p_level = min(1e-3, 1 / adata.shape[0]) if p_level is None else p_level

    SSE_line = ((df.log1p_n_genes_by_counts - rlm_model.predict())**2).sum()
    MSE = SSE_line / df.shape[0]
    z = t.ppf((p_level / 2, 1 - p_level / 2), df.shape[0])

    se = np.zeros(df.shape[0])
    get_SE(MSE, df.log1p_total_counts.values, se)
    pr = pd.DataFrame(
        {
            0: rlm_model.predict(),
            1: rlm_model.predict() + se * z[0],
            2: rlm_model.predict() + se * z[1],
        },
        index=adata.obs_names,
    )

    logg.info("    finished",
              time=True,
              end=" " if settings.verbosity > 2 else "\n")

    outlier = (df.log1p_n_genes_by_counts <
               pr[1]) | (df.log1p_n_genes_by_counts > pr[2])

    if plot:
        fig, ax = plt.subplots()
        idx = df.sort_values("log1p_total_counts").index
        ax.fill_between(
            df.log1p_total_counts[[idx[0], idx[-1]]],
            pr[1][[idx[0], idx[-1]]],
            pr[2][[idx[0], idx[-1]]],
            color="yellow",
            alpha=0.3,
        )
        df.loc[~outlier].plot.scatter(x="log1p_total_counts",
                                      y="log1p_n_genes_by_counts",
                                      c="k",
                                      ax=ax,
                                      s=1)
        df.loc[outlier].plot.scatter(x="log1p_total_counts",
                                     y="log1p_n_genes_by_counts",
                                     c="grey",
                                     ax=ax,
                                     s=1)

    if subset:
        adata._inplace_subset_obs(adata.obs_names[~outlier])
        logg.hint("subsetted adata.")

    else:
        adata.obs["outlier"] = outlier
        logg.hint("added \n"
                  "    .obs['outlier'], boolean column indicating outliers.")

    return adata if copy else None
Esempio n. 59
0
fplot = norm.pdf(xplot,xquer,s)
fig = plt.figure(1, figsize=(12, 4))
ax1, ax2 = fig.subplots(1,2)
ax1.hist(X, 10, density=True, facecolor='b')
ax1.plot(xplot,fplot,'r')
ax1.set_xlabel('Volumenstrom Q / m³/h')
ax1.set_ylabel('Wahrscheinlichkeitsdichte')
ax1.grid(False)
ax1.axis([0.48,0.54,0,80])
ax2.boxplot(X)
ax2.set_ylabel('Volumenstrom Q / m³/h')

""" Berechnung und Ausgabe der Parameter mit Konfidenzbereich """

gamma = 0.95
c1 = t.ppf((1-gamma)/2,N-1)
c2 = t.ppf((1+gamma)/2,N-1)
mu = round(xquer,3)
muc1 = round(xquer - c2*s/np.sqrt(N),3)
muc2 = round(xquer - c1*s/np.sqrt(N),3)
c1 = chi2.ppf((1-gamma)/2,N-1)
c2 = chi2.ppf((1+gamma)/2,N-1)
sig = round(s,3)
sigc1 = round(s*np.sqrt(N/c2),3)
sigc2 = round(s*np.sqrt(N/c1),3)
print(' ')
print('Konfidenzbereiche')
print('Mittelwert : ', muc1, '<=', mu, '<=', muc2)
print('Standardabweichung  : ', sigc1, '<=', sig, '<=', sigc2)

""" Durchführung Hypothesentest """
Esempio n. 60
0
print("\t Standard Error of the Mean = {:.5f}".format(std_err1))

# standard error (SE) of mean of sample2
std_err2 = s2_stdv / np.sqrt(n2)

print("\nSample 2: \n\t Number of Observations = {} \n\t Mean = {:.5f}".format(
    n2, s2_mean))
print("\t Standard Deviation = {:.5f}".format(s2_stdv))
print("\t Standard Error of the Mean = {:.5f}".format(std_err2))

# calculation of t-statistic and degrees of freedom
tstatistic, dof, sp = ttest_and_variance(s1_stdv, s2_stdv)
print("\nt-statistic: {:.5f}".format(tstatistic))

# calculation of Critical values
tcritical_l = t.ppf(q=los / 2, df=dof)
tcritical_u = -tcritical_l
print("\nCritical values are {:.5f}, {:.5f}".format(tcritical_l, tcritical_u))

# decision making: t-statistic and Critical values
if tstatistic < tcritical_l or tstatistic > tcritical_u:
    print("Reject the Null hypothesis.")
else:
    print("Fail to reject the Null hypothesis.")

# calculation of p-value
pvalue = 2 * t.cdf(tstatistic, df=dof)
print("\np-value: {:.5f}".format(pvalue))

# decision making: p-value and level of significance
if pvalue < los: print("Reject the Null hypothesis.")