Esempi in Python per interval, esempi in Python per scipy.stats.t.interval

Esempio n. 1

0

Mostra file

File: crunch_perf_results.py Progetto: geoelectric/moz-perf-analysis

def get_stats(values, intervals=True):
    stats = {}

    values_array = np.array(values, dtype=np.float64)

    stats['min'] = np.asscalar(np.amin(values_array))
    stats['max'] = np.asscalar(np.amax(values_array))
    stats['mean'] = np.asscalar(np.mean(values_array))
    stats['median'] = np.asscalar(np.median(values_array))

    if values_array.size > 1:
        stats['std_dev'] = np.asscalar(np.std(values_array, ddof=1))
    else:
        stats['std_dev'] = 0

    if intervals:
        stats['intervals'] = []
        loc = stats['mean']
        scale = stats['std_dev'] / sqrt(values_array.size)

        for alpha in (.95, .99, .90, .85, .80, .50):
            if values_array.size > 30:
                interval = norm.interval(alpha, loc=loc, scale=scale)
            else:
                interval = t.interval(alpha, values_array.size - 1, loc, scale)
            stats['intervals'].append(
                {'confidence': alpha, 'interval': interval})

    return stats

Esempio n. 2

0

Mostra file

File: t-check.py Progetto: 570468837/Daily-pracitce

def t_fun():
    # accumulate from -infinity to 3.077
    res = t.cdf(3.0777, df=1)
    print(res)
    # probability of middle
    a, b = t.interval(0.95, 1)
    print(a, b)

Esempio n. 3

0

Mostra file

File: stats.py Progetto: chiffa/chiffatools2

def get_t_distro_outlier_bound_estimation(array, background_std):
    narray = rm_nans(array)

    low, up = t.interval(0.95, narray.shape[0]-1, np.mean(narray), np.sqrt(np.var(narray)+background_std**2))
    up, low = (up-np.mean(narray), np.mean(narray)-low)

    return max(up, low)

Esempio n. 4

0

Mostra file

File: exp_uncer.py Progetto: bo3mrh/Test

def first_order_uncer(reading, conf=0.95):
    """
        This function calculates the first order uncertainty
        of the mean of data in a time series

        Parameters:
        ===========
        reading: array
            readings of the measurement in time-series
        conf:   float
            Confidence level. Default is 95%

        Returns:
        ===========
        first order uncertainty of the mean of the time series data

    """

    #sample standard deviation
    num = len(reading)
    sample_sigma = np.std(reading, ddof=1)

    #standard deviation of mean
    mean_sigma = sample_sigma/sqrt(num-1)

    #t-statistics
    k = t.interval(conf, num-1)[1]

    return k*mean_sigma

Esempio n. 5

0

Mostra file

File: script.py Progetto: andrei-iusan/analyzing-the-new-york-subway-dataset

def CV_experiment(features, values, log):
     indeces = np.array(range(len(features)))
     np.random.seed(1)
     np.random.shuffle(indeces)
     k_cv = 10
     test_set_len = len(features)/k_cv
     R_sq_array = []
     for k in range(k_cv):
          train_i = indeces[range(0,k*test_set_len)+range((k+1)*test_set_len,len(features))]
          test_i = indeces[range(k*test_set_len,(k+1)*test_set_len)]
          model = sm.OLS(values[train_i], features[train_i])
          results = model.fit()
          predicted_values = results.predict(features[test_i])
          r_sq = compute_r_squared(values[test_i], predicted_values)
          R_sq_array.append(r_sq)
          log.write(str(r_sq)+'\n')
     Rm = np.mean(R_sq_array)
     Rsig = np.std(R_sq_array)
     conf_interval = t.interval(.95,len(R_sq_array)-1,loc=Rm, scale=Rsig)
     log.write("\nAverage R squared\n")
     log.write(str(Rm))
     log.write("\nR squared STD\n")
     log.write(str(Rsig))
     log.write("\nR squared 95% confidence interval:\n")
     log.write(str(conf_interval))

Esempio n. 6

0

Mostra file

File: test_script.py Progetto: avonmoll/ebola-sim

 def lower_and_upper(means, stds, n):
     lower, upper = [0]*len(means), [0]*len(means)
     for i in range(len(means)):
         m = means[i]
         s = stds[i]
         lower[i], upper[i] = student_t.interval(0.95, n, loc = m, scale = s)
         
     return lower, upper

Esempio n. 7

0

Mostra file

File: t-check.py Progetto: 570468837/Daily-pracitce

def t_check(l, mu, alpha=0.05):
    aver = average(l)
    n = len(l)
    ss = s_2(l)
    tt = (aver - mu) * sqrt(n) / sqrt(ss)
    l, r = t.interval(1 - alpha, n - 1)
    if tt > r or tt < l:
        print('reject')
    else:
        print('accept')

Esempio n. 8

0

Mostra file

File: 04paired_t_test.py Progetto: higebobo/python-tokai18

def calc_scipy():
    # read data
    data = loadtxt(DATA_PATH, delimiter=",", skiprows=1, usecols=(1,2))

    # calculation
    t_value, p_value = ttest_rel(data[:,0], data[:,1])

    # omake
    df = data.shape[0] - 1
    t_dist = t.interval(0.95, df)
    t_dist_001 = t.interval(0.99, df)
    
    # output
    print '[Scipy]'
    print 't value:', t_value
    print 'p value:', p_value
    print 't dist(0.05):', t_dist[1], abs(t_value)>t_dist[1]
    print 't dist(0.01):', t_dist_001[1], abs(t_value)>t_dist_001[1]
    print

Esempio n. 9

0

Mostra file

File: profiler.py Progetto: stevenvanrossem/son-cli

    def query_metrics(self):
        # query the skewness metric from a vnf ever 2 secs
        # calculate the running average over 5 samples
        # query the host_cpu metric from a vnf ever 2 secs
        # calculate the running average and confidence intervals over 10 samples

        while not self.stop_event.is_set():
            # query host cpu
            try:
                ret = query_Prometheus(self.host_cpu_query)
                value = float(ret[1])/self.num_cores
                self.host_cpu_values.append(value)
            except:
                LOG.info('Prometheus query failed: {0} \nquery: {1}'.format(ret, self.host_cpu_query))

            # query skewness cpu
            try:
                for vnf_name, query in self.skew_query_dict.items():

                    ret = query_Prometheus(query)
                    value = float(ret[1])
                    self.skew_value_dict[vnf_name].append(value)
            except:
                LOG.info('Prometheus query failed: {0} \nquery: {1}'.format(ret, query))

            # check overload
            N = len(self.host_cpu_values)

            if N < 5:
                time.sleep(2)
                continue

            mu = np.mean(self.host_cpu_values)
            sigma = np.std(self.host_cpu_values)
            R = t.interval(0.95, N - 1, loc=mu, scale=sigma / np.sqrt(N))
            host_cpu_load = float(R[1])
            if host_cpu_load > 95 :
                LOG.info("host cpu overload CI: {0}".format(R))


            skew_list = []
            for vnf_name, values in self.skew_value_dict.items():
                skew_avg = np.mean(values)
                skew_list.append(skew_avg)
                #LOG.info("{0} skewness avg: {1}".format(vnf_name, np.mean(values)))
                if skew_avg < 0:
                    LOG.info("{0} skewness overload: {1}".format(vnf_name, skew_avg))

            negative_skews = [s for s in skew_list if s < 0]
            if (host_cpu_load > 95) or (len(negative_skews) > 0) :
                self.overload_flag.set()
            else:
                self.overload_flag.clear()

            time.sleep(2)

Esempio n. 10

0

Mostra file

File: supporting_functions.py Progetto: chiffa/Pharmacosensitivity_growth_assays

def clean_tri_replicates(points, std_of_tools):
    """
    Deletes an element inside the triplicates if one of them is strongly outlying compared to the others

    :param points:
    :return:
    """
    if all(np.isnan(points)):  # early termination if all points are nan
        return points

    arr_of_interest = pdist(points[:, np.newaxis])
    _min, _max = (np.min(arr_of_interest), np.max(arr_of_interest))
    containment = t.interval(0.95, 1, scale=_min / 2)[1]

    if _max > containment:
        outlier = 2 - np.argmin(arr_of_interest)
        msk = np.array([True, True, True])
        msk[outlier] = False
        _mean, _std = (np.mean(points[msk]), np.std(points[msk]))
        containment_2 = t.interval(0.95, 1, loc=_mean, scale=np.sqrt(_std ** 2 + std_of_tools ** 2))
        if points[outlier] > containment_2[1] or points[outlier] < containment_2[0]:
            points[outlier] = np.nan

    return points

Esempio n. 11

0

Mostra file

File: prometheus_lib.py Progetto: CN-UPB/son-cli

 def addValue(self, value):
     self.last_value = value
     self.list_values.append(value)
     # update running average
     self.sum += value
     self.len += 1
     self.average = self.sum/self.len
     # update CI
     if self.len > 5 :
         mu = self.average
         sigma = np.std(self.list_values)
         N = self.len
         if sigma > 0:
             R = t.interval(0.95, N - 1, loc=mu, scale=sigma / np.sqrt(N))
             self.CI = R

Esempio n. 12

0

Mostra file

File: significance.py Progetto: kenhoff/thesis

def main(N, md, sd, p):
	from scipy.stats import t
	from math import sqrt

	a = 1 - p


	ta = t.interval(p, N - 1) # confidence limits
	# print ta



	c = (ta[1] * sd) / sqrt(N)

	# print "c: {}".format(c)

	# print md-c, md+c

	return (md-c, md, md+c) # as long as md-c > 0, then we don't have the null hypothesis!

Esempio n. 13

0

Mostra file

File: GrainSizeTools_script.py Progetto: ravipurohit1991/GrainSizeTools

def confidence_interval(data, confidence=0.95):
    """Estimate the confidence interval using the t-distribution with n-1
    degrees of freedom t(n-1). This is the way to go when sample size is
    small (n < 30) and the standard deviation cannot be estimated accurately.
    For large datasets, the t-distribution approaches the normal distribution.

    Parameters
    ----------
    data : array-like
        the dataset

    confidence : float between 0 and 1, optional
        the confidence interval, default = 0.95

    Assumptions
    -----------
    the data follows a normal distrubution (when sample size is large)

    call_function(s)
    ----------------
    Scipy's t.interval

    Returns
    -------
    None
    """

    degrees_freedom = len(data) - 1
    sample_mean = np.mean(data)
    sd_err = sem(data)  # Standard error of the mean SD / sqrt(n)
    low, high = t.interval(confidence, degrees_freedom, sample_mean, sd_err)
    err = high - sample_mean

    print(' ')
    print('Confidence set at {} %'.format(confidence * 100))
    print('Mean = {mean} ± {err}'.format(mean=round(sample_mean, 2),
                                         err=round(err, 2)))
    print('Max / min = {max} / {min}'.format(max=round(high, 2),
                                             min=round(low, 2)))
    print('Coefficient of variation = {} %'.format(
        round(100 * err / sample_mean, 1)))

    return None

Esempio n. 14

0

Mostra file

File: classifiers.py Progetto: ggonzr/region_grow

 def fit(self):
     """
     Build a confidence interval for each of
     the bands or indexes to use them as a classification
     threshold
     
     """
     response = {}
     columns = list(self.pixels_df.columns)
     for column in columns:
         data_column = self.pixels_df[column]
         degrees_freedom = data_column.size - 1
         mean = np.mean(data_column)
         standard_error = sem(data_column)
         confidence_interval = t.interval(
             self.confidence_lvl, degrees_freedom, mean, standard_error
         )
         response[column] = confidence_interval
     return response

Esempio n. 15

0

Mostra file

File: analyser.py Progetto: syadegari/spotpy

def plot_objectivefunction(results,
                           evaluation,
                           limit=None,
                           sort=True,
                           fig_name='objective_function.png'):
    """Example Plot as seen in the SPOTPY Documentation"""
    import matplotlib.pyplot as plt
    likes = calc_like(results, evaluation, spotpy.objectivefunctions.rmse)
    data = likes
    #Calc confidence Interval
    mean = np.average(data)
    # evaluate sample variance by setting delta degrees of freedom (ddof) to
    # 1. The degree used in calculations is N - ddof
    stddev = np.std(data, ddof=1)
    from scipy.stats import t
    # Get the endpoints of the range that contains 95% of the distribution
    t_bounds = t.interval(0.999, len(data) - 1)
    # sum mean to the confidence interval
    ci = [mean + critval * stddev / np.sqrt(len(data)) for critval in t_bounds]
    value = "Mean: %f" % mean
    print(value)
    value = "Confidence Interval 95%%: %f, %f" % (ci[0], ci[1])
    print(value)
    threshold = ci[1]
    happend = None
    bestlike = [data[0]]
    for like in data:
        if like < bestlike[-1]:
            bestlike.append(like)
        if bestlike[-1] < threshold and not happend:
            thresholdpos = len(bestlike)
            happend = True
        else:
            bestlike.append(bestlike[-1])
    if limit:
        plt.plot(bestlike, 'k-')  #[0:limit])
        plt.axvline(x=thresholdpos, color='r')
        plt.plot(likes, 'b-')
        #plt.ylim(ymin=-1,ymax=1.39)
    else:
        plt.plot(bestlike)
    plt.savefig(fig_name)

Esempio n. 16

0

Mostra file

    def compute_prediction_interval(self, X, y=None, level=.95):

        from scipy.stats import t

        ypred = self.predict(X, y)
        self.prediction_se_ = self.compute_prediction_se(X, y)
        self.prediction_se_ = self.process_predictions(
            self.prediction_se_,
            Vx=X[self.vx_colname][self.prediction_se_mask_],
            inverse_transform_y=False)

        n, p = self.train_features_.shape

        lower_z, upper_z = t.interval(level, n - p)
        pred_interval = {
            'ypred': ypred,
            'lower': ypred + (lower_z * self.prediction_se_),
            'upper': ypred + (upper_z * self.prediction_se_)
        }
        return pred_interval

Esempio n. 17

0

Mostra file

File: stats.py Progetto: mkowoods/SCMLIb

def conf_interval(arr, confidence = 0.95):
    N = arr.size

    if N <= 30:
        z = t.interval(0.95, N - 1)
    else:
        z = norm.interval(0.95)

    s = arr.std()
    x_bar = arr.mean()

    return (x_bar - z*(s/np.sqrt(N)), x_bar + z*(s/np.sqrt(N)))

Esempio n. 18

0

Mostra file

File: GrainSizeTools_script.py Progetto: marcoalopez/GrainSizeTools

def conf_interval(data, confidence=0.95):
    """Estimate the confidence interval using the t-distribution with n-1
    degrees of freedom t(n-1). This is the way to go when sample size is
    small (n < 30) and the standard deviation cannot be estimated accurately.
    For large datasets, the t-distribution approaches the normal distribution.

    Parameters
    ----------
    data : array-like
        the dataset

    confidence : float between 0 and 1, optional
        the confidence interval, default = 0.95

    Assumptions
    -----------
    the data follows a normal or symmetric distrubution (when sample size
    is large)

    call_function(s)
    ----------------
    Scipy's t.interval

    Returns
    -------
    the arithmetic mean, the error, and the limits of the confidence interval
    """

    dof = len(data) - 1
    amean = np.mean(data)
    std_err = sem(data)  # Standard error of the mean SD / sqrt(n)
    low, high = t.interval(confidence, dof, amean, std_err)
    err = high - amean

    print(' ')
    print(f'Mean = {amean:0.2f} ± {err:0.2f}')
    print(f'Confidence set at {confidence * 100} %')
    print(f'Max / min = {high:0.2f} / {low:0.2f}')
    print(f'Coefficient of variation = ±{100 * err / amean:0.1f} %')

    return amean, err, (low, high)

Esempio n. 19

0

Mostra file

File: run_single.py Progetto: arssher/pgtpch

    def summary_exectime(self):
        if not scipy_loaded:
            return
        exectimes = []
        with open(self.exectime_path) as f:
            for line in f:
                exectimes.append(float(line))

        # calculate 0.95 confidence interval, assuming T-student distribution
        exectimes_mean = average(exectimes)
        standard_deviation = std(exectimes, ddof=1)
        t_bounds = t.interval(0.95, len(exectimes) - 1)
        ci = [
            exectimes_mean +
            crit_val * standard_deviation / math.sqrt(len(exectimes))
            for crit_val in t_bounds
        ]
        self.log("Mean exec time: {0:.2f}".format(exectimes_mean))
        self.log(
            "0.95 confidence interval, assuming T-student distribution: {0:.2f}, {1:.2f}\n"
            .format(ci[0], ci[1]))

Esempio n. 20

0

Mostra file

    def compute_confint(self):
        # compute confint from ob_data
        total = 0.0
        sdtotal = 0.0
        n = 0.0
        # compute eprice and evol, weighted based on time
        for v in self._ob_data:
            weight = self._TIME_DECAY_FACTOR**(time.time() - v[0])
            n += weight
            total += v[1] * weight
            sdtotal += ((v[1] - self._mu_sum / self._obs)**2) * weight
        self._eprice = total / n
        self._evol = sdtotal / n

        # if evol is zero, no activity is occuring; discourage bot from trading due to lack of liquidity
        if self._evol < 1:
            self._evol = (self._MAX_MKT_PRICE - 1)**2

        # CI based on Student t distribution
        return t.interval(1 - self._CONFIDENCE, int(round(n)), self._eprice,
                          self._evol**0.5)

Esempio n. 21

0

Mostra file

File: analyser.py Progetto: xuexianwu/spotpy

def plot_objectivefunction(results,evaluation,limit=None,sort=True):
    """Example Plot as seen in the SPOTPY Documentation"""
    import matplotlib.pyplot as plt
    from matplotlib import colors
    cnames=list(colors.cnames)
    likes=calc_like(results,evaluation)    
    data=likes
    #Calc confidence Interval    
    mean = np.average(data)
    # evaluate sample variance by setting delta degrees of freedom (ddof) to
    # 1. The degree used in calculations is N - ddof
    stddev = np.std(data, ddof=1)
    from scipy.stats import t
    # Get the endpoints of the range that contains 95% of the distribution
    t_bounds = t.interval(0.999, len(data) - 1)
    # sum mean to the confidence interval
    ci = [mean + critval * stddev / np.sqrt(len(data)) for critval in t_bounds]
    value="Mean: %f" % mean
    print(value)
    value="Confidence Interval 95%%: %f, %f" % (ci[0], ci[1])    
    print(value)
    threshold=ci[1]
    happend=None
    bestlike=[data[0]]
    for like in data:
        if like<bestlike[-1]:
            bestlike.append(like)
        if bestlike[-1]<threshold and not happend:
            thresholdpos=len(bestlike)
            happend=True
        else:
            bestlike.append(bestlike[-1])
    if limit:
        plt.plot(bestlike,'k-')#[0:limit])
        plt.axvline(x=thresholdpos,color='r')
        plt.plot(likes,'b-')
        #plt.ylim(ymin=-1,ymax=1.39)
    else:
        plt.plot(bestlike)

Esempio n. 22

0

Mostra file

    def uncertainty_q_random(discharges, prop):
        """Compute 95% random uncertainty for property of discharge.
        Uses simplified method for 2 transects.

        Parameters
        ----------
        discharges: list
            List of Discharge objects
        prop: str
            Attribute of Discharge objects

        Returns
        -------
        cov: float
            Coefficient of variation
        cov_95: float
            Coefficient of variation inflated to 95% value
        """
        n_max = len(discharges)
        if n_max > 0:
            # Create array of specified attribute
            data = Uncertainty.get_array_attr(discharges, prop)

            # Compute coefficient of variation
            cov = np.abs(np.nanstd(data, ddof=1) / np.nanmean(data)) * 100

            # Inflate the cov to the 95% value
            if n_max == 2:
                # Use the approximate method as taught in class to reduce the high coverage factor for 2 transects
                # and account for prior knowledge related to 720 second duration analysis
                cov_95 = cov * 3.3
            else:
                # Use Student's t to inflate COV for n > 2
                cov_95 = t.interval(0.95, n_max - 1)[1] * cov / n_max**0.5
        else:
            cov = np.nan
            cov_95 = np.nan

        return cov, cov_95

Esempio n. 23

0

Mostra file

def jackknife_ci(ratings, sim, use_unweighted, use_weighted):

    if use_weighted == True:
        mu_weighted_ratings = sum(ratings * sim) / sum(abs(sim))
    if use_unweighted == True:
        mu_ratings = np.mean(ratings)

    mu_jk_samples, n = [], len(ratings)
    index = np.arange(n)

    for i in range(n):
        if use_unweighted == True:
            jk_sample = ratings[index != i]
            mu_jk_sample = np.mean(jk_sample)
            #print(mu_jk_sample)
            mu_jk_samples.append(mu_jk_sample)

        if use_weighted == True:
            jk_sample = ratings[index != i] * sim[index != i]
            mu_jk_sample = sum(jk_sample) / (sum(abs(sim[index != i])))
            #print(sum(jk_sample)/sum(abs(sim[index != i])))
            #print(mu_jk_sample)
            mu_jk_samples.append(mu_jk_sample)

    if use_unweighted == True:
        se_jk = np.sqrt(
            sum(pow((mu_ratings - mu_jk_samples), 2)) * (n - 1) / n)
        #print(se_jk)
    if use_weighted == True:
        se_jk = np.sqrt(
            sum(pow((mu_weighted_ratings - mu_jk_samples), 2)) * (n - 1) / n)
        #print(se_jk)

    if n >= 30:
        multi = 1.96
    else:
        multi = t.interval(alpha=0.975, df=n - 1)[1]
    return multi * se_jk

Esempio n. 24

0

Mostra file

def fitting_data(x,y,fit_method,**bounds):
    func,bounds_,bound_name = fit_method_fetcher(fit_method,**bounds)
    p_0 = [0.5*(i+j) for i,j in zip(bounds_[0],bounds_[1])]
    x,y,y_stdev,multi_set = convert_x_y(x,y)
    try:
        freedom = max(1,len(x)-len(bound_name))
        lower_CI = []
        upper_CI = []
        if multi_set:
            fit_result,corv_=curve_fit(func, x, y,p0=p_0,sigma=y_stdev,bounds = bounds_,absolute_sigma=False) #
        else:
            fit_result,corv_=curve_fit(func, x, y,p0=p_0,bounds = bounds_,absolute_sigma=False)
        sigma = np.sqrt(np.diagonal(corv_))
        for i,j in zip(sigma,fit_result):
            C_interval = t.interval(0.95,freedom,j,i)
            lower_CI.append(C_interval[0])
            upper_CI.append(C_interval[1])
    except Exception as e:
        print(e)
        fit_result = [1]*len(bound_name)
        lower_CI = fit_result
        upper_CI = fit_result
    return dict(zip(bound_name, fit_result)),dict(zip(bound_name, lower_CI)),dict(zip(bound_name, upper_CI))

Esempio n. 25

0

Mostra file

File: statistics.py Progetto: lambdalisue/graf

def confidential_interval(x, alpha=0.98):
    """
    Return a numpy array of column confidential interval

    Args:
        x: a numpy array
        alpha: alpha value of confidential interval

    Returns:
        A numpy array which indicate the each difference from sample average
        point to confidential interval point
    """
    from scipy.stats import t
    if x.ndim == 1:
        return None
    # calculate degree of freedom
    df = len(x[0]) - 1
    # calculate positive critical value of student's T distribution
    cv = t.interval(alpha, df)[1]
    # calculate sample standard distribution
    std = np.std(x, axis=1)
    # calculate positive difference from
    # sample average to confidential interval
    return std * cv / np.sqrt(df)

Esempio n. 26

0

Mostra file

File: estimators.py Progetto: nrsalinas/carbonest

    def post_stratified_mean_var(self, domain, variable, confidence=0.95):
        #print ">>> In post_stratified_mean_var <<<"
        pv = {}
        pt = {}
        #print domain, variable
        self.get_strata_var(domain, variable)

        for h in self.s2:
            #print ">>> {0}, {1} <<<".format(self.weights[h], self.s2[h])
            pv[h] = self.weights[h] * self.s2[h]
            pv[h] += ((1 - self.weights[h]) * self.s2[h]) / len(
                self.dtfr.Plot.unique())
            pv[h] /= len(self.dtfr.Plot.unique())
            pt[h] = self.stratum_mean(h, domain, variable) * self.areas[h]

        vartot = self.var_total(pv)
        std_err = (vartot / len(self.dtfr.Plot.unique()))**0.5
        mean = self.dtfr.loc[self.dtfr.Domain == domain,
                             variable].sum() / float(
                                 len(self.dtfr.Plot.unique()))
        poptot = self.total(domain, variable)  #sum(self.areas.values()) * mean
        cv = vartot**0.5 / poptot * 100
        conf_inter = t.interval(confidence,
                                len(self.dtfr.Plot.unique()) - 1, poptot,
                                std_err)
        out = {
            'Domain mean': mean,
            'Population total': poptot,
            'Coefficient of variation': cv,
            'Strata variances': pv,
            'Strata totals': pt,
            'Variance of the total': vartot,
            'Confidence interval': conf_inter
        }

        return out

Esempio n. 27

0

Mostra file

File: chargingStation.py Progetto: FelipeSuarez17/chargingStationSimulation

    chargers = Charger(NBSS)
    listTime = []
    listChargingRate = []
    while time < SIM_TIME:
        (time, event_type, charger) = FES.get()

        if event_type == "arrival":
            arrival(time, FES, waitingLine)

        elif event_type == "batteryAvailable":
            batteryAvailable(time, FES, waitingLine, charger)

        elif event_type == "chargingRate_change":
            chargingRate_change(time, FES)
        listTime.append(time)
        listChargingRate.append(chargingRate)
    confidence_int_wait = t.interval(0.999,
                                     len(data.waitingTime) - 1,
                                     np.mean(data.waitingTime),
                                     sem(data.waitingTime))
    confidence_int_charge = t.interval(0.999,
                                       len(data.chargingTime) - 1,
                                       np.mean(data.chargingTime),
                                       sem(data.chargingTime))
    print(f"Confidence interval Waiting Time: {confidence_int_wait}")
    print(f"Confidence interval Charging Time: {confidence_int_charge}")
    print(f"Number of arrivals: {data.arr}")
    print(f"Number of departures: {data.dep}")
    print(f"Number of losses: {len(data.loss)}")
    plotCDF(data.loss, "", "", "test.pdf")

Esempio n. 28

0

Mostra file

File: ts_dispersion_uplot.py Progetto: fablabbcn/smartcitizen-data

def ts_dispersion_uplot(self, **kwargs):
    '''
    Plots dispersion timeseries in uplot plot
    Parameters
    ----------
        channel: string
            Channel
        options: dict
            Options including data processing prior to plot. Defaults in config._plot_def_opt
        formatting: dict
            Formatting dict. Defaults in config._ts_plot_def_fmt
    Returns
    -------
        Matplotlib figure
    '''

    head_template = '''
        <link rel="stylesheet" href="https://leeoniya.github.io/uPlot/dist/uPlot.min.css">
        <script src="https://leeoniya.github.io/uPlot/dist/uPlot.iife.js"></script>

        <div style="text-align:center">
            <h2 style="font-family: Roboto"> {{title}} </h2>
        </div>

        '''

    uplot_template = '''
        <div id="plot{{subplot}}"></div>
        <script>
            data = {{data}};
            options = {{options}};

            if (typeof options.scatter == 'undefined') {
                options.scatter = false
            }

            if (options.scatter) {
                for (i=1; i<data.length; i++) {
                    options['series'][i]["paths"] = u => null;
                }
            }

            u = new uPlot(options, data, document.getElementById("plot{{subplot}}"))
        </script>
        '''

    if 'channel' not in kwargs:
        std_out('Needs at least one channel to plot')
        return None
    else:
        channel = kwargs['channel']

    if 'options' not in kwargs:
        std_out('Using default options')
        options = config._plot_def_opt
    else:
        options = dict_fmerge(config._plot_def_opt, kwargs['options'])

    if 'formatting' not in kwargs:
        std_out('Using default formatting')
        formatting = config._ts_plot_def_fmt['uplot']
    else:
        formatting = dict_fmerge(config._ts_plot_def_fmt['uplot'],
                                 kwargs['formatting'])

    # Size sanity check
    if formatting['width'] < 100:
        std_out('Setting width to 800')
        formatting['width'] = 800
    if formatting['height'] < 100:
        std_out('Reducing height to 600')
        formatting['height'] = 600

    if 'html' not in options:
        options['html'] = False

    if self.dispersion_df is None:
        std_out('Perform dispersion analysis first!', 'ERROR')
        return None

    if self.common_channels == []: self.get_common_channels()

    if channel not in self.common_channels:
        std_out(f'Channel {channel} not in common_channels')
        return None
    if channel in config._dispersion['ignore_channels']:
        std_out(f'Channel {channel} ignored per config')
        return None

    if len(self.devices) > config._dispersion['nt_threshold']:
        distribution = 'normal'
        std_out('Using normal distribution')
        std_out(f"Using limit for sigma confidence:\
                {config._dispersion['limit_confidence_sigma']}")
    else:
        distribution = 't-student'
        std_out(f'Using t-student distribution.')

    ch_index = self.common_channels.index(channel) + 1
    total_number = len(self.common_channels)
    h = Template(head_template).render(
        title=f'({ch_index}/{total_number}) - {channel}')

    dispersion_avg = self._dispersion_summary[channel]

    if distribution == 'normal':
        limit_confidence = config._dispersion['limit_confidence_sigma']
        # Calculate upper and lower bounds
        if (config._dispersion['instantatenous_dispersion']):
            # For sensors with high variability in the measurements, it's better to use this
            upper_bound = self.dispersion_df[channel + '_AVG']\
                        + limit_confidence * self.dispersion_df[channel + '_STD']
            lower_bound = self.dispersion_df[channel + '_AVG']\
                        - abs(limit_confidence * self.dispersion_df[channel + '_STD'])
        else:
            upper_bound = self.dispersion_df[channel + '_AVG']\
                        + limit_confidence * dispersion_avg
            lower_bound = self.dispersion_df[channel + '_AVG']\
                        - abs(limit_confidence * dispersion_avg)
    else:
        limit_confidence = t.interval(
            config._dispersion['t_confidence_level'] / 100.0,
            len(self.devices),
            loc=self.dispersion_df[channel + '_AVG'],
            scale=dispersion_avg)
        upper_bound = limit_confidence[1]
        lower_bound = limit_confidence[0]

    udf = self.dispersion_df.copy()
    udf['upper_bound'] = upper_bound
    udf['lower_bound'] = lower_bound

    udf = udf.fillna('null')
    # List containing subplots. First list for TBR, second for OK
    subplots = [[], []]

    if formatting['join_sbplot']: n_subplots = 1
    else: n_subplots = 2
    udf.index = udf.index.astype(int) / 10**9

    # Compose subplots lists
    for device in self.devices:
        ncol = channel + '-' + device

        if ncol in self.dispersion_df.columns:

            # Count how many times we go above the upper bound or below the lower one
            count_problems_up = self.dispersion_df[ncol] > upper_bound
            count_problems_down = self.dispersion_df[ncol] < lower_bound

            # Count them
            count_problems = [1 if (count_problems_up[i] or count_problems_down[i])\
                                else 0 for i in range(len(count_problems_up))]

            # Add the trace in either
            number_errors = np.sum(count_problems)
            max_number_errors = len(count_problems)

            # TBR
            if number_errors / max_number_errors > config._dispersion[
                    'limit_errors'] / 100:
                std_out(
                    f"Device {device} out of {config._dispersion['limit_errors']}% limit\
                         - {np.round(number_errors/max_number_errors*100, 1)}% out",
                    'WARNING')
                subplots[0].append(ncol)
            #OK
            else:
                subplots[n_subplots - 1].append(ncol)

    # Add upper and low bound bound to subplot 0
    subplots[0].append(channel + '_AVG')
    subplots[0].append('upper_bound')
    subplots[0].append('lower_bound')

    if n_subplots > 1:
        # Add upper and low bound bound to subplot 1
        subplots[n_subplots - 1].append(channel + '_AVG')
        subplots[n_subplots - 1].append('upper_bound')
        subplots[n_subplots - 1].append('lower_bound')

        ylabels = [channel + '_TBR', channel + '_OK']
    else:
        ylabels = [channel]

    # Make subplots
    for isbplt in range(n_subplots):

        sdf = udf.loc[:, subplots[isbplt]]
        sdf = sdf.reset_index()
        data = sdf.values.T.tolist()

        labels = sdf.columns
        useries = [{'label': labels[0]}]

        ylabel = ylabels[isbplt]

        uaxes = [{
            'label': formatting['xlabel'],
            'labelSize': formatting['fontsize'],
        }, {
            'label': ylabel,
            'labelSize': formatting['fontsize']
        }]

        color_idx = 0

        for label in labels:
            if label == labels[0]: continue
            if color_idx + 1 > len(colors): color_idx = 0
            # Gray bounds and averages
            if '_bound' in label or '_AVG' in label:
                stroke = 'gray'
                point = {'space': 50, 'size': min([formatting['size'] - 2, 1])}
            else:
                stroke = colors[color_idx]
                point = {'space': 0, 'size': formatting['size']}

            nser = {'label': label, 'stroke': stroke, 'points': point}

            useries.append(nser)
            color_idx += 1

        u_options = {
            'width': formatting['width'],
            'height': formatting['height'],
            'legend': {
                'isolate': True
            },
            'cursor': {
                'lock': True,
                'focus': {
                    'prox': 16,
                },
                'sync': {
                    'key': 'moo',
                    'setSeries': True,
                },
                'drag': {
                    'x': True,
                    'y': True,
                    'uni': 50,
                    'dist': 10,
                }
            },
            'scales': {
                'x': {
                    'time': True
                },
                'y': {
                    'auto': True
                },
            },
            'series': useries,
            'axes': uaxes
        }

        h2 = Template(uplot_template).render(data=json.dumps(data),
                                             options=json.dumps(u_options),
                                             subplot=isbplt)

        h += h2

    h = h.replace('"', "'")
    h = h.replace("'null'", "null")

    if options['html']:
        return h
    else:
        iframe = f'''<iframe srcdoc="{h}" src=""
            frameborder="0" width={formatting['width'] + formatting['padding-right']}
            height={formatting['height'] + formatting['padding-bottom']}
            sandbox="allow-scripts">
            </iframe>'''

        return HTML(iframe)

Esempio n. 29

0

Mostra file

def bias(df, dropna=True, alpha=0.05, flatten=True):
    """"
    Calculates temporal mean biases and its confidence intervals based on Student's t-distribution,
    both with and without auto-correlation corrected sample size.

    Parameters
    ----------
    df : pd.DataFrame
        Data Frame whose k columns will be correlated
    dropna : boolean
        If false, temporal matching (dropna-based) will be done for each column-combination individually
    alpha : float [0,1]
        Significance level for the confidence intervals
    flatten : boolean
        If set, results are returned as pd.Series in case df only holds 2 columns

    Returns
    -------
    res : xr.DataArray
        (k x k x 7) Data Array holding the following statistics for each data set combination of df:
        bias : Temporal mean bias
        n, n_corr : original and auto-correlation corrected sample size
        CI_l, CI_l_corr, CI_u, CI_u_corr : lower and upper confidence levels with and without sample size correction

    res : pd.Series (if flatten is True and df contains only two columns)
        Series holding the above described statistics for the two input data sets.

    """

    if not isinstance(df, pd.DataFrame):
        print('Error: Input is no pd.DataFrame.')
        return None

    if dropna is True:
        df.dropna(inplace=True)

    df.sort_index(inplace=True)

    cols = df.columns.values

    stats = ['bias', 'n', 'CI_l', 'CI_u', 'n_corr', 'CI_l_corr', 'CI_u_corr']
    dummy = np.full((len(cols), len(cols), len(stats)), np.nan)

    res = xr.DataArray(dummy,
                       dims=['ds1', 'ds2', 'stats'],
                       coords={
                           'ds1': cols,
                           'ds2': cols,
                           'stats': stats
                       })

    for ds1 in cols:
        for ds2 in cols:
            if ds1 == ds2:
                continue

            # get sample size
            tmpdf = df[[ds1, ds2]].dropna()
            n = len(tmpdf)
            res.loc[ds1, ds2, 'n'] = n
            res.loc[ds2, ds1, 'n'] = n
            if n < 5:
                continue

            # Calculate bias & ubRMSD
            diff = tmpdf[ds1].values - tmpdf[ds2].values
            bias = diff.mean()
            ubRMSD = diff.std(ddof=1)

            t_l, t_u = t.interval(1 - alpha, n - 1)
            CI_l = bias + t_l * ubRMSD / np.sqrt(n)
            CI_u = bias + t_u * ubRMSD / np.sqrt(n)

            res.loc[ds1, ds2, 'bias'] = bias
            res.loc[ds1, ds2, 'CI_l'] = CI_l
            res.loc[ds1, ds2, 'CI_u'] = CI_u

            n_corr = correct_n(n, tmpdf)
            res.loc[ds1, ds2, 'n_corr'] = n_corr
            res.loc[ds2, ds1, 'n_corr'] = n_corr
            if n_corr < 5:
                continue

            # Confidence intervals with corrected sample size
            t_l, t_u = t.interval(alpha, n_corr - 1)
            CI_l = bias + t_l * ubRMSD / np.sqrt(n_corr)
            CI_u = bias + t_u * ubRMSD / np.sqrt(n_corr)

            res.loc[ds1, ds2, 'CI_l_corr'] = CI_l
            res.loc[ds1, ds2, 'CI_u_corr'] = CI_u

    if flatten is True:
        if len(cols) == 2:
            res = pd.Series(res.loc[cols[0], cols[1], :],
                            index=stats,
                            dtype='float32')

    return res

Esempio n. 30

0

Mostra file

def summary(self, regpyhdfe, yname=None, xname=None, title=None, alpha=.05):
    """
    Summarize the Regression Results.

    Parameters
    ----------
    yname : str, optional
        Name of endogenous (response) variable. The Default is `y`.
    xname : list[str], optional
        Names for the exogenous variables. Default is `var_##` for ## in
        the number of regressors. Must match the number of parameters
        in the model.
    title : str, optional
        Title for the top table. If not None, then this replaces the
        default title.
    alpha : float
        The significance level for the confidence intervals.

    Returns
    -------
    Summary
        Instance holding the summary tables and text, which can be printed
        or converted to various output formats.

    See Also
    --------
    statsmodels.iolib.summary.Summary : A class that holds summary results.
    """
    ##########################################################################################################
    ##########################################################################################################
    # https://apithymaxim.wordpress.com/2020/03/16/clustering-standard-errors-by-hand-using-python/
    # http://cameron.econ.ucdavis.edu/research/Cameron_Miller_JHR_2015_February.pdf
    #N,k,Nclusts = len(df.index),3,50 # Number of observations, right hand side columns counting constant, number of clusters
    #X = np.hstack( (np.random.random((N,k-1)), np.ones((N,1)) ) )
    #X = get_np_columns(df, ['wks_ue', 'tenure'], intercept=True)
    X = regpyhdfe.data[:, 1:]
    #y = get_np_columns(df, ['ttl_exp'])
    y = np.expand_dims(regpyhdfe.data[:, 0], 1)

    # Calculate (X'X)^-1 and the vector of coefficients, beta
    XX_inv = np.linalg.inv(X.T.dot(X))
    beta = (XX_inv).dot(X.T.dot(y))
    resid = y - X.dot(beta)

    #ID = np.random.choice([x for x in range(Nclusts)],N) # Vector of cluster IDs
    #ID = np.squeeze(get_np_columns(df, ['delete_me']))
    ID = np.squeeze(regpyhdfe.groups_np)
    c_list = np.unique(ID)  # Get unique list of clusters

    N, k, Nclusts = X.shape[0], X.shape[1], int(c_list.shape[0])

    sum_XuuTX = 0
    for c in range(0, Nclusts):
        in_cluster = (ID == c_list[c])  # Indicator for given cluster value
        resid_c = resid[in_cluster]
        uuT = resid_c.dot(resid_c.T)
        Xc = X[in_cluster]
        XuuTX = Xc.T.dot(uuT).dot(Xc)
        sum_XuuTX += XuuTX

    adj = (Nclusts / (Nclusts - 1)) * (
        (N - 1) / (N - k)
    )  # Degrees of freedom correction from https://www.stata.com/manuals13/u20.pdf p. 54

    # TODO: actually check if the fixed effects are nested
    df_a_nested = 1
    adj = ((N - 1) / (N - df_a_nested - k)) * (Nclusts / (Nclusts - 1))
    V_beta = adj * (XX_inv.dot(sum_XuuTX).dot(XX_inv))
    se_beta = np.sqrt(np.diag(V_beta))

    # Output data for Stata
    for_stata = pd.DataFrame(X)
    for_stata.columns = ["X" + str(i) for i in range(k)]
    for_stata['ID'] = ID
    for_stata['y'] = y

    ##for_stata.to_stata("resid_test.dta")
    print('B', beta, '\n SE: \n', se_beta)
    beta = np.squeeze(beta)
    t_values = beta / se_beta
    print('T values', t_values)

    from scipy.stats import t
    p_values = 2 * t.cdf(-np.abs(t_values), regpyhdfe.model.df_resid)
    # confidence interval size
    t_interval = np.asarray(
        t.interval(alpha=(1 - alpha), df=regpyhdfe.model.df_resid))
    print("t_interval", t_interval)
    intervals = np.empty(shape=(beta.shape[0], 2))
    # for each variables
    for i in range(0, intervals.shape[0]):
        intervals[i] = t_interval * se_beta[i] + beta[i]

    print('intervals', intervals)
    tmp1 = np.linalg.solve(V_beta, np.mat(beta).T)
    tmp2 = np.dot(np.mat(beta), tmp1)
    fvalue = tmp2[0, 0] / k
    import pdb
    pdb.set_trace()
    print('fvalue', fvalue)

    #    from statsmodels.stats.stattools import (
    #        jarque_bera, omni_normtest, durbin_watson)

    #    jb, jbpv, skew, kurtosis = jarque_bera(self.wresid)
    #    omni, omnipv = omni_normtest(self.wresid)

    #    eigvals = self.eigenvals
    #    condno = self.condition_number

    # TODO: Avoid adding attributes in non-__init__
    #    self.diagn = dict(jb=jb, jbpv=jbpv, skew=skew, kurtosis=kurtosis,
    #                      omni=omni, omnipv=omnipv, condno=condno,
    #                      mineigval=eigvals[-1])
    # TODO not used yet
    # diagn_left_header = ['Models stats']
    # diagn_right_header = ['Residual stats']

    # TODO: requiring list/iterable is a bit annoying
    #   need more control over formatting
    # TODO: default do not work if it's not identically spelled

    top_left = [
        ('Dep. Variable:', None),
        ('Model:', None),
        ('Method:', ['Least Squares']),
        ('Date:', None),
        ('Time:', None),
        ('No. Observations:', None),
        ('Df Residuals:', None),
        ('Df Model:', None),
    ]

    if hasattr(self, 'cov_type'):
        top_left.append(('Covariance Type:', [self.cov_type]))

    rsquared_type = '' if self.k_constant else ' (uncentered)'
    top_right = [
        ('R-squared' + rsquared_type + ':', ["%#8.3f" % self.rsquared]),
        ('Adj. R-squared' + rsquared_type + ':',
         ["%#8.3f" % self.rsquared_adj]),
        ('F-statistic:', ["%#8.4g" % self.fvalue]),
        ('Prob (F-statistic):', ["%#6.3g" % self.f_pvalue]),
    ]

    #    diagn_left = [('Omnibus:', ["%#6.3f" % omni]),
    #                  ('Prob(Omnibus):', ["%#6.3f" % omnipv]),
    #                  ('Skew:', ["%#6.3f" % skew]),
    #                  ('Kurtosis:', ["%#6.3f" % kurtosis])
    #                  ]
    #
    #    diagn_right = [('Durbin-Watson:',
    #                    ["%#8.3f" % durbin_watson(self.wresid)]
    #                    ),
    #                   ('Jarque-Bera (JB):', ["%#8.3f" % jb]),
    #                   ('Prob(JB):', ["%#8.3g" % jbpv]),
    #                   ]
    if title is None:
        title = self.model.__class__.__name__ + ' ' + "Regression Results"

    # create summary table instance
    from statsmodels.iolib.summary import Summary
    smry = Summary()
    smry.add_table_2cols(self,
                         gleft=top_left,
                         gright=top_right,
                         yname=yname,
                         xname=xname,
                         title=title)
    smry.add_table_params(self,
                          yname=yname,
                          xname=xname,
                          alpha=alpha,
                          use_t=self.use_t)

    #    smry.add_table_2cols(self, gleft=diagn_left, gright=diagn_right,
    #                         yname=yname, xname=xname,
    #                         title="")

    # add warnings/notes, added to text format only
    etext = []
    if not self.k_constant:
        etext.append("R² is computed without centering (uncentered) since the "
                     "model does not contain a constant.")
    if hasattr(self, 'cov_type'):
        etext.append(self.cov_kwds['description'])
    if self.model.exog.shape[0] < self.model.exog.shape[1]:
        wstr = "The input rank is higher than the number of observations."
        etext.append(wstr)
#    if eigvals[-1] < 1e-10:
#        wstr = "The smallest eigenvalue is %6.3g. This might indicate "
#        wstr += "that there are\n"
#        wstr += "strong multicollinearity problems or that the design "
#        wstr += "matrix is singular."
#        wstr = wstr % eigvals[-1]
#        etext.append(wstr)
#    elif condno > 1000:  # TODO: what is recommended?
#        wstr = "The condition number is large, %6.3g. This might "
#        wstr += "indicate that there are\n"
#        wstr += "strong multicollinearity or other numerical "
#        wstr += "problems."
#        wstr = wstr % condno
#        etext.append(wstr)

    if etext:
        etext = [
            "[{0}] {1}".format(i + 1, text) for i, text in enumerate(etext)
        ]
        etext.insert(0, "Notes:")
        smry.add_extra_txt(etext)

    return smry

Esempio n. 31

0

Mostra file

File: Aufgabe6-3.py Progetto: LukasStu/DFSS

xaxes = np.linspace(np.min(VA) * 0.9995, np.max(VA) * 1.001, 1000)
pdf = norm.pdf(xaxes, loc=VA_quer, scale=s_VA)
ax.plot(xaxes, pdf, 'r', label='Wahrscheinlichkeitsdichte der Ggh')
ax.set_xlabel(r'Wärme in $\frac{\mathrm{cal}}{\mathrm{g}}$')
ax.set_ylabel(r'Wahrscheinlichkeit')
ax.legend()

"d) Hypothesentest auf identische Mittelwerte mu_VA und mu_VA, "
# Stichprobenanalyse
VB_quer = np.mean(VB)
s_VB = np.std(VB, ddof=1)
N_VB = VB.size

s_gesamt = np.sqrt(((N_VA - 1) * s_VA + (N_VB - 1) * s_VB) / N_VA + N_VB - 2)
# Intervallgrenzen der t-verteilen ZV mit NA+NB-2 Freiheitsgraden
C = t.interval(gamma95, N_VA + N_VB - 2)
# Annahmebereich berechnen
Annnahme_delta_x_quer = np.array([
    C[0] * np.sqrt(1 / N_VA + 1 / N_VB) * s_gesamt,
    C[1] * np.sqrt(1 / N_VA + 1 / N_VB) * s_gesamt
])

# Vergleich mit Stichprobe
if (VA_quer - VB_quer) < Annnahme_delta_x_quer[0] or (
        VA_quer - VB_quer) >= Annnahme_delta_x_quer[1]:
    print("Hypothese verworfen")
else:
    print("Hypothese angenommen")
    print("{:.4f} < {:.4f} <= {:.4f}".format(Annnahme_delta_x_quer[0],
                                             (VA_quer - VB_quer),
                                             Annnahme_delta_x_quer[1]))

Esempio n. 32

0

Mostra file

File: dm2.py Progetto: cygilbert/dm_stat

skl_linmod = linear_model.LinearRegression()
skl_linmod.fit(X_scaled, Y)

coeff = skl_linmod.coef_
intercept = skl_linmod.intercept_
teta_scaled = [intercept, coeff[0], coeff[1], coeff[2], coeff[3], coeff[4]]
print(teta_scaled)

residual = Y - skl_linmod.predict(X_scaled)
norm_residual = LA.norm(residual)
var_residual_estimated = (norm_residual ** 2) / (n - LA.matrix_rank(X_scaled))
print(var_residual_estimated)


interval_student = t.interval(0.99, n - p - 1, loc=0, scale=1)
quantile = interval_student[1]

txx = np.dot(np.transpose(X_scaled), X_scaled)
txx_inv = LA.inv(txx)

intervalle = []
for i in range(5):
    intervalle.append(
        [
            coeff[i] - quantile * np.sqrt(var_residual_estimated * txx_inv[i][i]),
            coeff[i] + quantile * np.sqrt(var_residual_estimated * txx_inv[i][i]),
        ]
    )

print(intervalle)

Esempio n. 33

0

Mostra file

File: 8.2.8.py Progetto: enyaku/python3-for-system-trade

print("95%信頼区間:", confint)

p = sm.tsa.adfuller(arma_res.resid, regression='nc')[1]  #[1]はp値の検定結果
p1 = sm.tsa.adfuller(arma_res.resid, regression='c')[1]  #[1]はp値の検定結果
print("ドリフト無しランダムウォーク p値:", p)
print("ドリフト付きランダムウォーク p値:", p1)

from scipy.stats import t
resid = arma_res.resid.iloc[1:]
m = resid.mean()
v = resid.std()
resid_max = pd.Series.rolling(arma_res.resid, window=250).mean().max()
resid_min = pd.Series.rolling(arma_res.resid, window=250).mean().min()
print("平均:              %2.5f" % m, "標準偏差：          %2.4f" % v)
print("250日平均の最大値: %2.5f" % resid_max, "250日平均の最小値: %2.5f" % resid_min)
print("250日平均の95%の信頼区間: ", (t.interval(alpha=0.95, df=250, loc=0, scale=v)))

from scipy.stats import chi2
resid = arma_res.resid.iloc[1:]
m = resid.mean()
v = resid.std()
resid_max = pd.Series.rolling(arma_res.resid, window=250).std().max()
resid_min = pd.Series.rolling(arma_res.resid, window=250).std().min()
print("平均:                 %2.5f" % m, "        標準偏差：    %2.5f" % v)
print("250日標準偏差の最大値:%2.5f" % resid_max, "250日標準偏差の最小値:%2.5f" % resid_min)

cint1, cint2 = chi2.interval(alpha=(0.95), df=249)

bcs = [
    "1949/5/16", "1954/12/1", "1972/1/1", "1986/12/1", "1986/12/1",
    "1993/11/1", "1999/2/1", "2002/2/1", "2009/4/1"

Esempio n. 34

0

Mostra file

File: temp.py Progetto: Nimaa4u/Spring_Repository

from scipy.stats import t
from numpy import average, std
from math import sqrt

if __name__ == '__main__':
    # data we want to evaluate: average height of 30 one year old male and
    # female toddlers. Interestingly, at this age height is not bimodal yet
    data = [63.5, 81.3, 88.9, 63.5, 76.2, 67.3, 66.0, 64.8, 74.9, 81.3, 76.2,
            72.4, 76.2, 81.3, 71.1, 80.0, 73.7, 74.9, 76.2, 86.4, 73.7, 81.3,
            68.6, 71.1, 83.8, 71.1, 68.6, 81.3, 73.7, 74.9]
    mean = average(data)
    # evaluate sample variance by setting delta degrees of freedom (ddof) to
    # 1. The degree used in calculations is N - ddof
    stddev = std(data, ddof=1)
    # Get the endpoints of the range that contains 95% of the distribution
    t_bounds = t.interval(0.95, len(data) - 1)
    # sum mean to the confidence interval
    ci = [mean + critval * stddev / sqrt(len(data)) for critval in t_bounds]
    print ("Mean: %f" % mean)
    print ("Confidence Interval 95%%: %f, %f" % (ci[0], ci[1]))   
    
    
#%%    
##%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
        'pre_score': [4, 24, 31, 2, 3],
        'mid_score': [25, 94, 57, 62, 70],

Esempio n. 35

0

Mostra file

File: FireGirlStats.py Progetto: smcgregor/gravity

def fire_stats_by_year(pathway_set):
    """From a list of FireGirlPathway objects, return descriptive statistics of fires, by yearly_logging_totals

    Arguements
    pathway_set: A list of FireGirlPathway objects

    Returns a list with the following elements:
    -Element 0: A list containing various cells-burned stats
    ---Element 0: A list by year containing average number of cells burned in this years fires accross pathways
    ---Element 1: A list by year containing the smallest number of cells burned for any fire in a given year
    ---Element 2: A list by year containing the largest number of cells burned for any fire in a given year
    ---Element 3: A list, by year, containing standard deviations of cells burned for each year
    ---Element 4: A list, by year, containing upper confidence intervals on cells burned
    ---Element 5: A list, by year, containing lower confidence intervals on cells burned

    -Element 1: A list containing various timber-lost stats
    ---Element 0: A list, by year, containing the average timber lost to fire each year
    ---Element 1: A list, by year, containing the smallest timber lost to a fire in any pathway in a given year
    ---Element 2: A list, by year, containing the largest timber lost to a fire in any pathway in a given year
    ---Element 3: A list, by year, containing standard deviations of timber lost for each year
    ---Element 4: A list, by year, containing upper confidence intervals on timber lost
    ---Element 5: A list, by year, containing lower confidence intervals on timber lost

    -Element 2: A list, by year, of the number of pathways which suppressed their fires this year
    """

    #checking for an empty input list
    if len(pathway_set) < 1:
        #it's empty, so return an equally empty result string
        return [[],[],[],[],[],[],[],[],[],[]]

    #things to compile
    cells_burned_ave = []
    cells_burned_max = []
    cells_burned_min = []
    cells_burned_std = []
    cells_burned_confidence_upper = []
    cells_burned_confidence_lower = []
    timber_lost_ave = []
    timber_lost_max = []
    timber_lost_min = []
    timber_lost_std = []
    timber_lost_confidence_upper = []
    timber_lost_confidence_lower = []
    suppress_decisions = []

    #how many years are there in these pathways?
    # assuming they all have the same number of events, query the first pathway in the list, and get the lenght
    # of it's ignition_events list
    years = len(pathway_set[0].ignition_events)

    #get a new value for each of the above lists, for each year
    for y in range(years):
        this_years_cells_burned = []
        this_years_timber_lost = []
        this_years_suppress_decisions = 0

        #look through each pathway and add their value for year=y to cells_burned, timber_lost, and supp_decisions
        for pw in pathway_set:
            #in a pathway's ignition_events list, the ignition records each have an "outcomes" member
            #an ignition_record.getOutcomes() call returns a list in the following format:
            #  [timber_loss, cells_burned, sup_cost, end_time]
            outcomes = pw.ignition_events[y].getOutcomes()
            this_years_timber_lost.append( outcomes[0] )
            this_years_cells_burned.append( outcomes[1] )
            
            #likewise, calling an iginiton event object's .getChoice() method will return a True if the simulator
            #  suppressed that fire, and a False if it did not.
            if pw.ignition_events[y].getChoice():
                this_years_suppress_decisions += 1

        #we've got all the cells_burned, timber_lost, and suppress decisions for each pathway for this year
        cells_burned_ave.append( mean(this_years_cells_burned) )
        cells_burned_max.append(  max(this_years_cells_burned) )
        cells_burned_min.append(  min(this_years_cells_burned) )
        cells_burned_std.append(  std(this_years_cells_burned) )
        timber_lost_ave.append(  mean(this_years_timber_lost) )
        timber_lost_max.append(   max(this_years_timber_lost) )
        timber_lost_min.append(   min(this_years_timber_lost) )
        timber_lost_std.append(   std(this_years_timber_lost) )

        suppress_decisions.append( this_years_suppress_decisions )

        #get the t-stat for a 95% confidence interval for sample of this size
        #this returns a list with the [lower , upper] stats, which are equal and opposite if centered 
        # around the mean, as ours are.
        tstat = t.interval(0.95, len(this_years_cells_burned) )

        #the upper and lower confidence intervals are calculated as
        #  Upper = Mean + (tstat) * (standard error of the mean)
        #  Lower = Mean - (tstat) * (standard error of the mean)

        cells_burned_upper_conf = cells_burned_ave[y] + (  tstat[0] * cells_burned_std[y]  )
        cells_burned_lower_conf = cells_burned_ave[y] - (  tstat[0] * cells_burned_std[y]  )
        timber_lost_upper_conf = timber_lost_ave[y] + (  tstat[0] * timber_lost_std[y]  )
        timber_lost_lower_conf = timber_lost_ave[y] - (  tstat[0] * timber_lost_std[y]  )

        cells_burned_confidence_upper.append( cells_burned_upper_conf )
        cells_burned_confidence_lower.append( cells_burned_lower_conf )
        timber_lost_confidence_upper.append( timber_lost_upper_conf )
        timber_lost_confidence_lower.append( timber_lost_lower_conf )

    #All Years are finished, so compile the return lists

    cells_burned_stats = [cells_burned_ave, cells_burned_min, cells_burned_max, cells_burned_std, timber_lost_confidence_lower, cells_burned_confidence_upper]
    timber_lost_stats = [timber_lost_ave, timber_lost_min, cells_burned_max, timber_lost_std, timber_lost_confidence_lower, timber_lost_confidence_upper]

    return [cells_burned_stats, timber_lost_stats, suppress_decisions]

Esempio n. 36

0

Mostra file

File: FireGirlStats.py Progetto: smcgregor/gravity

def timber_harvest_stats_by_year(pathway_set):
    """From a list of FireGirlPathway objects, return summary statistics of harvest values by year

    Args
    pathway_set: a list of at least one FireGirlPathway object

    Returns
    A list with four elements:
    -Element 0: A list containing the yearly average harvest values over all pathways in the set. The first 
    element of the list will be the average harvest values for the first year of EACH pathway, and so on.

    -Element 1: A list containing the maximum harvest values of any pathway during that year.

    -Element 2: A list containing the minimum harvest values of any pathway during that year.

    -Element 3: A list containing the standard error of the yearly averages in Element 0

    -Element 4: A list containing the upper bound of the 95% confidence interval

    -Element 5: A list containing the lower bound of the 95% confidence interval
    """

    #checking for an empty input list
    if len(pathway_set) < 1:
        #it's empty, so return an equally empty result string
        return [[],[],[]]

    #Get averages and standard errors for each year
    yearly_ave = []
    yearly_stdev = []
    yearly_max = []
    yearly_min = []
    yearly_confidence_upper = []
    yearly_confidence_lower = []

    #how many years are there in these pathways?
    # assuming they all have the same number of events, query the first pathway in the list, and get the lenght
    # of it's ignition_events list
    years = len(pathway_set[0].ignition_events)

    #for each year, look in each pathway and add it's harvest value to a list
    for y in range(years):
        this_years_harvest = []

        for pw in pathway_set:
            #add this pathway's harvest value for this year to the list
            this_years_harvest.append( pw.yearly_logging_totals[y] )

        #finished with all pathways at this year, so the list holds all year=y harvest value

        #add a new element to the _ave and _stdev lists and add this year's stat to each
        yearly_ave.append( mean(this_years_harvest) )
        yearly_stdev.append( std(this_years_harvest) )
        yearly_min.append( min(this_years_harvest) )
        yearly_max.append( max(this_years_harvest) )

        #get the t-stat for a 95% confidence interval for sample of this size
        #this returns a list with the [lower , upper] stats, which are equal and opposite if centered 
        # around the mean, as ours are.
        tstat = t.interval(0.95, len(this_years_harvest) )

        #the upper and lower confidence intervals are calculated as
        #  Upper = Mean + (tstat) * (standard error of the mean)
        #  Lower = Mean - (tstat) * (standard error of the mean)

        upper_conf = yearly_ave[y] + (  tstat[0] * yearly_stdev[y]  )
        lower_conf = yearly_ave[y] - (  tstat[0] * yearly_stdev[y]  )

        #add them to the list
        yearly_confidence_upper.append(upper_conf) 
        yearly_confidence_lower.append(lower_conf) 

    #finished with ALL years

    #return a list with each list of stats
    return [yearly_ave, yearly_min, yearly_max, yearly_stdev, yearly_confidence_lower, yearly_confidence_upper]

Esempio n. 37

0

Mostra file

File: linear.py Progetto: bigmlcom/python

    def __init__(self, linear_regression, api=None):

        self.resource_id = None
        self.input_fields = []
        self.term_forms = {}
        self.tag_clouds = {}
        self.term_analysis = {}
        self.items = {}
        self.item_analysis = {}
        self.categories = {}
        self.coefficients = []
        self.data_field_types = {}
        self.field_codings = {}
        self.bias = None
        self.xtx_inverse = []
        self.mean_squared_error = None
        self.number_of_parameters = None
        self.number_of_samples = None

        self.resource_id, linear_regression = get_resource_dict( \
            linear_regression, "linearregression", api=api)

        if 'object' in linear_regression and \
            isinstance(linear_regression['object'], dict):
            linear_regression = linear_regression['object']
        try:
            self.input_fields = linear_regression.get("input_fields", [])
            self.dataset_field_types = linear_regression.get(
                "dataset_field_types", {})
            self.weight_field = linear_regression.get("weight_field")
            objective_field = linear_regression['objective_fields'] if \
                linear_regression['objective_fields'] else \
                linear_regression['objective_field']
        except KeyError:
            raise ValueError("Failed to find the linear regression expected "
                             "JSON structure. Check your arguments.")
        if 'linear_regression' in linear_regression and \
            isinstance(linear_regression['linear_regression'], dict):
            status = get_status(linear_regression)
            if 'code' in status and status['code'] == FINISHED:
                linear_regression_info = linear_regression[ \
                    'linear_regression']
                fields = linear_regression_info.get('fields', {})

                if not self.input_fields:
                    self.input_fields = [ \
                        field_id for field_id, _ in
                        sorted(self.fields.items(),
                               key=lambda x: x[1].get("column_number"))]
                self.coeff_ids = self.input_fields[:]
                self.coefficients = linear_regression_info.get( \
                    'coefficients', [])
                self.bias = linear_regression_info.get('bias', True)
                self.field_codings = linear_regression_info.get( \
                     'field_codings', {})
                self.number_of_parameters = linear_regression_info.get( \
                    "number_of_parameters")

                objective_id = extract_objective(objective_field)
                ModelFields.__init__(
                    self, fields,
                    objective_id=objective_id, terms=True, categories=True,
                    numerics=True)
                self.field_codings = linear_regression_info.get( \
                  'field_codings', {})
                self.format_field_codings()
                for field_id in self.field_codings:
                    if field_id not in fields and \
                            field_id in self.inverted_fields:
                        self.field_codings.update( \
                            {self.inverted_fields[field_id]: \
                             self.field_codings[field_id]})
                        del self.field_codings[field_id]
                stats = linear_regression_info["stats"]
                if stats is not None and stats.get("xtx_inverse") is not None:
                    self.xtx_inverse = stats["xtx_inverse"][:]
                    self.mean_squared_error = stats["mean_squared_error"]
                    self.number_of_samples = stats["number_of_samples"]
                    # to be used in predictions
                    self.t_crit = student_t.interval( \
                        CONFIDENCE,
                        self.number_of_samples - self.number_of_parameters)[1]
                    self.xtx_inverse = list( \
                        np.linalg.inv(np.array(self.xtx_inverse)))

            else:
                raise Exception("The linear regression isn't finished yet")
        else:
            raise Exception("Cannot create the LinearRegression instance."
                            " Could not find the 'linear_regression' key"
                            " in the resource:\n\n%s" %
                            linear_regression)

Esempio n. 38

0

Mostra file

    def __init__(self, linear_regression, api=None):

        self.resource_id = None
        self.input_fields = []
        self.term_forms = {}
        self.tag_clouds = {}
        self.term_analysis = {}
        self.items = {}
        self.item_analysis = {}
        self.categories = {}
        self.coefficients = []
        self.data_field_types = {}
        self.field_codings = {}
        self.bias = None
        self.xtx_inverse = []
        self.mean_squared_error = None
        self.number_of_parameters = None
        self.number_of_samples = None

        self.resource_id, linear_regression = get_resource_dict( \
            linear_regression, "linearregression", api=api)

        if 'object' in linear_regression and \
            isinstance(linear_regression['object'], dict):
            linear_regression = linear_regression['object']
        try:
            self.input_fields = linear_regression.get("input_fields", [])
            self.dataset_field_types = linear_regression.get(
                "dataset_field_types", {})
            self.weight_field = linear_regression.get("weight_field")
            objective_field = linear_regression['objective_fields'] if \
                linear_regression['objective_fields'] else \
                linear_regression['objective_field']
        except KeyError:
            raise ValueError("Failed to find the linear regression expected "
                             "JSON structure. Check your arguments.")
        if 'linear_regression' in linear_regression and \
            isinstance(linear_regression['linear_regression'], dict):
            status = get_status(linear_regression)
            if 'code' in status and status['code'] == FINISHED:
                linear_regression_info = linear_regression[ \
                    'linear_regression']
                fields = linear_regression_info.get('fields', {})

                if not self.input_fields:
                    self.input_fields = [ \
                        field_id for field_id, _ in
                        sorted(fields.items(),
                               key=lambda x: x[1].get("column_number"))]
                self.coeff_ids = self.input_fields[:]
                self.coefficients = linear_regression_info.get( \
                    'coefficients', [])
                self.bias = linear_regression_info.get('bias', True)
                self.field_codings = linear_regression_info.get( \
                     'field_codings', {})
                self.number_of_parameters = linear_regression_info.get( \
                    "number_of_parameters")
                missing_tokens = linear_regression_info.get("missing_tokens")

                objective_id = extract_objective(objective_field)
                ModelFields.__init__(self,
                                     fields,
                                     objective_id=objective_id,
                                     terms=True,
                                     categories=True,
                                     numerics=True,
                                     missing_tokens=missing_tokens)
                self.field_codings = linear_regression_info.get( \
                  'field_codings', {})
                self.format_field_codings()
                for field_id in self.field_codings:
                    if field_id not in fields and \
                            field_id in self.inverted_fields:
                        self.field_codings.update( \
                            {self.inverted_fields[field_id]: \
                             self.field_codings[field_id]})
                        del self.field_codings[field_id]
                stats = linear_regression_info["stats"]
                if STATS and stats is not None and \
                        stats.get("xtx_inverse") is not None:
                    self.xtx_inverse = stats["xtx_inverse"][:]
                    self.mean_squared_error = stats["mean_squared_error"]
                    self.number_of_samples = stats["number_of_samples"]
                    # to be used in predictions
                    self.t_crit = student_t.interval( \
                        CONFIDENCE,
                        self.number_of_samples - self.number_of_parameters)[1]
                    self.xtx_inverse = list( \
                        np.linalg.inv(np.array(self.xtx_inverse)))

            else:
                raise Exception("The linear regression isn't finished yet")
        else:
            raise Exception("Cannot create the LinearRegression instance."
                            " Could not find the 'linear_regression' key"
                            " in the resource:\n\n%s" % linear_regression)

Esempio n. 39

0

Mostra file

def IC_reg(repartition, dfX, Y, path_rslt, suffix_table):
    u = pd.DataFrame(data=1, columns=['constante'], index=dfX.index)
    dfX = pd.concat([u, dfX], axis=1)
    result = pd.DataFrame(columns=[
        'Y_real', 'Y_pred', 'error', 'cluster', 'min_IC', 'max_IC', 'largeur',
        '% largeur'
    ],
                          index=Y.index)
    result_test = pd.DataFrame(columns=[
        'Y_real', 'Y_pred', 'error', 'cluster', 'min_IC', 'max_IC', 'largeur',
        '% largeur'
    ],
                               index=Y.index)

    for j in repartition.keys():
        #try:
        df_cluster = repartition[j]
        #intialisation et repartition train et test
        #recuperation des indexes des revenus renseignés
        index = df_cluster["revenu"].index[~df_cluster["revenu"].apply(np.isnan
                                                                       )]
        dfX_train, dfX_test, Y_train, Y_test = train_test_split(
            dfX.loc[index], Y[index], test_size=0.4, random_state=44)
        index_test = Y_test.index
        index_train = Y_train.index
        #except:
        #    print("Le programme plante au cluster " + str(j))
        #condition sur les clusters
        if len(df_cluster) >= 40 and df_cluster["revenu"].isnull().sum() / len(
                df_cluster) < 1:
            df_cluster_index = df_cluster.index
            result_test.loc[index_test, 'cluster'] = j
            result.loc[df_cluster_index, 'cluster'] = j
            #calcul des fonctions de prévision et erreur
            result.loc[df_cluster.index,
                       'Y_real'] = df_cluster.loc[df_cluster.index, 'revenu']
            result_test.loc[index_test,
                            'Y_real'] = df_cluster.loc[index_test,
                                                       'revenu'].astype(int)
            result_test.loc[index_test,
                            'Y_pred'], result.loc[df_cluster_index,
                                                  'Y_pred'] = predi_reg(
                                                      dfX, Y, index_train,
                                                      index_test,
                                                      df_cluster_index)
            result_test.loc[index_test, 'error'] = result_test.loc[
                index_test, 'Y_pred'] - result_test.loc[index_test, 'Y_real']
            result.loc[df_cluster_index, 'error'] = result.loc[
                df_cluster_index, 'Y_pred'] - result.loc[df_cluster_index,
                                                         'Y_real']
            result['lib_segment'] = suffix_table

            n = len(
                df_cluster.loc[df_cluster_index])  #apprentisage ou test !!!!
            #np.linalg.matrix_rank(dfX.loc[index_train])
            df = n - (len(dfX.columns) - 1) - 1
            quantile = t.interval(0.85, df)[1]
            MSE = result_test.loc[
                index_test,
                'Y_pred'].std()  #MSE de result_test ou de result !!!!
            X = np.dot(dfX.loc[index_test].T, dfX.loc[index_test])
            if linalg.det(X) != 0:
                X = linalg.inv(X)
                for i in range(0, len(result_test.loc[index_test, 'Y_pred'])):
                    a = np.matrix(dfX.loc[index_test][i:i + 1])
                    u = (a * X) * (a.T)
                    #h=result.loc[index_test,'Y_pred'][i:i+1].index
                    if (1 + u) > 0:
                        result_test.loc[result_test.loc[index_test,
                                                        'Y_pred'][i:i +
                                                                  1].index,
                                        'min_IC'] = result_test.loc[
                                            index_test, 'Y_pred'][i:i + 1] - (
                                                quantile * MSE * sqrt(1 + u))
                        result_test.loc[result_test.loc[index_test,
                                                        'Y_pred'][i:i +
                                                                  1].index,
                                        'max_IC'] = result_test.loc[
                                            index_test, 'Y_pred'][i:i + 1] + (
                                                quantile * MSE * sqrt(1 + u))
                    else:
                        print('cluster ' + str(j) +
                              ' contient valeur negative')
                        result_test.loc[result_test.loc[index_test,
                                                        'Y_pred'][i:i +
                                                                  1].index,
                                        'min_IC'] = 0
                        result_test.loc[result_test.loc[index_test,
                                                        'Y_pred'][i:i +
                                                                  1].index,
                                        'max_IC'] = 0
                result_test.loc[index_test, 'largeur'] = result_test.loc[
                    index_test, 'max_IC'].astype(int) - result_test.loc[
                        index_test, 'min_IC'].astype(int)
                result_test.loc[index_test, '% largeur'] = result_test.loc[
                    index_test, 'largeur'].astype(int) / result_test.loc[
                        index_test, 'Y_pred'].astype(int)
            else:
                result_test.loc[index_test, 'min_IC'] = 'Inv'
                result_test.loc[index_test, 'max_IC'] = 'Inv'

            n = len(
                df_cluster.loc[df_cluster_index])  #apprentisage ou test !!!!
            #np.linalg.matrix_rank(dfX.loc[df_cluster_index])
            df = n - (len(dfX.columns) - 1) - 1
            quantile = t.interval(0.85, df)[1]
            MSE = result.loc[
                df_cluster_index,
                'Y_pred'].std()  #MSE de result_test ou de result !!!!
            X = np.matrix(dfX.loc[df_cluster_index]).T * np.matrix(
                dfX.loc[df_cluster_index])
            if linalg.det(X) != 0:
                X = linalg.inv(X)
                for i in range(0, len(result.loc[df_cluster_index, 'Y_pred'])):
                    a = np.matrix(dfX.loc[df_cluster_index][i:i + 1])
                    u = (a * X) * (a.T)
                    #h=result.loc[index_test,'Y_pred'][i:i+1].index
                    if (1 + u) > 0:
                        result.loc[result.loc[df_cluster_index,
                                              'Y_pred'][i:i + 1].index,
                                   'min_IC'] = (result.loc[df_cluster_index,
                                                           'Y_pred'][i:i + 1] -
                                                (quantile * MSE *
                                                 sqrt(1 + u))).astype(int)
                        result.loc[result.loc[df_cluster_index,
                                              'Y_pred'][i:i + 1].index,
                                   'max_IC'] = (result.loc[df_cluster_index,
                                                           'Y_pred'][i:i + 1] +
                                                (quantile * MSE *
                                                 sqrt(1 + u))).astype(int)
                    else:
                        print('cluster ' + str(j) +
                              ' contient valeur negative')
                        result.loc[result.loc[df_cluster_index,
                                              'Y_pred'][i:i + 1].index,
                                   'min_IC'] = 0
                        result.loc[result.loc[df_cluster_index,
                                              'Y_pred'][i:i + 1].index,
                                   'max_IC'] = 0
                result.loc[df_cluster_index, 'largeur'] = result.loc[
                    df_cluster_index,
                    'max_IC'].astype(int) - result.loc[df_cluster_index,
                                                       'min_IC'].astype(int)
                result.loc[df_cluster_index, '% largeur'] = result.loc[
                    df_cluster_index,
                    'largeur'].astype(int) / result.loc[df_cluster_index,
                                                        'Y_pred'].astype(int)

            else:
                result.loc[df_cluster_index, 'min_IC'] = 'Inv'
                result.loc[df_cluster_index, 'max_IC'] = 'Inv'

        else:
            print("cluster " + str(j) + " ne remplit pas les conditions")
    result_test = result_test[pd.notnull(result_test['cluster'])]
    result = result[pd.notnull(result['cluster'])]
    dfX.drop(['constante'], axis=1, inplace=True)
    #path="/mnt/smb/TAMPON/Igor/RFR/data_rslt/"
    #result_test.to_excel(path_rslt+"result_test" + suffix_table + ".xlsx",encoding="utf-8", index=True)
    #result.to_excel(path_rslt+"result"  + suffix_table + ".xlsx",encoding="utf-8", index=True)
    result.to_csv(path_rslt + "result" + suffix_table + ".csv",
                  sep=";",
                  encoding="utf-8",
                  index=True)
    return result, result_test

Esempio n. 40

0

Mostra file

File: time_range.py Progetto: YutingTao-lab/ENGN4221_S2_TechWork

def excel_table_byname():
    data = open_excel(file) #Open excel
    table = data.sheet_by_name(by_name)#obtain the sheet in Excel file by name
    book = xlwt.Workbook() #create Excel file
    sheet1 = book.add_sheet('sheet1')
    
    col0=table.col_values(0)
    for i in range(0,len(col0)):
        sheet1.write(i,0,str(col0[i]))
        book.save('ideal_range.xls')
    
    row0 = ['lmin','lmax','hmin','hmax']
    for j in range(0,len(row0)):
        sheet1.write(0,j+1,str(row0[j]))
        book.save('ideal_range.xls')
#input the raw data as a matrix
    set_matrix=[]
    for row in range (1,table.nrows):
        _row = []
        for col in range (1,table.ncols-8):
            _row.append(table.cell_value(row,col+1))
        set_matrix.append(_row)
    set_matrix_array = np.array(set_matrix)

    #sample mean
    mean = table.col_values(-6)
    mean.pop(0)

    #sample variance (The degree used in calculations is N - ddof), the calculation is done in Excel
    stddev = table.col_values(-4)
    stddev.pop(0)

    [h,l] = set_matrix_array.shape
    #due to the small sample size, using t-distribution, the confidence level is 95%
    t_bounds = t.interval(CI, l - 1,mean,stddev)#
    t_bounds = np.vstack(t_bounds)
    t_bounds = t_bounds.transpose()
    [a, b] = t_bounds.shape
    for i in range(a):
        for j in range(b):
            if t_bounds[i][j] <= 0:
                t_bounds[i][j]=1
            if math.isnan(t_bounds[i][j]):
                t_bounds[i][j]=1
    for m in range(a):
        for n in range(b):
            sheet1.write(m+1, n+1, t_bounds[m, n])
            book.save('ideal_range.xls')
    large_bounds = t.interval(0.95, l - 1,mean,stddev)
    large_bounds = np.vstack(large_bounds)
    large_bounds = large_bounds.transpose()
    [c,d] = large_bounds.shape
    for i in range(c):
        for j in range(d):
            if large_bounds[i][j] <= 0:
                large_bounds[i][j]=1
            if math.isnan(large_bounds[i][j]):
                large_bounds[i][j]=1
    for m in range(c):
        for n in range(d):
            sheet1.write(m+1,n+3,large_bounds[m,n])
            book.save('ideal_range.xls')

Esempio n. 41

0

Mostra file

lsq.fit(X=df[["frequency"]],y=df["power"])
df["lsq-estimated"]=lsq.predict(df[["frequency"]])
print('Least Squares: P={}·n +{}, Rsq{}'.format(lsq.coef_, lsq.intercept_, lsq.score(X=df[["frequency"]],y=df["power"])))
print(mean_squared_error(df["power"],df["lsq-estimated"]))

#Get confidence
conf_max=[]
conf_min=[]
frequencydummy=[]
for freq in df["frequency"].unique():
    if freq != 1150/60 and freq != 1250/60:
        serie=df[df["frequency"]==freq]["power"]
        mu=statistics.mean(serie)
        sigma=numpy.std(serie)
        gl=len(serie)
        conf_int = t.interval(0.90,gl, loc=mu,scale=sigma)
        conf_min.append(conf_int[0])
        conf_max.append(conf_int[1])
        frequencydummy.append(freq)
conf=pd.DataFrame({"freq":frequencydummy,"low":conf_min,"high":conf_max})
conf["ts"]=ts.predict(conf[["freq"]])
conf["lsq"]=lsq.predict(conf[["freq"]])
conf.to_csv("powerfreqmodel.csv")



matplotlib.rcParams.update({'font.size': 16})
#Plot it confidence intervals
sns.set_style("whitegrid")
fig, ax = plt.subplots(figsize=[13,8],dpi=200)
plt.plot(df["frequency"],df["lsq-estimated"])

Esempio n. 42

0

Mostra file

File: modeling.py Progetto: chiffa/chiffatools2

 def get_1p_bounds(mean, std, dof):
     return t.interval(0.99, dof, mean, std)

Esempio n. 43

0

Mostra file

File: confidence_intervals_wo_sigma.py Progetto: snchit/UCSanDiegoX

import numpy as np
from scipy.stats import norm, t, sem, moment
from math import sqrt

# from the SAT Score Question
list = [560, 610, 500, 470, 660, 640]
p = 0.90

# Elephan Trunk
# List = [5.62, 6.07, 6.64, 5.91, 6.30, 6.55, 6.19, 5.48]
# p = 0.95

total = 0
n = len(list)
mu = np.mean(list)
var = np.var(list, ddof=1)  # add ddof=1 for unbiased (Bessle Corrected)
bounds = t.interval(p, len(list) - 1, loc=np.mean(list), scale=sem(list))
critical_t = t.ppf(((1 + p) / 2), n - 1)

sigma_est = sqrt(var)
std_error = critical_t * sigma_est / sqrt(n)

print('Mean =', mu)
print('Critical T =', critical_t)
print('Unbiased Sample Variace (Bessel Corrected) = ', var)
print('Standard Deviation (Estimation) =', sigma_est)
print('Standard Error =', std_error)
print('Lower Bounds =', bounds[0])
print('Upper Bounds =', bounds[1])

Esempio n. 44

0

Mostra file

 def geometric_mean(p_series, df, cols):
     # Alternatively we can use scipy.stats.lognorm to fit a distribution
     # and provide the parameters
     if (len(p_series) > 3) & (p_series.quantile(0.5) > 0):
         # result = gmean(p_series.to_numpy()+1)-1
         module_logger.debug(
             f"Calculating confidence interval for"
             f"{df.loc[p_series.index[0],groupby_cols].values}")
         module_logger.debug(f"{p_series.values}")
         with np.errstate(all='raise'):
             try:
                 data = p_series.to_numpy()
             except (ArithmeticError, ValueError, FloatingPointError):
                 module_logger.debug("Problem with input data")
                 return None
             try:
                 log_data = np.log(data)
             except (ArithmeticError, ValueError, FloatingPointError):
                 module_logger.debug("Problem with log function")
                 return None
             try:
                 mean = np.mean(log_data)
             except (ArithmeticError, ValueError, FloatingPointError):
                 module_logger.debug("Problem with mean function")
                 return None
             l = len(data)
             try:
                 sd = np.std(log_data) / np.sqrt(l)
                 sd2 = sd**2
             except (ArithmeticError, ValueError, FloatingPointError):
                 module_logger.debug("Problem with std function")
                 return None
             try:
                 pi1, pi2 = t.interval(alpha=0.90,
                                       df=l - 2,
                                       loc=mean,
                                       scale=sd)
             except (ArithmeticError, ValueError, FloatingPointError):
                 module_logger.debug("Problem with t function")
                 return None
             try:
                 upper_interval = np.max([
                     mean + sd2 / 2 + pi2 * np.sqrt(sd2 / l + sd2**2 /
                                                    (2 * (l - 1))),
                     mean + sd2 / 2 - pi2 * np.sqrt(sd2 / l + sd2**2 /
                                                    (2 * (l - 1))),
                 ])
             except:
                 module_logger.debug("Problem with interval function")
                 return None
             try:
                 result = (np.exp(mean), 0, np.exp(upper_interval))
             except (ArithmeticError, ValueError, FloatingPointError):
                 module_logger.debug("Unable to calculate geometric_mean")
                 return None
             if result is not None:
                 return result
             else:
                 module_logger.debug(
                     f"Problem generating uncertainty parameters \n"
                     f"{df.loc[p_series.index[0],groupby_cols].values}\n"
                     f"{p_series.values}"
                     f"{p_series.values+1}")
                 return None
     else:
         return None

Esempio n. 45

0

Mostra file

def knn_ci(ratings):  # input will be Train_data_matrix[neighborset]
    std_dev = np.std(ratings)
    n = len(ratings)
    multi = t.interval(alpha=0.975, df=((n - 2) / 2))[1]
    return multi * std_dev / math.sqrt(n)

Esempio n. 46

0

Mostra file

File: analysis.py Progetto: thiesgehrmann/Homokaryon-Expression

  DI_chr = DI_chr.To(id, Do=_.Sum())
#efor

DI_chr = DI_chr.Get(_.seqname, *[ x[0] for x in enumerate(DI_chr.Names[1:-1], 1) ]).ReplaceMissing()

conditions = [ x for x in enumerate(DI_chr.Names[1:], 1)]
condition_pairs = zip(conditions[:len(conditions)/2], conditions[len(conditions)/2:])

stats = [];
for ((a_id_rep1, conda_rep1), (b_id_rep1, condb_rep1)), ((a_id_rep2, conda_rep2), (b_id_rep2, condb_rep2)) in zip(condition_pairs[0::2], condition_pairs[1::2]):
  cond = conda_rep1.split('|')[1]
  cond_stats = DI_chr.Get(_.seqname, _.Get(a_id_rep1).Cast(float) / _.Get(b_id_rep1).Cast(float), _.Get(a_id_rep2).Cast(float) / _.Get(b_id_rep2).Cast(float)  )   / ('seqname', 'r1', 'r2');
  cond_stats = cond_stats.Get(_.seqname, _.r1, _.r2, ((_.r1 + _.r2) / 2) / 'mean' )
  cond_stats = cond_stats.Get(_.seqname, _.r1, _.r2, _.mean, ((_.r1 - _.mean) * (_.r1 - _.mean) + (_.r2 - _.mean) * (_.r2 - _.mean) / 2) / 'var')
  cond_stats = cond_stats.Get(_.seqname, _.r1, _.r2, _.mean, _.var.Each(lambda x: np.sqrt(x)).Cast(float) / 'sd', _.var );
  cond_stats = cond_stats.Get(_.seqname, _.r1, _.r2, _.mean, _.sd, _.var, (_.sd / np.sqrt(2)) * max(t.interval(0.90, 1)) / 'confidence' ).Copy()
  stats.append(cond_stats.Get(_.seqname.Each(lambda x: cond).Cast(str) / 'cond', *cond_stats.Names).Copy());
#efor

allstats = stats[0];
for s in stats[1:]:
  allstats = allstats | Stack | s;
#efor

  # Add the data from ALL the chromosomes
condlibratios = Read('%s/deseq_condlibratios.tsv' % output_dir).Detect() / ('cond', 'r1', 'r2', 'mean', 'sd', 'var');
condlibratios = condlibratios.Get(_.cond, _.r1, _.r2, _.mean, _.sd, _.var, (_.sd / np.sqrt(2)) * max(t.interval(0.90, 1)) / 'confidence')

allstats = condlibratios.Get(_.cond, _.cond.Each(lambda x: 'all').Cast(str) / 'seqname', _.r1, _.r2, _.mean, _.sd, _.var, _.confidence) | Stack | allstats

minval, maxval = allstats.Get(_.mean.Min(), _.mean.Max())()

Esempio n. 47

0

Mostra file

# *******************************************************************************
if __name__ == '__main__':

    random.seed(RANDOM_SEED)

    # For a fix pair of parameters we will repeat the simulation many times

    mu = 0.05  # 20 s per customers, thus 0.05 customer/s
    lambd = 1.5 * mu

    # ***************************************************************************
    # First Case: 1 Queue, 3 servers
    # ***************************************************************************
    y1 = simulate(1, 3)

    print(t.interval(0.99, SAMPLES - 1, np.mean(y1), sem(y1)))

    # ***************************************************************************
    # Second Case: 3 Queue, 1 servers
    # ***************************************************************************
    y2 = simulate(3, 1)

    print(t.interval(0.99, SAMPLES - 1, np.mean(y2), sem(y2)))

    f1, ax1 = pyplot.subplots()
    ax1.plot(y1, 'ro', color='green')
    ax1.plot(y2, 'ro', color='red')
    ax1.set_xlabel("Simulation round")
    ax1.set_ylabel("E[T] - response time")
    ax1.grid(b=True, which='major', color='#CCCCCC', linestyle='-')
    pyplot.show()

Esempio n. 48

0

Mostra file

def uncertainty(db, mean_gen, total_gen, total_facility_considered):
    # Troy Method
    # Creating copy of database by substitution the NA emissions with zero
    # db1 = db.fillna(value = 0)

    # Removing all rows here emissions are not reported for second dataframe
    # db2 = db.dropna()
    # frames = [db1,db2]
    # Here we doubled up the database by combining two databases together
    # data_1 = pd.concat(frames,axis = 0)
    data_1 = db

    df2 = pd.DataFrame([[0, 0]], columns=['Electricity', 'FlowAmount'])

    for i in range(len(data_1), total_facility_considered):

        data = data_1.append(df2, ignore_index=True)
        data_1 = data

    data = data_1
    mean = np.mean(data.iloc[:, 1])
    l, b = data.shape
    sd = np.std(data.iloc[:, 1]) / np.sqrt(l)
    # mean_gen = np.mean(data.iloc[:,0])
    # obtaining the emissions factor from the weight based method
    ef = compilation(db, total_gen)

    # Endpoints of the range that contains alpha percent of the distribution
    pi1, pi2 = t.interval(alpha=0.90, df=l - 2, loc=mean, scale=sd)
    # Converting prediction interval to emission factors
    pi2 = pi2 / mean_gen
    pi1 = pi1 / mean_gen
    pi3 = (pi2 - ef) / ef
    x = var('x')

    if math.isnan(pi3) == True:
        return None, None

    elif math.isnan(pi3) == False:

        # This method will not work with the interval limits are more than 280% of the mean.
        if pi3 < 2.8:
            # sd1,sd2 = solve(0.5*x*x -(1.16308*np.sqrt(2))*x + (np.log(1+pi3)),x)

            a = 0.5
            b = -(1.16308 * np.sqrt(2))
            c = np.log(1 + pi3)
            sd1 = (-b + np.sqrt(b**2 - (4 * a * c))) / (2 * a)
            sd2 = (-b - np.sqrt(b**2 - (4 * a * c))) / (2 * a)

        else:  # This is a wrong mathematical statement. However, we have to use it if something fails.
            sd1, sd2 = solve(
                0.5 * x * x - (1.36 * np.sqrt(2)) * x + (np.log(1 + pi3)), x)

        # if type(sd1) != float or type(sd2) != float:
        #   return 0,0
        # always choose lower standard deviation from solving the square root equation.
        if sd1 < sd2:
            log_mean = np.log(ef) - 0.5 * (sd1**2)
            return round(log_mean, 12), round(sd1, 12)
        else:
            log_mean = np.log(ef) - 0.5 * (sd2**2)
            return round(log_mean, 12), round(sd2, 12)

Esempio n. 49

0

Mostra file

for i in range(0,tam):
    
    os.system("./ejecutar 1 3 1 >> datos.txt")
    proceso1 = subprocess.Popen("./"+ str(sys.argv[2])+ " 1", stdout=subprocess.PIPE, shell=True)
    (out, err) = proceso1.communicate()
    print(out)
    valor = out.decode("utf-8")
    valor = float(valor.split(":")[0])
    datos_torm[i] = valor

    valor = os.popen('./'+ sys.argv[2]+' 1 3 0 1').read()
    valor = float(valor.split(":")[0])
    datos_vel[i] = valor

    print("Pasada " , i)"""

datos_vel = np.genfromtxt("datosVel.txt", dtype=np.float)
datos_torm = np.genfromtxt("datosTor.txt", dtype=np.float)
datos_vel = datos_vel[:int(sys.argv[1])]
datos_torm = datos_torm[:int(sys.argv[1])]
tam = np.size(datos_vel)
diferencia = datos_vel - datos_torm
media = np.mean(diferencia)
varianza = np.var(diferencia)

valores_student = t.interval(0.95, tam - 1)

intervalo = [media + t * np.sqrt(varianza / tam) for t in valores_student]

print("Intervalo=", intervalo, " media=", media, " varianza=", varianza)

Esempio n. 50

0

Mostra file

File: pan8.py Progetto: jettom/SoftArsenal

print("ドリフト無しランダムウォーク p値:",p)
print("ドリフト付きランダムウォーク p値:",p1)


# In[6]:


from scipy.stats import t
resid=arma_res.resid.iloc[1:]
m=resid.mean()
v=resid.std()
resid_max=pd.Series.rolling(arma_res.resid,window=250).mean().max()
resid_min=pd.Series.rolling(arma_res.resid,window=250).mean().min()
print("平均:              %2.5f"%m,"標準偏差：          %2.4f"%v)
print("250日平均の最大値: %2.5f"%resid_max,"250日平均の最小値: %2.5f"%resid_min)
print("250日平均の95%の信頼区間: ",(t.interval(alpha=0.95, df=250, loc=0, scale=v)))


# In[7]:


pd.Series.rolling(arma_res.resid.iloc[1:],250).mean().plot(figsize=(6,4),color='hotpink')
plt.ylabel('$\hat{z_t}$')


# In[8]:


from scipy.stats import chi2
resid=arma_res.resid.iloc[1:]
m=resid.mean()

Esempio n. 51

0

Mostra file

File: calib_proc.py Progetto: bo3mrh/Test

def cal_regression_power(
    t_evap,
    t_cond,
    uncer_t_evap,
    uncer_t_cond,
    rel_uncer_power,
    abs_uncer_power,
    para,
    full_output=False,
    dist_output=False,
):
    """
        Estimate the compressor power and
        its uncertainty based on evaporating
        and condensing temperature

        Parameters:
        ===========
        t_evap: float
            Evaporating temperature in F

        t_cond: float
            Condensing temperature in F

        uncer_t_evap: float
            Uncertainty of evaporating temperature in F

        uncer_t_cond: float
            Uncertainty of condensing temperature in F

        rel_uncer_power: float
            Relative uncertainty of measured
            power consumption in %

        abs_uncer_power: float
            Absolute uncertainty of measured
            power consumption in W

        para: MAP_PARA() object
            Object containing the coefficients

        full_output: boolean
            Whether to output all other uncertainties.
            Default false.

        dist_output: boolean
            Whether to output all components of
            uncertainty from training data in a
            numpy array

        Returns:
        ===========
        power: float
            Estimated power in W

        uncer: float
            Uncertainty of the estimation in W

        uncer_input: float
            Uncertainty from inputs in W. Only output
            when full_output=True

        uncer_output: float
            Uncertainty from output in W. Only output
            when full_output=True

        uncer_train: float
            Uncertainty from training data in W. Only
            output when full_output=True

        uncer_dev: float
            Uncertainty from deviation in W. Only
            output when full_output=True

        uncer_cov: float
            Uncertainty from covariance in W. Only
            output when full_output=True

        uncer_train_dist: float
            Components of uncertainty from training
            datain W. Only output when dist_output=True

    """

    # form x vector
    coeff = para.get_coeff()
    x = (
        np.matrix(
            [
                1.0,
                t_evap,
                t_cond,
                t_evap ** 2,
                t_evap * t_cond,
                t_cond ** 2,
                t_evap ** 3,
                t_evap ** 2 * t_cond,
                t_evap * t_cond ** 2,
                t_cond ** 3,
            ]
        )
    ).transpose()
    dyestdet = (
        np.matrix(
            [0.0, 1.0, 0.0, 2.0 * t_evap, t_cond, 0.0, 3.0 * t_evap ** 2, 2.0 * t_evap * t_cond, t_cond ** 2, 0.0]
        )
    ) * coeff
    dyestdct = (
        np.matrix(
            [0.0, 0.0, 1.0, 0.0, t_evap, 2.0 * t_cond, 0.0, t_evap ** 2, 2.0 * t_cond * t_evap, 3.0 * t_cond ** 2]
        )
    ) * coeff

    # estimate power
    power = (x.transpose() * coeff)[0, 0]

    # estimate uncer_input
    uncer_input = sqrt(((dyestdet * uncer_t_evap).sum()) ** 2 + ((dyestdct * uncer_t_cond).sum()) ** 2)

    # estimate uncer_output
    uncer_output = sqrt(abs_uncer_power ** 2 + (rel_uncer_power * power) ** 2)

    # estimate uncer_train
    train_x_entry = para.get_dBdXdeltaX() * x
    train_y_entry = para.get_dBdydeltay() * x
    uncer_train = sqrt(
        (np.multiply(train_x_entry, train_x_entry)).sum() + (np.multiply(train_y_entry, train_y_entry)).sum()
    )

    if dist_output:
        uncer_train_comp = np.array(
            [
                np.sqrt(qq)
                for qq in (
                    np.multiply(train_x_entry, train_x_entry).tolist()
                    + np.multiply(train_y_entry, train_y_entry).tolist()
                )
            ]
        )

    # estimate uncer_dev
    m = len(para.get_y())
    t_stat = t.interval(0.95, m - 10)[1]
    uncer_dev = t_stat * para.get_sigma()

    # estimate uncer_cov
    uncer_cov = t_stat * sqrt(x.transpose() * para.get_X_inverse_prod() * x) * para.get_sigma()

    # estimate uncer
    uncer = sqrt(uncer_input ** 2 + uncer_output ** 2 + uncer_train ** 2 + uncer_dev ** 2 + uncer_cov ** 2)

    if full_output:
        if dist_output:
            return power, uncer, uncer_input, uncer_output, uncer_train, uncer_dev, uncer_cov, uncer_train_comp
        else:
            return power, uncer, uncer_input, uncer_output, uncer_train, uncer_dev, uncer_cov
    else:
        if dist_output:
            return power, uncer, uncer_train_comp
        else:
            return power, uncer

Esempio n. 52

0

Mostra file

File: LittleBigData.py Progetto: estark-mpcr/LittleBigData

def DeepDiscovery(Xval,
                  Yval,
                  classnames,
                  n,
                  modeldir=None,
                  mname=None,
                  model=None,
                  net=None,
                  arch=None):

    # LOADS IN MODEL

    if modeldir is not None:
        tf.reset_default_graph()

    if net == '3FCN':
        print('3-Hidden Layer Fully Connected Network Selected')
        network = input_data(
            shape=[None, Xval.shape[1], Xval.shape[2], Xval.shape[3]])
        network = fully_connected(network, 2000, activation='tanh')
        network = dropout(network, 0.5)
        network = fully_connected(network, 2000, activation='tanh')
        network = dropout(network, 0.5)
        network = fully_connected(network, 2000, activation='tanh')
        network = dropout(network, 0.5)
        network = fully_connected(network, Yval.shape[1], activation='softmax')
        network = regression(network,
                             optimizer='momentum',
                             loss='categorical_crossentropy',
                             learning_rate=0.001)
    if net == '5FCN':
        print('5-Hidden Layer Fully Connected Network Selected')
        network = input_data(
            shape=[None, Xval.shape[1], Xval.shape[2], Xval.shape[3]])
        network = fully_connected(network, 2000, activation='tanh')
        network = dropout(network, 0.5)
        network = fully_connected(network, 2000, activation='tanh')
        network = dropout(network, 0.5)
        network = fully_connected(network, 2000, activation='tanh')
        network = dropout(network, 0.5)
        network = fully_connected(network, 2000, activation='tanh')
        network = dropout(network, 0.5)
        network = fully_connected(network, 2000, activation='tanh')
        network = dropout(network, 0.5)
        network = fully_connected(network, Yval.shape[1], activation='softmax')
        network = regression(network,
                             optimizer='momentum',
                             loss='categorical_crossentropy',
                             learning_rate=0.001)

    if net == 'AlexNet':
        print('AlexNet selected')
        network = input_data(
            shape=[None, Xval.shape[1], Xval.shape[2], Xval.shape[3]])
        network = conv_2d(network, 96, 11, strides=4, activation='relu')
        network = max_pool_2d(network, 3, strides=2)
        network = local_response_normalization(network)
        network = conv_2d(network, 256, 5, activation='relu')
        network = max_pool_2d(network, 3, strides=2)
        network = local_response_normalization(network)
        network = conv_2d(network, 384, 3, activation='relu')
        network = conv_2d(network, 384, 3, activation='relu')
        network = conv_2d(network, 256, 3, activation='relu')
        network = max_pool_2d(network, 3, strides=2)
        network = local_response_normalization(network)
        network = fully_connected(network, 4096, activation='tanh')
        network = dropout(network, 0.5)
        network = fully_connected(network, 4096, activation='tanh')
        network = dropout(network, 0.5)
        network = fully_connected(network, Yval.shape[1], activation='softmax')
        network = regression(network,
                             optimizer='momentum',
                             loss='categorical_crossentropy',
                             learning_rate=0.001)

    if arch is not None:
        print('Different Architecture Provided')
        network = arch

    if modeldir is not None:
        os.chdir(modeldir)
        model = tflearn.DNN(network)
        model.load(model_file=mname)

    # SORTS THE INPUTS BY CLASS
    k = Yval.shape[1]  # determining the number of classes, k
    nkarr = np.sum(Yval,
                   axis=0)  # checking to make sure the test set is balanced
    N = Yval.shape[0]  # number of inputs

    if np.min(nkarr) == np.max(nkarr):
        nk = nkarr[0]
        nk = int(
            nk)  # if it balanced, the rest of the if statement will continue

        tick = np.zeros([
            k
        ])  # creates a counter, next input associated with  class is free
        Xsort = np.zeros(Xval.shape)  # for sorted data
        Ysort = np.zeros(Yval.shape)  # for sorted data

        for i in range(0, N):
            c = np.argmax(
                Yval[i,
                     ...])  # checks which class the input in Xval belongs to
            Xsort[int(c * nk + tick[c]):int(c * nk + tick[c] + 1),
                  ...] = Xval[i:i + 1, ...]
            Ysort[int(c * nk + tick[c]):int(c * nk + tick[c] + 1), c] = 1
            tick[c] = tick[c] + 1

        # FORMATS Xval
        while len(
                Xsort.shape
        ) < 4:  # checks to see if the 4th dimension was already added
            Xsort = Xsort[
                ...,
                None]  # if it wasn't, the 4th dimension is added, if it was nothing happens

        # CREATES ARRAYS FOR PREDICTIONS
        N = Ysort.shape[0]  # number of inputs
        Lhat = np.zeros([N, k])  # creates an empty matrix

        # STORES PREDICTIONS
        for i in range(0, N):
            q = model.predict(Xsort[
                i:(i + 1),
                ...])  # row vector of the confidences outputted by the model
            Lhat[i:(
                i + 1
            ), :] = q  # assigns confidence values to the correct row in Lhat

        # CALCULATING THE Lressum
        Lres = Ysort - Lhat  # calculates the raw residuals (labels - confidences)
        Lressum = np.std(Lres, axis=1)  # std devs across the rows
        Lressum = Lressum[..., None]

        # FORMATTING Lressum FOR PLOTTING
        ids = np.zeros([N, 1])  # creating an id column to attach to Lressum

        for i in range(0, N):  # stupid for loop because it's not R
            ids[i, 0] = i

        Lressumid = np.append(ids, Lressum, axis=1)

        Ldf = pd.DataFrame(
            Lressumid)  # converting to Pandas because we're tired of Python
        Ldf = Ldf.rename(index=str, columns={0: "id", 1: "res"})

        # PLOTTING
        plt.figure(1)
        Ldf.plot.scatter(x='id',
                         y='res',
                         title='RMSE Across All Classes, Grouped by Class')
        plt.show()

        plt.figure(2)
        for j in range(0, k):
            Ldfa = Ldf[nk * j:nk * (j + 1)]
            Ldfa.plot.scatter(x='id',
                              y='res',
                              title='RMSE Across Class ' + classnames[j])
        plt.show()

        nsamp = int(N / n)
        LressumSamp = np.zeros([nsamp, n, 1])
        Minima = np.zeros([nsamp])

        for i in range(0, nsamp):
            LressumSamp[i, ...] = Lressum[i * n:(i + 1) * n, :]
            Minima[i] = np.where(
                LressumSamp[i] == np.min(LressumSamp[i]))[0][0]

        # Calculating the Confidence Intervals of the Position of the Lowest Error Input per Sample
        tval1 = t.interval(.90, n - 1)[1]
        lowlim1 = np.mean(Minima) - tval1 * np.std(Minima)
        upplim1 = np.mean(Minima) + tval1 * np.std(Minima)

        tval2 = t.interval(.95, n - 1)[1]
        lowlim2 = np.mean(Minima) - tval2 * np.std(Minima)
        upplim2 = np.mean(Minima) + tval2 * np.std(Minima)

        tval3 = t.interval(.99, n - 1)[1]
        lowlim3 = np.mean(Minima) - tval3 * np.std(Minima)
        upplim3 = np.mean(Minima) + tval3 * np.std(Minima)

        # Finding Average Error per Position in Sample
        AveErr = np.zeros([n])
        for i in range(0, n):
            AveErr[i] = np.mean(LressumSamp[:, i])

        plt.figure(3)

        plt.scatter(ids[0:n], AveErr)

        plt.axvspan(lowlim1, upplim1, alpha=0.05, color='salmon')
        plt.axvspan(lowlim2, upplim2, alpha=0.1, color='salmon')
        plt.axvspan(lowlim3, upplim3, alpha=0.18, color='salmon')

        plt.suptitle('Average RMSE of Input Number', fontsize=12)
        plt.title('Confidence Intervals of Signal Position in Red')
        plt.xlabel('Input Number')
        plt.ylabel('Average RMSE of Input')

        print('90% Confidence Interval Limits: (', lowlim1, ',', upplim1, ')')
        print('95% Confidence Interval Limits: (', lowlim2, ',', upplim2, ')')
        print('99% Confidence Interval Limits: (', lowlim3, ',', upplim3, ')')

        plt.show()

        # CONFIDENCE MATRIX
        label = tf.argmax(Ysort,
                          axis=1)  # converts true labels to column vector
        predict = tf.argmax(
            Lhat, axis=1
        )  # predicts binary labels from the confidences, stores vector
        confusion_matrix = tf.confusion_matrix(label, predict, k)

        with tf.Session() as sess:
            cm = confusion_matrix.eval()  # creates the confusion matrix

        pdcm = pd.DataFrame(
            cm)  # converts the confusion matrix to pandas for aesthetics

        p = ['Predicted'] * k  # this is to make pretty row and column names
        list = []
        for i in range(0, k):
            list.append(p[i] + ' ' + classnames[i])
            pclassname = list

        a = ['Actual'] * k
        list = []
        for i in range(0, k):
            list.append(a[i] + ' ' + classnames[i])
            aclassname = list

        pdcm.columns = pclassname  # renaming columns
        pdcm.index = aclassname  # renaming rows

        confusion_mat = pdcm
        return (confusion_mat)

    else:
        print("Need Blanced Testing Set")

Esempio n. 53

0

Mostra file

File: ts_dispersion_plot.py Progetto: fablabbcn/smartcitizen-data

def ts_dispersion_plot(self, **kwargs):
    '''
    Plots disperison timeseries in matplotlib plot
    Parameters
    ----------
        channel: string
            Channel
        options: dict 
            Options including data processing prior to plot. Defaults in config._plot_def_opt
        formatting: dict
            Formatting dict. Defaults in config._ts_plot_def_fmt
    Returns
    -------
        Matplotlib figure
    '''

    if 'channel' not in kwargs:
        std_out('Needs at least one channel to plot')
        return None
    else:
        channel = kwargs['channel']

    if 'options' not in kwargs:
        std_out('Using default options')
        options = config._plot_def_opt
    else:
        options = dict_fmerge(config._plot_def_opt, kwargs['options'])

    if 'formatting' not in kwargs:
        std_out('Using default formatting')
        formatting = config._ts_plot_def_fmt['mpl']
    else:
        formatting = dict_fmerge(config._ts_plot_def_fmt['mpl'],
                                 kwargs['formatting'])

    if self.dispersion_df is None:
        std_out('Perform dispersion analysis first!', 'ERROR')
        return None

    if self.common_channels == []: self.get_common_channels()
    if channel not in self.common_channels:
        std_out(f'Channel {channel} not in common_channels')
        return None
    if channel in config._dispersion['ignore_channels']:
        std_out(f'Channel {channel} ignored per config')
        return None

    if len(self.devices) > config._dispersion['nt_threshold']:
        distribution = 'normal'
        std_out('Using normal distribution')
        std_out(
            f"Using limit for sigma confidence: {config._dispersion['limit_confidence_sigma']}"
        )
    else:
        distribution = 't-student'
        std_out(f'Using t-student distribution.')

    # Size sanity check
    if formatting['width'] > 50:
        std_out('Reducing width to 12')
        formatting['width'] = 12
    if formatting['height'] > 50:
        std_out('Reducing height to 10')
        formatting['height'] = 10

    # Make subplot
    figure, (ax_tbr, ax_ok) = plt.subplots(nrows=2,
                                           sharex=formatting['sharex'],
                                           figsize=(formatting['width'],
                                                    formatting['height']))
    # cmap = plt.cm.Reds
    norm = matplotlib.colors.Normalize(
        vmin=0, vmax=config._dispersion['limit_errors'] / 2)
    ch_index = self.common_channels.index(channel) + 1

    # Style
    if formatting['style'] is not None: style.use(formatting['style'])
    else: style.use(config._plot_style)
    # Font size
    if formatting['fontsize'] is not None:
        rcParams.update({'font.size': formatting['fontsize']})

    total_number = len(self.common_channels)
    dispersion_avg = self._dispersion_summary[channel]

    if distribution == 'normal':
        limit_confidence = config._dispersion['limit_confidence_sigma']
        # Calculate upper and lower bounds
        if (config._dispersion['instantatenous_dispersion']):
            # For sensors with high variability in the measurements, it's better to use this
            upper_bound = self.dispersion_df[channel + '_AVG']\
                        + limit_confidence * self.dispersion_df[channel + '_STD']
            lower_bound = self.dispersion_df[channel + '_AVG']\
                        - abs(limit_confidence * self.dispersion_df[channel + '_STD'])
        else:
            upper_bound = self.dispersion_df[channel + '_AVG']\
                        + limit_confidence * dispersion_avg
            lower_bound = self.dispersion_df[channel + '_AVG']\
                        - abs(limit_confidence * dispersion_avg)
    else:
        limit_confidence = t.interval(
            config._dispersion['t_confidence_level'] / 100.0,
            len(self.devices),
            loc=self.dispersion_df[channel + '_AVG'],
            scale=dispersion_avg)
        upper_bound = limit_confidence[1]
        lower_bound = limit_confidence[0]

    for device in self.devices:
        ncol = channel + '-' + device
        if ncol in self.dispersion_df.columns:

            # Count how many times we go above the upper bound or below the lower one
            count_problems_up = self.dispersion_df[ncol] > upper_bound
            count_problems_down = self.dispersion_df[ncol] < lower_bound

            # Count them
            count_problems = [1 if (count_problems_up[i] or count_problems_down[i])\
                                else 0 for i in range(len(count_problems_up))]

            # Add the trace in either
            number_errors = np.sum(count_problems)
            max_number_errors = len(count_problems)

            if number_errors / max_number_errors > config._dispersion[
                    'limit_errors'] / 100:
                std_out(
                    f"Device {device} out of {config._dispersion['limit_errors']}% limit\
                         - {np.round(number_errors/max_number_errors*100, 1)}% out",
                    'WARNING')
                alpha = 1
                ax_tbr.plot(self.dispersion_df.index,
                            self.dispersion_df[ncol],
                            color='r',
                            label=device,
                            alpha=alpha)
            else:
                alpha = 1
                color = 'g'
                ax_ok.plot(self.dispersion_df.index,
                           self.dispersion_df[ncol],
                           color=color,
                           label=device,
                           alpha=alpha)

    # Add upper and low bound bound to subplot 1
    ax_tbr.plot(self.dispersion_df.index,
                self.dispersion_df[channel + '_AVG'],
                'b',
                label='Average',
                alpha=0.6)
    ax_tbr.plot(self.dispersion_df.index,
                upper_bound,
                'k',
                label='Upper-Bound',
                alpha=0.6)
    ax_tbr.plot(self.dispersion_df.index,
                lower_bound,
                'k',
                label='Lower-Bound',
                alpha=0.6)

    # Format the legend
    lgd1 = ax_tbr.legend(bbox_to_anchor=(1, 0.5),
                         fancybox=True,
                         loc='center left',
                         ncol=5)
    ax_tbr.grid(True)
    ax_tbr.set_ylabel(channel + ' TBR')
    ax_tbr.set_xlabel('Time')

    # Add upper and low bound bound to subplot 2
    ax_ok.plot(self.dispersion_df.index,
               self.dispersion_df[channel + '_AVG'],
               'b',
               label='Average',
               alpha=0.6)
    ax_ok.plot(self.dispersion_df.index,
               upper_bound,
               'k',
               label='Upper-Bound',
               alpha=0.6)
    ax_ok.plot(self.dispersion_df.index,
               lower_bound,
               'k',
               label='Lower-Bound',
               alpha=0.6)

    # Format the legend
    ax_ok.legend(bbox_to_anchor=(1, 0.5),
                 fancybox=True,
                 loc='center left',
                 ncol=5)
    lgd2 = ax_ok.legend(bbox_to_anchor=(1, 0.5),
                        fancybox=True,
                        loc='center left',
                        ncol=5)
    ax_ok.grid(True)
    ax_ok.set_ylabel(channel + ' OK')
    ax_ok.set_xlabel('Time')

    figure.suptitle(f'({ch_index}/{total_number}) - {channel}',
                    fontsize=formatting['title_fontsize'])
    plt.subplots_adjust(top=formatting['suptitle_factor'])

    if options['show']: plt.show()

    return figure