def get_stats(values, intervals=True): stats = {} values_array = np.array(values, dtype=np.float64) stats['min'] = np.asscalar(np.amin(values_array)) stats['max'] = np.asscalar(np.amax(values_array)) stats['mean'] = np.asscalar(np.mean(values_array)) stats['median'] = np.asscalar(np.median(values_array)) if values_array.size > 1: stats['std_dev'] = np.asscalar(np.std(values_array, ddof=1)) else: stats['std_dev'] = 0 if intervals: stats['intervals'] = [] loc = stats['mean'] scale = stats['std_dev'] / sqrt(values_array.size) for alpha in (.95, .99, .90, .85, .80, .50): if values_array.size > 30: interval = norm.interval(alpha, loc=loc, scale=scale) else: interval = t.interval(alpha, values_array.size - 1, loc, scale) stats['intervals'].append( {'confidence': alpha, 'interval': interval}) return stats
def t_fun(): # accumulate from -infinity to 3.077 res = t.cdf(3.0777, df=1) print(res) # probability of middle a, b = t.interval(0.95, 1) print(a, b)
def get_t_distro_outlier_bound_estimation(array, background_std): narray = rm_nans(array) low, up = t.interval(0.95, narray.shape[0]-1, np.mean(narray), np.sqrt(np.var(narray)+background_std**2)) up, low = (up-np.mean(narray), np.mean(narray)-low) return max(up, low)
def first_order_uncer(reading, conf=0.95): """ This function calculates the first order uncertainty of the mean of data in a time series Parameters: =========== reading: array readings of the measurement in time-series conf: float Confidence level. Default is 95% Returns: =========== first order uncertainty of the mean of the time series data """ #sample standard deviation num = len(reading) sample_sigma = np.std(reading, ddof=1) #standard deviation of mean mean_sigma = sample_sigma/sqrt(num-1) #t-statistics k = t.interval(conf, num-1)[1] return k*mean_sigma
def CV_experiment(features, values, log): indeces = np.array(range(len(features))) np.random.seed(1) np.random.shuffle(indeces) k_cv = 10 test_set_len = len(features)/k_cv R_sq_array = [] for k in range(k_cv): train_i = indeces[range(0,k*test_set_len)+range((k+1)*test_set_len,len(features))] test_i = indeces[range(k*test_set_len,(k+1)*test_set_len)] model = sm.OLS(values[train_i], features[train_i]) results = model.fit() predicted_values = results.predict(features[test_i]) r_sq = compute_r_squared(values[test_i], predicted_values) R_sq_array.append(r_sq) log.write(str(r_sq)+'\n') Rm = np.mean(R_sq_array) Rsig = np.std(R_sq_array) conf_interval = t.interval(.95,len(R_sq_array)-1,loc=Rm, scale=Rsig) log.write("\nAverage R squared\n") log.write(str(Rm)) log.write("\nR squared STD\n") log.write(str(Rsig)) log.write("\nR squared 95% confidence interval:\n") log.write(str(conf_interval))
def lower_and_upper(means, stds, n): lower, upper = [0]*len(means), [0]*len(means) for i in range(len(means)): m = means[i] s = stds[i] lower[i], upper[i] = student_t.interval(0.95, n, loc = m, scale = s) return lower, upper
def t_check(l, mu, alpha=0.05): aver = average(l) n = len(l) ss = s_2(l) tt = (aver - mu) * sqrt(n) / sqrt(ss) l, r = t.interval(1 - alpha, n - 1) if tt > r or tt < l: print('reject') else: print('accept')
def calc_scipy(): # read data data = loadtxt(DATA_PATH, delimiter=",", skiprows=1, usecols=(1,2)) # calculation t_value, p_value = ttest_rel(data[:,0], data[:,1]) # omake df = data.shape[0] - 1 t_dist = t.interval(0.95, df) t_dist_001 = t.interval(0.99, df) # output print '[Scipy]' print 't value:', t_value print 'p value:', p_value print 't dist(0.05):', t_dist[1], abs(t_value)>t_dist[1] print 't dist(0.01):', t_dist_001[1], abs(t_value)>t_dist_001[1] print
def query_metrics(self): # query the skewness metric from a vnf ever 2 secs # calculate the running average over 5 samples # query the host_cpu metric from a vnf ever 2 secs # calculate the running average and confidence intervals over 10 samples while not self.stop_event.is_set(): # query host cpu try: ret = query_Prometheus(self.host_cpu_query) value = float(ret[1])/self.num_cores self.host_cpu_values.append(value) except: LOG.info('Prometheus query failed: {0} \nquery: {1}'.format(ret, self.host_cpu_query)) # query skewness cpu try: for vnf_name, query in self.skew_query_dict.items(): ret = query_Prometheus(query) value = float(ret[1]) self.skew_value_dict[vnf_name].append(value) except: LOG.info('Prometheus query failed: {0} \nquery: {1}'.format(ret, query)) # check overload N = len(self.host_cpu_values) if N < 5: time.sleep(2) continue mu = np.mean(self.host_cpu_values) sigma = np.std(self.host_cpu_values) R = t.interval(0.95, N - 1, loc=mu, scale=sigma / np.sqrt(N)) host_cpu_load = float(R[1]) if host_cpu_load > 95 : LOG.info("host cpu overload CI: {0}".format(R)) skew_list = [] for vnf_name, values in self.skew_value_dict.items(): skew_avg = np.mean(values) skew_list.append(skew_avg) #LOG.info("{0} skewness avg: {1}".format(vnf_name, np.mean(values))) if skew_avg < 0: LOG.info("{0} skewness overload: {1}".format(vnf_name, skew_avg)) negative_skews = [s for s in skew_list if s < 0] if (host_cpu_load > 95) or (len(negative_skews) > 0) : self.overload_flag.set() else: self.overload_flag.clear() time.sleep(2)
def clean_tri_replicates(points, std_of_tools): """ Deletes an element inside the triplicates if one of them is strongly outlying compared to the others :param points: :return: """ if all(np.isnan(points)): # early termination if all points are nan return points arr_of_interest = pdist(points[:, np.newaxis]) _min, _max = (np.min(arr_of_interest), np.max(arr_of_interest)) containment = t.interval(0.95, 1, scale=_min / 2)[1] if _max > containment: outlier = 2 - np.argmin(arr_of_interest) msk = np.array([True, True, True]) msk[outlier] = False _mean, _std = (np.mean(points[msk]), np.std(points[msk])) containment_2 = t.interval(0.95, 1, loc=_mean, scale=np.sqrt(_std ** 2 + std_of_tools ** 2)) if points[outlier] > containment_2[1] or points[outlier] < containment_2[0]: points[outlier] = np.nan return points
def addValue(self, value): self.last_value = value self.list_values.append(value) # update running average self.sum += value self.len += 1 self.average = self.sum/self.len # update CI if self.len > 5 : mu = self.average sigma = np.std(self.list_values) N = self.len if sigma > 0: R = t.interval(0.95, N - 1, loc=mu, scale=sigma / np.sqrt(N)) self.CI = R
def main(N, md, sd, p): from scipy.stats import t from math import sqrt a = 1 - p ta = t.interval(p, N - 1) # confidence limits # print ta c = (ta[1] * sd) / sqrt(N) # print "c: {}".format(c) # print md-c, md+c return (md-c, md, md+c) # as long as md-c > 0, then we don't have the null hypothesis!
def confidence_interval(data, confidence=0.95): """Estimate the confidence interval using the t-distribution with n-1 degrees of freedom t(n-1). This is the way to go when sample size is small (n < 30) and the standard deviation cannot be estimated accurately. For large datasets, the t-distribution approaches the normal distribution. Parameters ---------- data : array-like the dataset confidence : float between 0 and 1, optional the confidence interval, default = 0.95 Assumptions ----------- the data follows a normal distrubution (when sample size is large) call_function(s) ---------------- Scipy's t.interval Returns ------- None """ degrees_freedom = len(data) - 1 sample_mean = np.mean(data) sd_err = sem(data) # Standard error of the mean SD / sqrt(n) low, high = t.interval(confidence, degrees_freedom, sample_mean, sd_err) err = high - sample_mean print(' ') print('Confidence set at {} %'.format(confidence * 100)) print('Mean = {mean} ± {err}'.format(mean=round(sample_mean, 2), err=round(err, 2))) print('Max / min = {max} / {min}'.format(max=round(high, 2), min=round(low, 2))) print('Coefficient of variation = {} %'.format( round(100 * err / sample_mean, 1))) return None
def fit(self): """ Build a confidence interval for each of the bands or indexes to use them as a classification threshold """ response = {} columns = list(self.pixels_df.columns) for column in columns: data_column = self.pixels_df[column] degrees_freedom = data_column.size - 1 mean = np.mean(data_column) standard_error = sem(data_column) confidence_interval = t.interval( self.confidence_lvl, degrees_freedom, mean, standard_error ) response[column] = confidence_interval return response
def plot_objectivefunction(results, evaluation, limit=None, sort=True, fig_name='objective_function.png'): """Example Plot as seen in the SPOTPY Documentation""" import matplotlib.pyplot as plt likes = calc_like(results, evaluation, spotpy.objectivefunctions.rmse) data = likes #Calc confidence Interval mean = np.average(data) # evaluate sample variance by setting delta degrees of freedom (ddof) to # 1. The degree used in calculations is N - ddof stddev = np.std(data, ddof=1) from scipy.stats import t # Get the endpoints of the range that contains 95% of the distribution t_bounds = t.interval(0.999, len(data) - 1) # sum mean to the confidence interval ci = [mean + critval * stddev / np.sqrt(len(data)) for critval in t_bounds] value = "Mean: %f" % mean print(value) value = "Confidence Interval 95%%: %f, %f" % (ci[0], ci[1]) print(value) threshold = ci[1] happend = None bestlike = [data[0]] for like in data: if like < bestlike[-1]: bestlike.append(like) if bestlike[-1] < threshold and not happend: thresholdpos = len(bestlike) happend = True else: bestlike.append(bestlike[-1]) if limit: plt.plot(bestlike, 'k-') #[0:limit]) plt.axvline(x=thresholdpos, color='r') plt.plot(likes, 'b-') #plt.ylim(ymin=-1,ymax=1.39) else: plt.plot(bestlike) plt.savefig(fig_name)
def compute_prediction_interval(self, X, y=None, level=.95): from scipy.stats import t ypred = self.predict(X, y) self.prediction_se_ = self.compute_prediction_se(X, y) self.prediction_se_ = self.process_predictions( self.prediction_se_, Vx=X[self.vx_colname][self.prediction_se_mask_], inverse_transform_y=False) n, p = self.train_features_.shape lower_z, upper_z = t.interval(level, n - p) pred_interval = { 'ypred': ypred, 'lower': ypred + (lower_z * self.prediction_se_), 'upper': ypred + (upper_z * self.prediction_se_) } return pred_interval
def conf_interval(arr, confidence = 0.95): N = arr.size if N <= 30: z = t.interval(0.95, N - 1) else: z = norm.interval(0.95) s = arr.std() x_bar = arr.mean() return (x_bar - z*(s/np.sqrt(N)), x_bar + z*(s/np.sqrt(N)))
def conf_interval(data, confidence=0.95): """Estimate the confidence interval using the t-distribution with n-1 degrees of freedom t(n-1). This is the way to go when sample size is small (n < 30) and the standard deviation cannot be estimated accurately. For large datasets, the t-distribution approaches the normal distribution. Parameters ---------- data : array-like the dataset confidence : float between 0 and 1, optional the confidence interval, default = 0.95 Assumptions ----------- the data follows a normal or symmetric distrubution (when sample size is large) call_function(s) ---------------- Scipy's t.interval Returns ------- the arithmetic mean, the error, and the limits of the confidence interval """ dof = len(data) - 1 amean = np.mean(data) std_err = sem(data) # Standard error of the mean SD / sqrt(n) low, high = t.interval(confidence, dof, amean, std_err) err = high - amean print(' ') print(f'Mean = {amean:0.2f} ± {err:0.2f}') print(f'Confidence set at {confidence * 100} %') print(f'Max / min = {high:0.2f} / {low:0.2f}') print(f'Coefficient of variation = ±{100 * err / amean:0.1f} %') return amean, err, (low, high)
def summary_exectime(self): if not scipy_loaded: return exectimes = [] with open(self.exectime_path) as f: for line in f: exectimes.append(float(line)) # calculate 0.95 confidence interval, assuming T-student distribution exectimes_mean = average(exectimes) standard_deviation = std(exectimes, ddof=1) t_bounds = t.interval(0.95, len(exectimes) - 1) ci = [ exectimes_mean + crit_val * standard_deviation / math.sqrt(len(exectimes)) for crit_val in t_bounds ] self.log("Mean exec time: {0:.2f}".format(exectimes_mean)) self.log( "0.95 confidence interval, assuming T-student distribution: {0:.2f}, {1:.2f}\n" .format(ci[0], ci[1]))
def compute_confint(self): # compute confint from ob_data total = 0.0 sdtotal = 0.0 n = 0.0 # compute eprice and evol, weighted based on time for v in self._ob_data: weight = self._TIME_DECAY_FACTOR**(time.time() - v[0]) n += weight total += v[1] * weight sdtotal += ((v[1] - self._mu_sum / self._obs)**2) * weight self._eprice = total / n self._evol = sdtotal / n # if evol is zero, no activity is occuring; discourage bot from trading due to lack of liquidity if self._evol < 1: self._evol = (self._MAX_MKT_PRICE - 1)**2 # CI based on Student t distribution return t.interval(1 - self._CONFIDENCE, int(round(n)), self._eprice, self._evol**0.5)
def plot_objectivefunction(results,evaluation,limit=None,sort=True): """Example Plot as seen in the SPOTPY Documentation""" import matplotlib.pyplot as plt from matplotlib import colors cnames=list(colors.cnames) likes=calc_like(results,evaluation) data=likes #Calc confidence Interval mean = np.average(data) # evaluate sample variance by setting delta degrees of freedom (ddof) to # 1. The degree used in calculations is N - ddof stddev = np.std(data, ddof=1) from scipy.stats import t # Get the endpoints of the range that contains 95% of the distribution t_bounds = t.interval(0.999, len(data) - 1) # sum mean to the confidence interval ci = [mean + critval * stddev / np.sqrt(len(data)) for critval in t_bounds] value="Mean: %f" % mean print(value) value="Confidence Interval 95%%: %f, %f" % (ci[0], ci[1]) print(value) threshold=ci[1] happend=None bestlike=[data[0]] for like in data: if like<bestlike[-1]: bestlike.append(like) if bestlike[-1]<threshold and not happend: thresholdpos=len(bestlike) happend=True else: bestlike.append(bestlike[-1]) if limit: plt.plot(bestlike,'k-')#[0:limit]) plt.axvline(x=thresholdpos,color='r') plt.plot(likes,'b-') #plt.ylim(ymin=-1,ymax=1.39) else: plt.plot(bestlike)
def uncertainty_q_random(discharges, prop): """Compute 95% random uncertainty for property of discharge. Uses simplified method for 2 transects. Parameters ---------- discharges: list List of Discharge objects prop: str Attribute of Discharge objects Returns ------- cov: float Coefficient of variation cov_95: float Coefficient of variation inflated to 95% value """ n_max = len(discharges) if n_max > 0: # Create array of specified attribute data = Uncertainty.get_array_attr(discharges, prop) # Compute coefficient of variation cov = np.abs(np.nanstd(data, ddof=1) / np.nanmean(data)) * 100 # Inflate the cov to the 95% value if n_max == 2: # Use the approximate method as taught in class to reduce the high coverage factor for 2 transects # and account for prior knowledge related to 720 second duration analysis cov_95 = cov * 3.3 else: # Use Student's t to inflate COV for n > 2 cov_95 = t.interval(0.95, n_max - 1)[1] * cov / n_max**0.5 else: cov = np.nan cov_95 = np.nan return cov, cov_95
def jackknife_ci(ratings, sim, use_unweighted, use_weighted): if use_weighted == True: mu_weighted_ratings = sum(ratings * sim) / sum(abs(sim)) if use_unweighted == True: mu_ratings = np.mean(ratings) mu_jk_samples, n = [], len(ratings) index = np.arange(n) for i in range(n): if use_unweighted == True: jk_sample = ratings[index != i] mu_jk_sample = np.mean(jk_sample) #print(mu_jk_sample) mu_jk_samples.append(mu_jk_sample) if use_weighted == True: jk_sample = ratings[index != i] * sim[index != i] mu_jk_sample = sum(jk_sample) / (sum(abs(sim[index != i]))) #print(sum(jk_sample)/sum(abs(sim[index != i]))) #print(mu_jk_sample) mu_jk_samples.append(mu_jk_sample) if use_unweighted == True: se_jk = np.sqrt( sum(pow((mu_ratings - mu_jk_samples), 2)) * (n - 1) / n) #print(se_jk) if use_weighted == True: se_jk = np.sqrt( sum(pow((mu_weighted_ratings - mu_jk_samples), 2)) * (n - 1) / n) #print(se_jk) if n >= 30: multi = 1.96 else: multi = t.interval(alpha=0.975, df=n - 1)[1] return multi * se_jk
def fitting_data(x,y,fit_method,**bounds): func,bounds_,bound_name = fit_method_fetcher(fit_method,**bounds) p_0 = [0.5*(i+j) for i,j in zip(bounds_[0],bounds_[1])] x,y,y_stdev,multi_set = convert_x_y(x,y) try: freedom = max(1,len(x)-len(bound_name)) lower_CI = [] upper_CI = [] if multi_set: fit_result,corv_=curve_fit(func, x, y,p0=p_0,sigma=y_stdev,bounds = bounds_,absolute_sigma=False) # else: fit_result,corv_=curve_fit(func, x, y,p0=p_0,bounds = bounds_,absolute_sigma=False) sigma = np.sqrt(np.diagonal(corv_)) for i,j in zip(sigma,fit_result): C_interval = t.interval(0.95,freedom,j,i) lower_CI.append(C_interval[0]) upper_CI.append(C_interval[1]) except Exception as e: print(e) fit_result = [1]*len(bound_name) lower_CI = fit_result upper_CI = fit_result return dict(zip(bound_name, fit_result)),dict(zip(bound_name, lower_CI)),dict(zip(bound_name, upper_CI))
def confidential_interval(x, alpha=0.98): """ Return a numpy array of column confidential interval Args: x: a numpy array alpha: alpha value of confidential interval Returns: A numpy array which indicate the each difference from sample average point to confidential interval point """ from scipy.stats import t if x.ndim == 1: return None # calculate degree of freedom df = len(x[0]) - 1 # calculate positive critical value of student's T distribution cv = t.interval(alpha, df)[1] # calculate sample standard distribution std = np.std(x, axis=1) # calculate positive difference from # sample average to confidential interval return std * cv / np.sqrt(df)
def post_stratified_mean_var(self, domain, variable, confidence=0.95): #print ">>> In post_stratified_mean_var <<<" pv = {} pt = {} #print domain, variable self.get_strata_var(domain, variable) for h in self.s2: #print ">>> {0}, {1} <<<".format(self.weights[h], self.s2[h]) pv[h] = self.weights[h] * self.s2[h] pv[h] += ((1 - self.weights[h]) * self.s2[h]) / len( self.dtfr.Plot.unique()) pv[h] /= len(self.dtfr.Plot.unique()) pt[h] = self.stratum_mean(h, domain, variable) * self.areas[h] vartot = self.var_total(pv) std_err = (vartot / len(self.dtfr.Plot.unique()))**0.5 mean = self.dtfr.loc[self.dtfr.Domain == domain, variable].sum() / float( len(self.dtfr.Plot.unique())) poptot = self.total(domain, variable) #sum(self.areas.values()) * mean cv = vartot**0.5 / poptot * 100 conf_inter = t.interval(confidence, len(self.dtfr.Plot.unique()) - 1, poptot, std_err) out = { 'Domain mean': mean, 'Population total': poptot, 'Coefficient of variation': cv, 'Strata variances': pv, 'Strata totals': pt, 'Variance of the total': vartot, 'Confidence interval': conf_inter } return out
chargers = Charger(NBSS) listTime = [] listChargingRate = [] while time < SIM_TIME: (time, event_type, charger) = FES.get() if event_type == "arrival": arrival(time, FES, waitingLine) elif event_type == "batteryAvailable": batteryAvailable(time, FES, waitingLine, charger) elif event_type == "chargingRate_change": chargingRate_change(time, FES) listTime.append(time) listChargingRate.append(chargingRate) confidence_int_wait = t.interval(0.999, len(data.waitingTime) - 1, np.mean(data.waitingTime), sem(data.waitingTime)) confidence_int_charge = t.interval(0.999, len(data.chargingTime) - 1, np.mean(data.chargingTime), sem(data.chargingTime)) print(f"Confidence interval Waiting Time: {confidence_int_wait}") print(f"Confidence interval Charging Time: {confidence_int_charge}") print(f"Number of arrivals: {data.arr}") print(f"Number of departures: {data.dep}") print(f"Number of losses: {len(data.loss)}") plotCDF(data.loss, "", "", "test.pdf")
def ts_dispersion_uplot(self, **kwargs): ''' Plots dispersion timeseries in uplot plot Parameters ---------- channel: string Channel options: dict Options including data processing prior to plot. Defaults in config._plot_def_opt formatting: dict Formatting dict. Defaults in config._ts_plot_def_fmt Returns ------- Matplotlib figure ''' head_template = ''' <link rel="stylesheet" href="https://leeoniya.github.io/uPlot/dist/uPlot.min.css"> <script src="https://leeoniya.github.io/uPlot/dist/uPlot.iife.js"></script> <div style="text-align:center"> <h2 style="font-family: Roboto"> {{title}} </h2> </div> ''' uplot_template = ''' <div id="plot{{subplot}}"></div> <script> data = {{data}}; options = {{options}}; if (typeof options.scatter == 'undefined') { options.scatter = false } if (options.scatter) { for (i=1; i<data.length; i++) { options['series'][i]["paths"] = u => null; } } u = new uPlot(options, data, document.getElementById("plot{{subplot}}")) </script> ''' if 'channel' not in kwargs: std_out('Needs at least one channel to plot') return None else: channel = kwargs['channel'] if 'options' not in kwargs: std_out('Using default options') options = config._plot_def_opt else: options = dict_fmerge(config._plot_def_opt, kwargs['options']) if 'formatting' not in kwargs: std_out('Using default formatting') formatting = config._ts_plot_def_fmt['uplot'] else: formatting = dict_fmerge(config._ts_plot_def_fmt['uplot'], kwargs['formatting']) # Size sanity check if formatting['width'] < 100: std_out('Setting width to 800') formatting['width'] = 800 if formatting['height'] < 100: std_out('Reducing height to 600') formatting['height'] = 600 if 'html' not in options: options['html'] = False if self.dispersion_df is None: std_out('Perform dispersion analysis first!', 'ERROR') return None if self.common_channels == []: self.get_common_channels() if channel not in self.common_channels: std_out(f'Channel {channel} not in common_channels') return None if channel in config._dispersion['ignore_channels']: std_out(f'Channel {channel} ignored per config') return None if len(self.devices) > config._dispersion['nt_threshold']: distribution = 'normal' std_out('Using normal distribution') std_out(f"Using limit for sigma confidence:\ {config._dispersion['limit_confidence_sigma']}") else: distribution = 't-student' std_out(f'Using t-student distribution.') ch_index = self.common_channels.index(channel) + 1 total_number = len(self.common_channels) h = Template(head_template).render( title=f'({ch_index}/{total_number}) - {channel}') dispersion_avg = self._dispersion_summary[channel] if distribution == 'normal': limit_confidence = config._dispersion['limit_confidence_sigma'] # Calculate upper and lower bounds if (config._dispersion['instantatenous_dispersion']): # For sensors with high variability in the measurements, it's better to use this upper_bound = self.dispersion_df[channel + '_AVG']\ + limit_confidence * self.dispersion_df[channel + '_STD'] lower_bound = self.dispersion_df[channel + '_AVG']\ - abs(limit_confidence * self.dispersion_df[channel + '_STD']) else: upper_bound = self.dispersion_df[channel + '_AVG']\ + limit_confidence * dispersion_avg lower_bound = self.dispersion_df[channel + '_AVG']\ - abs(limit_confidence * dispersion_avg) else: limit_confidence = t.interval( config._dispersion['t_confidence_level'] / 100.0, len(self.devices), loc=self.dispersion_df[channel + '_AVG'], scale=dispersion_avg) upper_bound = limit_confidence[1] lower_bound = limit_confidence[0] udf = self.dispersion_df.copy() udf['upper_bound'] = upper_bound udf['lower_bound'] = lower_bound udf = udf.fillna('null') # List containing subplots. First list for TBR, second for OK subplots = [[], []] if formatting['join_sbplot']: n_subplots = 1 else: n_subplots = 2 udf.index = udf.index.astype(int) / 10**9 # Compose subplots lists for device in self.devices: ncol = channel + '-' + device if ncol in self.dispersion_df.columns: # Count how many times we go above the upper bound or below the lower one count_problems_up = self.dispersion_df[ncol] > upper_bound count_problems_down = self.dispersion_df[ncol] < lower_bound # Count them count_problems = [1 if (count_problems_up[i] or count_problems_down[i])\ else 0 for i in range(len(count_problems_up))] # Add the trace in either number_errors = np.sum(count_problems) max_number_errors = len(count_problems) # TBR if number_errors / max_number_errors > config._dispersion[ 'limit_errors'] / 100: std_out( f"Device {device} out of {config._dispersion['limit_errors']}% limit\ - {np.round(number_errors/max_number_errors*100, 1)}% out", 'WARNING') subplots[0].append(ncol) #OK else: subplots[n_subplots - 1].append(ncol) # Add upper and low bound bound to subplot 0 subplots[0].append(channel + '_AVG') subplots[0].append('upper_bound') subplots[0].append('lower_bound') if n_subplots > 1: # Add upper and low bound bound to subplot 1 subplots[n_subplots - 1].append(channel + '_AVG') subplots[n_subplots - 1].append('upper_bound') subplots[n_subplots - 1].append('lower_bound') ylabels = [channel + '_TBR', channel + '_OK'] else: ylabels = [channel] # Make subplots for isbplt in range(n_subplots): sdf = udf.loc[:, subplots[isbplt]] sdf = sdf.reset_index() data = sdf.values.T.tolist() labels = sdf.columns useries = [{'label': labels[0]}] ylabel = ylabels[isbplt] uaxes = [{ 'label': formatting['xlabel'], 'labelSize': formatting['fontsize'], }, { 'label': ylabel, 'labelSize': formatting['fontsize'] }] color_idx = 0 for label in labels: if label == labels[0]: continue if color_idx + 1 > len(colors): color_idx = 0 # Gray bounds and averages if '_bound' in label or '_AVG' in label: stroke = 'gray' point = {'space': 50, 'size': min([formatting['size'] - 2, 1])} else: stroke = colors[color_idx] point = {'space': 0, 'size': formatting['size']} nser = {'label': label, 'stroke': stroke, 'points': point} useries.append(nser) color_idx += 1 u_options = { 'width': formatting['width'], 'height': formatting['height'], 'legend': { 'isolate': True }, 'cursor': { 'lock': True, 'focus': { 'prox': 16, }, 'sync': { 'key': 'moo', 'setSeries': True, }, 'drag': { 'x': True, 'y': True, 'uni': 50, 'dist': 10, } }, 'scales': { 'x': { 'time': True }, 'y': { 'auto': True }, }, 'series': useries, 'axes': uaxes } h2 = Template(uplot_template).render(data=json.dumps(data), options=json.dumps(u_options), subplot=isbplt) h += h2 h = h.replace('"', "'") h = h.replace("'null'", "null") if options['html']: return h else: iframe = f'''<iframe srcdoc="{h}" src="" frameborder="0" width={formatting['width'] + formatting['padding-right']} height={formatting['height'] + formatting['padding-bottom']} sandbox="allow-scripts"> </iframe>''' return HTML(iframe)
def bias(df, dropna=True, alpha=0.05, flatten=True): """" Calculates temporal mean biases and its confidence intervals based on Student's t-distribution, both with and without auto-correlation corrected sample size. Parameters ---------- df : pd.DataFrame Data Frame whose k columns will be correlated dropna : boolean If false, temporal matching (dropna-based) will be done for each column-combination individually alpha : float [0,1] Significance level for the confidence intervals flatten : boolean If set, results are returned as pd.Series in case df only holds 2 columns Returns ------- res : xr.DataArray (k x k x 7) Data Array holding the following statistics for each data set combination of df: bias : Temporal mean bias n, n_corr : original and auto-correlation corrected sample size CI_l, CI_l_corr, CI_u, CI_u_corr : lower and upper confidence levels with and without sample size correction res : pd.Series (if flatten is True and df contains only two columns) Series holding the above described statistics for the two input data sets. """ if not isinstance(df, pd.DataFrame): print('Error: Input is no pd.DataFrame.') return None if dropna is True: df.dropna(inplace=True) df.sort_index(inplace=True) cols = df.columns.values stats = ['bias', 'n', 'CI_l', 'CI_u', 'n_corr', 'CI_l_corr', 'CI_u_corr'] dummy = np.full((len(cols), len(cols), len(stats)), np.nan) res = xr.DataArray(dummy, dims=['ds1', 'ds2', 'stats'], coords={ 'ds1': cols, 'ds2': cols, 'stats': stats }) for ds1 in cols: for ds2 in cols: if ds1 == ds2: continue # get sample size tmpdf = df[[ds1, ds2]].dropna() n = len(tmpdf) res.loc[ds1, ds2, 'n'] = n res.loc[ds2, ds1, 'n'] = n if n < 5: continue # Calculate bias & ubRMSD diff = tmpdf[ds1].values - tmpdf[ds2].values bias = diff.mean() ubRMSD = diff.std(ddof=1) t_l, t_u = t.interval(1 - alpha, n - 1) CI_l = bias + t_l * ubRMSD / np.sqrt(n) CI_u = bias + t_u * ubRMSD / np.sqrt(n) res.loc[ds1, ds2, 'bias'] = bias res.loc[ds1, ds2, 'CI_l'] = CI_l res.loc[ds1, ds2, 'CI_u'] = CI_u n_corr = correct_n(n, tmpdf) res.loc[ds1, ds2, 'n_corr'] = n_corr res.loc[ds2, ds1, 'n_corr'] = n_corr if n_corr < 5: continue # Confidence intervals with corrected sample size t_l, t_u = t.interval(alpha, n_corr - 1) CI_l = bias + t_l * ubRMSD / np.sqrt(n_corr) CI_u = bias + t_u * ubRMSD / np.sqrt(n_corr) res.loc[ds1, ds2, 'CI_l_corr'] = CI_l res.loc[ds1, ds2, 'CI_u_corr'] = CI_u if flatten is True: if len(cols) == 2: res = pd.Series(res.loc[cols[0], cols[1], :], index=stats, dtype='float32') return res
def summary(self, regpyhdfe, yname=None, xname=None, title=None, alpha=.05): """ Summarize the Regression Results. Parameters ---------- yname : str, optional Name of endogenous (response) variable. The Default is `y`. xname : list[str], optional Names for the exogenous variables. Default is `var_##` for ## in the number of regressors. Must match the number of parameters in the model. title : str, optional Title for the top table. If not None, then this replaces the default title. alpha : float The significance level for the confidence intervals. Returns ------- Summary Instance holding the summary tables and text, which can be printed or converted to various output formats. See Also -------- statsmodels.iolib.summary.Summary : A class that holds summary results. """ ########################################################################################################## ########################################################################################################## # https://apithymaxim.wordpress.com/2020/03/16/clustering-standard-errors-by-hand-using-python/ # http://cameron.econ.ucdavis.edu/research/Cameron_Miller_JHR_2015_February.pdf #N,k,Nclusts = len(df.index),3,50 # Number of observations, right hand side columns counting constant, number of clusters #X = np.hstack( (np.random.random((N,k-1)), np.ones((N,1)) ) ) #X = get_np_columns(df, ['wks_ue', 'tenure'], intercept=True) X = regpyhdfe.data[:, 1:] #y = get_np_columns(df, ['ttl_exp']) y = np.expand_dims(regpyhdfe.data[:, 0], 1) # Calculate (X'X)^-1 and the vector of coefficients, beta XX_inv = np.linalg.inv(X.T.dot(X)) beta = (XX_inv).dot(X.T.dot(y)) resid = y - X.dot(beta) #ID = np.random.choice([x for x in range(Nclusts)],N) # Vector of cluster IDs #ID = np.squeeze(get_np_columns(df, ['delete_me'])) ID = np.squeeze(regpyhdfe.groups_np) c_list = np.unique(ID) # Get unique list of clusters N, k, Nclusts = X.shape[0], X.shape[1], int(c_list.shape[0]) sum_XuuTX = 0 for c in range(0, Nclusts): in_cluster = (ID == c_list[c]) # Indicator for given cluster value resid_c = resid[in_cluster] uuT = resid_c.dot(resid_c.T) Xc = X[in_cluster] XuuTX = Xc.T.dot(uuT).dot(Xc) sum_XuuTX += XuuTX adj = (Nclusts / (Nclusts - 1)) * ( (N - 1) / (N - k) ) # Degrees of freedom correction from https://www.stata.com/manuals13/u20.pdf p. 54 # TODO: actually check if the fixed effects are nested df_a_nested = 1 adj = ((N - 1) / (N - df_a_nested - k)) * (Nclusts / (Nclusts - 1)) V_beta = adj * (XX_inv.dot(sum_XuuTX).dot(XX_inv)) se_beta = np.sqrt(np.diag(V_beta)) # Output data for Stata for_stata = pd.DataFrame(X) for_stata.columns = ["X" + str(i) for i in range(k)] for_stata['ID'] = ID for_stata['y'] = y ##for_stata.to_stata("resid_test.dta") print('B', beta, '\n SE: \n', se_beta) beta = np.squeeze(beta) t_values = beta / se_beta print('T values', t_values) from scipy.stats import t p_values = 2 * t.cdf(-np.abs(t_values), regpyhdfe.model.df_resid) # confidence interval size t_interval = np.asarray( t.interval(alpha=(1 - alpha), df=regpyhdfe.model.df_resid)) print("t_interval", t_interval) intervals = np.empty(shape=(beta.shape[0], 2)) # for each variables for i in range(0, intervals.shape[0]): intervals[i] = t_interval * se_beta[i] + beta[i] print('intervals', intervals) tmp1 = np.linalg.solve(V_beta, np.mat(beta).T) tmp2 = np.dot(np.mat(beta), tmp1) fvalue = tmp2[0, 0] / k import pdb pdb.set_trace() print('fvalue', fvalue) # from statsmodels.stats.stattools import ( # jarque_bera, omni_normtest, durbin_watson) # jb, jbpv, skew, kurtosis = jarque_bera(self.wresid) # omni, omnipv = omni_normtest(self.wresid) # eigvals = self.eigenvals # condno = self.condition_number # TODO: Avoid adding attributes in non-__init__ # self.diagn = dict(jb=jb, jbpv=jbpv, skew=skew, kurtosis=kurtosis, # omni=omni, omnipv=omnipv, condno=condno, # mineigval=eigvals[-1]) # TODO not used yet # diagn_left_header = ['Models stats'] # diagn_right_header = ['Residual stats'] # TODO: requiring list/iterable is a bit annoying # need more control over formatting # TODO: default do not work if it's not identically spelled top_left = [ ('Dep. Variable:', None), ('Model:', None), ('Method:', ['Least Squares']), ('Date:', None), ('Time:', None), ('No. Observations:', None), ('Df Residuals:', None), ('Df Model:', None), ] if hasattr(self, 'cov_type'): top_left.append(('Covariance Type:', [self.cov_type])) rsquared_type = '' if self.k_constant else ' (uncentered)' top_right = [ ('R-squared' + rsquared_type + ':', ["%#8.3f" % self.rsquared]), ('Adj. R-squared' + rsquared_type + ':', ["%#8.3f" % self.rsquared_adj]), ('F-statistic:', ["%#8.4g" % self.fvalue]), ('Prob (F-statistic):', ["%#6.3g" % self.f_pvalue]), ] # diagn_left = [('Omnibus:', ["%#6.3f" % omni]), # ('Prob(Omnibus):', ["%#6.3f" % omnipv]), # ('Skew:', ["%#6.3f" % skew]), # ('Kurtosis:', ["%#6.3f" % kurtosis]) # ] # # diagn_right = [('Durbin-Watson:', # ["%#8.3f" % durbin_watson(self.wresid)] # ), # ('Jarque-Bera (JB):', ["%#8.3f" % jb]), # ('Prob(JB):', ["%#8.3g" % jbpv]), # ] if title is None: title = self.model.__class__.__name__ + ' ' + "Regression Results" # create summary table instance from statsmodels.iolib.summary import Summary smry = Summary() smry.add_table_2cols(self, gleft=top_left, gright=top_right, yname=yname, xname=xname, title=title) smry.add_table_params(self, yname=yname, xname=xname, alpha=alpha, use_t=self.use_t) # smry.add_table_2cols(self, gleft=diagn_left, gright=diagn_right, # yname=yname, xname=xname, # title="") # add warnings/notes, added to text format only etext = [] if not self.k_constant: etext.append("R² is computed without centering (uncentered) since the " "model does not contain a constant.") if hasattr(self, 'cov_type'): etext.append(self.cov_kwds['description']) if self.model.exog.shape[0] < self.model.exog.shape[1]: wstr = "The input rank is higher than the number of observations." etext.append(wstr) # if eigvals[-1] < 1e-10: # wstr = "The smallest eigenvalue is %6.3g. This might indicate " # wstr += "that there are\n" # wstr += "strong multicollinearity problems or that the design " # wstr += "matrix is singular." # wstr = wstr % eigvals[-1] # etext.append(wstr) # elif condno > 1000: # TODO: what is recommended? # wstr = "The condition number is large, %6.3g. This might " # wstr += "indicate that there are\n" # wstr += "strong multicollinearity or other numerical " # wstr += "problems." # wstr = wstr % condno # etext.append(wstr) if etext: etext = [ "[{0}] {1}".format(i + 1, text) for i, text in enumerate(etext) ] etext.insert(0, "Notes:") smry.add_extra_txt(etext) return smry
xaxes = np.linspace(np.min(VA) * 0.9995, np.max(VA) * 1.001, 1000) pdf = norm.pdf(xaxes, loc=VA_quer, scale=s_VA) ax.plot(xaxes, pdf, 'r', label='Wahrscheinlichkeitsdichte der Ggh') ax.set_xlabel(r'Wärme in $\frac{\mathrm{cal}}{\mathrm{g}}$') ax.set_ylabel(r'Wahrscheinlichkeit') ax.legend() "d) Hypothesentest auf identische Mittelwerte mu_VA und mu_VA, " # Stichprobenanalyse VB_quer = np.mean(VB) s_VB = np.std(VB, ddof=1) N_VB = VB.size s_gesamt = np.sqrt(((N_VA - 1) * s_VA + (N_VB - 1) * s_VB) / N_VA + N_VB - 2) # Intervallgrenzen der t-verteilen ZV mit NA+NB-2 Freiheitsgraden C = t.interval(gamma95, N_VA + N_VB - 2) # Annahmebereich berechnen Annnahme_delta_x_quer = np.array([ C[0] * np.sqrt(1 / N_VA + 1 / N_VB) * s_gesamt, C[1] * np.sqrt(1 / N_VA + 1 / N_VB) * s_gesamt ]) # Vergleich mit Stichprobe if (VA_quer - VB_quer) < Annnahme_delta_x_quer[0] or ( VA_quer - VB_quer) >= Annnahme_delta_x_quer[1]: print("Hypothese verworfen") else: print("Hypothese angenommen") print("{:.4f} < {:.4f} <= {:.4f}".format(Annnahme_delta_x_quer[0], (VA_quer - VB_quer), Annnahme_delta_x_quer[1]))
skl_linmod = linear_model.LinearRegression() skl_linmod.fit(X_scaled, Y) coeff = skl_linmod.coef_ intercept = skl_linmod.intercept_ teta_scaled = [intercept, coeff[0], coeff[1], coeff[2], coeff[3], coeff[4]] print(teta_scaled) residual = Y - skl_linmod.predict(X_scaled) norm_residual = LA.norm(residual) var_residual_estimated = (norm_residual ** 2) / (n - LA.matrix_rank(X_scaled)) print(var_residual_estimated) interval_student = t.interval(0.99, n - p - 1, loc=0, scale=1) quantile = interval_student[1] txx = np.dot(np.transpose(X_scaled), X_scaled) txx_inv = LA.inv(txx) intervalle = [] for i in range(5): intervalle.append( [ coeff[i] - quantile * np.sqrt(var_residual_estimated * txx_inv[i][i]), coeff[i] + quantile * np.sqrt(var_residual_estimated * txx_inv[i][i]), ] ) print(intervalle)
print("95%信頼区間:", confint) p = sm.tsa.adfuller(arma_res.resid, regression='nc')[1] #[1]はp値の検定結果 p1 = sm.tsa.adfuller(arma_res.resid, regression='c')[1] #[1]はp値の検定結果 print("ドリフト無しランダムウォーク p値:", p) print("ドリフト付きランダムウォーク p値:", p1) from scipy.stats import t resid = arma_res.resid.iloc[1:] m = resid.mean() v = resid.std() resid_max = pd.Series.rolling(arma_res.resid, window=250).mean().max() resid_min = pd.Series.rolling(arma_res.resid, window=250).mean().min() print("平均: %2.5f" % m, "標準偏差: %2.4f" % v) print("250日平均の最大値: %2.5f" % resid_max, "250日平均の最小値: %2.5f" % resid_min) print("250日平均の95%の信頼区間: ", (t.interval(alpha=0.95, df=250, loc=0, scale=v))) from scipy.stats import chi2 resid = arma_res.resid.iloc[1:] m = resid.mean() v = resid.std() resid_max = pd.Series.rolling(arma_res.resid, window=250).std().max() resid_min = pd.Series.rolling(arma_res.resid, window=250).std().min() print("平均: %2.5f" % m, " 標準偏差: %2.5f" % v) print("250日標準偏差の最大値:%2.5f" % resid_max, "250日標準偏差の最小値:%2.5f" % resid_min) cint1, cint2 = chi2.interval(alpha=(0.95), df=249) bcs = [ "1949/5/16", "1954/12/1", "1972/1/1", "1986/12/1", "1986/12/1", "1993/11/1", "1999/2/1", "2002/2/1", "2009/4/1"
from scipy.stats import t from numpy import average, std from math import sqrt if __name__ == '__main__': # data we want to evaluate: average height of 30 one year old male and # female toddlers. Interestingly, at this age height is not bimodal yet data = [63.5, 81.3, 88.9, 63.5, 76.2, 67.3, 66.0, 64.8, 74.9, 81.3, 76.2, 72.4, 76.2, 81.3, 71.1, 80.0, 73.7, 74.9, 76.2, 86.4, 73.7, 81.3, 68.6, 71.1, 83.8, 71.1, 68.6, 81.3, 73.7, 74.9] mean = average(data) # evaluate sample variance by setting delta degrees of freedom (ddof) to # 1. The degree used in calculations is N - ddof stddev = std(data, ddof=1) # Get the endpoints of the range that contains 95% of the distribution t_bounds = t.interval(0.95, len(data) - 1) # sum mean to the confidence interval ci = [mean + critval * stddev / sqrt(len(data)) for critval in t_bounds] print ("Mean: %f" % mean) print ("Confidence Interval 95%%: %f, %f" % (ci[0], ci[1])) #%% ##%matplotlib inline import pandas as pd import matplotlib.pyplot as plt import numpy as np raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], 'pre_score': [4, 24, 31, 2, 3], 'mid_score': [25, 94, 57, 62, 70],
def fire_stats_by_year(pathway_set): """From a list of FireGirlPathway objects, return descriptive statistics of fires, by yearly_logging_totals Arguements pathway_set: A list of FireGirlPathway objects Returns a list with the following elements: -Element 0: A list containing various cells-burned stats ---Element 0: A list by year containing average number of cells burned in this years fires accross pathways ---Element 1: A list by year containing the smallest number of cells burned for any fire in a given year ---Element 2: A list by year containing the largest number of cells burned for any fire in a given year ---Element 3: A list, by year, containing standard deviations of cells burned for each year ---Element 4: A list, by year, containing upper confidence intervals on cells burned ---Element 5: A list, by year, containing lower confidence intervals on cells burned -Element 1: A list containing various timber-lost stats ---Element 0: A list, by year, containing the average timber lost to fire each year ---Element 1: A list, by year, containing the smallest timber lost to a fire in any pathway in a given year ---Element 2: A list, by year, containing the largest timber lost to a fire in any pathway in a given year ---Element 3: A list, by year, containing standard deviations of timber lost for each year ---Element 4: A list, by year, containing upper confidence intervals on timber lost ---Element 5: A list, by year, containing lower confidence intervals on timber lost -Element 2: A list, by year, of the number of pathways which suppressed their fires this year """ #checking for an empty input list if len(pathway_set) < 1: #it's empty, so return an equally empty result string return [[],[],[],[],[],[],[],[],[],[]] #things to compile cells_burned_ave = [] cells_burned_max = [] cells_burned_min = [] cells_burned_std = [] cells_burned_confidence_upper = [] cells_burned_confidence_lower = [] timber_lost_ave = [] timber_lost_max = [] timber_lost_min = [] timber_lost_std = [] timber_lost_confidence_upper = [] timber_lost_confidence_lower = [] suppress_decisions = [] #how many years are there in these pathways? # assuming they all have the same number of events, query the first pathway in the list, and get the lenght # of it's ignition_events list years = len(pathway_set[0].ignition_events) #get a new value for each of the above lists, for each year for y in range(years): this_years_cells_burned = [] this_years_timber_lost = [] this_years_suppress_decisions = 0 #look through each pathway and add their value for year=y to cells_burned, timber_lost, and supp_decisions for pw in pathway_set: #in a pathway's ignition_events list, the ignition records each have an "outcomes" member #an ignition_record.getOutcomes() call returns a list in the following format: # [timber_loss, cells_burned, sup_cost, end_time] outcomes = pw.ignition_events[y].getOutcomes() this_years_timber_lost.append( outcomes[0] ) this_years_cells_burned.append( outcomes[1] ) #likewise, calling an iginiton event object's .getChoice() method will return a True if the simulator # suppressed that fire, and a False if it did not. if pw.ignition_events[y].getChoice(): this_years_suppress_decisions += 1 #we've got all the cells_burned, timber_lost, and suppress decisions for each pathway for this year cells_burned_ave.append( mean(this_years_cells_burned) ) cells_burned_max.append( max(this_years_cells_burned) ) cells_burned_min.append( min(this_years_cells_burned) ) cells_burned_std.append( std(this_years_cells_burned) ) timber_lost_ave.append( mean(this_years_timber_lost) ) timber_lost_max.append( max(this_years_timber_lost) ) timber_lost_min.append( min(this_years_timber_lost) ) timber_lost_std.append( std(this_years_timber_lost) ) suppress_decisions.append( this_years_suppress_decisions ) #get the t-stat for a 95% confidence interval for sample of this size #this returns a list with the [lower , upper] stats, which are equal and opposite if centered # around the mean, as ours are. tstat = t.interval(0.95, len(this_years_cells_burned) ) #the upper and lower confidence intervals are calculated as # Upper = Mean + (tstat) * (standard error of the mean) # Lower = Mean - (tstat) * (standard error of the mean) cells_burned_upper_conf = cells_burned_ave[y] + ( tstat[0] * cells_burned_std[y] ) cells_burned_lower_conf = cells_burned_ave[y] - ( tstat[0] * cells_burned_std[y] ) timber_lost_upper_conf = timber_lost_ave[y] + ( tstat[0] * timber_lost_std[y] ) timber_lost_lower_conf = timber_lost_ave[y] - ( tstat[0] * timber_lost_std[y] ) cells_burned_confidence_upper.append( cells_burned_upper_conf ) cells_burned_confidence_lower.append( cells_burned_lower_conf ) timber_lost_confidence_upper.append( timber_lost_upper_conf ) timber_lost_confidence_lower.append( timber_lost_lower_conf ) #All Years are finished, so compile the return lists cells_burned_stats = [cells_burned_ave, cells_burned_min, cells_burned_max, cells_burned_std, timber_lost_confidence_lower, cells_burned_confidence_upper] timber_lost_stats = [timber_lost_ave, timber_lost_min, cells_burned_max, timber_lost_std, timber_lost_confidence_lower, timber_lost_confidence_upper] return [cells_burned_stats, timber_lost_stats, suppress_decisions]
def timber_harvest_stats_by_year(pathway_set): """From a list of FireGirlPathway objects, return summary statistics of harvest values by year Args pathway_set: a list of at least one FireGirlPathway object Returns A list with four elements: -Element 0: A list containing the yearly average harvest values over all pathways in the set. The first element of the list will be the average harvest values for the first year of EACH pathway, and so on. -Element 1: A list containing the maximum harvest values of any pathway during that year. -Element 2: A list containing the minimum harvest values of any pathway during that year. -Element 3: A list containing the standard error of the yearly averages in Element 0 -Element 4: A list containing the upper bound of the 95% confidence interval -Element 5: A list containing the lower bound of the 95% confidence interval """ #checking for an empty input list if len(pathway_set) < 1: #it's empty, so return an equally empty result string return [[],[],[]] #Get averages and standard errors for each year yearly_ave = [] yearly_stdev = [] yearly_max = [] yearly_min = [] yearly_confidence_upper = [] yearly_confidence_lower = [] #how many years are there in these pathways? # assuming they all have the same number of events, query the first pathway in the list, and get the lenght # of it's ignition_events list years = len(pathway_set[0].ignition_events) #for each year, look in each pathway and add it's harvest value to a list for y in range(years): this_years_harvest = [] for pw in pathway_set: #add this pathway's harvest value for this year to the list this_years_harvest.append( pw.yearly_logging_totals[y] ) #finished with all pathways at this year, so the list holds all year=y harvest value #add a new element to the _ave and _stdev lists and add this year's stat to each yearly_ave.append( mean(this_years_harvest) ) yearly_stdev.append( std(this_years_harvest) ) yearly_min.append( min(this_years_harvest) ) yearly_max.append( max(this_years_harvest) ) #get the t-stat for a 95% confidence interval for sample of this size #this returns a list with the [lower , upper] stats, which are equal and opposite if centered # around the mean, as ours are. tstat = t.interval(0.95, len(this_years_harvest) ) #the upper and lower confidence intervals are calculated as # Upper = Mean + (tstat) * (standard error of the mean) # Lower = Mean - (tstat) * (standard error of the mean) upper_conf = yearly_ave[y] + ( tstat[0] * yearly_stdev[y] ) lower_conf = yearly_ave[y] - ( tstat[0] * yearly_stdev[y] ) #add them to the list yearly_confidence_upper.append(upper_conf) yearly_confidence_lower.append(lower_conf) #finished with ALL years #return a list with each list of stats return [yearly_ave, yearly_min, yearly_max, yearly_stdev, yearly_confidence_lower, yearly_confidence_upper]
def __init__(self, linear_regression, api=None): self.resource_id = None self.input_fields = [] self.term_forms = {} self.tag_clouds = {} self.term_analysis = {} self.items = {} self.item_analysis = {} self.categories = {} self.coefficients = [] self.data_field_types = {} self.field_codings = {} self.bias = None self.xtx_inverse = [] self.mean_squared_error = None self.number_of_parameters = None self.number_of_samples = None self.resource_id, linear_regression = get_resource_dict( \ linear_regression, "linearregression", api=api) if 'object' in linear_regression and \ isinstance(linear_regression['object'], dict): linear_regression = linear_regression['object'] try: self.input_fields = linear_regression.get("input_fields", []) self.dataset_field_types = linear_regression.get( "dataset_field_types", {}) self.weight_field = linear_regression.get("weight_field") objective_field = linear_regression['objective_fields'] if \ linear_regression['objective_fields'] else \ linear_regression['objective_field'] except KeyError: raise ValueError("Failed to find the linear regression expected " "JSON structure. Check your arguments.") if 'linear_regression' in linear_regression and \ isinstance(linear_regression['linear_regression'], dict): status = get_status(linear_regression) if 'code' in status and status['code'] == FINISHED: linear_regression_info = linear_regression[ \ 'linear_regression'] fields = linear_regression_info.get('fields', {}) if not self.input_fields: self.input_fields = [ \ field_id for field_id, _ in sorted(self.fields.items(), key=lambda x: x[1].get("column_number"))] self.coeff_ids = self.input_fields[:] self.coefficients = linear_regression_info.get( \ 'coefficients', []) self.bias = linear_regression_info.get('bias', True) self.field_codings = linear_regression_info.get( \ 'field_codings', {}) self.number_of_parameters = linear_regression_info.get( \ "number_of_parameters") objective_id = extract_objective(objective_field) ModelFields.__init__( self, fields, objective_id=objective_id, terms=True, categories=True, numerics=True) self.field_codings = linear_regression_info.get( \ 'field_codings', {}) self.format_field_codings() for field_id in self.field_codings: if field_id not in fields and \ field_id in self.inverted_fields: self.field_codings.update( \ {self.inverted_fields[field_id]: \ self.field_codings[field_id]}) del self.field_codings[field_id] stats = linear_regression_info["stats"] if stats is not None and stats.get("xtx_inverse") is not None: self.xtx_inverse = stats["xtx_inverse"][:] self.mean_squared_error = stats["mean_squared_error"] self.number_of_samples = stats["number_of_samples"] # to be used in predictions self.t_crit = student_t.interval( \ CONFIDENCE, self.number_of_samples - self.number_of_parameters)[1] self.xtx_inverse = list( \ np.linalg.inv(np.array(self.xtx_inverse))) else: raise Exception("The linear regression isn't finished yet") else: raise Exception("Cannot create the LinearRegression instance." " Could not find the 'linear_regression' key" " in the resource:\n\n%s" % linear_regression)
def __init__(self, linear_regression, api=None): self.resource_id = None self.input_fields = [] self.term_forms = {} self.tag_clouds = {} self.term_analysis = {} self.items = {} self.item_analysis = {} self.categories = {} self.coefficients = [] self.data_field_types = {} self.field_codings = {} self.bias = None self.xtx_inverse = [] self.mean_squared_error = None self.number_of_parameters = None self.number_of_samples = None self.resource_id, linear_regression = get_resource_dict( \ linear_regression, "linearregression", api=api) if 'object' in linear_regression and \ isinstance(linear_regression['object'], dict): linear_regression = linear_regression['object'] try: self.input_fields = linear_regression.get("input_fields", []) self.dataset_field_types = linear_regression.get( "dataset_field_types", {}) self.weight_field = linear_regression.get("weight_field") objective_field = linear_regression['objective_fields'] if \ linear_regression['objective_fields'] else \ linear_regression['objective_field'] except KeyError: raise ValueError("Failed to find the linear regression expected " "JSON structure. Check your arguments.") if 'linear_regression' in linear_regression and \ isinstance(linear_regression['linear_regression'], dict): status = get_status(linear_regression) if 'code' in status and status['code'] == FINISHED: linear_regression_info = linear_regression[ \ 'linear_regression'] fields = linear_regression_info.get('fields', {}) if not self.input_fields: self.input_fields = [ \ field_id for field_id, _ in sorted(fields.items(), key=lambda x: x[1].get("column_number"))] self.coeff_ids = self.input_fields[:] self.coefficients = linear_regression_info.get( \ 'coefficients', []) self.bias = linear_regression_info.get('bias', True) self.field_codings = linear_regression_info.get( \ 'field_codings', {}) self.number_of_parameters = linear_regression_info.get( \ "number_of_parameters") missing_tokens = linear_regression_info.get("missing_tokens") objective_id = extract_objective(objective_field) ModelFields.__init__(self, fields, objective_id=objective_id, terms=True, categories=True, numerics=True, missing_tokens=missing_tokens) self.field_codings = linear_regression_info.get( \ 'field_codings', {}) self.format_field_codings() for field_id in self.field_codings: if field_id not in fields and \ field_id in self.inverted_fields: self.field_codings.update( \ {self.inverted_fields[field_id]: \ self.field_codings[field_id]}) del self.field_codings[field_id] stats = linear_regression_info["stats"] if STATS and stats is not None and \ stats.get("xtx_inverse") is not None: self.xtx_inverse = stats["xtx_inverse"][:] self.mean_squared_error = stats["mean_squared_error"] self.number_of_samples = stats["number_of_samples"] # to be used in predictions self.t_crit = student_t.interval( \ CONFIDENCE, self.number_of_samples - self.number_of_parameters)[1] self.xtx_inverse = list( \ np.linalg.inv(np.array(self.xtx_inverse))) else: raise Exception("The linear regression isn't finished yet") else: raise Exception("Cannot create the LinearRegression instance." " Could not find the 'linear_regression' key" " in the resource:\n\n%s" % linear_regression)
def IC_reg(repartition, dfX, Y, path_rslt, suffix_table): u = pd.DataFrame(data=1, columns=['constante'], index=dfX.index) dfX = pd.concat([u, dfX], axis=1) result = pd.DataFrame(columns=[ 'Y_real', 'Y_pred', 'error', 'cluster', 'min_IC', 'max_IC', 'largeur', '% largeur' ], index=Y.index) result_test = pd.DataFrame(columns=[ 'Y_real', 'Y_pred', 'error', 'cluster', 'min_IC', 'max_IC', 'largeur', '% largeur' ], index=Y.index) for j in repartition.keys(): #try: df_cluster = repartition[j] #intialisation et repartition train et test #recuperation des indexes des revenus renseignés index = df_cluster["revenu"].index[~df_cluster["revenu"].apply(np.isnan )] dfX_train, dfX_test, Y_train, Y_test = train_test_split( dfX.loc[index], Y[index], test_size=0.4, random_state=44) index_test = Y_test.index index_train = Y_train.index #except: # print("Le programme plante au cluster " + str(j)) #condition sur les clusters if len(df_cluster) >= 40 and df_cluster["revenu"].isnull().sum() / len( df_cluster) < 1: df_cluster_index = df_cluster.index result_test.loc[index_test, 'cluster'] = j result.loc[df_cluster_index, 'cluster'] = j #calcul des fonctions de prévision et erreur result.loc[df_cluster.index, 'Y_real'] = df_cluster.loc[df_cluster.index, 'revenu'] result_test.loc[index_test, 'Y_real'] = df_cluster.loc[index_test, 'revenu'].astype(int) result_test.loc[index_test, 'Y_pred'], result.loc[df_cluster_index, 'Y_pred'] = predi_reg( dfX, Y, index_train, index_test, df_cluster_index) result_test.loc[index_test, 'error'] = result_test.loc[ index_test, 'Y_pred'] - result_test.loc[index_test, 'Y_real'] result.loc[df_cluster_index, 'error'] = result.loc[ df_cluster_index, 'Y_pred'] - result.loc[df_cluster_index, 'Y_real'] result['lib_segment'] = suffix_table n = len( df_cluster.loc[df_cluster_index]) #apprentisage ou test !!!! #np.linalg.matrix_rank(dfX.loc[index_train]) df = n - (len(dfX.columns) - 1) - 1 quantile = t.interval(0.85, df)[1] MSE = result_test.loc[ index_test, 'Y_pred'].std() #MSE de result_test ou de result !!!! X = np.dot(dfX.loc[index_test].T, dfX.loc[index_test]) if linalg.det(X) != 0: X = linalg.inv(X) for i in range(0, len(result_test.loc[index_test, 'Y_pred'])): a = np.matrix(dfX.loc[index_test][i:i + 1]) u = (a * X) * (a.T) #h=result.loc[index_test,'Y_pred'][i:i+1].index if (1 + u) > 0: result_test.loc[result_test.loc[index_test, 'Y_pred'][i:i + 1].index, 'min_IC'] = result_test.loc[ index_test, 'Y_pred'][i:i + 1] - ( quantile * MSE * sqrt(1 + u)) result_test.loc[result_test.loc[index_test, 'Y_pred'][i:i + 1].index, 'max_IC'] = result_test.loc[ index_test, 'Y_pred'][i:i + 1] + ( quantile * MSE * sqrt(1 + u)) else: print('cluster ' + str(j) + ' contient valeur negative') result_test.loc[result_test.loc[index_test, 'Y_pred'][i:i + 1].index, 'min_IC'] = 0 result_test.loc[result_test.loc[index_test, 'Y_pred'][i:i + 1].index, 'max_IC'] = 0 result_test.loc[index_test, 'largeur'] = result_test.loc[ index_test, 'max_IC'].astype(int) - result_test.loc[ index_test, 'min_IC'].astype(int) result_test.loc[index_test, '% largeur'] = result_test.loc[ index_test, 'largeur'].astype(int) / result_test.loc[ index_test, 'Y_pred'].astype(int) else: result_test.loc[index_test, 'min_IC'] = 'Inv' result_test.loc[index_test, 'max_IC'] = 'Inv' n = len( df_cluster.loc[df_cluster_index]) #apprentisage ou test !!!! #np.linalg.matrix_rank(dfX.loc[df_cluster_index]) df = n - (len(dfX.columns) - 1) - 1 quantile = t.interval(0.85, df)[1] MSE = result.loc[ df_cluster_index, 'Y_pred'].std() #MSE de result_test ou de result !!!! X = np.matrix(dfX.loc[df_cluster_index]).T * np.matrix( dfX.loc[df_cluster_index]) if linalg.det(X) != 0: X = linalg.inv(X) for i in range(0, len(result.loc[df_cluster_index, 'Y_pred'])): a = np.matrix(dfX.loc[df_cluster_index][i:i + 1]) u = (a * X) * (a.T) #h=result.loc[index_test,'Y_pred'][i:i+1].index if (1 + u) > 0: result.loc[result.loc[df_cluster_index, 'Y_pred'][i:i + 1].index, 'min_IC'] = (result.loc[df_cluster_index, 'Y_pred'][i:i + 1] - (quantile * MSE * sqrt(1 + u))).astype(int) result.loc[result.loc[df_cluster_index, 'Y_pred'][i:i + 1].index, 'max_IC'] = (result.loc[df_cluster_index, 'Y_pred'][i:i + 1] + (quantile * MSE * sqrt(1 + u))).astype(int) else: print('cluster ' + str(j) + ' contient valeur negative') result.loc[result.loc[df_cluster_index, 'Y_pred'][i:i + 1].index, 'min_IC'] = 0 result.loc[result.loc[df_cluster_index, 'Y_pred'][i:i + 1].index, 'max_IC'] = 0 result.loc[df_cluster_index, 'largeur'] = result.loc[ df_cluster_index, 'max_IC'].astype(int) - result.loc[df_cluster_index, 'min_IC'].astype(int) result.loc[df_cluster_index, '% largeur'] = result.loc[ df_cluster_index, 'largeur'].astype(int) / result.loc[df_cluster_index, 'Y_pred'].astype(int) else: result.loc[df_cluster_index, 'min_IC'] = 'Inv' result.loc[df_cluster_index, 'max_IC'] = 'Inv' else: print("cluster " + str(j) + " ne remplit pas les conditions") result_test = result_test[pd.notnull(result_test['cluster'])] result = result[pd.notnull(result['cluster'])] dfX.drop(['constante'], axis=1, inplace=True) #path="/mnt/smb/TAMPON/Igor/RFR/data_rslt/" #result_test.to_excel(path_rslt+"result_test" + suffix_table + ".xlsx",encoding="utf-8", index=True) #result.to_excel(path_rslt+"result" + suffix_table + ".xlsx",encoding="utf-8", index=True) result.to_csv(path_rslt + "result" + suffix_table + ".csv", sep=";", encoding="utf-8", index=True) return result, result_test
def excel_table_byname(): data = open_excel(file) #Open excel table = data.sheet_by_name(by_name)#obtain the sheet in Excel file by name book = xlwt.Workbook() #create Excel file sheet1 = book.add_sheet('sheet1') col0=table.col_values(0) for i in range(0,len(col0)): sheet1.write(i,0,str(col0[i])) book.save('ideal_range.xls') row0 = ['lmin','lmax','hmin','hmax'] for j in range(0,len(row0)): sheet1.write(0,j+1,str(row0[j])) book.save('ideal_range.xls') #input the raw data as a matrix set_matrix=[] for row in range (1,table.nrows): _row = [] for col in range (1,table.ncols-8): _row.append(table.cell_value(row,col+1)) set_matrix.append(_row) set_matrix_array = np.array(set_matrix) #sample mean mean = table.col_values(-6) mean.pop(0) #sample variance (The degree used in calculations is N - ddof), the calculation is done in Excel stddev = table.col_values(-4) stddev.pop(0) [h,l] = set_matrix_array.shape #due to the small sample size, using t-distribution, the confidence level is 95% t_bounds = t.interval(CI, l - 1,mean,stddev)# t_bounds = np.vstack(t_bounds) t_bounds = t_bounds.transpose() [a, b] = t_bounds.shape for i in range(a): for j in range(b): if t_bounds[i][j] <= 0: t_bounds[i][j]=1 if math.isnan(t_bounds[i][j]): t_bounds[i][j]=1 for m in range(a): for n in range(b): sheet1.write(m+1, n+1, t_bounds[m, n]) book.save('ideal_range.xls') large_bounds = t.interval(0.95, l - 1,mean,stddev) large_bounds = np.vstack(large_bounds) large_bounds = large_bounds.transpose() [c,d] = large_bounds.shape for i in range(c): for j in range(d): if large_bounds[i][j] <= 0: large_bounds[i][j]=1 if math.isnan(large_bounds[i][j]): large_bounds[i][j]=1 for m in range(c): for n in range(d): sheet1.write(m+1,n+3,large_bounds[m,n]) book.save('ideal_range.xls')
lsq.fit(X=df[["frequency"]],y=df["power"]) df["lsq-estimated"]=lsq.predict(df[["frequency"]]) print('Least Squares: P={}·n +{}, Rsq{}'.format(lsq.coef_, lsq.intercept_, lsq.score(X=df[["frequency"]],y=df["power"]))) print(mean_squared_error(df["power"],df["lsq-estimated"])) #Get confidence conf_max=[] conf_min=[] frequencydummy=[] for freq in df["frequency"].unique(): if freq != 1150/60 and freq != 1250/60: serie=df[df["frequency"]==freq]["power"] mu=statistics.mean(serie) sigma=numpy.std(serie) gl=len(serie) conf_int = t.interval(0.90,gl, loc=mu,scale=sigma) conf_min.append(conf_int[0]) conf_max.append(conf_int[1]) frequencydummy.append(freq) conf=pd.DataFrame({"freq":frequencydummy,"low":conf_min,"high":conf_max}) conf["ts"]=ts.predict(conf[["freq"]]) conf["lsq"]=lsq.predict(conf[["freq"]]) conf.to_csv("powerfreqmodel.csv") matplotlib.rcParams.update({'font.size': 16}) #Plot it confidence intervals sns.set_style("whitegrid") fig, ax = plt.subplots(figsize=[13,8],dpi=200) plt.plot(df["frequency"],df["lsq-estimated"])
def get_1p_bounds(mean, std, dof): return t.interval(0.99, dof, mean, std)
import numpy as np from scipy.stats import norm, t, sem, moment from math import sqrt # from the SAT Score Question list = [560, 610, 500, 470, 660, 640] p = 0.90 # Elephan Trunk # List = [5.62, 6.07, 6.64, 5.91, 6.30, 6.55, 6.19, 5.48] # p = 0.95 total = 0 n = len(list) mu = np.mean(list) var = np.var(list, ddof=1) # add ddof=1 for unbiased (Bessle Corrected) bounds = t.interval(p, len(list) - 1, loc=np.mean(list), scale=sem(list)) critical_t = t.ppf(((1 + p) / 2), n - 1) sigma_est = sqrt(var) std_error = critical_t * sigma_est / sqrt(n) print('Mean =', mu) print('Critical T =', critical_t) print('Unbiased Sample Variace (Bessel Corrected) = ', var) print('Standard Deviation (Estimation) =', sigma_est) print('Standard Error =', std_error) print('Lower Bounds =', bounds[0]) print('Upper Bounds =', bounds[1])
def geometric_mean(p_series, df, cols): # Alternatively we can use scipy.stats.lognorm to fit a distribution # and provide the parameters if (len(p_series) > 3) & (p_series.quantile(0.5) > 0): # result = gmean(p_series.to_numpy()+1)-1 module_logger.debug( f"Calculating confidence interval for" f"{df.loc[p_series.index[0],groupby_cols].values}") module_logger.debug(f"{p_series.values}") with np.errstate(all='raise'): try: data = p_series.to_numpy() except (ArithmeticError, ValueError, FloatingPointError): module_logger.debug("Problem with input data") return None try: log_data = np.log(data) except (ArithmeticError, ValueError, FloatingPointError): module_logger.debug("Problem with log function") return None try: mean = np.mean(log_data) except (ArithmeticError, ValueError, FloatingPointError): module_logger.debug("Problem with mean function") return None l = len(data) try: sd = np.std(log_data) / np.sqrt(l) sd2 = sd**2 except (ArithmeticError, ValueError, FloatingPointError): module_logger.debug("Problem with std function") return None try: pi1, pi2 = t.interval(alpha=0.90, df=l - 2, loc=mean, scale=sd) except (ArithmeticError, ValueError, FloatingPointError): module_logger.debug("Problem with t function") return None try: upper_interval = np.max([ mean + sd2 / 2 + pi2 * np.sqrt(sd2 / l + sd2**2 / (2 * (l - 1))), mean + sd2 / 2 - pi2 * np.sqrt(sd2 / l + sd2**2 / (2 * (l - 1))), ]) except: module_logger.debug("Problem with interval function") return None try: result = (np.exp(mean), 0, np.exp(upper_interval)) except (ArithmeticError, ValueError, FloatingPointError): module_logger.debug("Unable to calculate geometric_mean") return None if result is not None: return result else: module_logger.debug( f"Problem generating uncertainty parameters \n" f"{df.loc[p_series.index[0],groupby_cols].values}\n" f"{p_series.values}" f"{p_series.values+1}") return None else: return None
def knn_ci(ratings): # input will be Train_data_matrix[neighborset] std_dev = np.std(ratings) n = len(ratings) multi = t.interval(alpha=0.975, df=((n - 2) / 2))[1] return multi * std_dev / math.sqrt(n)
DI_chr = DI_chr.To(id, Do=_.Sum()) #efor DI_chr = DI_chr.Get(_.seqname, *[ x[0] for x in enumerate(DI_chr.Names[1:-1], 1) ]).ReplaceMissing() conditions = [ x for x in enumerate(DI_chr.Names[1:], 1)] condition_pairs = zip(conditions[:len(conditions)/2], conditions[len(conditions)/2:]) stats = []; for ((a_id_rep1, conda_rep1), (b_id_rep1, condb_rep1)), ((a_id_rep2, conda_rep2), (b_id_rep2, condb_rep2)) in zip(condition_pairs[0::2], condition_pairs[1::2]): cond = conda_rep1.split('|')[1] cond_stats = DI_chr.Get(_.seqname, _.Get(a_id_rep1).Cast(float) / _.Get(b_id_rep1).Cast(float), _.Get(a_id_rep2).Cast(float) / _.Get(b_id_rep2).Cast(float) ) / ('seqname', 'r1', 'r2'); cond_stats = cond_stats.Get(_.seqname, _.r1, _.r2, ((_.r1 + _.r2) / 2) / 'mean' ) cond_stats = cond_stats.Get(_.seqname, _.r1, _.r2, _.mean, ((_.r1 - _.mean) * (_.r1 - _.mean) + (_.r2 - _.mean) * (_.r2 - _.mean) / 2) / 'var') cond_stats = cond_stats.Get(_.seqname, _.r1, _.r2, _.mean, _.var.Each(lambda x: np.sqrt(x)).Cast(float) / 'sd', _.var ); cond_stats = cond_stats.Get(_.seqname, _.r1, _.r2, _.mean, _.sd, _.var, (_.sd / np.sqrt(2)) * max(t.interval(0.90, 1)) / 'confidence' ).Copy() stats.append(cond_stats.Get(_.seqname.Each(lambda x: cond).Cast(str) / 'cond', *cond_stats.Names).Copy()); #efor allstats = stats[0]; for s in stats[1:]: allstats = allstats | Stack | s; #efor # Add the data from ALL the chromosomes condlibratios = Read('%s/deseq_condlibratios.tsv' % output_dir).Detect() / ('cond', 'r1', 'r2', 'mean', 'sd', 'var'); condlibratios = condlibratios.Get(_.cond, _.r1, _.r2, _.mean, _.sd, _.var, (_.sd / np.sqrt(2)) * max(t.interval(0.90, 1)) / 'confidence') allstats = condlibratios.Get(_.cond, _.cond.Each(lambda x: 'all').Cast(str) / 'seqname', _.r1, _.r2, _.mean, _.sd, _.var, _.confidence) | Stack | allstats minval, maxval = allstats.Get(_.mean.Min(), _.mean.Max())()
# ******************************************************************************* if __name__ == '__main__': random.seed(RANDOM_SEED) # For a fix pair of parameters we will repeat the simulation many times mu = 0.05 # 20 s per customers, thus 0.05 customer/s lambd = 1.5 * mu # *************************************************************************** # First Case: 1 Queue, 3 servers # *************************************************************************** y1 = simulate(1, 3) print(t.interval(0.99, SAMPLES - 1, np.mean(y1), sem(y1))) # *************************************************************************** # Second Case: 3 Queue, 1 servers # *************************************************************************** y2 = simulate(3, 1) print(t.interval(0.99, SAMPLES - 1, np.mean(y2), sem(y2))) f1, ax1 = pyplot.subplots() ax1.plot(y1, 'ro', color='green') ax1.plot(y2, 'ro', color='red') ax1.set_xlabel("Simulation round") ax1.set_ylabel("E[T] - response time") ax1.grid(b=True, which='major', color='#CCCCCC', linestyle='-') pyplot.show()
def uncertainty(db, mean_gen, total_gen, total_facility_considered): # Troy Method # Creating copy of database by substitution the NA emissions with zero # db1 = db.fillna(value = 0) # Removing all rows here emissions are not reported for second dataframe # db2 = db.dropna() # frames = [db1,db2] # Here we doubled up the database by combining two databases together # data_1 = pd.concat(frames,axis = 0) data_1 = db df2 = pd.DataFrame([[0, 0]], columns=['Electricity', 'FlowAmount']) for i in range(len(data_1), total_facility_considered): data = data_1.append(df2, ignore_index=True) data_1 = data data = data_1 mean = np.mean(data.iloc[:, 1]) l, b = data.shape sd = np.std(data.iloc[:, 1]) / np.sqrt(l) # mean_gen = np.mean(data.iloc[:,0]) # obtaining the emissions factor from the weight based method ef = compilation(db, total_gen) # Endpoints of the range that contains alpha percent of the distribution pi1, pi2 = t.interval(alpha=0.90, df=l - 2, loc=mean, scale=sd) # Converting prediction interval to emission factors pi2 = pi2 / mean_gen pi1 = pi1 / mean_gen pi3 = (pi2 - ef) / ef x = var('x') if math.isnan(pi3) == True: return None, None elif math.isnan(pi3) == False: # This method will not work with the interval limits are more than 280% of the mean. if pi3 < 2.8: # sd1,sd2 = solve(0.5*x*x -(1.16308*np.sqrt(2))*x + (np.log(1+pi3)),x) a = 0.5 b = -(1.16308 * np.sqrt(2)) c = np.log(1 + pi3) sd1 = (-b + np.sqrt(b**2 - (4 * a * c))) / (2 * a) sd2 = (-b - np.sqrt(b**2 - (4 * a * c))) / (2 * a) else: # This is a wrong mathematical statement. However, we have to use it if something fails. sd1, sd2 = solve( 0.5 * x * x - (1.36 * np.sqrt(2)) * x + (np.log(1 + pi3)), x) # if type(sd1) != float or type(sd2) != float: # return 0,0 # always choose lower standard deviation from solving the square root equation. if sd1 < sd2: log_mean = np.log(ef) - 0.5 * (sd1**2) return round(log_mean, 12), round(sd1, 12) else: log_mean = np.log(ef) - 0.5 * (sd2**2) return round(log_mean, 12), round(sd2, 12)
for i in range(0,tam): os.system("./ejecutar 1 3 1 >> datos.txt") proceso1 = subprocess.Popen("./"+ str(sys.argv[2])+ " 1", stdout=subprocess.PIPE, shell=True) (out, err) = proceso1.communicate() print(out) valor = out.decode("utf-8") valor = float(valor.split(":")[0]) datos_torm[i] = valor valor = os.popen('./'+ sys.argv[2]+' 1 3 0 1').read() valor = float(valor.split(":")[0]) datos_vel[i] = valor print("Pasada " , i)""" datos_vel = np.genfromtxt("datosVel.txt", dtype=np.float) datos_torm = np.genfromtxt("datosTor.txt", dtype=np.float) datos_vel = datos_vel[:int(sys.argv[1])] datos_torm = datos_torm[:int(sys.argv[1])] tam = np.size(datos_vel) diferencia = datos_vel - datos_torm media = np.mean(diferencia) varianza = np.var(diferencia) valores_student = t.interval(0.95, tam - 1) intervalo = [media + t * np.sqrt(varianza / tam) for t in valores_student] print("Intervalo=", intervalo, " media=", media, " varianza=", varianza)
print("ドリフト無しランダムウォーク p値:",p) print("ドリフト付きランダムウォーク p値:",p1) # In[6]: from scipy.stats import t resid=arma_res.resid.iloc[1:] m=resid.mean() v=resid.std() resid_max=pd.Series.rolling(arma_res.resid,window=250).mean().max() resid_min=pd.Series.rolling(arma_res.resid,window=250).mean().min() print("平均: %2.5f"%m,"標準偏差: %2.4f"%v) print("250日平均の最大値: %2.5f"%resid_max,"250日平均の最小値: %2.5f"%resid_min) print("250日平均の95%の信頼区間: ",(t.interval(alpha=0.95, df=250, loc=0, scale=v))) # In[7]: pd.Series.rolling(arma_res.resid.iloc[1:],250).mean().plot(figsize=(6,4),color='hotpink') plt.ylabel('$\hat{z_t}$') # In[8]: from scipy.stats import chi2 resid=arma_res.resid.iloc[1:] m=resid.mean()
def cal_regression_power( t_evap, t_cond, uncer_t_evap, uncer_t_cond, rel_uncer_power, abs_uncer_power, para, full_output=False, dist_output=False, ): """ Estimate the compressor power and its uncertainty based on evaporating and condensing temperature Parameters: =========== t_evap: float Evaporating temperature in F t_cond: float Condensing temperature in F uncer_t_evap: float Uncertainty of evaporating temperature in F uncer_t_cond: float Uncertainty of condensing temperature in F rel_uncer_power: float Relative uncertainty of measured power consumption in % abs_uncer_power: float Absolute uncertainty of measured power consumption in W para: MAP_PARA() object Object containing the coefficients full_output: boolean Whether to output all other uncertainties. Default false. dist_output: boolean Whether to output all components of uncertainty from training data in a numpy array Returns: =========== power: float Estimated power in W uncer: float Uncertainty of the estimation in W uncer_input: float Uncertainty from inputs in W. Only output when full_output=True uncer_output: float Uncertainty from output in W. Only output when full_output=True uncer_train: float Uncertainty from training data in W. Only output when full_output=True uncer_dev: float Uncertainty from deviation in W. Only output when full_output=True uncer_cov: float Uncertainty from covariance in W. Only output when full_output=True uncer_train_dist: float Components of uncertainty from training datain W. Only output when dist_output=True """ # form x vector coeff = para.get_coeff() x = ( np.matrix( [ 1.0, t_evap, t_cond, t_evap ** 2, t_evap * t_cond, t_cond ** 2, t_evap ** 3, t_evap ** 2 * t_cond, t_evap * t_cond ** 2, t_cond ** 3, ] ) ).transpose() dyestdet = ( np.matrix( [0.0, 1.0, 0.0, 2.0 * t_evap, t_cond, 0.0, 3.0 * t_evap ** 2, 2.0 * t_evap * t_cond, t_cond ** 2, 0.0] ) ) * coeff dyestdct = ( np.matrix( [0.0, 0.0, 1.0, 0.0, t_evap, 2.0 * t_cond, 0.0, t_evap ** 2, 2.0 * t_cond * t_evap, 3.0 * t_cond ** 2] ) ) * coeff # estimate power power = (x.transpose() * coeff)[0, 0] # estimate uncer_input uncer_input = sqrt(((dyestdet * uncer_t_evap).sum()) ** 2 + ((dyestdct * uncer_t_cond).sum()) ** 2) # estimate uncer_output uncer_output = sqrt(abs_uncer_power ** 2 + (rel_uncer_power * power) ** 2) # estimate uncer_train train_x_entry = para.get_dBdXdeltaX() * x train_y_entry = para.get_dBdydeltay() * x uncer_train = sqrt( (np.multiply(train_x_entry, train_x_entry)).sum() + (np.multiply(train_y_entry, train_y_entry)).sum() ) if dist_output: uncer_train_comp = np.array( [ np.sqrt(qq) for qq in ( np.multiply(train_x_entry, train_x_entry).tolist() + np.multiply(train_y_entry, train_y_entry).tolist() ) ] ) # estimate uncer_dev m = len(para.get_y()) t_stat = t.interval(0.95, m - 10)[1] uncer_dev = t_stat * para.get_sigma() # estimate uncer_cov uncer_cov = t_stat * sqrt(x.transpose() * para.get_X_inverse_prod() * x) * para.get_sigma() # estimate uncer uncer = sqrt(uncer_input ** 2 + uncer_output ** 2 + uncer_train ** 2 + uncer_dev ** 2 + uncer_cov ** 2) if full_output: if dist_output: return power, uncer, uncer_input, uncer_output, uncer_train, uncer_dev, uncer_cov, uncer_train_comp else: return power, uncer, uncer_input, uncer_output, uncer_train, uncer_dev, uncer_cov else: if dist_output: return power, uncer, uncer_train_comp else: return power, uncer
def DeepDiscovery(Xval, Yval, classnames, n, modeldir=None, mname=None, model=None, net=None, arch=None): # LOADS IN MODEL if modeldir is not None: tf.reset_default_graph() if net == '3FCN': print('3-Hidden Layer Fully Connected Network Selected') network = input_data( shape=[None, Xval.shape[1], Xval.shape[2], Xval.shape[3]]) network = fully_connected(network, 2000, activation='tanh') network = dropout(network, 0.5) network = fully_connected(network, 2000, activation='tanh') network = dropout(network, 0.5) network = fully_connected(network, 2000, activation='tanh') network = dropout(network, 0.5) network = fully_connected(network, Yval.shape[1], activation='softmax') network = regression(network, optimizer='momentum', loss='categorical_crossentropy', learning_rate=0.001) if net == '5FCN': print('5-Hidden Layer Fully Connected Network Selected') network = input_data( shape=[None, Xval.shape[1], Xval.shape[2], Xval.shape[3]]) network = fully_connected(network, 2000, activation='tanh') network = dropout(network, 0.5) network = fully_connected(network, 2000, activation='tanh') network = dropout(network, 0.5) network = fully_connected(network, 2000, activation='tanh') network = dropout(network, 0.5) network = fully_connected(network, 2000, activation='tanh') network = dropout(network, 0.5) network = fully_connected(network, 2000, activation='tanh') network = dropout(network, 0.5) network = fully_connected(network, Yval.shape[1], activation='softmax') network = regression(network, optimizer='momentum', loss='categorical_crossentropy', learning_rate=0.001) if net == 'AlexNet': print('AlexNet selected') network = input_data( shape=[None, Xval.shape[1], Xval.shape[2], Xval.shape[3]]) network = conv_2d(network, 96, 11, strides=4, activation='relu') network = max_pool_2d(network, 3, strides=2) network = local_response_normalization(network) network = conv_2d(network, 256, 5, activation='relu') network = max_pool_2d(network, 3, strides=2) network = local_response_normalization(network) network = conv_2d(network, 384, 3, activation='relu') network = conv_2d(network, 384, 3, activation='relu') network = conv_2d(network, 256, 3, activation='relu') network = max_pool_2d(network, 3, strides=2) network = local_response_normalization(network) network = fully_connected(network, 4096, activation='tanh') network = dropout(network, 0.5) network = fully_connected(network, 4096, activation='tanh') network = dropout(network, 0.5) network = fully_connected(network, Yval.shape[1], activation='softmax') network = regression(network, optimizer='momentum', loss='categorical_crossentropy', learning_rate=0.001) if arch is not None: print('Different Architecture Provided') network = arch if modeldir is not None: os.chdir(modeldir) model = tflearn.DNN(network) model.load(model_file=mname) # SORTS THE INPUTS BY CLASS k = Yval.shape[1] # determining the number of classes, k nkarr = np.sum(Yval, axis=0) # checking to make sure the test set is balanced N = Yval.shape[0] # number of inputs if np.min(nkarr) == np.max(nkarr): nk = nkarr[0] nk = int( nk) # if it balanced, the rest of the if statement will continue tick = np.zeros([ k ]) # creates a counter, next input associated with class is free Xsort = np.zeros(Xval.shape) # for sorted data Ysort = np.zeros(Yval.shape) # for sorted data for i in range(0, N): c = np.argmax( Yval[i, ...]) # checks which class the input in Xval belongs to Xsort[int(c * nk + tick[c]):int(c * nk + tick[c] + 1), ...] = Xval[i:i + 1, ...] Ysort[int(c * nk + tick[c]):int(c * nk + tick[c] + 1), c] = 1 tick[c] = tick[c] + 1 # FORMATS Xval while len( Xsort.shape ) < 4: # checks to see if the 4th dimension was already added Xsort = Xsort[ ..., None] # if it wasn't, the 4th dimension is added, if it was nothing happens # CREATES ARRAYS FOR PREDICTIONS N = Ysort.shape[0] # number of inputs Lhat = np.zeros([N, k]) # creates an empty matrix # STORES PREDICTIONS for i in range(0, N): q = model.predict(Xsort[ i:(i + 1), ...]) # row vector of the confidences outputted by the model Lhat[i:( i + 1 ), :] = q # assigns confidence values to the correct row in Lhat # CALCULATING THE Lressum Lres = Ysort - Lhat # calculates the raw residuals (labels - confidences) Lressum = np.std(Lres, axis=1) # std devs across the rows Lressum = Lressum[..., None] # FORMATTING Lressum FOR PLOTTING ids = np.zeros([N, 1]) # creating an id column to attach to Lressum for i in range(0, N): # stupid for loop because it's not R ids[i, 0] = i Lressumid = np.append(ids, Lressum, axis=1) Ldf = pd.DataFrame( Lressumid) # converting to Pandas because we're tired of Python Ldf = Ldf.rename(index=str, columns={0: "id", 1: "res"}) # PLOTTING plt.figure(1) Ldf.plot.scatter(x='id', y='res', title='RMSE Across All Classes, Grouped by Class') plt.show() plt.figure(2) for j in range(0, k): Ldfa = Ldf[nk * j:nk * (j + 1)] Ldfa.plot.scatter(x='id', y='res', title='RMSE Across Class ' + classnames[j]) plt.show() nsamp = int(N / n) LressumSamp = np.zeros([nsamp, n, 1]) Minima = np.zeros([nsamp]) for i in range(0, nsamp): LressumSamp[i, ...] = Lressum[i * n:(i + 1) * n, :] Minima[i] = np.where( LressumSamp[i] == np.min(LressumSamp[i]))[0][0] # Calculating the Confidence Intervals of the Position of the Lowest Error Input per Sample tval1 = t.interval(.90, n - 1)[1] lowlim1 = np.mean(Minima) - tval1 * np.std(Minima) upplim1 = np.mean(Minima) + tval1 * np.std(Minima) tval2 = t.interval(.95, n - 1)[1] lowlim2 = np.mean(Minima) - tval2 * np.std(Minima) upplim2 = np.mean(Minima) + tval2 * np.std(Minima) tval3 = t.interval(.99, n - 1)[1] lowlim3 = np.mean(Minima) - tval3 * np.std(Minima) upplim3 = np.mean(Minima) + tval3 * np.std(Minima) # Finding Average Error per Position in Sample AveErr = np.zeros([n]) for i in range(0, n): AveErr[i] = np.mean(LressumSamp[:, i]) plt.figure(3) plt.scatter(ids[0:n], AveErr) plt.axvspan(lowlim1, upplim1, alpha=0.05, color='salmon') plt.axvspan(lowlim2, upplim2, alpha=0.1, color='salmon') plt.axvspan(lowlim3, upplim3, alpha=0.18, color='salmon') plt.suptitle('Average RMSE of Input Number', fontsize=12) plt.title('Confidence Intervals of Signal Position in Red') plt.xlabel('Input Number') plt.ylabel('Average RMSE of Input') print('90% Confidence Interval Limits: (', lowlim1, ',', upplim1, ')') print('95% Confidence Interval Limits: (', lowlim2, ',', upplim2, ')') print('99% Confidence Interval Limits: (', lowlim3, ',', upplim3, ')') plt.show() # CONFIDENCE MATRIX label = tf.argmax(Ysort, axis=1) # converts true labels to column vector predict = tf.argmax( Lhat, axis=1 ) # predicts binary labels from the confidences, stores vector confusion_matrix = tf.confusion_matrix(label, predict, k) with tf.Session() as sess: cm = confusion_matrix.eval() # creates the confusion matrix pdcm = pd.DataFrame( cm) # converts the confusion matrix to pandas for aesthetics p = ['Predicted'] * k # this is to make pretty row and column names list = [] for i in range(0, k): list.append(p[i] + ' ' + classnames[i]) pclassname = list a = ['Actual'] * k list = [] for i in range(0, k): list.append(a[i] + ' ' + classnames[i]) aclassname = list pdcm.columns = pclassname # renaming columns pdcm.index = aclassname # renaming rows confusion_mat = pdcm return (confusion_mat) else: print("Need Blanced Testing Set")
def ts_dispersion_plot(self, **kwargs): ''' Plots disperison timeseries in matplotlib plot Parameters ---------- channel: string Channel options: dict Options including data processing prior to plot. Defaults in config._plot_def_opt formatting: dict Formatting dict. Defaults in config._ts_plot_def_fmt Returns ------- Matplotlib figure ''' if 'channel' not in kwargs: std_out('Needs at least one channel to plot') return None else: channel = kwargs['channel'] if 'options' not in kwargs: std_out('Using default options') options = config._plot_def_opt else: options = dict_fmerge(config._plot_def_opt, kwargs['options']) if 'formatting' not in kwargs: std_out('Using default formatting') formatting = config._ts_plot_def_fmt['mpl'] else: formatting = dict_fmerge(config._ts_plot_def_fmt['mpl'], kwargs['formatting']) if self.dispersion_df is None: std_out('Perform dispersion analysis first!', 'ERROR') return None if self.common_channels == []: self.get_common_channels() if channel not in self.common_channels: std_out(f'Channel {channel} not in common_channels') return None if channel in config._dispersion['ignore_channels']: std_out(f'Channel {channel} ignored per config') return None if len(self.devices) > config._dispersion['nt_threshold']: distribution = 'normal' std_out('Using normal distribution') std_out( f"Using limit for sigma confidence: {config._dispersion['limit_confidence_sigma']}" ) else: distribution = 't-student' std_out(f'Using t-student distribution.') # Size sanity check if formatting['width'] > 50: std_out('Reducing width to 12') formatting['width'] = 12 if formatting['height'] > 50: std_out('Reducing height to 10') formatting['height'] = 10 # Make subplot figure, (ax_tbr, ax_ok) = plt.subplots(nrows=2, sharex=formatting['sharex'], figsize=(formatting['width'], formatting['height'])) # cmap = plt.cm.Reds norm = matplotlib.colors.Normalize( vmin=0, vmax=config._dispersion['limit_errors'] / 2) ch_index = self.common_channels.index(channel) + 1 # Style if formatting['style'] is not None: style.use(formatting['style']) else: style.use(config._plot_style) # Font size if formatting['fontsize'] is not None: rcParams.update({'font.size': formatting['fontsize']}) total_number = len(self.common_channels) dispersion_avg = self._dispersion_summary[channel] if distribution == 'normal': limit_confidence = config._dispersion['limit_confidence_sigma'] # Calculate upper and lower bounds if (config._dispersion['instantatenous_dispersion']): # For sensors with high variability in the measurements, it's better to use this upper_bound = self.dispersion_df[channel + '_AVG']\ + limit_confidence * self.dispersion_df[channel + '_STD'] lower_bound = self.dispersion_df[channel + '_AVG']\ - abs(limit_confidence * self.dispersion_df[channel + '_STD']) else: upper_bound = self.dispersion_df[channel + '_AVG']\ + limit_confidence * dispersion_avg lower_bound = self.dispersion_df[channel + '_AVG']\ - abs(limit_confidence * dispersion_avg) else: limit_confidence = t.interval( config._dispersion['t_confidence_level'] / 100.0, len(self.devices), loc=self.dispersion_df[channel + '_AVG'], scale=dispersion_avg) upper_bound = limit_confidence[1] lower_bound = limit_confidence[0] for device in self.devices: ncol = channel + '-' + device if ncol in self.dispersion_df.columns: # Count how many times we go above the upper bound or below the lower one count_problems_up = self.dispersion_df[ncol] > upper_bound count_problems_down = self.dispersion_df[ncol] < lower_bound # Count them count_problems = [1 if (count_problems_up[i] or count_problems_down[i])\ else 0 for i in range(len(count_problems_up))] # Add the trace in either number_errors = np.sum(count_problems) max_number_errors = len(count_problems) if number_errors / max_number_errors > config._dispersion[ 'limit_errors'] / 100: std_out( f"Device {device} out of {config._dispersion['limit_errors']}% limit\ - {np.round(number_errors/max_number_errors*100, 1)}% out", 'WARNING') alpha = 1 ax_tbr.plot(self.dispersion_df.index, self.dispersion_df[ncol], color='r', label=device, alpha=alpha) else: alpha = 1 color = 'g' ax_ok.plot(self.dispersion_df.index, self.dispersion_df[ncol], color=color, label=device, alpha=alpha) # Add upper and low bound bound to subplot 1 ax_tbr.plot(self.dispersion_df.index, self.dispersion_df[channel + '_AVG'], 'b', label='Average', alpha=0.6) ax_tbr.plot(self.dispersion_df.index, upper_bound, 'k', label='Upper-Bound', alpha=0.6) ax_tbr.plot(self.dispersion_df.index, lower_bound, 'k', label='Lower-Bound', alpha=0.6) # Format the legend lgd1 = ax_tbr.legend(bbox_to_anchor=(1, 0.5), fancybox=True, loc='center left', ncol=5) ax_tbr.grid(True) ax_tbr.set_ylabel(channel + ' TBR') ax_tbr.set_xlabel('Time') # Add upper and low bound bound to subplot 2 ax_ok.plot(self.dispersion_df.index, self.dispersion_df[channel + '_AVG'], 'b', label='Average', alpha=0.6) ax_ok.plot(self.dispersion_df.index, upper_bound, 'k', label='Upper-Bound', alpha=0.6) ax_ok.plot(self.dispersion_df.index, lower_bound, 'k', label='Lower-Bound', alpha=0.6) # Format the legend ax_ok.legend(bbox_to_anchor=(1, 0.5), fancybox=True, loc='center left', ncol=5) lgd2 = ax_ok.legend(bbox_to_anchor=(1, 0.5), fancybox=True, loc='center left', ncol=5) ax_ok.grid(True) ax_ok.set_ylabel(channel + ' OK') ax_ok.set_xlabel('Time') figure.suptitle(f'({ch_index}/{total_number}) - {channel}', fontsize=formatting['title_fontsize']) plt.subplots_adjust(top=formatting['suptitle_factor']) if options['show']: plt.show() return figure