def fit(self, df_agg): df = deepcopy(df_agg) for model in self.models: df['error'] = ((df[model] - df[self.target_col]) / df[model]) grouped = df.groupby(self.region_col).agg({ 'error': ['mean', 'std'] }).reset_index() grouped.columns = [self.region_col, 'mean', 'std'] self.mean[model] = { grouped[self.region_col].iloc[i]: grouped['mean'].iloc[i] for i in range(grouped.shape[0]) } self.std[model] = { grouped[self.region_col].iloc[i]: grouped['std'].iloc[i] for i in range(grouped.shape[0]) } { grouped[self.region_col].iloc[i]: norm.interval(self.ci_range, loc=grouped['mean'].iloc[i], scale=grouped['std'].iloc[i]) for i in range(grouped.shape[0]) } self.ci[model] = { grouped[self.region_col].iloc[i]: norm.interval(self.ci_range, loc=grouped['mean'].iloc[i], scale=grouped['std'].iloc[i]) for i in range(grouped.shape[0]) }
def predict_dist(self, X, interval=0.95, *args, **kwargs): # Expectation and variance in latent space Ey_t, Vy_t, ql, qu = super().predict_dist(X, interval) # Save computation if identity transform if type(self.target_transform) is transforms.Identity: return Ey_t, Vy_t, ql, qu # Save computation if standardise transform elif type(self.target_transform) is transforms.Standardise: Ey = self.target_transform.itransform(Ey_t) Vy = Vy_t * self.target_transform.ystd ** 2 ql, qu = norm.interval(interval, loc=Ey, scale=np.sqrt(Vy)) return Ey, Vy, ql, qu # All other transforms require quadrature Ey = np.empty_like(Ey_t) Vy = np.empty_like(Vy_t) # Used fixed order quadrature to transform prob. estimates for i, (Eyi, Vyi) in enumerate(zip(Ey_t, Vy_t)): # Establish bounds Syi = np.sqrt(Vyi) a, b = Eyi - 3 * Syi, Eyi + 3 * Syi # approx 99% bounds # Quadrature Ey[i], _ = fixed_quad(self.__expec_int, a, b, n=QUADORDER, args=(Eyi, Syi)) Vy[i], _ = fixed_quad(self.__var_int, a, b, n=QUADORDER, args=(Ey[i], Eyi, Syi)) ql, qu = norm.interval(interval, loc=Ey, scale=np.sqrt(Vy)) return Ey, Vy, ql, qu
def Tukey_outliers(set_of_means, FDR=0.005, supporting_interval=0.5, verbose=False): """ Performs Tukey quintile test for outliers from a normal distribution with defined false discovery rate :param set_of_means: :param FDR: :return: """ # false discovery rate v.s. expected falses v.s. power q1_q3 = norm.interval(supporting_interval) FDR_q1_q3 = norm.interval(1 - FDR) # TODO: this is not necessary: we can perfectly well fit it with proper params to FDR multiplier = (FDR_q1_q3[1] - q1_q3[1]) / (q1_q3[1] - q1_q3[0]) l_means = len(set_of_means) q1 = np.percentile(set_of_means, 50*(1-supporting_interval)) q3 = np.percentile(set_of_means, 50*(1+supporting_interval)) high_fence = q3 + multiplier*(q3 - q1) low_fence = q1 - multiplier*(q3 - q1) if verbose: print 'FDR:', FDR print 'q1_q3', q1_q3 print 'FDRq1_q3', FDR_q1_q3 print 'q1, q3', q1, q3 print 'fences', high_fence, low_fence if verbose: print "FDR: %s %%, expected outliers: %s, outlier 5%% confidence interval: %s"% (FDR*100, FDR*l_means, poisson.interval(0.95, FDR*l_means)) ho = (set_of_means < low_fence).nonzero()[0] lo = (set_of_means > high_fence).nonzero()[0] return lo, ho
def robustness(graphclusters, permutations): """ Compares vectors of cluster assignments to estimate cluster-wise robustness and node-wise robustness. These are returned as dictionaries. Inspired by reliablity scores as proposed by: Frantz, T. L., & Carley, K. M. (2017). Reporting a network’s most-central actor with a confidence level. Computational and Mathematical Organization Theory, 23(2), 301-312. Because calculating the accuracy of a cluster assignment is not trivial, the function does not compare cluster labels directly. Instead, this function calculates the Jaccard similarity between cluster assignments. Parameters ---------- :param graphclusters: Dictionary of original cluster assignments :param permutations: Number of permutations to compute robustness. :return: Two dictionaries of reliability scores (cluster-wise and node-wise). """ rev_assignments = list() for assignment in permutations: subassignments = dict() for k, v in assignment.items(): subassignments.setdefault(v, set()).add(k) rev_assignments.append(subassignments) revclusters = dict() for k, v in graphclusters.items(): revclusters.setdefault(v, set()).add(k) # clusterwise jaccard clusjaccards = dict() for cluster in set(graphclusters.values()): true_composition = revclusters[cluster] jaccards = list() # keys don't have to match so both cluster assignments should be evaluated for rev_assignment in rev_assignments: scores = list() for key in rev_assignment: scores.append(jaccard_similarity_score(true_composition, rev_assignment[key])) bestmatch = np.max(scores) jaccards.append(bestmatch) clusjaccards[cluster] = np.round(norm.interval(0.95, np.mean(jaccards), np.std(jaccards)), 4) logger.info("Confidence intervals for Jaccard similarity of cluster assignments:") logger.info(str(clusjaccards)) nodejaccards = dict.fromkeys(graphclusters.keys()) ci_width = dict.fromkeys(graphclusters.keys()) for node in nodejaccards: true_composition = revclusters[graphclusters[node]] jaccards = list() for i in range(len(permutations)): clusid = permutations[i][node] rev_assignment = rev_assignments[i][clusid] jaccards.append(jaccard_similarity_score(true_composition, rev_assignment)) nodejaccards[node] = np.round(norm.interval(0.95, np.mean(jaccards), np.std(jaccards)), 4) ci_width[node] = np.round(nodejaccards[node][1] - nodejaccards[node][0], 4) return clusjaccards, nodejaccards, ci_width
def plot_samples(ax, data, colour="black"): samples = map(operator.itemgetter(0), data) Nx = np.array(map(operator.itemgetter(1), data), dtype=int) Na = np.array(map(operator.itemgetter(2), data), dtype=int) Rx = np.array(map(operator.itemgetter(5), data), dtype=float) Rx_CI = map(operator.itemgetter(6), data) sex = map(operator.itemgetter(7), data) Elx = np.array(map(operator.itemgetter(8), data), dtype=float) Rx_m = [x for x,sx in zip(Rx,sex) if sx=='M'] Rx_f = [x for x,sx in zip(Rx,sex) if sx=='F'] ax.vlines(0.5, -2, len(samples), linestyle=':') ax.vlines(1.0, -2, len(samples), linestyle=':') y_pos = np.arange(len(samples)) ax.set_yticks(y_pos) ax.set_yticklabels(["{} ({})".format(s,x+a) for s,x,a in zip(samples,Nx,Na)]) if len(Rx_m) > 1: ax.vlines(np.mean(Rx_m), -2, len(samples), linestyle='-', color='red') #ax.vlines(2*np.mean(Rx_m), -2, len(samples), linestyle='-.', color='blue') m_ci = norm.interval(0.99, np.mean(Rx_m), np.std(Rx_m)) ax.fill_between(m_ci, -2, len(samples), alpha=0.2, color="red", edgecolor="none") else: ax.fill_between([0.4, 0.6], -2, len(samples), alpha=0.2, color="red", edgecolor="none") if len(Rx_f) > 1: ax.vlines(np.mean(Rx_f), -2, len(samples), linestyle='-', color='blue') f_ci = norm.interval(0.99, np.mean(Rx_f), np.std(Rx_f)) ax.fill_between(f_ci, -2, len(samples), alpha=0.2, color="blue", edgecolor="none") else: ax.fill_between([0.8, 1.2], -2, len(samples), alpha=0.2, color="blue", edgecolor="none") sex_colour = {'M': "red", 'F': "blue", 'U': 'black'} ecol = [sex_colour[sx] for sx in sex] ax.scatter(Rx, y_pos, facecolor=ecol, edgecolors=colour, lw=0.5, s=60) err_low = Rx - np.array(map(operator.itemgetter(0), Rx_CI)) err_high = np.array(map(operator.itemgetter(1), Rx_CI)) - Rx ax.errorbar(Rx, y_pos, xerr=[err_low, err_high], ecolor=colour, marker="none", fmt="none", capsize=0) ax.set_ylim([-0.5, len(samples)-0.5]) ax.set_xlim([0, 1.5]) ax.set_xlabel('Read dosage (X)', size=16) ax.set_ylabel('Sample (number of sequences)', size=16)
def temporal_prior(traces, actmn, actvar, fwhm, outliers=None): """ Generate temporal-dependent priors using basis sets and mexican-hat functions :param traces: matrix of traces, ncells by nframes :param actmn: mean activity :param actvar: variation above which we will consider it a guaranteed event :param fwhm: the full-width at half-maximum to use for the temporal prior :param outliers: cells with strongly outlying activity :return: prior vector """ if outliers is None: outliers = np.zeros(np.shape(traces)[0]) > 1 # Set the half-width of the convolution kernel xhalfwidth = 100 # Determine a normal function sigma from the full-width at half-maximum def sigma(fwhm_): return fwhm_ / (2 * np.sqrt(2 * np.log(2))) # Generate the basis functions and correct population activity for baseline and variation basis = np.power(fwhm, np.arange(4) + 1) popact = (np.nanmean(traces[np.invert(outliers), :], axis=0) - actmn) / actvar fits = np.zeros((len(basis) - 1, len(popact))) # Get the first basis normal function defrange = int(norm.interval(0.99999, loc=0, scale=sigma(basis[0]))[1]) + 3 defrange = min(xhalfwidth, defrange) b0 = np.zeros(2 * xhalfwidth + 1) b0[xhalfwidth - defrange:xhalfwidth + defrange + 1] = norm.pdf( range(-defrange, defrange + 1), loc=0, scale=sigma(basis[0])) # Generate the fits for b in range(1, len(basis)): defrange = int( norm.interval(0.99999, loc=0, scale=sigma(basis[b]))[1]) + 3 defrange = min(xhalfwidth, defrange) bn = np.zeros(2 * xhalfwidth + 1) bn[xhalfwidth - defrange:xhalfwidth + defrange + 1] = norm.pdf( range(-defrange, defrange + 1), loc=0, scale=sigma(basis[b])) fits[b - 1, :] = np.convolve(popact, b0 - bn, 'same') # And return the wfits to the narrowest basis function weights = np.clip(np.nanmin(fits, axis=0), 0, 1) return weights
def get_stats(values, intervals=True): stats = {} values_array = np.array(values, dtype=np.float64) stats['min'] = np.asscalar(np.amin(values_array)) stats['max'] = np.asscalar(np.amax(values_array)) stats['mean'] = np.asscalar(np.mean(values_array)) stats['median'] = np.asscalar(np.median(values_array)) if values_array.size > 1: stats['std_dev'] = np.asscalar(np.std(values_array, ddof=1)) else: stats['std_dev'] = 0 if intervals: stats['intervals'] = [] loc = stats['mean'] scale = stats['std_dev'] / sqrt(values_array.size) for alpha in (.95, .99, .90, .85, .80, .50): if values_array.size > 30: interval = norm.interval(alpha, loc=loc, scale=scale) else: interval = t.interval(alpha, values_array.size - 1, loc, scale) stats['intervals'].append( {'confidence': alpha, 'interval': interval}) return stats
def test_find_confidence_interval(test_data): """ """ # z test case test_statistic, standard_error = one_samp_z(test_data) ci = find_confidence_interval( se=standard_error, df=np.inf, alpha=0.05, tails=True, ) ci_s = norm.interval( alpha=0.95, loc=np.mean(test_data), scale=sem(test_data), ) ci_s = ci_s[1] - np.mean(test_data) assert np.abs(ci - ci_s) <= 1e-10 # student t test case test_statistic, standard_error, degrees_freedom = one_samp_t(test_data) ci = find_confidence_interval( se=standard_error, df=degrees_freedom, alpha=0.05, tails=True, ) ci_s = t.interval(alpha=0.95, df=len(data) - 1, loc=np.mean(data), scale=sem(data)) ci_s = ci_s[1] - np.mean(test_data) assert np.abs(ci - ci_s) <= 1e-10
def validate_area(ALPHA, areas, area, discarded_file, discarded_homographies): if len(areas) < 2: return True norm_areas_parameters = norm.fit( areas ) # returned a list of two parameters (mean=parameters[0] and std=parameters[1]) areas_quantiles = norm.interval(ALPHA, norm_areas_parameters[0], norm_areas_parameters[1]) ##print('-----') ##print('Area: '+str(areas)) ##print('Mean: '+str(norm_areas_parameters[0])) ##print('Std: '+str(norm_areas_parameters[1])) ##print(str(areas_quantiles[0])+' < '+str(area)+' < '+str(areas_quantiles[1])) ##print('-----') if area >= areas_quantiles[0] and area <= areas_quantiles[1]: return True else: discarded_homographies[0] += 1 discarded_file.write( "HOMOGRAPHY DISCARDED #" + str(discarded_homographies[0] + discarded_homographies[1] + discarded_homographies[2] + discarded_homographies[3]) + " (area too big)\n") discarded_file.write("Min bound: " + str(areas_quantiles[0]) + "\nMax bound: " + str(areas_quantiles[1]) + "\nArea: " + str(area) + "\n\n") return False
def fit_normal(signal, tag): mu, std = norm.fit(signal["mean"].values) confidence_interval = norm.interval(CONFIDENCE, loc=mu, scale=std) if PLOTTING: # Plot the histogram. plt.subplots() plt.hist(signal["mean"].values, bins=25, density=True, alpha=0.6, color="g") # Plot the PDF. xmin, xmax = plt.xlim() x = np.linspace(xmin, xmax, 100) p = norm.pdf(x, mu, std) plt.plot(x, p, "k", linewidth=2) title = "Fit results normal: mu = %.2f, std = %.2f" % (mu, std) plt.title(title) plt.axvline(x=confidence_interval[0]) plt.axvline(x=confidence_interval[1]) # Plot the confidence interval plt.savefig(f"analysis/images/{slugify(tag)}_fit_histogram.png", format="png") return { "distribution": "normal", "params": [{ "mu": mu, "std": std }], "confidence": [confidence_interval], }
def sumStats(self, pelistgroup, bclass, confint): outputMatrix = [[ "Boyce Class", "Mean", "Median", "Range", "Lower Bound", "Upper Bound" ]] convertMatrix = [] header = [["Mean", "Median", "Range", "Lower Bound", "Upper Bound"]] for i in range(len(pelistgroup[0])): tempList = [] for list in pelistgroup: tempList.append(list[i]) convertMatrix.append(tempList) for i in range(len(convertMatrix)): average = np.mean(convertMatrix[i]) median = np.median(convertMatrix[i]) convertMatrix[i].sort() ran = convertMatrix[i][-1] - convertMatrix[i][0] sterr = np.std(convertMatrix[i]) / math.sqrt(len(convertMatrix[i])) alpha = float(confint) / 100 lbound = stats.norm.interval(alpha, average, sterr)[0] if lbound < 0: lbound = 0 ubound = norm.interval(alpha, average, sterr)[1] #Edited 10/9/2013 outputMatrix.append( [str(i + 1), average, median, ran, lbound, ubound]) return convertMatrix, outputMatrix
def calculate_calibration_intervals( targets, predicted_means, predicted_stddevs, step=0.05, verbose=False ): """ Computes calibration curve - how theoretical conf intervals correlate wti practical (assuming prediction is Normal) """ real_errors = np.abs(targets - predicted_means) all_fractions = [] q_list = np.arange(0.0, 1.0 + step, step) if verbose: q_list = tqdm(q_list) for q in q_list: predicted_error_bound = -predicted_means + \ norm.interval(q, predicted_means, predicted_stddevs)[1] emp_fraction = np.mean(( real_errors <= predicted_error_bound ).astype(float)) all_fractions.append(emp_fraction) # Calculates area to diagonal (ideal calibration). c_auc_score = np.abs( np.array(all_fractions) - np.arange(0.0, 1.0 + step, step) ).sum() * step return all_fractions, c_auc_score
def is_in_confidence_region(self, x, alpha): """Check if sample is in alpha confidence region. Parameters ---------- x : array, shape (n_features,) Sample alpha : float Value between 0 and 1 that defines the probability of the confidence region, e.g., 0.6827 for the 1-sigma confidence region or 0.9545 for the 2-sigma confidence region. Returns ------- is_in_confidence_region : bool Is the sample in the alpha confidence region? """ self._check_initialized() # we have one degree of freedom less than number of dimensions n_dof = len(x) - 1 if n_dof >= 1: return self.squared_mahalanobis_distance(x) <= chi2(n_dof).ppf(alpha) else: # 1D lo, hi = norm.interval( alpha, loc=self.mean[0], scale=self.covariance[0, 0]) return lo <= x[0] <= hi
def get_intervals(xi_i, p_xi, frequencies, alpha=0.95): N = xi_i / p_xi dist_xi = norm.interval(alpha, loc=xi_i, scale=math.sqrt(xi_i * (1 - p_xi))) dist_xi = (dist_xi[0] / N, dist_xi[1] / N) return dist_xi
def calibration_test(self, x, y_norm): mean, var, shape, rate, mixture_var = self(x) y = y_norm * self.y_std + self.y_mean y_norm = y_norm.detach().numpy() y = y.detach().numpy() confidence_values = np.expand_dims(np.arange(0.1, 1, 0.1), axis=1) norm_lower, norm_upper = norm.interval( confidence_values, loc=mean.detach().numpy(), scale=np.sqrt(var.detach().numpy()), ) gamma_lower, gamma_upper = gamma.interval(confidence_values, shape.detach().numpy(), scale=1 / rate.detach().numpy()) output = torch.zeros_like(norm_upper) normal_check = np.logical_and(norm_lower < y_norm[np.newaxis, :], y_norm[np.newaxis, :] < norm_upper) gamma_check = np.logical_and(gamma_lower < y[np.newaxis, :], y[np.newaxis, :] < gamma_upper) output[mixture_var < 0.5] = normal_check output[mixture_var > 0.5] = gamma_check return output
def get_decision(self): if not self.fed_data: return None self.fed_data = False # keep default period until we collect more data points if self.points_observed < 10: return None #can at most increase monitoring period to the next one in the list curr_index = self.mon_periods.index(self.curr_period) max_index = min(curr_index + 1, len(self.mon_periods) - 1) # pick largest period which doesn't cross thresholds with 'confidence' probability for i in reversed(range(max_index + 1)): period = self.mon_periods[i] mean = self.latest_val std = max(0.01, np.sqrt(self.ewmv * (1 + self.weight * (period - 1)))) interval = norm.interval(self.confidence, loc=mean, scale=std) #print(mean, std, interval) if interval[0] > self.ok_interval[0] and \ interval[1] < self.ok_interval[1]: # period stays unchanged, so no decision to change if self.curr_period == period: return None self.curr_period = period return period if self.curr_period == self.mon_periods[0]: return None self.curr_period = self.mon_periods[0] return self.mon_periods[0]
def _std_tuple_of(var=None, std=None, interval=None): """ Convienence function for plotting. Given one of var, standard deviation, or interval, return the std. Any of the three can be an iterable list. Examples -------- >>>_std_tuple_of(var=[1, 3, 9]) (1, 2, 3) """ if std is not None: if np.isscalar(std): std = (std, ) return std if interval is not None: if np.isscalar(interval): interval = (interval, ) return norm.interval(interval)[1] if var is None: raise ValueError("no inputs were provided") if np.isscalar(var): var = (var, ) return np.sqrt(var)
def success_count_trial_outliers(alpha, trials, p, n): μ, σ = params_binomal_to_normal(p, n) acceptence_interval = norm.interval(alpha, μ, σ) return [ trial for trial in trials if trial < acceptence_interval[0] or trial > acceptence_interval[1] ]
def load_data(data_path): data = np.load(data_path) T = data.shape[1] n = data.shape[0] #n_sample = data.shape[1] #n = bs#*n_sample means = [] stds = [] conf_intervals = [] conf_intervals_max = [] conf_intervals_min = [] dof = n - 1 for i in range(T): mean = np.mean(data[:, i]) std = np.std(data[:, i]) print(mean) print(std) std = std / math.sqrt(n) #conf_interval = ST.ppf(1-alpha/2., dof) * std*np.sqrt(1.+1./n) conf_interval = ST.interval(0.95, loc=mean, scale=std) print(conf_interval) means.append(mean) stds.append(std) conf_intervals.append(conf_interval) conf_intervals_max.append(mean + (conf_interval[1] - mean)) conf_intervals_min.append(mean - (mean - conf_interval[0])) return means, conf_intervals_min, conf_intervals_max
def price(self, n, level = 0.95): x = asset.simulate(n) y = self.payoff(x) delta = np.mean(y) Var = np.var(y) CI = [delta + q * sqrt(Var/n) for q in norm.interval(level)] return {"option price":delta, "confidence interval":CI, "variance":Var}
def _std_tuple_of(var=None, std=None, interval=None): """ Convienence function for plotting. Given one of var, standard deviation, or interval, return the std. Any of the three can be an iterable list. Examples -------- >>>_std_tuple_of(var=[1, 3, 9]) (1, 2, 3) """ if std is not None: if np.isscalar(std): std = (std,) return std if interval is not None: if np.isscalar(interval): interval = (interval,) return norm.interval(interval)[1] if var is None: raise ValueError("no inputs were provided") if np.isscalar(var): var = (var,) return np.sqrt(var)
def get_PI(self): mean = self.latest_val std = max( 0.01, np.sqrt(self.ewmv * (1 + self.weight * (self.curr_period - 1)))) interval = norm.interval(self.confidence, loc=mean, scale=std) return interval
def make_vec(n): ranges = [] for i in range(1, 2**n + 1): a, b = norm.interval(alpha=i / (2**n + 1), loc=0, scale=0.4) ranges.append(a) ranges.append(b) return sorted(ranges)
def predict_proba(self, x, interval=0.95, *args, **kwargs): # We can't make predictions until we have trained the model if not self._trained: print('Train first') return y_pred = np.zeros((x.shape[0], self.forests * self.n_estimators)) for i in range(self.forests): if self.parallel: # used in training pk_f = join(self.temp_dir, 'rf_model_{}.pk'.format(i)) else: # used when parallel is false, i.e., during x-val pk_f = join(self.temp_dir, 'rf_model_{}_{}.pk'.format(i, mpiops.chunk_index)) with open(pk_f, 'rb') as fp: f = pickle.load(fp) for m, dt in enumerate(f.estimators_): y_pred[:, i * self.n_estimators + m] = \ f.ytform.itransform(dt.predict(x)) y_mean = np.mean(y_pred, axis=1) y_var = np.var(y_pred, axis=1) # Determine quantiles ql, qu = norm.interval(interval, loc=y_mean, scale=np.sqrt(y_var)) return y_mean, y_var, ql, qu
def detect_signal(self, history_dict, pos=None, contrary=False): best_f = self.__lrfs.extract_best_feature(history_dict=history_dict) kfo = KalmanFilterOptimizer(y=best_f['series'], x0=self.__x0, v0=self.__v0, pmv_ratio=self.__pmv_ratio) q, r = kfo.optimize() kf = KalmanFilter(x0=self.__x0, v0=self.__v0, q=q, r=r) kf_res = kf.fit(y=best_f['series']).iloc[-1].to_dict() self.__logger.debug(f'kf_res:\t{kf_res}') gauss_mu = kf_res['x'] gauss_ci = np.asarray( norm.interval(alpha=self.__ci_level, loc=gauss_mu, scale=np.sqrt(kf_res['v'] + q))) sig_side = 'short' if gauss_mu * [1, -1][int(contrary)] < 0 else 'long' if gauss_ci[1] < 0 or gauss_ci[0] > 0: sig_act = sig_side else: sig_act = None sig_log_str = '{:^40}|'.format('{0:>3}[{1:>3}]:{2:>9}{3:>18}'.format( self.__lrfs.code, best_f['granularity_str'], f'{gauss_mu:.1g}', np.array2string(gauss_ci, formatter={'float_kind': lambda f: f'{f:.1g}'}))) return { 'sig_act': sig_act, 'granularity': best_f['granularity'], 'sig_log_str': sig_log_str, 'sig_mu': gauss_mu, 'sig_cil': gauss_ci[0], 'sig_ciu': gauss_ci[1] }
def predict_proba(self, X, interval=0.95, *args, **kwargs): """ Predictive mean and variance for a probabilistic regressor. Parameters ---------- X: ndarray (Ns, d) array query dataset (Ns samples, d dimensions). interval: float, optional The percentile confidence interval (e.g. 95%) to return. fields: dict, optional dictionary of fields parsed from the shape file. ``indicator_field`` should be a key in this dictionary. If this is not present, then a Gaussian likelihood will be used for all predictions. The only time this may be input if for cross validation. Returns ------- Ey: ndarray The expected value of ys for the query inputs, X of shape (Ns,). Vy: ndarray The expected variance of ys (excluding likelihood noise terms) for the query inputs, X of shape (Ns,). ql: ndarray The lower end point of the interval with shape (Ns,) qu: ndarray The upper end point of the interval with shape (Ns,) """ Ey, Vy = self.predict_moments(X, *args, **kwargs) ql, qu = norm.interval(interval, loc=Ey, scale=np.sqrt(Vy)) return Ey, Vy, ql, qu
def get_decision(self): # keep default period until we collect more data points if len(self.val_window) < self.window_size: return None np_window = np.array(self.change_window.data) curr_val = self.val_window[0] change_mean = np.mean(np_window) change_var = np.var(np_window) # pick largest period which doesn't cross thresholds with 'confidence' probability for period in reversed(self.mon_periods): rnd_walk_std = np.sqrt(change_var * period) rnd_walk_std = max(0.01 * period, rnd_walk_std) change_interval = norm.interval(self.confidence, loc=change_mean, scale=rnd_walk_std) if curr_val + change_interval[0] > self.ok_interval[0] and \ curr_val + change_interval[1] < self.ok_interval[1]: # period stays unchanged, so no decision to change if self.curr_period == period: return None self.curr_period = period return period if self.curr_period == self.mon_periods[0]: return None self.curr_period = self.mon_periods[0] return self.mon_periods[0]
def dPDF(pts,mu,sigma, distribuition, outlier = 0, data = 0, n=10, seed = None): import numpy as np from scipy.interpolate import interp1d from distAnalyze import dpdf, mediaMovel from scipy.stats import norm, lognorm eps = 5e-5 ngrid = int(1e6) if distribuition == 'normal': outlier_inf = outlier_sup = outlier if not data: inf, sup = norm.interval(0.9999, loc = mu, scale = sigma) x = np.linspace(inf-outlier_inf,sup+outlier_sup,ngrid) y = dpdf(x,mu,sigma,distribuition) else: np.random.set_state(seed) d = np.random.normal(mu,sigma,data) inf,sup = min(d)-outlier_inf,max(d)+outlier_sup y,x = np.histogram(d,bins = 'fd',normed = True) x = np.mean(np.array([x[:-1],x[1:]]),0) y = abs(np.diff(mediaMovel(y,n))) x = x[:-1]+np.diff(x)[0]/2 elif distribuition == 'lognormal': outlier_inf = 0 outlier_sup = outlier if not data: inf, sup = lognorm.interval(0.9999, sigma, loc = 0, scale = np.exp(mu)) x = np.linspace(inf-outlier_inf,sup+outlier_sup,ngrid) y = dpdf(x,mu,sigma,distribuition) else: np.random.set_state(seed) d = np.random.lognormal(mu,sigma,data) inf,sup = min(d)-outlier_inf,max(d)+outlier_sup y,x = np.histogram(d,bins = 'fd',normed = True) x = np.mean(np.array([x[:-1],x[1:]]),0) y = abs(np.diff(mediaMovel(y,n))) x = x[:-1]+np.diff(x)[0]/2 y = y/(np.diff(x)[0]*sum(y)) #dy = lambda x,u,s : abs(1/(s**3*sqrt(2*pi))*(u-x)*np.exp(-0.5*((u-x)/s)**2)) cdf = np.cumsum(y) #cdf = np.sum(np.tri(len(x))*y,1) #cdf = np.concatenate(cdf) cdf = cdf/max(cdf) #time.time()-t interp = interp1d(cdf,x, fill_value = 'extrapolate') Y = np.linspace(eps,1-eps,pts) X = interp(Y) return X,Y
def qqplot(data, labels, n_quantiles=100, alpha=0.95, error_type='theoretical', distribution = 'binomial', log10conv=True, color=['k', 'r', 'b'], fill_dens=[0.1, 0.1, 0.1], type = 'uniform', title='title'): ''' Function for plotting Quantile Quantile (QQ) plots with confidence interval (CI) :param data: NumPy 1D array with data :param labels: :param type: type of the plot :param n_quantiles: number of quntiles to plot :param alpha: confidence interval :param log10conv: conversion to -log10(p) for the figure :return: nothing ''' xmax = 0 ymax = 0 if type == 'uniform': # we expect distribution from 0 to 1 for j in range(len(data)): # define quantiles positions: q_pos = np.concatenate([np.arange(99.)/len(data[j]), np.logspace(-np.log10(len(data[j]))+2, 0, n_quantiles)]) # define quantiles in data q_data = mquantiles(data[j], prob=q_pos, alphap=0, betap=1, limit=(0, 1)) # linear interpolation # define theoretical predictions q_th = q_pos.copy() # evaluate errors q_err = np.zeros([len(q_pos),2]) if np.sum(alpha) > 0: for i in range(0, len(q_pos)): if distribution == 'binomial': q_err[i, :] = binom.interval(alpha=alpha, n=len(data[j]), p=q_pos[i]) elif distribution == 'normal': q_err[i, :] = norm.interval(alpha, len(data[j])*q_pos[i], np.sqrt(len(data[j])*q_pos[i]*(1.-q_pos[i]))) q_err[i, q_err[i, :] < 0] = 1e-12 else: print('Distribution is not defined!') q_err /= 1.0*len(data[j]) for i in range(0, 100): q_err[i,:] += 1e-12 # print(q_err[100:, :]) slope, intercept, r_value, p_value, std_err = linregress(q_th, q_data) # print(labels[j], ' -- Slope: ', slope, " R-squared:", r_value**2) plt.plot(-np.log10(q_th[n_quantiles-1:]), -np.log10(q_data[n_quantiles-1:]), '-', color=color[j]) plt.plot(-np.log10(q_th[:n_quantiles]), -np.log10(q_data[:n_quantiles]), '.', color=color[j], label=labels[j]) xmax = np.max([xmax, - np.log10(q_th[1])]) ymax = np.max([ymax, - np.log10(q_data[0])]) # print(- np.log10(q_th[:])) if np.sum(alpha)>0: if error_type=='experimental': plt.fill_between(-np.log10(q_th), -np.log10(q_data/q_th*q_err[:,0]), -np.log10(q_data/q_th*q_err[:,1]), color=color[j], alpha=fill_dens[j], label='%1.3f CI'%alpha) if np.sum(alpha)>0: if error_type=='theoretical': plt.fill_between(-np.log10(q_th), -np.log10(q_err[:,0]), -np.log10(q_err[:,1]), color=color[j], alpha=fill_dens[j], label='%1.3f CI'%alpha) plt.legend(loc=4) plt.xlabel('Theoretical -log10') plt.ylabel('Experimental -log10') plt.plot([0, 100], [0, 100],'--k') # print(xmax,ymax) plt.xlim([0, np.ceil(xmax)]) plt.ylim([0, np.ceil(ymax*1.05)]) plt.title(title) plt.tight_layout()
def MCerr(func, ins, params, errs, nums, conf, nproc=1): #func : function taking in parameters #ins : list of inputs to function #params : list of parameters to put into function #err : list of error associated with parameters #nums: list of number of trials to compute for each parameter #np.random.seed(0) from scipy.stats import norm #val = func(*(ins+params)) n = len(params) val_errs = np.zeros(n) val_means = np.zeros(n) #val_means = np.zeros(n) #for each parameter for i in range(n): #print "computing parameter "+str(i+1)+"/"+str(n) #perturb parameter N times by STD trials = np.random.normal(params[i], errs[i], nums[i]) #confidence interval conf_int = norm.interval(conf, loc=params[i], scale=errs[i]) trials = trials[np.logical_and(trials > conf_int[0], trials < conf_int[1])] if nproc > 1: from multiprocessing import Pool pool = Pool(nproc) procs = [] #vals = np.zeros(nums[i]) #for each perturbation for j in range(len(trials)): #calculate value using perturbed perameter trial_params = np.copy(params) trial_params[i] = trials[j] #perform processes in parallel #vals[j] = func(*(ins+trial_params)) procs.append(pool.apply_async(func, ins + list(trial_params))) vals = np.array([proc.get(timeout=10) for proc in procs]) pool.terminate() else: vals = np.zeros(len(trials)) #for each perturbation for j in range(len(trials)): #calculate value using perturbed perameter trial_params = np.copy(params) trial_params[i] = trials[j] #perform process vals[j] = func(*(ins + list(trial_params))) #error associated with perturbation of parameter val_errs[i] = vals.std() val_means[i] = vals.mean() #val_means[i] = vals.mean() #total summed error associated with all perturbation val_err = np.sqrt(np.square(val_errs).sum()) val = val_means.mean() #return value and error return val, val_err
def predict_interval(self, X, confidence): assert np.isscalar(confidence), "Confidence should be a scalar" ensemble_mean, std = self.predict(X, return_std=True) std += 1e-6 # Avoid NaNs. TODO: Better solution? How are std=0 produced?? interval_tuple = norm.interval(confidence, loc=ensemble_mean, scale=std) intervals = np.concatenate((interval_tuple[0][:, np.newaxis], interval_tuple[1][:, np.newaxis]), axis=1) return intervals
def make_normal(n): ranges = [] for i in range(1, n + 1, 2): a, _ = norm.interval(alpha=i / (n + 2), loc=0, scale=1) ranges.append(a) ranges = np.asarray(ranges) ranges /= abs(max(ranges, key=abs)) return np.sort(ranges) * random.uniform(0.1, 0.7)
def predict_proba(self, x, interval=0.95): """ Predict the outputs and variances of the inputs This method predicts the output values that would correspond to each input in X. This method also returns the certainty of the model in each case, which is only sensible when the number of commitee members is greater than one. This method also outputs quantile information along with the variance to establish the probability distribution clearly. Parameters ---------- x: numpy.array The inputs for which the model should be evaluated interval: float The probability threshold for which the quantiles should be output. Returns ------- y_mean: numpy.array An array of expected output values given the inputs y_var: numpy.array The variance of the outputs ql: numpy.array The lower quantiles for each input qu: numpy.array The upper quantiles for each input """ n, m = x.shape # We can't make predictions until we have trained the model if not self._trained: print('Train first') return # Determine which rule to run on each row and then run the regression # on each row of x to get the regression output. y_pred = np.zeros((n, len(self.models))) for m, model in enumerate(self.models): for rule in model: # Determine which rows satisfy this rule mask = rule.satisfied(x) # Make the prediction for the whole matrix, and keep only the # rows that are correctly sized y_pred[mask, m] += rule.regress(x, mask) y_mean = np.mean(y_pred, axis=1) y_var = np.var(y_pred, axis=1) # Determine quantiles ql, qu = norm.interval(interval, loc=y_mean, scale=np.sqrt(y_var)) # Convert the prediction to a numpy array and return it return y_mean, y_var, ql, qu
def print_and_plot_results(count, results, verbose, plot_file_name): print("RPS calculated as 95% confidence interval") rps_mean_ar = [] low_ar = [] high_ar = [] test_name_ar = [] for test_name in sorted(results): data = results[test_name] rps = count / array(data) rps_mean = tmean(rps) rps_var = tvar(rps) low, high = norm.interval(0.95, loc=rps_mean, scale=rps_var**0.5) times = array(data) * 1000000 / count times_mean = tmean(times) times_stdev = tstd(times) print('Results for', test_name) print('RPS: {:d}: [{:d}, {:d}],\tmean: {:.3f} μs,' '\tstandard deviation {:.3f} μs' .format(int(rps_mean), int(low), int(high), times_mean, times_stdev)) test_name_ar.append(test_name) rps_mean_ar.append(rps_mean) low_ar.append(low) high_ar.append(high) if verbose: print(' from', times) print() if plot_file_name is not None: import matplotlib.pyplot as plt from matplotlib import cm fig = plt.figure() ax = fig.add_subplot(111) L = len(rps_mean_ar) color = [cm.autumn(float(c) / (L - 1)) for c in arange(L)] bars = ax.bar( arange(L), rps_mean_ar, color=color, yerr=(low_ar, high_ar), ecolor='k') # order of legend is reversed for visual appeal ax.legend( reversed(bars), reversed(test_name_ar), loc='upper left') ax.get_xaxis().set_visible(False) plt.ylabel('Requets per Second', fontsize=16) print(plot_file_name) plt.savefig(plot_file_name, dpi=96) print("Plot is saved to {}".format(plot_file_name)) if verbose: plt.show()
def theoretical_stddev_and_confidence_intervals(n): """ Returns an output helping to build the latex tables for the theoretical CIs """ stddev = ((pi - 2) * 2 / (n * (pi ** 2))) ** 0.5 output = ['{n}'.format(n=n), '{stddev:.4f}'.format(stddev=stddev)] for alpha in ALPHA_LEVELS: output.append(r'$\frac{2}{\pi} \pm ' + '{x:.4f}'.format(x=(norm.interval(alpha)[1] * stddev)) + '$') return ' & '.join(output) + r' \\'
def confidence_interval(errors): # tvar is the sample variance from scipy.stats import norm, tvar import math mu = sum(errors) / float(len(errors)) var = tvar(errors) std_dev = math.sqrt(var) std_error = std_dev / math.sqrt(len(errors)) span_95 = norm.interval(0.95, loc=mu, scale=std_error) return span_95
def _95p_center(means_accumulator, errs_accumulator): def helper(initial, bounds, target): shift = target - initial if np.abs(shift) < bounds: return target else: scale = bounds / np.abs(shift) return initial + shift*scale target_percentile = np.sum(np.logical_not(np.isnan(means_accumulator)), axis=0)-1 target_percentile = 1-np.power(np.ones_like(target_percentile)*0.05, 1/np.sqrt(target_percentile)) _95p_cosntant = norm.interval(0.95)[1] contractor = np.vectorize(lambda x: _95p_cosntant/norm.interval(x)[1]) contraction_interval = contractor(target_percentile) contraction_interval = errs_accumulator/contraction_interval[np.newaxis, :] contractor2 = np.vectorize(helper) new_means_accumulator = contractor2(means_accumulator, contraction_interval, np.nanmean(means_accumulator, axis=0)[np.newaxis, :]) return new_means_accumulator
def exp_data_generator( self, exp_val, apparatus_uncer, first_abs=0.0, first_rel=0.0, conf=0.95 ): """ Predict experimental data according to the experimental method and apparatus uncertainty information with the assumption that the apparatus uncertainty is described by a 95% confidence interval under a normal distribution Parameters: =========== exp_val: float expected reading of the sensor apparatus_uncer: class exp_uncer.APPARATUS_UNCER uncertainty information of the measurement apparatus first_abs: float extra absolute uncertainty to the measurement due to noise, in the same engineering unit as the measurement first_rel: float extra relative uncertainty to the measurement due to noise conf: float, optional the confidence level of the apparatus uncertainty. Default 95% Returns: =========== data: numpy array readings in time-series """ num = self.get_num() # number of data points zero_order = apparatus_uncer.zero_order_uncer(exp_val) first_order = sqrt( first_abs**2+(first_rel*exp_val)**2 ) norm_std = sqrt( zero_order**2+first_order**2 )/norm.interval(0.95)[1] data = [] for ii in xrange(num): data.append(normalvariate(exp_val, norm_std)) return np.array(data)
def test_KL_divergence_for_normal_distributions(show_plot=True): mu_0 = 0 sigma_0 = 1 interval = norm.interval(.99,mu_0,sigma_0) support = numpy.linspace(interval[0], interval[1], num=2000) mus = numpy.linspace(0, 3, num=30) p_0 = norm.logpdf(support, mu_0, sigma_0) KL_inf = [] KL_ana = [] for mu in mus: p_1 = norm.logpdf(support, mu, sigma_0) kld = qtu.KL_divergence_arrays(support, p_0, p_1, False) KL_inf.append(float(kld)) KL_ana.append(actual_KL(mu_0, sigma_0, mu, sigma_0)) KL_inf = numpy.array(KL_inf) KL_ana = numpy.array(KL_ana) KL_diff = KL_ana-KL_inf if show_plot: pylab.subplot(1,2,1) pylab.plot(KL_inf, label='est') pylab.plot(KL_ana, label='analytical') pylab.title('estimated KL') pylab.legend() pylab.subplot(1,2,2) pylab.plot(KL_diff) pylab.title('KL error') pylab.show() _, p = pearsonr(KL_inf, KL_ana) return p
def conf_interval(arr, confidence = 0.95): N = arr.size if N <= 30: z = t.interval(0.95, N - 1) else: z = norm.interval(0.95) s = arr.std() x_bar = arr.mean() return (x_bar - z*(s/np.sqrt(N)), x_bar + z*(s/np.sqrt(N)))
def sumStats(self, pelistgroup, bclass, confint): outputMatrix = [["Boyce Class", "Mean", "Median", "Range", "Lower Bound", "Upper Bound"]] convertMatrix = [] header = [["Mean", "Median", "Range", "Lower Bound", "Upper Bound"]] for i in range(len(pelistgroup[0])): tempList = [] for list in pelistgroup: tempList.append(list[i]) convertMatrix.append(tempList) for i in range(len(convertMatrix)): average = np.mean(convertMatrix[i]) median = np.median(convertMatrix[i]) convertMatrix[i].sort() ran = convertMatrix[i][-1] - convertMatrix[i][0] sterr = np.std(convertMatrix[i])/math.sqrt(len(convertMatrix[i])) alpha = float(confint)/100 lbound = stats.norm.interval(alpha, average, sterr)[0] if lbound < 0: lbound = 0 ubound = norm.interval(alpha, average, sterr)[1] #Edited 10/9/2013 outputMatrix.append([str(i+1), average, median, ran, lbound, ubound]) return convertMatrix, outputMatrix
def generate_discrete_support(params, support=0.95, nbins=100): """ returns a set of intervals over which the component model pdf is supported. Inputs: params: a dict with entries 'mu' and 'rho' nbins: cardinality of the set or the number of grid points in the approximation support: a float in (0,1) that describes the amount of probability we want in the range of support """ if type(nbins) is not int: raise TypeError("nbins should be an int") if nbins <= 0: raise ValueError("nbins should be greater than 0") support = check_type_force_float(support, "support") if support <= 0.0 or support >= 1.0: raise ValueError("support is a float st: 0 < support < 1") check_model_params_dict(params) mu = params['mu'] sigma = (1.0/params['rho'])**.5 interval = norm.interval(support,mu,sigma) a = interval[0] b = interval[1] support_range = b - a; support_bin_size = support_range/(nbins-1.0) bins = [a+i*support_bin_size for i in range(nbins)] return bins
def guess_param(current_val, lo_bound, hi_bound, SEARCH_RATE=5, BACKTRACK_PROB=.1, hard_lower_bound = .0001, hard_upper_bound = 10): lo_val = lo_bound hi_val = hi_bound # If we are missing either of the bounds, artificially create them using the search rate if(lo_val==None): #lo_val = current_val / SEARCH_RATE if(hi_val!=None): lo_val = hi_val / SEARCH_RATE else: lo_val = current_val / SEARCH_RATE if(hi_val==None): #hi_val = current_val * SEARCH_RATE if(lo_val!=None): hi_val = lo_val * SEARCH_RATE else: hi_val = current_val * SEARCH_RATE # Create a normal distribution centered between the two bounds, and # Ensure that there is only a BACKTRACK_PROB probability of generating # a point outside of that range mean = (hi_val+ lo_val) / 2 (lo_conf_bound, hi_conf_bound) = norm.interval(1 - BACKTRACK_PROB) sd = (hi_val - lo_val) / (hi_conf_bound*2) # Draw a random guess from the distribution and update bounds if necessary current_val = normalvariate(mean, sd) current_val = max(current_val, hard_lower_bound) current_val = min(current_val, hard_upper_bound) if(lo_bound!=None): lo_bound = min(lo_bound, current_val) if(hi_bound!=None): hi_bound = max(hi_bound, current_val) return current_val, lo_bound, hi_bound
def get_gaussian_model(residuals, n_bins=50): from scipy.stats import norm import matplotlib.mlab as mlab mu, std = norm.fit(residuals) plt.figure() plt.subplot(211) result = plt.hist(residuals, n_bins, histtype='bar') born_inf, born_supp = norm.interval(0.95, mu, std) x = numpy.linspace(min(residuals), max(residuals), 100) dx = result[1][1] - result[1][0] scale = len(residuals) * dx plt.plot(x, mlab.normpdf(x, mu, std) * scale, "r--", linewidth=2.0) plt.axvline(born_inf, color='g', linewidth=2.0) plt.axvline(born_supp, color='g', linewidth=2.0) plt.subplot(212) plt.plot(residuals) plt.show()
# calculate prob, disregard weights lpr = log_multivariate_normal_density(input_df,g_1.means_,g_1.covars_,g_1.covariance_type) logprob = logsumexp(lpr,axis=1) responsibilities = np.exp(lpr - logprob[:, np.newaxis]) probs = pd.DataFrame(responsibilities) probs.set_index(input_df.index,inplace=True) probs.columns = ['prob_0','prob_1'] probs.loc[:,'color'] = 'k' probs.loc[probs.prob_0>=0.90, 'color'] = 'r' probs.loc[probs.prob_1>=0.90, 'color'] = 'b' # plot 1D GMM delta= 0.0001 x = np.arange(0.5, 1.2, delta) mu_1, sigma_1 = (g_1.means_[0][0],np.sqrt(g_1.covars_[0][0])) mu_2, sigma_2 = (g_1.means_[1][0],np.sqrt(g_1.covars_[1][0])) intervals_1 = norm.interval(0.95,loc=mu_1,scale=sigma_1) intervals_2 = norm.interval(0.95,loc=mu_2,scale=sigma_2) interval_1_x = np.arange(intervals_1[0][0],intervals_1[1][0],delta) interval_2_x = np.arange(intervals_2[0][0],intervals_2[1][0],delta) print intervals_1 print intervals_2 Z1 = mlab.normpdf(x,mu_1,sigma_1) Z2 = mlab.normpdf(x,mu_2,sigma_2) Z = (Z2-Z1) diffpts = zip(x,Z) diffpts = [(a,b) for a,b in diffpts if a > 0.8 and a < 1.4] zeropt = sorted(diffpts, key=lambda x: abs(x[1]))[0][0] min_interval = min(intervals_1[1][0],intervals_2[1][0]) max_interval = max(intervals_1[0][0],intervals_2[0][0]) input_df.plot(kind='density') plt.plot(x,Z1,label='gaussian one')
def bootstrap_residuals(data, model, num_samples=100, statistic=np.mean): ''' Bootstraps data with models a given number of times and calculates the Goodness of fit for each run. The standard deviation of the Goodness-of-fit values is then used to estimate the confidence interval. Parameters ---------- data : array_like The observed data, must be the same size as the model model : array_like The model data, must be the same size as the observed data. num_samples : int, optional Number of runs in the bootstrapping. alpha : float, optional Significance of confidence interval. data_error : float, array_like, optional If unset, the error will be the standard deviation of the data. If an array, it must have the same dimensions as the observed data. sigma : float, optional If set, the confidence interval will be calculated using the number of standard deviations from the mean. verbose : bool, optional Print out progress? Returns ------- out : list A list, [confidence interval, goodness of fit array] ''' import numpy as np from scipy.stats import norm data_list = data.ravel() model_list = model.ravel() residuals = data - model length = len(data_list) num_samples = int(num_samples) gofArray = np.zeros(num_samples) if verbose: print('Beginning bootstrapping') for i in range(num_samples): # randomly sample all values of data and model indices_sample = np.random.choice(length,size=length,replace=True) data_sample = data_list[indices_sample] model_sample = model_list[indices_sample] gofArray[i] = ((data_sample - model_sample)**2 / \ data_error_list**2).sum() if verbose: if i%10 == 0: print(str(i) + 'th run complete.') mean, std = gofArray.mean(), gofArray.std() if sigma is not None: alpha = 1 - norm.cdf(sigma) confid_int = norm.interval(1 - alpha, loc=mean, sigma=std) return (confid_int,gofArray)
# because it's a discrete distribution) low, high = binom.interval(alpha, N, p) if p==0: low = high = 0 elif p==1: low = high = N q = binom.cdf(low-0.1, N, p)+binom.sf(high+0.1, N, p) low, high = binom.interval(alpha, num_Np_checks, q) if q==0: low = high = 0 if num_Np_fails<low or num_Np_fails>high: print 'N=%d, p=%.3f failed %d of %d checks, outside range (%d, %d)' % (N, p, num_Np_fails, num_Np_checks, low, high) print failrate = float(numfails)/numchecks low, high = norm.interval(alpha, loc=mu, scale=sqrt(sigma2)) print '%d/%d=%.2f%% failed at %d%%' % (numfails, numchecks, numfails*100.0/numchecks, 100*alpha) print 'Expected mean=%d, std dev=%d (mean fail rate=%.2f%%)' % (mu, sqrt(sigma2), 100*mu/numchecks) if low<=numfails<=high: print 'Overall passed at %d%%: within range (%d, %d)' % (alpha*100, low, high) else: print 'Overall failed at %d%%: outside range (%d, %d)' % (alpha*100, low, high) figure(figsize=(10, 6)) plotnum = 0 for p in p_range: if p==0 or p==1: continue plotnum += 1 subplot(2, 3, plotnum) n = arange(1, isi_max[p])
def cross_validate_disc_version(algorithm, tab_file, min_support=-30, sample_pct=0.1, iterations=1, only_interesting_triples=False, restricted_triples=None, extra_id=''): from subprocess import call from parsers import Borgelt cv_start = time() # Create work folder _id = str(time()).replace('.','') + '_' + extra_id path = '../tmp/cv_' + _id + '/' os.mkdir(path) print "\n### Running cross validation cv_{}###".format(_id) total_transactions = 0 for line in open(tab_file, 'rb'): total_transactions += 1 print 'Total total_transactions: ', total_transactions # Get the total observed triples borgelt_start = time() observed_file_name = path + 'observed_frequent_items.out' args = [algorithm, tab_file, observed_file_name, '-s' + str(min_support), '-n3'] # pro = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True, preexec_fn=os.setsid) # os.killpg(pro.pid, signal.SIGTERM) call(args) # sleep(20) print 'fpgrowth on all data done: {} secs'.format(time()-borgelt_start) freq = Borgelt.read_frequent_items(observed_file_name) # Create ds of all observed triplets # Saved as sorted keys for lookup, # and their frequency as value observed = {} count = 0 for item in freq: if len(item[0]) == 3: sorted_trip = triple_sort(item[0]) observed[sorted_trip] = item[1][0] print 'Total triplets observed:', len(observed) average_observed = sum(observed.values()) / float(len(observed)) print 'Baseline: ', average_observed del freq avg_errors = [] var_errors = [] avg_errors_ext = [] var_errors_ext = [] avg_errors_heu = [] var_errors_heu = [] avg_errors_ind = [] var_errors_ind = [] avg_errors_baseline = [] occurrences = [0 for i in range(100)] max_ent_acc_error = [0 for i in range(100)] ext_acc_error = [0 for i in range(100)] ind_acc_error = [0 for i in range(100)] heu_acc_error = [0 for i in range(100)] baseline_acc_error = [0 for i in range(100)] # Record trip counts for the best estimats max_ent_best = Counter() ext_best = Counter() ind_best = Counter() for index in range(iterations): # Create sample file sampling_start = time() if sample_pct > 0: sample_size= int(total_transactions*sample_pct) else: sample_size = abs(sample_pct) test_data_size = total_transactions - sample_size sample = random.sample(range(total_transactions), sample_size) assert len(sample) == sample_size, 'Sample size not equal to sample' sample.sort() sample_file_name = path + str(index) + '_sample.tab' with open(sample_file_name, 'a') as sample_file: sample_line = 0 for line_num, line in enumerate(open(tab_file, 'rb')): if line_num == sample[sample_line]: sample_file.write(line) sample_line += 1 if sample_line == sample_size: break del sample print 'Sample size: {} time: {}'.format(sample_size, time() - sampling_start) borgelt_start = time() sample_freq_name = path + str(index) + '_sample_frequent_items.out' args = [algorithm, sample_file_name, sample_freq_name, '-s-1', '-n3'] call(args) print 'fpgrowth on sample data done: {} secs'.format(time()-borgelt_start) # Check any frequent items were found if not os.path.exists(sample_freq_name): print 'No frequent items found' print 'args', args continue min_support_trips = min_supported_trips(min_support, test_data_size) print 'Forward min_support_trips set to: ', min_support_trips triangles_start = time() triangle_tree, sample_triples = Forward.forward_compact(sample_freq_name, min_support_trips, observed, only_interesting_triples, restricted_triples) print 'Found triangles done: {}'.format(time() - triangles_start) #del sample_freq estimates = [] extrapolations = [] independences = [] heurestics = [] baselines = [] observations = [] triplets = [] MAPE_errors = [] MAPE_errors_ext = [] MAPE_errors_ind = [] MAPE_errors_heu = [] MAPE_errors_baseline = [] true_errors = [] pair_triple_ratios = [] triangle_counts = [] # s1_list = [] # s2_list = [] # s3_list = [] # s12_list = [] # s13_list = [] # s23_list = [] # Recursion for estimate to converge req_depth = int(math.log(total_transactions, 2)) + 1 # DFS of the tree holding all triangles for n1 in triangle_tree.keys(): s1, s2_dict = triangle_tree[n1] for n2 in s2_dict.keys(): s2, s12, s3_dict = s2_dict[n2] for n3 in s3_dict.keys(): s3, s13, s23, s123 = s3_dict[n3] triangle_counts.append((s1, s2, s3, s12, s13, s23, s123)) triangle = (n1, n2, n3) pair_triple_ratio = s123 / float(min(s12, s13, s23)) pair_triple_ratios.append(pair_triple_ratio) # Get the obs (test data) frequency minus those found in the sample (training data) obs = 0 if triangle in observed: # (triples in data) - (triples in sample). Calculating the number of triples in test data. obs = observed[triangle] - s123 # maxent estimate est = ent.maxent_est_rosa(s1, s2, s3, s12, s23, s13, float(sample_size), num=req_depth) * (test_data_size / float(sample_size)) if est < 0: print 'max ent below 0' print 's1 s2 s3 s12 s13 s23 s123', (s1, s2, s3, s12, s23, s13, s123) # extrapolation estimate est2 = s123 / float(sample_size) * test_data_size # independence estimat est3 = (s1 / float(sample_size)) * (s2 / float(sample_size)) * (s3 / float(sample_size)) * test_data_size # est3 = (s1*s2*s3)/float(sample_size*sample_size) * test_data_size/float(sample_size) # heurestic, use max_ent for 0 triple in sample est4 = s123 < 5 and est or est2 # base line estimat est5 = average_observed estimates.append(est) extrapolations.append(est2) independences.append(est3) heurestics.append(est4) baselines.append(est5) observations.append(obs) triplets.append(triangle) # TODO Do why save these? They already exist in the triangle tree (and take # up shit load of space..) # s1_list.append(s1) # s2_list.append(s2) # s3_list.append(s3) # s12_list.append(s12) # s13_list.append(s13) # s23_list.append(s23) #end TODO # MAPE error max ent error = abs(obs-est) / math.sqrt(obs) # * 100 MAPE_errors.append(error) true_errors.append(obs-est) # MAPE error extrapolation error2 = 0 if est2 > 0: error2 = abs(obs-est2) / math.sqrt(obs) # * 100 MAPE_errors_ext.append(error2) # MAPE error independence error3 = abs(obs-est3) / math.sqrt(obs) # * 100 MAPE_errors_ind.append(error3) # MAPE error heurestic error4 = abs(obs-est4) / math.sqrt(obs) # * 100 MAPE_errors_heu.append(error4) # MAPE baseline error error5 = abs(obs-est5) / math.sqrt(obs) #* 100 MAPE_errors_baseline.append(error5) # Record error for the estimeate that performed best if error < error2 and error < error3: max_ent_best[s123] += 1 elif error2 < error and error2 < error3: ext_best[s123] += 1 else: ind_best[s123] += 1 try: occurrences[s123] += 1 max_ent_acc_error[s123] += error ext_acc_error[s123] += error2 ind_acc_error[s123] += error3 heu_acc_error[s123] += error4 baseline_acc_error[s123] += error5 except IndexError, ie: pass # print 'true errors: ', true_errors # print 'estimates: ', estimates # print 'observed: ', observed # print 'mape ', MAPE_errors del triangle_tree del sample_triples if len(MAPE_errors) > 0: #TODO handle this, probably when nothing has been found min_error = min(MAPE_errors) max_error = max(MAPE_errors) # max ent error avg_error = sum(MAPE_errors) / float(len(MAPE_errors)) avg_errors.append(avg_error) # extrapolation error avg_error_ext = sum(MAPE_errors_ext) / float(len(MAPE_errors_ext)) avg_errors_ext.append(avg_error_ext) # independence error avg_error_ind = sum(MAPE_errors_ind) / float(len(MAPE_errors_ind)) avg_errors_ind.append(avg_error_ind) # heurestic error avg_error_heu = sum(MAPE_errors_heu) / float(len(MAPE_errors_heu)) avg_errors_heu.append(avg_error_heu) # baseline error avg_error_baseline = sum(MAPE_errors_baseline) / float(len(MAPE_errors_baseline)) avg_errors_baseline.append(avg_error_baseline) var_error = 0 var_error_ext = 0 var_error_heu = 0 var_error_ind = 0 # variance if len(MAPE_errors) > 1: var_error = tvar(MAPE_errors) #tvar is the sample variance var_error_ext = tvar(MAPE_errors_ext) var_error_heu = tvar(MAPE_errors_heu) var_error_ind = tvar(MAPE_errors_ind) # max_ent confidence interval std_dev = math.sqrt(var_error) std_error = std_dev / math.sqrt(sample_size) span_99 = norm.interval(0.99, avg_error, std_error) span_95 = norm.interval(0.95, avg_error, std_error) # ext confidence interval std_dev_ext = math.sqrt(var_error_ext) std_error_ext = std_dev_ext / math.sqrt(sample_size) span_99_ext = norm.interval(0.99, avg_error_ext, std_error_ext) span_95_ext = norm.interval(0.95, avg_error_ext, std_error_ext) # independence confidence interval std_dev_ind = math.sqrt(var_error_ind) std_error_ind = std_dev_ind / math.sqrt(sample_size) span_99_ind = norm.interval(0.99, avg_error_ind, std_error_ind) span_95_ind = norm.interval(0.95, avg_error_ind, std_error_ind) # heurestic confidence interval std_dev_heu = math.sqrt(var_error_heu) std_error_heu = std_dev_heu / math.sqrt(sample_size) span_99_heu = norm.interval(0.99, avg_error_heu, std_error_heu) span_95_heu = norm.interval(0.95, avg_error_heu, std_error_heu) var_errors.append(var_error) var_errors_ext.append(var_error_ext) var_errors_heu.append(var_error_heu) var_errors_ind.append(var_error_ind) res_string = "\nResult ({}):\nSample size:{} triangles:{} test_data:{}\n".format(index, sample_size, len(estimates), total_transactions-sample_size) # log max ent result res_string += "avg_error:{} var_error:{}\n".format(avg_error, var_error) res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99)) res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95)) res_string += 'avg_error_ext:{} var_error_ext:{}\n'.format(avg_error_ext, var_error_ext) res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99_ext)) res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95_ext)) res_string += 'avg_error_ind:{} var_error_ind:{}\n'.format(avg_error_ind, var_error_ind) res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99_ind)) res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95_ind)) res_string += 'avg_error_heu:{} var_error_heu:{}\n'.format(avg_error_heu, var_error_heu) res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99_heu)) res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95_heu)) res_string += 'avg_error_baseline:{}\n'.format(avg_error_baseline) with open(path + str(index) + '_log.txt', 'a') as log_file: log_file.write(res_string) print res_string # Write result data with open(path + str(index) + '_data.json', 'w') as fd: # triplet_key = ['triple' for t in estimates] # est_key = ['est' for t in estimates] # obs_key = ['obs' for t in observations] fd.write(json.dumps(zip(triplets, zip(estimates, observations)))) with open(path + str(index) + '_data.tsv', 'w') as fd: fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n') for _index, i in enumerate(estimates): fd.write(str(estimates[_index]) + '\t' + str(observations[_index]) + '\t' + str(triplets[_index][0]) + '\t' + str(triplets[_index][1]) + '\t' + str(triplets[_index][2]) + '\t' + str(pair_triple_ratios[_index]) + '\t' + str(triangle_counts[_index][0]) + '\t' + str(triangle_counts[_index][1]) + '\t' + str(triangle_counts[_index][2]) + '\t' + str(triangle_counts[_index][3]) + '\t' + str(triangle_counts[_index][4]) + '\t' + str(triangle_counts[_index][5]) + '\t' + str(triangle_counts[_index][6]) + '\n') with open(path + str(index) + '_data_extrapolation.tsv', 'w') as fd: fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n') for _index, i in enumerate(estimates): fd.write(str(extrapolations[_index]) + '\t' + str(observations[_index]) + '\t' + str(triplets[_index][0]) + '\t' + str(triplets[_index][1]) + '\t' + str(triplets[_index][2]) + '\t' + str(pair_triple_ratios[_index]) + '\t' + str(triangle_counts[_index][0]) + '\t' + str(triangle_counts[_index][1]) + '\t' + str(triangle_counts[_index][2]) + '\t' + str(triangle_counts[_index][3]) + '\t' + str(triangle_counts[_index][4]) + '\t' + str(triangle_counts[_index][5]) + '\t' + str(triangle_counts[_index][6]) + '\n') with open(path + str(index) + '_data_heurestic.tsv', 'w') as fd: fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n') for _index, i in enumerate(heurestics): fd.write(str(heurestics[_index]) + '\t' + str(observations[_index]) + '\t' + str(triplets[_index][0]) + '\t' + str(triplets[_index][1]) + '\t' + str(triplets[_index][2]) + '\t' + str(pair_triple_ratios[_index]) + '\t' + str(triangle_counts[_index][0]) + '\t' + str(triangle_counts[_index][1]) + '\t' + str(triangle_counts[_index][2]) + '\t' + str(triangle_counts[_index][3]) + '\t' + str(triangle_counts[_index][4]) + '\t' + str(triangle_counts[_index][5]) + '\t' + str(triangle_counts[_index][6]) + '\n') with open(path + str(index) + '_data_independece.tsv', 'w') as fd: fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n') for _index, i in enumerate(independences): fd.write(str(independences[_index]) + '\t' + str(observations[_index]) + '\t' + str(triplets[_index][0]) + '\t' + str(triplets[_index][1]) + '\t' + str(triplets[_index][2]) + '\t' + str(pair_triple_ratios[_index]) + '\t' + str(triangle_counts[_index][0]) + '\t' + str(triangle_counts[_index][1]) + '\t' + str(triangle_counts[_index][2]) + '\t' + str(triangle_counts[_index][3]) + '\t' + str(triangle_counts[_index][4]) + '\t' + str(triangle_counts[_index][5]) + '\t' + str(triangle_counts[_index][6]) + '\n') # Save the errors with open(path + str(index) + '_MAPE_errors.pickle', 'wb') as fd: pickle.dump(MAPE_errors, fd) with open(path + str(index) + '_MAPE_errors_ext.pickle', 'wb') as fd: pickle.dump(MAPE_errors_ext, fd) with open(path + str(index) + '_MAPE_errors_heu.pickle', 'wb') as fd: pickle.dump(MAPE_errors_heu, fd) with open(path + str(index) + '_MAPE_errors_ind.pickle', 'wb') as fd: pickle.dump(MAPE_errors_ind, fd) with open(path + str(index) + '_MAPE_errors_baseline.pickle', 'wb') as fd: pickle.dump(MAPE_errors_baseline, fd) #saves amounts of all subsets of triples. # TODO this code does not run! # with open(path + str(index) + '_data_correlations.tsv', 'w') as fd: # fd.write('s1\ts2\ts3\ts12\ts13\ts23\n') # for _index, i in enumerate(s123): # fd.write(str(s1[_index]) + '\t' + str(s2[_index]) + '\t' + str(s3[_index]) + '\t' + str(s12[_index]) + '\t' + str(s13[_index]) + '\t'+ str(s23[_index]) + '\n') #saves independence estimate for all triples. # TODO Why s123[_index] in the denominator? # TODO What is a 'double independece estimat'? # TODO Why not calculate and save estimates in the same way as ext and max_ent? # with open(path + str(index) + '_independence_estimate.tsv', 'w') as fd: # fd.write('single independence estimate\tdouble independence estimate\n') # for _index, i in enumerate(s123): # tempVal1 = sample_size/(s1[_index]) # tempVal2=sample_size/(s2[_index]) # tempVal3=sample_size/(s3[_index]) # tempVal12=sample_size/(s12[_index]) # tempVal13=sample_size/(s13[_index]) # tempVal23=sample_size/(s23[_index]) # fd.write(str(s123[_index]/tempVal1*tempVal2*tempVal3*(total_transactions-sample_size) + '\t' + s123[_index]/tempVal12*tempVal13*tempVal23*(total_transactions-sample_size) + '\n')) del estimates del observations # remove tmp files # os.remove(sample_freq_name) # os.remove(sample_file_name) else: print 'No abs errors!'
def ci_norm(data, confidence): mean=np.mean(data) sigma = np.std(data) v1, v2 = norm.interval(confidence, loc=mean, scale=sigma) return v2-v1
def est_all_data_disc_version(algorithm, tab_file, min_support=-30, iterations=1, only_interesting_triples=False, restricted_triples=None, extra_id=''): from subprocess import call from parsers import Borgelt cv_start = time() # Create work folder _id = str(time()).replace('.','') + '_' + extra_id path = '../tmp/cv_' + _id + '/' os.mkdir(path) print "\n### Running cross validation on ALL DATA cv_{}###".format(_id) total_transactions = 0 for line in open(tab_file, 'rb'): total_transactions += 1 print 'Total total_transactions: ', total_transactions sample_size = total_transactions avg_errors = [] var_errors = [] avg_errors_ext = [] var_errors_ext = [] avg_errors_heu = [] var_errors_heu = [] for index in range(iterations): borgelt_start = time() sample_freq_name = path + str(index) + '_sample_frequent_items.out' args = [algorithm, tab_file, sample_freq_name, '-s' + str(min_support), '-n3'] call(args) print 'fpgrowth on sample data (ALL DATA) done: {} secs'.format(time()-borgelt_start) freq = Borgelt.read_frequent_items(sample_freq_name) # Create ds of all observed triplets # Saved as sorted keys for lookup, # and their frequency as value observed = {} count = 0 for item in freq: if len(item[0]) == 3: sorted_trip = triple_sort(item[0]) # * 2, horrible hack to make Forward calculated the # observed frequency correctly. observed[sorted_trip] = item[1][0] * 2 print 'Total triplets observed:', len(observed) # Check any frequent items were found if not os.path.exists(sample_freq_name): print 'No frequent items found' print 'args', args continue min_support_trips = min_supported_trips(min_support, total_transactions) print 'Forward min_support_trips set to: ', min_support_trips triangles_start = time() triangle_tree, sample_triples = Forward.forward_compact(sample_freq_name, min_support_trips, observed, only_interesting_triples, restricted_triples) print 'Found triangles done: {}'.format(time() - triangles_start) #del sample_freq estimates = [] extrapolations = [] heurestics = [] observations = [] triplets = [] MAPE_errors = [] MAPE_errors_ext = [] triangle_counts = [] triplets = [] pair_triple_ratios = [] # Recursion for estimate to converge req_depth = int(math.log(total_transactions, 2))+1 # DFS of the tree holding all triangles for n1 in triangle_tree.keys(): s1, s2_dict = triangle_tree[n1] for n2 in s2_dict.keys(): s2, s12, s3_dict = s2_dict[n2] for n3 in s3_dict.keys(): s3, s13, s23, s123 = s3_dict[n3] triangle = (n1, n2, n3) triplets.append(triangle) triangle_counts.append((s1, s2, s3, s12, s13, s23, s123)) pair_triple_ratio = s123 / float(min(s12, s13, s23)) pair_triple_ratios.append(pair_triple_ratio) # Observed is the triple support, since sample is all data obs = s123 # maxent estimate est = ent.maxent_est_rosa(s1, s2, s3, s12, s23, s13, float(total_transactions), num=req_depth) # extrapolation estimate, does not make sense for all data est2 = s123 / float(sample_size) * (total_transactions) # heurestic, use max_ent for 0 triple in sample, does not make sense for all data # est3 = s123 == 0 and est or est2 estimates.append(est) # extrapolations.append(est2) # heurestics.append(est3) observations.append(obs) triplets.append(triangle) # MAPE error max ent error = abs(obs-est) / math.sqrt(obs) MAPE_errors.append(error) # MAPE error extrapolation error2 = abs(obs-est2) / math.sqrt(obs) MAPE_errors_ext.append(error2) # MAPE error heurestic # error3 = abs(obs-est3) / float(obs) * 100 # MAPE_errors_heu.append(error3) del triangle_tree del sample_triples if len(MAPE_errors) > 0: #TODO handle this, probably when nothing has been found min_error = min(MAPE_errors) max_error = max(MAPE_errors) # max ent error avg_error = sum(MAPE_errors) / float(len(MAPE_errors)) avg_errors.append(avg_error) # extrapolation error # avg_error_ext = sum(MAPE_errors_ext) / float(len(MAPE_errors_ext)) # avg_errors_ext.append(avg_error_ext) # heurestic error # avg_error_heu = sum(MAPE_errors_heu) / float(len(MAPE_errors_heu)) # avg_errors_heu.append(avg_error_heu) # variance var_error = var(MAPE_errors) # var_error_ext = tvar(MAPE_errors_ext) # var_error_heu = tvar(MAPE_errors_heu) # max_ent confidence interval std_dev = math.sqrt(var_error) std_error = std_dev / math.sqrt(sample_size) span_99 = norm.interval(0.99, avg_error, std_error) span_95 = norm.interval(0.95, avg_error, std_error) # ext confidence interval # std_dev_ext = math.sqrt(var_error_ext) # std_error_ext = std_dev_ext / math.sqrt(sample_size) # span_99_ext = norm.interval(0.99, avg_error_ext, std_error_ext) # span_95_ext = norm.interval(0.95, avg_error_ext, std_error_ext) # heurestic confidence interval # std_dev_heu = math.sqrt(var_error_heu) # std_error_heu = std_dev_heu / math.sqrt(sample_size) # span_99_heu = norm.interval(0.99, avg_error_heu, std_error_heu) # span_95_heu = norm.interval(0.95, avg_error_heu, std_error_heu) var_errors.append(var_error) # var_errors_ext.append(var_error_ext) # var_errors_heu.append(var_error_heu) res_string = "\nResult ALL DATA({}):\nSample size:{} triangles:{} test_data:{}\n".format(index, sample_size, len(estimates), sample_size) # log max ent result res_string += "avg_error:{} var_error:{}\n".format(avg_error, var_error) res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99)) res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95)) res_string += 'avg_error_ext:{} var_error_ext:{}\n'.format(avg_error_ext, var_error_ext) res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99_ext)) res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95_ext)) # res_string += 'avg_error_heu:{} var_error_heu:{}\n'.format(avg_error_heu, var_error_heu) # res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99_heu)) # res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95_heu)) with open(path + 'log.txt', 'a') as log_file: log_file.write(res_string) print res_string # Write result data with open(path + str(index) + '_data.json', 'w') as fd: # triplet_key = ['triple' for t in estimates] # est_key = ['est' for t in estimates] # obs_key = ['obs' for t in observations] fd.write(json.dumps(zip(triplets, zip(estimates, observations)))) with open(path + str(index) + '_data.tsv', 'w') as fd: fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n') for _index, i in enumerate(estimates): fd.write(str(estimates[_index]) + '\t' + str(observations[_index]) + '\t' + str(triplets[_index][0]) + '\t' + str(triplets[_index][1]) + '\t' + str(triplets[_index][2]) + '\t' + str(pair_triple_ratios[_index]) + '\t' + str(triangle_counts[_index][0]) + '\t' + str(triangle_counts[_index][1]) + '\t' + str(triangle_counts[_index][2]) + '\t' + str(triangle_counts[_index][3]) + '\t' + str(triangle_counts[_index][4]) + '\t' + str(triangle_counts[_index][5]) + '\t' + str(triangle_counts[_index][6]) + '\n') with open(path + str(index) + '_data_extrapolation.tsv', 'w') as fd: fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n') for _index, i in enumerate(estimates): fd.write(str(extrapolations[_index]) + '\t' + str(observations[_index]) + '\t' + str(triplets[_index][0]) + '\t' + str(triplets[_index][1]) + '\t' + str(triplets[_index][2]) + '\t' + str(pair_triple_ratios[_index]) + '\t' + str(triangle_counts[_index][0]) + '\t' + str(triangle_counts[_index][1]) + '\t' + str(triangle_counts[_index][2]) + '\t' + str(triangle_counts[_index][3]) + '\t' + str(triangle_counts[_index][4]) + '\t' + str(triangle_counts[_index][5]) + '\t' + str(triangle_counts[_index][6]) + '\n') del estimates del observations # remove tmp files # os.remove(sample_freq_name) # os.remove(sample_file_name) else: print 'No abs errors!' print "Cross validation done!" print "time: ", (time() - cv_start) if len(avg_errors) > 0: total_avg_error = sum(avg_errors)/float(len(avg_errors)) total_res_string = "Avg error:{}".format(total_avg_error)