def switching_BER(data, **kwargs): """ Process data for BER experiment. """ count_mat, start_stt = count_matrices_ber(data, **kwargs) switched_stt = int(1 - start_stt) mean = beta.mean(1 + count_mat[start_stt, switched_stt], 1 + count_mat[start_stt, start_stt]) limit = beta.mean( 1 + count_mat[start_stt, switched_stt] + count_mat[start_stt, start_stt], 1) ci68 = beta.interval(0.68, 1 + count_mat[start_stt, switched_stt], 1 + count_mat[start_stt, start_stt]) ci95 = beta.interval(0.95, 1 + count_mat[start_stt, switched_stt], 1 + count_mat[start_stt, start_stt]) return mean, limit, ci68, ci95
def get_answer(self): # In the beginning of every turn, we calculate the first, second, and third top guesses # related to the clue. if self.start_turn: embedding_distances = self.compute_distance(self.clue, self.words) sorted_words = [k for k, v in sorted(embedding_distances.items(), key=lambda item: item[1])] print("Words sorted by embedding distances: ", sorted_words) # First closest word. self.first = sorted_words[0] self.state[self.first] = (self.state[self.first][0] + 5, self.state[self.first][1]) # Second closest word. Guaranteed to exist. self.second = sorted_words[1] self.state[self.second] = (self.state[self.second][0] + 3, self.state[self.second][1]) # Third closest word. Usually exists. self.third = sorted_words[2] self.state[self.third] = (self.state[self.third][0] + 2, self.state[self.third][1]) self.start_turn = False print("state is: ", self.state) print("\n") sorted_by_beta = [k for k, v in sorted(self.state.items(), key=lambda item: -beta.mean(item[1][0], item[1][1]))] print("sorted_by_beta: ", sorted_by_beta) print("\n") # Note this guess may or may not be related to the current clue. self.guess = sorted_by_beta[0] self.guess_index = self.words.index(self.guess) print("Guess is: ", self.guess) self.num -= 1 return self.guess
def beta_estimation(_, rewards): global GLOBAL_CACHE key = (tuple(rewards), 'beta') if key in GLOBAL_CACHE: return GLOBAL_CACHE[key] trials = len(rewards) GLOBAL_CACHE[key] = (beta.mean(1 + sum(rewards), trials+1 - sum(rewards)), beta.std(1 + sum(rewards), trials+1 - sum(rewards))) return GLOBAL_CACHE[key]
def run_beta_fit(cadd_trset, mnp_cadd_trset, gerp_trset): ''' from scipy import stats import numpy as np import matplotlib.pylab as plt # create some normal random noisy data ser = 50*np.random.rand() * np.random.normal(10, 10, 100) + 20 # plot normed histogram plt.hist(ser, normed=True) # find minimum and maximum of xticks, so we know # where we should compute theoretical distribution xt = plt.xticks()[0] xmin, xmax = min(xt), max(xt) lnspc = np.linspace(xmin, xmax, len(ser)) ab,bb,cb,db = stats.beta.fit(ser) pdf_beta = stats.beta.pdf(lnspc, ab, bb,cb, db) plt.plot(lnspc, pdf_beta, label="Beta") plt.show() ''' cadd_trset_param = {} for aaconv in cadd_trset.keys(): a,b,loc2,scale2 = beta.fit(cadd_trset[aaconv]) mean2 = beta.mean(a,b,loc2,scale2) cadd_trset_param[aaconv] = [a,b,loc2,scale2,mean2] mnp_cadd_trset_param = {} for aaconv in mnp_cadd_trset.keys(): a,b,loc2,scale2 = beta.fit(mnp_cadd_trset[aaconv]) mean2 = beta.mean(a,b,loc2,scale2) mnp_cadd_trset_param[aaconv] = [a,b,loc2,scale2,mean2] gerp_trset_param = {} for aaconv in gerp_trset.keys(): a,b,loc2,scale2 = beta.fit(gerp_trset[aaconv]) mean2 = beta.mean(a,b,loc2,scale2) gerp_trset_param[aaconv] = [a,b,loc2,scale2,mean2] return cadd_trset_param, mnp_cadd_trset_param, gerp_trset_param
def test_transformation_composition_II(self): num_vars = 2 alpha_stat = 5 beta_stat = 2 def beta_cdf(x): return beta_rv.cdf(x, a=alpha_stat, b=beta_stat) def beta_icdf(x): return beta_rv.ppf(x, a=alpha_stat, b=beta_stat) x_marginal_cdfs = [beta_cdf]*num_vars x_marginal_inv_cdfs = [beta_icdf]*num_vars x_marginal_means = np.asarray( [beta_rv.mean(a=alpha_stat, b=beta_stat)]*num_vars) x_marginal_stdevs = np.asarray( [beta_rv.std(a=alpha_stat, b=beta_stat)]*num_vars) def beta_pdf(x): return beta_rv.pdf(x, a=alpha_stat, b=beta_stat) x_marginal_pdfs = [beta_pdf]*num_vars z_correlation = -0.9*np.ones((num_vars, num_vars)) for ii in range(num_vars): z_correlation[ii, ii] = 1. x_correlation = gaussian_copula_compute_x_correlation_from_z_correlation( x_marginal_inv_cdfs, x_marginal_means, x_marginal_stdevs, z_correlation) x_covariance = correlation_to_covariance( x_correlation, x_marginal_stdevs) var_trans_1 = NatafTransformation( x_marginal_cdfs, x_marginal_inv_cdfs, x_marginal_pdfs, x_covariance, x_marginal_means) # rosenblatt maps to [0,1] but polynomials of bounded variables # are in [-1,1] so add second transformation for this second mapping def normal_cdf(x): return normal_rv.cdf(x) def normal_icdf(x): return normal_rv.ppf(x) std_normal_marginal_cdfs = [normal_cdf]*num_vars std_normal_marginal_inv_cdfs = [normal_icdf]*num_vars var_trans_2 = UniformMarginalTransformation( std_normal_marginal_cdfs, std_normal_marginal_inv_cdfs) var_trans = TransformationComposition([var_trans_1, var_trans_2]) num_samples = 1000 true_samples, true_canonical_samples = \ generate_x_samples_using_gaussian_copula( num_vars, z_correlation, x_marginal_inv_cdfs, num_samples) true_canonical_samples = normal_rv.cdf(true_canonical_samples) samples = var_trans.map_from_canonical_space( true_canonical_samples) assert np.allclose(true_samples, samples) canonical_samples = var_trans.map_to_canonical_space(samples) assert np.allclose(true_canonical_samples, canonical_samples)
def setUp(self): uniform_var1 = {'var_type': 'uniform', 'range': [-1, 1]} uniform_var2 = {'var_type': 'uniform', 'range': [0, 1]} beta_var1 = { 'var_type': 'beta', 'range': [-1, 1], 'alpha_stat': 1, 'beta_stat': 1 } beta_var2 = { 'var_type': 'beta', 'range': [-2, 1], 'alpha_stat': 2, 'beta_stat': 1 } gaussian_var = {'var_type': 'gaussian', 'mean': -1., 'variance': 4.} #self.continuous_variables = [ # uniform_var1,beta_var1,gaussian_var,uniform_var2,uniform_var1, # beta_var2] self.continuous_variables = [ uniform(-1, 2), beta(1, 1, -1, 2), norm(-1, 2), uniform(), uniform(-1, 2), beta(2, 1, -2, 3) ] self.continuous_mean = np.array( [0., 0., -1, 0.5, 0., beta.mean(a=2, b=1, loc=-2, scale=3)]) nmasses1 = 10 mass_locations1 = np.geomspace(1.0, 32.0, num=nmasses1) masses1 = np.ones(nmasses1, dtype=float) / nmasses1 nmasses2 = 10 mass_locations2 = np.arange(0, nmasses2) masses2 = np.geomspace(1.0, 32.0, num=nmasses2) masses2 /= masses2.sum() # second () is to freeze variable which creates var.dist member # variable var1 = float_rv_discrete(name='var1', values=(mass_locations1, masses1))() var2 = float_rv_discrete(name='var2', values=(mass_locations2, masses2))() self.discrete_variables = [var1, var2] self.discrete_mean = np.empty(len(self.discrete_variables)) for ii, var in enumerate(self.discrete_variables): self.discrete_mean[ii] = var.moment(1)
def shots_to_obs_moments(bitarray: np.ndarray, qubits: List[int], observable: PauliTerm, use_beta_dist_unbiased_prior: bool = False) -> Tuple[float, float]: """ Calculate the mean and variance of the given observable based on the bitarray of results. :param bitarray: results from running `qc.run`, a 2D num_shots by num_qubits array. :param qubits: list of qubits in order corresponding to the bitarray results. :param observable: the observable whose moments are calculated from the shot data :param use_beta_dist_unbiased_prior: if true then the mean and variance are estimated from a beta distribution that incorporates an unbiased Bayes prior. This precludes var = 0. :return: tuple specifying (mean, variance) """ coeff = complex(observable.coefficient) if not np.isclose(coeff.imag, 0): raise ValueError(f"The coefficient of an observable should not be complex.") coeff = coeff.real obs_qubits = [q for q, _ in observable] # Identify classical register indices to select idxs = [idx for idx, q in enumerate(qubits) if q in obs_qubits] if len(idxs) == 0: # identity term return coeff, 0 assert bitarray.shape[1] == len(qubits), 'qubits should label each column of the bitarray' # Pick columns corresponding to qubits with a non-identity out_operation obs_strings = bitarray[:, idxs] # Transform bits to eigenvalues; ie (+1, -1) my_obs_strings = 1 - 2 * obs_strings # Multiply row-wise to get operator values. obs_vals = np.prod(my_obs_strings, axis=1) if use_beta_dist_unbiased_prior: # For binary classified data with N counts of + and M counts of -, these can be estimated # using the mean and variance of the beta distribution beta(N+1, M+1) where the +1 is used # to incorporate an unbiased Bayes prior. plus_array = obs_vals == 1 n_minus, n_plus = np.bincount(plus_array, minlength=2) bernoulli_mean = beta.mean(n_plus + 1, n_minus + 1) bernoulli_var = beta.var(n_plus + 1, n_minus + 1) obs_mean, obs_var = transform_bit_moments_to_pauli(bernoulli_mean, bernoulli_var) obs_mean *= coeff obs_var *= coeff**2 else: obs_vals = coeff * obs_vals obs_mean = np.mean(obs_vals).item() obs_var = np.var(obs_vals).item() / len(bitarray) return obs_mean, obs_var
def get_mean_accuracy(all_means, nbins=10): """ Bins ancestors according to mean bootstrapped posterior probability, and then returns the mean accuracy for each bin """ ## Add a columns of bin assignments # bins = np.linspace(0, all_means['posterior'].max(), nbins) bins = np.linspace(0, 1, nbins) all_means['bin'] = np.digitize(all_means['posterior'], bins) ## Add upper bound to right-most bin all_means.replace(to_replace={'bin':{nbins: nbins-1}}, inplace=True) ## Bin ancestors by mean bootstrapped probability, adding columns for ## whether they were the true generating ancestor, and the number of ## ancestors in each bin bin_count = lambda x: len(x) binned = all_means[['generator', 'bin']].pivot_table(index='bin', aggfunc=[np.mean, bin_count], fill_value=0) binned.columns = [['observed_prob', 'bin_count']] binned['n_successes'] = binned['observed_prob'].values * \ binned['bin_count'].values ## Estimate means and confidence intervals as sampling from a binomial ## distribution, with a uniform prior on success rates - Done using ## a beta distribution binned['alpha'] = binned['n_successes'] + 1 binned['beta'] = binned['bin_count'].values - binned['n_successes'].values + 1 beta_mean = lambda row: beta.mean(float(row['alpha']), float(row['beta'])) binned['posterior_mean'] = binned.apply(beta_mean, axis=1) ## Add confidence intercals beta_025CI = lambda row: beta.ppf(0.025, float(row['alpha']), float(row['beta'])) beta_975CI = lambda row: beta.ppf(0.975, float(row['alpha']), float(row['beta'])) binned['CI2.5'] = binned.apply(beta_025CI, axis=1) binned['CI97.5'] = binned.apply(beta_975CI, axis=1) ## Convert to values relative to mean, to fit plotting convention binned['CI2.5'] = binned['posterior_mean'].values - binned['CI2.5'].values binned['CI97.5'] = binned['CI97.5'].values - binned['posterior_mean'].values ## Add column with bin centre for plotting binned['bin_centre'] = all_means[['posterior', 'bin']].groupby('bin').mean() return binned
def test_nataf_transformation(self): num_vars = 2 alpha_stat = 2 beta_stat = 5 bisection_opts = {'tol': 1e-10, 'max_iterations': 100} def beta_cdf(x): return beta_rv.cdf(x, a=alpha_stat, b=beta_stat) def beta_icdf(x): return beta_rv.ppf(x, a=alpha_stat, b=beta_stat) x_marginal_cdfs = [beta_cdf]*num_vars x_marginal_inv_cdfs = [beta_icdf]*num_vars x_marginal_means = np.asarray( [beta_rv.mean(a=alpha_stat, b=beta_stat)]*num_vars) x_marginal_stdevs = np.asarray( [beta_rv.std(a=alpha_stat, b=beta_stat)]*num_vars) def beta_pdf(x): return beta_rv.pdf(x, a=alpha_stat, b=beta_stat) x_marginal_pdfs = [beta_pdf]*num_vars z_correlation = np.array([[1, 0.7], [0.7, 1]]) x_correlation = \ gaussian_copula_compute_x_correlation_from_z_correlation( x_marginal_inv_cdfs, x_marginal_means, x_marginal_stdevs, z_correlation) x_covariance = correlation_to_covariance( x_correlation, x_marginal_stdevs) var_trans = NatafTransformation( x_marginal_cdfs, x_marginal_inv_cdfs, x_marginal_pdfs, x_covariance, x_marginal_means, bisection_opts) assert np.allclose(var_trans.z_correlation, z_correlation) num_samples = 1000 true_samples, true_canonical_samples = \ generate_x_samples_using_gaussian_copula( num_vars, z_correlation, x_marginal_inv_cdfs, num_samples) canonical_samples = var_trans.map_to_canonical_space(true_samples) assert np.allclose(true_canonical_samples, canonical_samples) samples = var_trans.map_from_canonical_space( true_canonical_samples) assert np.allclose(true_samples, samples)
def beta_posterior(self, a, b): """ A beta(a,b) prior on the porportion of disease cases: theta. """ theta_lower = 0 theta_upper = 1 theta_range = np.linspace(theta_lower, theta_upper, 100) beta_posterior_distribution = beta.pdf( theta_range, self.no_disease_occurances + a, self.sample_size + b - self.no_disease_occurances) beta_posterior_mean = beta.mean( self.no_disease_occurances + a, self.sample_size + b - self.no_disease_occurances) return theta_range, beta_posterior_distribution, beta_posterior_mean
def survival_statistics(bitstrings): """ Calculate the mean and variance of the estimated probability of the ground state given shot data on one or more bits. For binary classified data with N counts of 1 and M counts of 0, these can be estimated using the mean and variance of the beta distribution beta(N+1, M+1) where the +1 is used to incorporate an unbiased Bayes prior. :param ndarray bitstrings: A 2D numpy array of repetitions x bit-arrays. :return: (survival mean, sqrt(survival variance)) """ survived = np.sum(bitstrings, axis=1) == 0 # count obmurrences of 000...0 and anything besides 000...0 n_died, n_survived = bincount(survived, minlength=2) # mean and variance given by beta distribution with a uniform prior survival_mean = beta.mean(n_survived + 1, n_died + 1) survival_var = beta.var(n_survived + 1, n_died + 1) return survival_mean, np.sqrt(survival_var)
def player_abilities(decay, day_span): query = {'mw': decay, 'day_span': day_span} projection = { '_id': 0, 'mw': 0, 'day_span': 0, } mongo_wrapper = mongo.Mongo() cursor = mongo_wrapper.find(mongo_wrapper.PLAYERS_BETA, query, projection) abilities_df = pd.DataFrame(list(cursor)) df = pd.concat([ abilities_df.drop(['player'], axis=1), abilities_df['player'].apply( pd.Series) ], axis=1) df['mean'] = beta.mean(df.a, df.b) return df
def switching_phase_diagram(buffers, durations, volts, num_clusters=2): #Get an idea of SNR #Cluster all the data into three based with starting point based on edges all_vals = buffers.flatten() all_vals.resize((all_vals.size, 1)) init_guess = np.linspace(np.min(all_vals), np.max(all_vals), num_clusters) init_guess[[1, -1]] = init_guess[[-1, 1]] init_guess.resize((num_clusters, 1)) clusterer = KMeans(init=init_guess, n_clusters=num_clusters) state = clusterer.fit_predict(all_vals) #Report initial state distributions print("Total initial state distribution:") init_state = state[::2] for ct in range(num_clusters): print("\tState {}: {:.2f}%".format( ct, 100 * np.sum(init_state == ct) / len(init_state))) #Approximate SNR from centre distance and variance std0 = np.std(all_vals[state == 0]) std1 = np.std(all_vals[state == 1]) mean_std = 0.5 * (std0 + std1) centre0 = clusterer.cluster_centers_[0, 0] centre1 = clusterer.cluster_centers_[1, 0] centre_dist = centre1 - centre0 print( "Centre distance = {:.3f} with widths = {:.4f} / {:.4f} gives SNR ratio {:.3}" .format(centre_dist, std0, std1, centre_dist / mean_std)) #Have a look at the distributions plt.figure() for ct in range(num_clusters): sns.distplot(all_vals[state == ct], kde=False, norm_hist=False) #calculate some switching matrices for each amplitude # 0->0 0->1 # 1->0 1->1 counts = [] for buf in buffers: state = clusterer.predict(buf.reshape((len(buf), 1))) init_state = state[::2] final_state = state[1::2] switched = np.logical_xor(init_state, final_state) count_mat = np.zeros((2, 2), dtype=np.int) count_mat[0, 0] = np.sum( np.logical_and(init_state == 0, np.logical_not(switched))) count_mat[0, 1] = np.sum(np.logical_and(init_state == 0, switched)) count_mat[1, 0] = np.sum(np.logical_and(init_state == 1, switched)) count_mat[1, 1] = np.sum( np.logical_and(init_state == 1, np.logical_not(switched))) counts.append(count_mat) mean_PtoAP = np.array( [beta.mean(1 + c[0, 1], 1 + c[0, 0]) for c in counts]) limit_PtoAP = np.array( [beta.mean(1 + c[0, 1] + c[0, 0], 1) for c in counts]) mean_APtoP = np.array( [beta.mean(1 + c[1, 0], 1 + c[1, 1]) for c in counts]) limit_APtoP = np.array( [beta.mean(1 + c[1, 0] + c[1, 1], 1) for c in counts]) ci68_PtoAP = np.array( [beta.interval(0.68, 1 + c[0, 1], 1 + c[0, 0]) for c in counts]) ci68_APtoP = np.array( [beta.interval(0.68, 1 + c[1, 0], 1 + c[1, 1]) for c in counts]) ci95_PtoAP = np.array( [beta.interval(0.95, 1 + c[0, 1], 1 + c[0, 0]) for c in counts]) ci95_APtoP = np.array( [beta.interval(0.95, 1 + c[1, 0], 1 + c[1, 1]) for c in counts]) # import h5py # FID = h5py.File("data/CSHE-Switching-PhaseDiagramPtoAP.h5", "w") # FID.create_dataset("/buffer", data=buffers, compression="lzf") # FID.create_dataset("/durations", data=durations, compression="lzf") # FID.create_dataset("/volts", data=volts, compression="lzf") # FID.close() plt.figure() plt.title("Phase Diagram - P to AP", size=16) plt.xlabel("Pulse Duration (ns)", size=14) plt.ylabel("Pulse Amplitude (V)", size=14) means_diagram_PtoAP = mean_PtoAP.reshape(len(volts), len(durations), order='F') plt.pcolormesh(durations * 1e9, volts, means_diagram_PtoAP, cmap="RdGy") plt.colorbar() plt.figure() plt.title("Phase Diagram - AP to P", size=16) plt.xlabel("Pulse Duration (ns)", size=14) plt.ylabel("Pulse Amplitude (V)", size=14) means_diagram_APtoP = mean_APtoP.reshape(len(volts), len(durations), order='F') plt.pcolormesh(durations * 1e9, volts, means_diagram_APtoP, cmap="RdGy") plt.colorbar() print("Reached end") plt.show()
count_mat[0, 1] = np.sum(np.logical_and(init_state == 0, switched)) count_mat[1, 0] = np.sum(np.logical_and(init_state == 1, switched)) count_mat[1, 1] = np.sum( np.logical_and(init_state == 1, np.logical_not(switched))) counts.append(count_mat) import h5py FID = h5py.File("data/nTron-AmpFall-PhaseDiagram-10V-52us-HighRes.h5", "w") FID.create_dataset("/buffer", data=buffers, compression="lzf") FID.create_dataset("/fall_times", data=fall_times, compression="lzf") FID.create_dataset("/amplitudes", data=amplitudes, compression="lzf") FID.close() mean_PtoAP = np.array( [beta.mean(1 + c[0, 1], 1 + c[0, 0]) for c in counts]) mean_APtoP = np.array( [beta.mean(1 + c[1, 0], 1 + c[1, 1]) for c in counts]) plt.figure() plt.title("P to AP") plt.xlabel("Pulse Falltime (ns)") plt.ylabel("Pulse Amplitude (Arb. Units)") means_diagram_PtoAP = mean_PtoAP.reshape(len(amplitudes), len(fall_times), order='F') plt.pcolormesh(fall_times * 1e9, amplitudes, means_diagram_PtoAP, cmap="RdGy") plt.colorbar() plt.figure()
def test_correlated_beta(self): num_vars = 2 alpha_stat = 2 beta_stat = 5 bisection_opts = {'tol': 1e-10, 'max_iterations': 100} beta_cdf = lambda x: beta_rv.cdf(x, a=alpha_stat, b=beta_stat) beta_icdf = lambda x: beta_rv.ppf(x, a=alpha_stat, b=beta_stat) x_marginal_cdfs = [beta_cdf] * num_vars x_marginal_inv_cdfs = [beta_icdf] * num_vars x_marginal_means = np.asarray( [beta_rv.mean(a=alpha_stat, b=beta_stat)] * num_vars) x_marginal_stdevs = np.asarray( [beta_rv.std(a=alpha_stat, b=beta_stat)] * num_vars) beta_pdf = lambda x: beta_rv.pdf(x, a=alpha_stat, b=beta_stat) x_marginal_pdfs = [beta_pdf] * num_vars x_correlation = np.array([[1, 0.7], [0.7, 1]]) quad_rule = gauss_hermite_pts_wts_1D(11) z_correlation = transform_correlations(x_correlation, x_marginal_inv_cdfs, x_marginal_means, x_marginal_stdevs, quad_rule, bisection_opts) assert np.allclose(z_correlation[0, 1], z_correlation[1, 0]) x_correlation_recovered = \ gaussian_copula_compute_x_correlation_from_z_correlation( x_marginal_inv_cdfs,x_marginal_means,x_marginal_stdevs, z_correlation) assert np.allclose(x_correlation, x_correlation_recovered) z_variable = multivariate_normal(mean=np.zeros((num_vars)), cov=z_correlation) z_joint_density = lambda x: z_variable.pdf(x.T) target_density = partial(nataf_joint_density, x_marginal_cdfs=x_marginal_cdfs, x_marginal_pdfs=x_marginal_pdfs, z_joint_density=z_joint_density) # all variances are the same so #true_x_covariance = x_correlation.copy()*x_marginal_stdevs[0]**2 true_x_covariance = correlation_to_covariance(x_correlation, x_marginal_stdevs) def univariate_quad_rule(n): x, w = np.polynomial.legendre.leggauss(n) x = (x + 1.) / 2. w /= 2. return x, w x, w = get_tensor_product_quadrature_rule(100, num_vars, univariate_quad_rule) assert np.allclose(np.dot(target_density(x), w), 1.0) # test covariance of computed by aplying quadrature to joint density mean = np.dot(x * target_density(x), w) x_covariance = np.empty((num_vars, num_vars)) x_covariance[0, 0] = np.dot(x[0, :]**2 * target_density(x), w) - mean[0]**2 x_covariance[1, 1] = np.dot(x[1, :]**2 * target_density(x), w) - mean[1]**2 x_covariance[0, 1] = np.dot(x[0, :] * x[1, :] * target_density(x), w) - mean[0] * mean[1] x_covariance[1, 0] = x_covariance[0, 1] # error is influenced by bisection_opts['tol'] assert np.allclose(x_covariance, true_x_covariance, atol=bisection_opts['tol']) # test samples generated using Gaussian copula are correct num_samples = 10000 x_samples, true_u_samples = generate_x_samples_using_gaussian_copula( num_vars, z_correlation, x_marginal_inv_cdfs, num_samples) x_sample_covariance = np.cov(x_samples) assert np.allclose(true_x_covariance, x_sample_covariance, atol=1e-2) u_samples = nataf_transformation(x_samples, true_x_covariance, x_marginal_cdfs, x_marginal_inv_cdfs, x_marginal_means, x_marginal_stdevs, bisection_opts) assert np.allclose(u_samples, true_u_samples) trans_samples = inverse_nataf_transformation( u_samples, x_covariance, x_marginal_cdfs, x_marginal_inv_cdfs, x_marginal_means, x_marginal_stdevs, bisection_opts) assert np.allclose(x_samples, trans_samples)
def mean(self, n, p): mu = beta.mean(self, n, p) return mu
def get_overall_acc(self, weight): return np.dot(beta.mean(self._params[:, 0], self._params[:, 1]), weight)
def generate_beta_distribution_mean(alpha_val, beta_val): """ Generates the mean of the beta distribution for the given alpha and beta values. """ return beta.mean(alpha_val, beta_val)
ax.fill_between(mu_test, prior_mu, color='green', alpha=0.3) ax.set_xlabel('$\mu$') ax.set_ylabel('$p(\mu|\mathbf{x})$') # pick random uniform point # and update assumption points = [] index = np.random.permutation(X.shape[0]) for i in range(0, X.shape[0]): y, a_n, b_n = posterior(a, b, X[:index[i]]) plt.plot(mu_test, y, 'r', alpha=0.3) print(a_n, b_n) post_mean = beta.mean(a_n, b_n) prior_mean = beta.mean(a, b) points.append(post_mean - prior_mean) y, _, _ = posterior(a, b, X) plt.plot(mu_test, y, 'b', linewidth=4.0) # q3 ax = fig.add_subplot(212) xx = [i for i in range(0, len(points))] plt.plot(xx, points) plt.tight_layout() plt.show() #plt.savefig(path, transparent=True)
def naiveProbability(self, questionNumber, idx): expectedPerformance = list() individualResponse = list() probabilities = list() human_accuracy = list() machine_accuracy = [None for _ in range(self.numAgents)] group_accuracy = 0 #Save human expected performance based for i in range(0,self.teamSize): expectedPerformance.append(beta.mean(self.alphas[i],self.betas[i])) individualResponse.append(self.lastIndividualResponsesbyQNo[(self.lastIndividualResponsesbyQNo["questionNumber"] == questionNumber) & (self.lastIndividualResponsesbyQNo["sender"] == self.teamMember.iloc[i])]["stringValue"].any()) self.updateAlphaBeta(i,self.lastIndividualResponsesbyQNo[(self.lastIndividualResponsesbyQNo["questionNumber"] == questionNumber) & (self.lastIndividualResponsesbyQNo["sender"] == self.teamMember.iloc[i])]["stringValue"].any(),self.correctAnswers[idx]) ans = self.lastIndividualResponsesbyQNo[(self.lastIndividualResponsesbyQNo["questionNumber"] == questionNumber) & (self.lastIndividualResponsesbyQNo["sender"] == self.teamMember.iloc[i])]["stringValue"].any() if ans == self.correctAnswers[idx]: human_accuracy.append(1) else: human_accuracy.append(0) if (self.groupSubmission["groupAnswer"].iloc[idx] == self.correctAnswers[idx]): group_accuracy = 1 indxQ = -1 anyMachineAsked = False if(int(float(questionNumber)) in self.machineAskedQuestions): indxQ = self.machineAskedQuestions.index(int(float(questionNumber))) sender = self.machineAsked['sender'].iloc[indxQ] k = self.teamArray.index(sender) anyMachineAsked = True # Add expected Performance for Agents for i in range(self.teamSize, self.teamSize+self.numAgents): expectedPerformance.append(beta.mean(self.alphas[i],self.betas[i])) # update alpha beta for that machine #Update machine accuracy if(anyMachineAsked): self.updateAlphaBeta(self.getAgentForHuman(k), self.machineAsked['event_content'].iloc[indxQ].split("||")[0].split(":")[2].replace('"', ''), self.correctAnswers[idx]) self.machineUseCount[k]+=1 machineAnswer = self.machineAsked['event_content'].iloc[indxQ].split("||")[0].split(":")[2].replace('"', '').split("_")[0] if self.firstMachineUsage[k] == -1: self.firstMachineUsage[k] = idx machine_accuracy[k] = 1 # Conditional Probability # Do a bayes update denominator = 0 numerator = [1. for _ in range(len(self.options[idx]))] prob_class = 0.25 prob_resp = 0 prob_class_responses = [None for _ in range(len(self.options[idx]))] prob_resp_given_class = [None for _ in range(len(self.options[idx]))] for opt_num in range(0,len(self.options[idx])): prob_resp = 0 numerator = prob_class for person_num in range(0,self.teamSize): if individualResponse[person_num] == self.options[idx][opt_num]: numerator *= expectedPerformance[person_num] else: numerator *= (1 - expectedPerformance[person_num])/3 prob_resp += numerator prob_resp_given_class[opt_num] = numerator prob_class_responses = [(prob_resp_given_class[i]/sum(prob_resp_given_class)) for i in range(0,len(prob_resp_given_class))] #ANSIs this updating agent probabilities? for i in range(self.teamSize): probabilities.append(expectedPerformance[self.teamSize+i]) #8 probability values returned # first set is for options (sums to 1) assert(sum(prob_class_responses) > 0.999 and sum(prob_class_responses) < 1.001) #second set is for machines prob_all_class_responses = prob_class_responses + [expectedPerformance[self.getAgentForHuman(k)] for k in range(self.teamSize)] return prob_all_class_responses,human_accuracy,group_accuracy,machine_accuracy
def run(): if len(sys.argv) < 3: print 'usage: python %s methylation_reads_all.tsv loglike_threshold outfile [-stranded +|-] [-minAbsLogLike float] [-minAbsPValue float] [-BayesianIntegration window(bp) step alpha beta pseudosamplesize] [-N6mAweight pseudosamplesize genome.fa] [-saveNewSingleMoleculeFile filename]' % sys.argv[ 0] print '\tNote: the [-BayesianIntegration] option requires the [-minAbsPValue] option' print '\tNote: the [-saveNewSingleMoleculeFile] option requires the [-BayesianIntegration] option' print '\tNote: the [-N6mAweight] option only works together with the -BayesianIntegration option' sys.exit(1) reads = sys.argv[1] LLthreshold = float(sys.argv[2]) outfilename = sys.argv[3] doStranded = False if '-stranded' in sys.argv: doStranded = True WantedStrand = sys.argv[sys.argv.index('-stranded') + 1] print 'will only output coverage from reads on the', WantedStrand, 'strand' minAbsLogLike = 0 if '-minAbsLogLike' in sys.argv: minAbsLogLike = float(sys.argv[sys.argv.index('-minAbsLogLike') + 1]) print 'will ignore bases with absolute loglikelihood values less than', minAbsLogLike doP = False minAbsPValue = 0 if '-minAbsPValue' in sys.argv: doP = True minAbsPValue = float(sys.argv[sys.argv.index('-minAbsPValue') + 1]) print 'will ignore bases with p-values higher than', minAbsPValue, 'and lower than', 1 - minAbsPValue doSaveNewFile = False doBI = False if '-BayesianIntegration' in sys.argv: if not doP: print 'data not specified to be in probability space, exiting' sys.exit(1) doBI = True window = int(sys.argv[sys.argv.index('-BayesianIntegration') + 1]) step = int(sys.argv[sys.argv.index('-BayesianIntegration') + 2]) alph = float(sys.argv[sys.argv.index('-BayesianIntegration') + 3]) bet = float(sys.argv[sys.argv.index('-BayesianIntegration') + 4]) PSS = int(sys.argv[sys.argv.index('-BayesianIntegration') + 5]) print 'will integrate accessibility probabilities over windows of', window, 'bp in size, step size', step, 'bp, using (', alph, bet, ') as beta priors, and a pseudosample size of', PSS if '-saveNewSingleMoleculeFile' in sys.argv: doSaveNewFile = True NewFile = open( sys.argv[sys.argv.index('-saveNewSingleMoleculeFile') + 1], 'w') print 'will save integrated basepair accessibilities probabilities into a new file:', sys.argv[ sys.argv.index('-saveNewSingleMoleculeFile') + 1] doAweight = False if '-N6mAweight' in sys.argv: doAweight = True N6mAweight = int(sys.argv[sys.argv.index('-N6mAweight') + 1]) genome_fasta = sys.argv[sys.argv.index('-N6mAweight') + 2] print 'will used different weight for N6mA', sys.argv[ sys.argv.index('-saveNewSingleMoleculeFile') + 1] GenomeDict = {} sequence = '' inputdatafile = open(genome_fasta) for line in inputdatafile: if line[0] == '>': if sequence != '': GenomeDict[chr] = ''.join(sequence).upper() chr = line.strip().split('>')[1] sequence = [] continue else: sequence.append(line.strip()) GenomeDict[chr] = ''.join(sequence).upper() CoverageDict = {} if reads.endswith('.bz2'): cmd = 'bzip2 -cd ' + reads elif reads.endswith('.gz') or reads.endswith('.bgz'): cmd = 'zcat ' + reads elif reads.endswith('.zip'): cmd = 'unzip -p ' + reads else: cmd = 'cat ' + reads RN = 0 P = os.popen(cmd, "r") line = 'line' while line != '': line = P.readline().strip() if line == '': break if line.startswith('chromosome\tstart\tend\tread_name'): continue fields = line.strip().split('\t') if len(fields) < 8: print 'skipping:', fields continue RN += 1 if RN % 10000 == 0: print RN, 'lines processed' chr = fields[0] strand = fields[3] if doStranded: if strand != WantedStrand: continue Ps = fields[6].split(',') LLs = fields[7].split(',') if CoverageDict.has_key(chr): pass else: CoverageDict[chr] = {} if doBI: PDict = {} for i in range(len(Ps)): pos = int(Ps[i]) p = float(LLs[i]) PDict[pos] = p positions = PDict.keys() minPos = min(positions) maxPos = max(positions) if doSaveNewFile: NewPos = [] NewLLs = [] for pos in range(minPos + window / 2, maxPos - window / 2, step): (A, B) = (alph, bet) for j in range(pos - window / 2, pos + window / 2): if PDict.has_key(j): p = PDict[j] if doAweight: if strand == '+' and GenomeDict[chr][j] == 'A': Z = int(N6mAweight * p) A = A + Z B = B + N6mAweight - Z elif strand == '-' and GenomeDict[chr][j] == 'T': Z = int(N6mAweight * p) A = A + Z B = B + N6mAweight - Z else: Z = int(PSS * p) A = A + Z B = B + PSS - Z else: Z = int(PSS * p) A = A + Z B = B + PSS - Z final_p = beta.mean(A, B) newpos = pos - (pos % step) if doSaveNewFile: NewPos.append(str(newpos)) NewLLs.append("{0:.2f}".format(final_p)) if final_p > minAbsPValue and final_p < 1 - minAbsPValue: continue else: pass if CoverageDict[chr].has_key(newpos): pass else: CoverageDict[chr][newpos] = [0, 0] if final_p < LLthreshold: CoverageDict[chr][newpos][1] += 1 else: CoverageDict[chr][newpos][0] += 1 if doSaveNewFile: outline = fields[0] + '\t' + fields[1] + '\t' + fields[ 2] + '\t' + fields[3] + '\t' + fields[4] + '\t' + fields[5] outline = outline + '\t' + ','.join(NewPos) + '\t' + ','.join( NewLLs) NewFile.write(outline + '\n') else: try: for i in range(len(Ps)): pos = int(Ps[i]) ll = float(LLs[i]) if math.fabs(ll) >= minAbsLogLike: pass else: continue if doP: if ll > minAbsPValue and ll < 1 - minAbsPValue: continue else: pass if CoverageDict[chr].has_key(pos): pass else: CoverageDict[chr][pos] = [0, 0] if ll < LLthreshold: CoverageDict[chr][pos][1] += 1 else: CoverageDict[chr][pos][0] += 1 except: print 'skipping read' print fields continue print 'finished inputting reads' if doSaveNewFile: NewFile.close() chromosomes = CoverageDict.keys() chromosomes.sort() outfile = open(outfilename, 'w') outline = '#chr\tstart\tend\tmeth\tunmeth\tcov' outfile.write(outline + '\n') K = 0 for chr in chromosomes: print chr positions = CoverageDict[chr].keys() positions.sort() for pos in positions: outline = chr + '\t' + str(pos) + '\t' + str(pos + 1) outline = outline + '\t' + str(CoverageDict[chr][pos][1]) outline = outline + '\t' + str(CoverageDict[chr][pos][0]) outline = outline + '\t' + str(CoverageDict[chr][pos][0] + CoverageDict[chr][pos][1]) outfile.write(outline + '\n') outfile.close()
def run(): if len(sys.argv) < 9: print 'usage: python %s methylation_reads_all.tsv peaks chrFieldID leftFiled RightFieldID minCoverage maxDist tabix_location outfile [-subsample N]' % sys.argv[ 0] print '\Note: the script assumes Tombo 1.3 probabilities, a tabix indexed reads file, and uses a beta distribution prior of (10,10) by default' print '\Note: the subsample option will sample the reads in all comparisons down to the minCoverage level; the N parameter indicates how many such subsamplings should be averaged for the final value' sys.exit(1) reads = sys.argv[1] peaks = sys.argv[2] chrFieldID = int(sys.argv[3]) leftFieldID = int(sys.argv[4]) rightFieldID = int(sys.argv[5]) minCov = int(sys.argv[6]) maxDist = int(sys.argv[7]) tabix = sys.argv[8] outfilename = sys.argv[9] doSS = False if '-subsample' in sys.argv: SS = int(sys.argv[sys.argv.index('-subsample') + 1]) doSS = True print 'will subsample all comparisons down to', minCov, 'reads' print 'will take the average outcome of', SS, 'subsamplings' alph = 10 bet = 10 PSS = 100 PeakDict = {} if peaks.endswith('.bz2'): cmd = 'bzip2 -cd ' + peaks elif peaks.endswith('.gz') or peaks.endswith('.bgz'): cmd = 'zcat ' + peaks elif peaks.endswith('.zip'): cmd = 'unzip -p ' + peaks else: cmd = 'cat ' + peaks RN = 0 P = os.popen(cmd, "r") line = 'line' while line != '': line = P.readline().strip() if line == '': break if line.startswith('#'): continue fields = line.strip().split('\t') chr = fields[chrFieldID] RL = int(fields[leftFieldID]) RR = int(fields[rightFieldID]) if PeakDict.has_key(chr): pass else: PeakDict[chr] = [] PeakDict[chr].append((RL, RR)) print 'finished inputting peaks' outfile = open(outfilename, 'w') outline = '#chr\tpeak1_left\tpeak1_right\tpeak1_open\tpeak1_closed\tpeak1_fraction\tpeak2_left\tpeak2_right\tpeak2_open\tpeak2_closed\tpeak2_fraction\tp_val' outfile.write(outline + '\n') for chr in PeakDict.keys(): PeakDict[chr].sort() for i in range(len(PeakDict[chr]) - 1): (RL1, RR1) = PeakDict[chr][i] for j in range(i + 1, len(PeakDict[chr])): (RL2, RR2) = PeakDict[chr][j] print chr, RL1, RR1, RL2, RR2, if RR2 - RL1 > maxDist: break cmd = tabix + ' ' + reads + ' ' + chr + ':' + str( RL1) + '-' + str(RR2) p = os.popen(cmd, "r") RegionReads = [] line = 'line' while line != '': line = p.readline().strip() if line == '': break fields = line.strip().split('\t') chr = fields[0] read_left = int(fields[1]) read_right = int(fields[2]) if read_left <= RL1 and read_right >= RR2: pass else: continue strand = fields[3] read = fields[4] cgs = fields[6].split(',') loglike = fields[7].split(',') RegionReads.append((cgs, loglike)) print len(RegionReads) if len(RegionReads) < minCov: continue if doSS: p_av = 0.0 CR1_av = 0.0 OR1_av = 0.0 CR2_av = 0.0 OR2_av = 0.0 for S in range(SS): RegionReadsSubSampled = random.sample( RegionReads, minCov) OpenOrClosed = [] for (cgs, loglike) in RegionReadsSubSampled: t = zip(cgs, loglike) RD = dict((int(x), float(y)) for x, y in t) (A, B) = (alph, bet) for pos in range(RL1, RR1): if RD.has_key(pos): p = RD[pos] Z = int(PSS * p) A = A + Z B = B + PSS - Z if beta.mean(A, B) > 0.5: final_p1 = 1 else: final_p1 = 0 (A, B) = (alph, bet) for pos in range(RL2, RR2): if RD.has_key(pos): p = RD[pos] Z = int(PSS * p) A = A + Z B = B + PSS - Z if beta.mean(A, B) > 0.5: final_p2 = 1 else: final_p2 = 0 OpenOrClosed.append((final_p1, final_p2)) C00 = OpenOrClosed.count((0, 0)) C01 = OpenOrClosed.count((0, 1)) C10 = OpenOrClosed.count((1, 0)) C11 = OpenOrClosed.count((1, 1)) CR1 = C01 + C00 OR1 = C10 + C11 CR2 = C10 + C00 OR2 = C01 + C11 oddsratio, pvalue = fisher_exact([[C00, C01], [C10, C11]]) logp = -math.log10(pvalue) p_av += logp CR1_av += CR1 OR1_av += OR1 CR2_av += CR2 OR2_av += OR2 (pvalue, CR1, OR1, CR2, OR2) = (p_av / SS, CR1_av / SS, OR1_av / SS, CR2_av / SS, OR2_av / SS) outline = chr + '\t' + str(RL1) + '\t' + str(RR1) outline = outline + '\t' + str(OR1) + '\t' + str( CR1) + '\t' + str(OR1 / (OR1 + CR1 + 0.0)) outline = outline + '\t' + str(RL2) + '\t' + str(RR2) outline = outline + '\t' + str(OR2) + '\t' + str( CR2) + '\t' + str( OR2 / (OR2 + CR2 + 0.0)) + '\t' + str(pvalue) outfile.write(outline + '\n') else: OpenOrClosed = [] for (cgs, loglike) in RegionReads: t = zip(cgs, loglike) RD = dict((int(x), float(y)) for x, y in t) (A, B) = (alph, bet) for pos in range(RL1, RR1): if RD.has_key(pos): p = RD[pos] Z = int(PSS * p) A = A + Z B = B + PSS - Z if beta.mean(A, B) > 0.5: final_p1 = 1 else: final_p1 = 0 (A, B) = (alph, bet) for pos in range(RL2, RR2): if RD.has_key(pos): p = RD[pos] Z = int(PSS * p) A = A + Z B = B + PSS - Z if beta.mean(A, B) > 0.5: final_p2 = 1 else: final_p2 = 0 OpenOrClosed.append((final_p1, final_p2)) C00 = OpenOrClosed.count((0, 0)) C01 = OpenOrClosed.count((0, 1)) C10 = OpenOrClosed.count((1, 0)) C11 = OpenOrClosed.count((1, 1)) OR1 = C01 + C00 CR1 = C10 + C11 OR2 = C10 + C00 CR2 = C01 + C11 oddsratio, pvalue = fisher_exact([[C00, C01], [C10, C11]]) outline = chr + '\t' + str(RL1) + '\t' + str(RR1) outline = outline + '\t' + str(OR1) + '\t' + str( CR1) + '\t' + str(OR1 / (OR1 + CR1 + 0.0)) outline = outline + '\t' + str(RL2) + '\t' + str(RR2) if pvalue == 0: pvalue = 1e-300 outline = outline + '\t' + str(OR2) + '\t' + str( CR2) + '\t' + str(OR2 / (OR2 + CR2 + 0.0)) + '\t' + str( -math.log10(pvalue)) outfile.write(outline + '\n') outfile.close()
from scipy.stats import beta from scipy import special from matplotlib import pyplot import numpy as np import math a_prior = 2 b_prior = 10 a = 46 b = 240 print('Prior mean: ', beta.mean(a_prior, b_prior)) print('Posterior mean : ', beta.mean(a, b)) x = np.arange(0.0, 1.01, 0.01) y_prior = beta.pdf(x, a_prior, b_prior) y = beta.pdf(x, a, b) ax = pyplot.subplot(111) ax.plot(x, y_prior) ax.plot(x, y) # plot.show() sample_size = 1000 r = beta.rvs(a, b, size=sample_size) new_mean = np.mean(r) new_std = np.std(r) """ z_critical is the number of std you'd have to go from the mean to capture the proportion of data association with the confidence level """
state = clusterer.predict(buf.reshape((2*reps,1))) init_state = state[::2] final_state = state[1::2] switched = np.logical_xor(init_state, final_state) count_mat = np.zeros((2,2), dtype=np.int) count_mat[0,0] = np.sum(np.logical_and(init_state == 0, np.logical_not(switched) )) count_mat[0,1] = np.sum(np.logical_and(init_state == 0, switched )) count_mat[1,0] = np.sum(np.logical_and(init_state == 1, switched )) count_mat[1,1] = np.sum(np.logical_and(init_state == 1, np.logical_not(switched) )) counts.append(count_mat) plt.figure() mean_PtoAP = [beta.mean(1+c[0,1], 1+c[0,0]) for c in counts] mean_APtoP = [beta.mean(1+c[1,0], 1+c[1,1]) for c in counts] ci68_PtoAP = [beta.interval(0.68, 1+c[0,1], 1+c[0,0]) for c in counts] ci68_APtoP = [beta.interval(0.68, 1+c[1,0], 1+c[1,1]) for c in counts] ci95_PtoAP = [beta.interval(0.95, 1+c[0,1], 1+c[0,0]) for c in counts] ci95_APtoP = [beta.interval(0.95, 1+c[1,0], 1+c[1,1]) for c in counts] current_palette = sns.color_palette() plt.plot(fall_times, mean_PtoAP) plt.fill_between(fall_times, [ci[0] for ci in ci68_PtoAP], [ci[1] for ci in ci68_PtoAP], color=current_palette[0], alpha=0.2, edgecolor="none") plt.fill_between(fall_times, [ci[0] for ci in ci95_PtoAP], [ci[1] for ci in ci95_PtoAP], color=current_palette[0], alpha=0.2, edgecolor="none") plt.plot(fall_times, mean_APtoP) plt.fill_between(fall_times, [ci[0] for ci in ci68_APtoP], [ci[1] for ci in ci68_APtoP], color=current_palette[1], alpha=0.2, edgecolor="none") plt.fill_between(fall_times, [ci[0] for ci in ci95_APtoP], [ci[1] for ci in ci95_APtoP], color=current_palette[1], alpha=0.2, edgecolor="none") plt.xlabel("nTron Pulse Fall Time (s)") plt.ylabel("Switching Probability") plt.legend(("P->AP", "AP->P"))
def run(): if len(sys.argv) < 10: print 'usage: python %s methylation_reads_all.tsv peaks chrFieldID leftFiled RightFieldID minCoverage maxDist N_samplings tabix_location outfile [-subsample N] [-quantiles N]' % sys.argv[0] print '\Note: the script assumes Tombo 1.3 probabilities, a tabix indexed reads file, and uses a beta distribution prior of (10,10) by default' print '\Note: the subsample option will sample the reads in all comparisons down to the minCoverage level; the N parameter indicates how many such subsamplings should be averaged for the final value' print '\Note: the subsample option IS REQUIRED AT THE MOMENT' sys.exit(1) reads = sys.argv[1] peaks = sys.argv[2] chrFieldID = int(sys.argv[3]) leftFieldID = int(sys.argv[4]) rightFieldID = int(sys.argv[5]) minCov = int(sys.argv[6]) maxDist = int(sys.argv[7]) Nsamp = int(sys.argv[8]) tabix = sys.argv[9] outfilename = sys.argv[10] QU = 5 if '-quantiles' in sys.argv: QU = int(sys.argv[sys.argv.index('-quantiles') + 1]) print 'will split reads into', QU, 'quantiles/bins instead of the default 5' doSS = False if '-subsample' in sys.argv: SS = int(sys.argv[sys.argv.index('-subsample') + 1]) doSS = True print 'will subsample all comparisons down to', minCov, 'reads' print 'will take the average outcome of', SS, 'subsamplings' if minCov % QU != 0: print 'minCov value must be divisble by the number of quantiles, which is', QU, ', exiting' sys.exit(1) alph = 10 bet = 10 PSS = 100 PeakDict = {} if peaks.endswith('.bz2'): cmd = 'bzip2 -cd ' + peaks elif peaks.endswith('.gz') or peaks.endswith('.bgz'): cmd = 'zcat ' + peaks elif peaks.endswith('.zip'): cmd = 'unzip -p ' + peaks else: cmd = 'cat ' + peaks RN = 0 P = os.popen(cmd, "r") line = 'line' while line != '': line = P.readline().strip() if line == '': break if line.startswith('#'): continue fields = line.strip().split('\t') chr = fields[chrFieldID] RL = int(fields[leftFieldID]) RR = int(fields[rightFieldID]) if PeakDict.has_key(chr): pass else: PeakDict[chr] = [] PeakDict[chr].append((RL,RR)) print 'finished inputting peaks' outfile = open(outfilename,'w') outline = '#chr\tpeak1_left\tpeak1_right\tpeak1_open\tpeak1_closed\tpeak1_fraction\tpeak2_left\tpeak2_right\tpeak2_open\tpeak2_closed\tpeak2_fraction\tFisher_test_p_val\tEmpirical_p-val\tMax_upper_empirical_p-val\tMax_lower_empirical_p-val\tNMI' outfile.write(outline + '\n') for chr in PeakDict.keys(): PeakDict[chr].sort() for i in range(len(PeakDict[chr])-1): (RL1,RR1) = PeakDict[chr][i] for j in range(i+1,len(PeakDict[chr])): (RL2,RR2) = PeakDict[chr][j] print 'testing:', chr, RL1, RR1, RL2, RR2 if RR2 - RL1 > maxDist: break cmd = tabix + ' ' + reads + ' ' + chr + ':' + str(RL1) + '-' + str(RR2) p = os.popen(cmd, "r") RegionReads = [] line = 'line' while line != '': line = p.readline().strip() if line == '': break fields = line.strip().split('\t') chr = fields[0] read_left = int(fields[1]) read_right = int(fields[2]) if read_left <= RL1 and read_right >= RR2: pass else: continue strand = fields[3] read = fields[4] cgs = fields[6].split(',') loglike = fields[7].split(',') LLs = [] NLLs = sum(float(L) > 0.5 for L in loglike)/(len(loglike) + 0.0) RegionReads.append((NLLs,cgs,loglike)) print 'found:', len(RegionReads), 'reads', 'needed:', minCov, 'reads' if len(RegionReads) < minCov: continue if doSS: emp_p_av = 0.0 max_possible_upper_emp_p_av = 0.0 max_possible_lower_emp_p_av = 0.0 emp_p_av = 0.0 NMI_av = 0.0 p_av = 0.0 CR1_av = 0.0 OR1_av = 0.0 CR2_av = 0.0 OR2_av = 0.0 for S in range(SS): RegionReadsSubSampled = random.sample(RegionReads,minCov) OpenOrClosed = [] for (NLLs,cgs,loglike) in RegionReadsSubSampled: t = zip(cgs,loglike) RD = dict((int(x), float(y)) for x, y in t) (A,B) = (alph,bet) for pos in range(RL1,RR1): if RD.has_key(pos): p = RD[pos] Z = int(PSS*p) A = A + Z B = B + PSS - Z if beta.mean(A,B) > 0.5: final_p1 = 1 else: final_p1 = 0 (A,B) = (alph,bet) for pos in range(RL2,RR2): if RD.has_key(pos): p = RD[pos] Z = int(PSS*p) A = A + Z B = B + PSS - Z if beta.mean(A,B) > 0.5: final_p2 = 1 else: final_p2 = 0 OpenOrClosed.append((NLLs,final_p1,final_p2)) # (Z1,Z2,Z3) = zip(*OpenOrClosed) # print 'before sorting:', Z1 # OpenOrClosed.sort() # (Z1,Z2,Z3) = zip(*OpenOrClosed) # print 'after sorting:', Z1 # get empirical sampling distribution of coaccessibility values: OpenOrClosed.sort() ObservedMatches = [] STEP = len(OpenOrClosed)/QU OMlist = [] for i in range(QU): tempOMlist = [] for j in range(i*STEP,(i+1)*STEP): tempOMlist.append(OpenOrClosed[j]) [Z,P,Q] = zip(*tempOMlist) P = list(P) Q = list(Q) OMlist.append((P,Q)) for NS in range(Nsamp): ObsMatchesInChunks = 0 for (P,Q) in OMlist: Ps = np.array(P) Qs = np.array(Q) random.shuffle(Ps) random.shuffle(Qs) ObsMatchesInChunks += np.sum(Ps==Qs) ObservedMatches.append(ObsMatchesInChunks) # calculate normalized mutual information on coaccessibility: [Z,P,Q] = zip(*OpenOrClosed) OpenOrClosed = zip(P,Q) P = list(P) Q = list(Q) P = np.array(P) Q = np.array(Q) NMI = NMIS(P,Q) # calculate empirical p-values (assuming normal distribution of sampling coaccessibilities): matches = np.sum(P==Q) if matches < 0.5*len(P): NMI = -NMI ObservedMatches = np.array(ObservedMatches) OMm = np.mean(ObservedMatches) OMstd = np.std(ObservedMatches) if OMstd == 0: OMstd = 0.1 if matches < OMm: emp_p_val = norm.cdf(matches,OMm,OMstd) if emp_p_val == 0: emp_p_val = -300 else: emp_p_val = math.log10(emp_p_val) else: emp_p_val = 1 - norm.cdf(matches,OMm,OMstd) if emp_p_val == 0: emp_p_val = 300 else: emp_p_val = -math.log10(emp_p_val) # calculate maximum possible p-values: max_possible_upper_emp_p = 1 - norm.cdf(len(P),OMm,OMstd) if max_possible_upper_emp_p == 0: max_possible_upper_emp_p = 300 else: max_possible_upper_emp_p = -math.log10(max_possible_upper_emp_p) max_possible_lower_emp_p = norm.cdf(0,OMm,OMstd) if max_possible_lower_emp_p == 0: max_possible_lower_emp_p = -300 else: max_possible_lower_emp_p = math.log10(max_possible_lower_emp_p) # print matches, len(P), OMm, OMstd, np.mean(P), np.mean(Q) # print norm.cdf(len(P),OMm,OMstd), norm.cdf(0,OMm,OMstd) # print norm.cdf(matches,OMm,OMstd), 1 - norm.cdf(matches,OMm,OMstd) # calculate Fisher test p-value: C00 = OpenOrClosed.count((0,0)) C01 = OpenOrClosed.count((0,1)) C10 = OpenOrClosed.count((1,0)) C11 = OpenOrClosed.count((1,1)) CR1 = C01 + C00 OR1 = C10 + C11 CR2 = C10 + C00 OR2 = C01 + C11 oddsratio, pvalue = fisher_exact([[C00, C01], [C10, C11]]) logp = -math.log10(pvalue) emp_p_av += emp_p_val max_possible_upper_emp_p_av += max_possible_upper_emp_p max_possible_lower_emp_p_av += max_possible_lower_emp_p NMI_av += NMI p_av += logp CR1_av += CR1 OR1_av += OR1 CR2_av += CR2 OR2_av += OR2 (emp_p, max_possible_upper_emp_p, max_possible_lower_emp_p, NMI, pvalue, CR1, OR1, CR2, OR2) = (emp_p_av/SS, max_possible_upper_emp_p_av/SS, max_possible_lower_emp_p_av/SS, NMI_av/SS, p_av/SS, CR1_av/SS, OR1_av/SS, CR2_av/SS, OR2_av/SS) if str(emp_p) == 'nan': print emp_p, max_possible_upper_emp_p, max_possible_lower_emp_p, NMI, pvalue, CR1, OR1, CR2, OR2 outline = chr + '\t' + str(RL1) + '\t' + str(RR1) outline = outline + '\t' + str(OR1) + '\t' + str(CR1) + '\t' + str(OR1/(OR1 + CR1 + 0.0)) outline = outline + '\t' + str(RL2) + '\t' + str(RR2) outline = outline + '\t' + str(OR2) + '\t' + str(CR2) + '\t' + str(OR2/(OR2 + CR2 + 0.0)) + '\t' + str(pvalue) outline = outline + '\t' + str(emp_p) outline = outline + '\t' + str(max_possible_upper_emp_p) outline = outline + '\t' + str(max_possible_lower_emp_p) outline = outline + '\t' + str(NMI) outfile.write(outline + '\n') print outline # else: # OpenOrClosed = [] # for (cgs,loglike) in RegionReads: # t = zip(cgs,loglike) # RD = dict((int(x), float(y)) for x, y in t) # (A,B) = (alph,bet) # for pos in range(RL1,RR1): # if RD.has_key(pos): # p = RD[pos] # Z = int(PSS*p) # A = A + Z # B = B + PSS - Z # if beta.mean(A,B) > 0.5: # final_p1 = 1 # else: # final_p1 = 0 # (A,B) = (alph,bet) # for pos in range(RL2,RR2): # if RD.has_key(pos): # p = RD[pos] # Z = int(PSS*p) # A = A + Z # B = B + PSS - Z # if beta.mean(A,B) > 0.5: # final_p2 = 1 # else: # final_p2 = 0 # OpenOrClosed.append((final_p1,final_p2)) # C00 = OpenOrClosed.count((0,0)) # C01 = OpenOrClosed.count((0,1)) # C10 = OpenOrClosed.count((1,0)) # C11 = OpenOrClosed.count((1,1)) # OR1 = C01 + C00 # CR1 = C10 + C11 # OR2 = C10 + C00 # CR2 = C01 + C11 # oddsratio, pvalue = fisher_exact([[C00, C01], [C10, C11]]) # outline = chr + '\t' + str(RL1) + '\t' + str(RR1) # outline = outline + '\t' + str(OR1) + '\t' + str(CR1) + '\t' + str(OR1/(OR1 + CR1 + 0.0)) # outline = outline + '\t' + str(RL2) + '\t' + str(RR2) # if pvalue == 0: # pvalue = 1e-300 # outline = outline + '\t' + str(OR2) + '\t' + str(CR2) + '\t' + str(OR2/(OR2 + CR2 + 0.0)) + '\t' + str(-math.log10(pvalue)) # outfile.write(outline + '\n') outfile.close()
def switching_plots(buffers, axis1, num_clusters=2): #Get an idea of SNR #Cluster all the data into three based with starting point based on edges all_vals = buffers.flatten() all_vals.resize((all_vals.size, 1)) init_guess = np.linspace(np.min(all_vals), np.max(all_vals), num_clusters) init_guess[[1, -1]] = init_guess[[-1, 1]] init_guess.resize((num_clusters, 1)) clusterer = KMeans(init=init_guess, n_clusters=num_clusters) state = clusterer.fit_predict(all_vals) #Report initial state distributions print("Total initial state distribution:") init_state = state[::2] for ct in range(num_clusters): print("\tState {}: {:.2f}%".format( ct, 100 * np.sum(init_state == ct) / len(init_state))) #Approximate SNR from centre distance and variance std0 = np.std(all_vals[state == 0]) std1 = np.std(all_vals[state == 1]) mean_std = 0.5 * (std0 + std1) centre0 = clusterer.cluster_centers_[0, 0] centre1 = clusterer.cluster_centers_[1, 0] centre_dist = centre1 - centre0 print( "Centre distance = {:.3f} with widths = {:.4f} / {:.4f} gives SNR ratio {:.3}" .format(centre_dist, std0, std1, centre_dist / mean_std)) #Have a look at the distributions plt.figure() for ct in range(num_clusters): sns.distplot(all_vals[state == ct], kde=False, norm_hist=False) #calculate some switching matrices for each amplitude # 0->0 0->1 # 1->0 1->1 counts = [] for buf in buffers: state = clusterer.predict(buf.reshape((len(buf), 1))) init_state = state[::2] final_state = state[1::2] switched = np.logical_xor(init_state, final_state) count_mat = np.zeros((2, 2), dtype=np.int) count_mat[0, 0] = np.sum( np.logical_and(init_state == 0, np.logical_not(switched))) count_mat[0, 1] = np.sum(np.logical_and(init_state == 0, switched)) count_mat[1, 0] = np.sum(np.logical_and(init_state == 1, switched)) count_mat[1, 1] = np.sum( np.logical_and(init_state == 1, np.logical_not(switched))) counts.append(count_mat) mean_PtoAP = np.array( [beta.mean(1 + c[0, 1], 1 + c[0, 0]) for c in counts]) limit_PtoAP = np.array( [beta.mean(1 + c[0, 1] + c[0, 0], 1) for c in counts]) mean_APtoP = np.array( [beta.mean(1 + c[1, 0], 1 + c[1, 1]) for c in counts]) limit_APtoP = np.array( [beta.mean(1 + c[1, 0] + c[1, 1], 1) for c in counts]) ci68_PtoAP = np.array( [beta.interval(0.68, 1 + c[0, 1], 1 + c[0, 0]) for c in counts]) ci68_APtoP = np.array( [beta.interval(0.68, 1 + c[1, 0], 1 + c[1, 1]) for c in counts]) ci95_PtoAP = np.array( [beta.interval(0.95, 1 + c[0, 1], 1 + c[0, 0]) for c in counts]) ci95_APtoP = np.array( [beta.interval(0.95, 1 + c[1, 0], 1 + c[1, 1]) for c in counts]) plt.figure() # volts = 7.5*np.power(10, (-5+attens)/20) current_palette = sns.color_palette() plt.plot(axis1, mean_PtoAP) plt.fill_between(axis1, [ci[0] for ci in ci68_PtoAP], [ci[1] for ci in ci68_PtoAP], color=current_palette[0], alpha=0.2, edgecolor="none") plt.fill_between(axis1, [ci[0] for ci in ci95_PtoAP], [ci[1] for ci in ci95_PtoAP], color=current_palette[0], alpha=0.2, edgecolor="none") plt.plot(axis1, mean_APtoP) plt.fill_between(axis1, [ci[0] for ci in ci68_APtoP], [ci[1] for ci in ci68_APtoP], color=current_palette[1], alpha=0.2, edgecolor="none") plt.fill_between(axis1, [ci[0] for ci in ci95_APtoP], [ci[1] for ci in ci95_APtoP], color=current_palette[1], alpha=0.2, edgecolor="none") # plt.xlabel("Pulse Amp (V)") plt.xlabel("Pulse Duration (ns)") plt.ylabel("Estimated Switching Probability") # plt.title("P to AP") # means_diagram_PtoAP = mean_PtoAP.reshape(len(attens), len(durations), order='F') # plt.pcolormesh(axis1, volts, means_diagram_PtoAP, cmap="RdGy") # plt.colorbar() # plt.figure() # plt.title("AP to P") # means_diagram_APtoP = mean_APtoP.reshape(len(attens), len(durations), order='F') # plt.pcolormesh(axis1, volts, means_diagram_APtoP, cmap="RdGy") # plt.colorbar() plt.figure() plt.semilogy(axis1, 1 - mean_PtoAP) plt.semilogy(axis1, 1 - limit_PtoAP, color=current_palette[0], linestyle="--") plt.semilogy(axis1, 1 - mean_APtoP) plt.semilogy(axis1, 1 - limit_APtoP, color=current_palette[1], linestyle="--") plt.ylabel("Switching Error Rate") plt.xlabel("Pulse Duration (ns)") plt.show()
# The variance = E[X**2] - (E[X])**2 exp_x_squared = np.sum(np.square(p_grid) * posterior) std = np.sqrt(exp_x_squared - mu**2) print(f'posterior mean = {mu}, posterior standard deviation = {std}') norm_approx_posterior = norm.pdf(p_grid, loc=mu, scale=std) # The Beta dist. is a conjugate pair of the binomial dist # More specifically, if X_1, ..., X_n are iid random variables from a Binomial dist. # with parameter p, and p ~ Beta(a, b), then the posterior distribution of p # given X_1 = x_1, ..., X_n = x_n is Beta(a + sum(x_1, ..., x_n), b + n - sum(x_1, ..., x_n)) # Since Uniform(0, 1) = Beta(1, 1), the hyper-parameter update rule after observing water W times # and land L times is a = W + 1 and b = L + 1 W = 6 L = 3 beta_data = beta.pdf(p_grid, W + 1, L + 1) beta_mu = beta.mean(W + 1, L + 1) beta_std = beta.std(W + 1, L + 1) norm_approx = norm.pdf(p_grid, beta_mu, beta_std) # Plot both the analytically obtained posterior and the normal approximation plt.plot(p_grid, beta_data, 'bo-', label='beta') plt.plot(p_grid, norm_approx, 'ro-', label='normal') plt.xlabel('Fraction of water') plt.ylabel('Beta(W=6, L=3)') plt.title(f'Sample= WLWWWLWLW; number of grid points = {NUM_PTS}') plt.legend() plt.show()
def true_mean_beta(): return beta.mean(a=0.5, b=1, loc=self.left, scale=self.right)
def run(): if len(sys.argv) < 10: print 'usage: python %s methylation_reads_all.tsv region.bed chrFieldID leftField rightFieldID minCoverage windowsize stepsize tabix_location outfileprefix [-subsample N] [-expectedMaxDist bp]' % sys.argv[0] print '\Note: the script assumes Tombo 1.3 probabilities, and a tabix indexed reads file' print '\Note: the [-subsample] option will sample the reads in all comparisons down to the minCoverage level; the N parameter indicates how many such subsamplings should be averaged for the final value' print '\Note: the [-expectedMaxDist] option will change the initial window over which the required minimum number of reads is to be search for; default: 2kb' sys.exit(1) reads = sys.argv[1] peaks = sys.argv[2] chrFieldID = int(sys.argv[3]) leftFieldID = int(sys.argv[4]) rightFieldID = int(sys.argv[5]) minCov = int(sys.argv[6]) window = int(sys.argv[7]) step = int(sys.argv[8]) tabix = sys.argv[9] outprefix = sys.argv[10] alph = 10 bet = 10 PSS = 100 SS = 1 doSS = False if '-subsample' in sys.argv: SS = int(sys.argv[sys.argv.index('-subsample') + 1]) doSS = True print 'will subsample all comparisons down to', minCov, 'reads' print 'will take the average outcome of', SS, 'subsamplings' EMD = 2000 if '-expectedMaxDist' in sys.argv: EMD = int(sys.argv[sys.argv.index('-expectedMaxDist') + 1]) print 'will use an expected maximum distance of', EMD PeakDict = {} if peaks.endswith('.bz2'): cmd = 'bzip2 -cd ' + peaks elif peaks.endswith('.gz') or peaks.endswith('.bgz'): cmd = 'zcat ' + peaks elif peaks.endswith('.zip'): cmd = 'unzip -p ' + peaks else: cmd = 'cat ' + peaks RN = 0 P = os.popen(cmd, "r") line = 'line' while line != '': line = P.readline().strip() if line == '': break if line.startswith('#'): continue fields = line.strip().split('\t') chr = fields[chrFieldID] RL = int(fields[leftFieldID]) RR = int(fields[rightFieldID]) if PeakDict.has_key(chr): pass else: PeakDict[chr] = [] PeakDict[chr].append((RL,RR)) print 'finished inputting peaks' for chr in PeakDict.keys(): PeakDict[chr].sort() for (left,right) in PeakDict[chr]: print chr,left,right Matrix = {} outfile = open(outprefix + '.' + chr + '_' + str(left) + '_' + str(right),'w') for i in range(left,right,step): RC = 0 cmd = tabix + ' ' + reads + ' ' + chr + ':' + str(i) + '-' + str(min(right,i + EMD)) p = os.popen(cmd, "r") line = 'line' while line != '': line = p.readline().strip() if line == '': break fields = line.strip().split('\t') read_left = int(fields[1]) read_right = int(fields[2]) if read_left <= i and read_right >= min(right,i + EMD): RC += 1 else: continue if RC >= minCov: RCC = minCov LJ = min(right,i + EMD) while RCC >= minCov and LJ < right: LJ += window RRR = 0 cmd = tabix + ' ' + reads + ' ' + chr + ':' + str(i) + '-' + str(LJ) p = os.popen(cmd, "r") line = 'line' while line != '': line = p.readline().strip() if line == '': break fields = line.strip().split('\t') read_left = int(fields[1]) read_right = int(fields[2]) if read_left <= i and read_right >= LJ: RRR += 1 else: continue RCC = RRR rightLimit = LJ - window else: RCC = RC LJ = min(right,i + EMD) while RCC <= minCov and LJ > i: LJ = LJ - window RRR = 0 cmd = tabix + ' ' + reads + ' ' + chr + ':' + str(i) + '-' + str(LJ) p = os.popen(cmd, "r") line = 'line' while line != '': line = p.readline().strip() if line == '': break fields = line.strip().split('\t') read_left = int(fields[1]) read_right = int(fields[2]) if read_left <= i and read_right >= LJ: RRR += 1 else: continue RCC = RRR rightLimit = LJ rightLimit = min(right,rightLimit) print chr, i, rightLimit RegionReads = [] cmd = tabix + ' ' + reads + ' ' + chr + ':' + str(i) + '-' + str(rightLimit) p = os.popen(cmd, "r") line = 'line' while line != '': line = p.readline().strip() if line == '': break fields = line.strip().split('\t') chr = fields[0] read_left = int(fields[1]) read_right = int(fields[2]) if read_left <= i and read_right >= rightLimit: pass else: continue strand = fields[3] read = fields[4] cgs = fields[6].split(',') loglike = fields[7].split(',') RegionReads.append((cgs,loglike)) AccDict = {} for j in range(i,rightLimit,window): AccDict[j] = [] Matrix[j] = {} for k in range(j,rightLimit,window): Matrix[j][k] = 0 print len(RegionReads) if len(RegionReads) < minCov: continue for S in range(SS): if doSS: RegionReadsSubSampled = random.sample(RegionReads,minCov) else: RegionReadsSubSampled = RegionReads for (cgs,loglike) in RegionReadsSubSampled: t = zip(cgs,loglike) RD = dict((int(x), float(y)) for x, y in t) for j in range(i,rightLimit,window): (A,B) = (alph,bet) for pos in range(j, j + window): if RD.has_key(pos): p = RD[pos] Z = int(PSS*p) A = A + Z B = B + PSS - Z if beta.mean(A,B) > 0.5: final_p = 1 else: final_p = 0 AccDict[j].append(final_p) for j in range(i,rightLimit,window): for k in range(j,rightLimit,window): JSDvalue = JSD(AccDict[j],AccDict[k]) Matrix[j][k] += JSDvalue/SS outline = '#' for i in range(left,right,window): outline = outline + '\t' + str(i) outfile.write(outline + '\n') for i in range(left,right,window): outline = str(i) for j in range(left,right,window): if Matrix.has_key(i): if Matrix[i].has_key(j): outline = outline + '\t' + "{0:.2f}".format(Matrix[i][j]) else: outline = outline + '\t' + 'nan' else: outline = outline + '\t' + 'nan' outfile.write(outline + '\n') outfile.close()
from scipy.stats import beta a = 7.2 b = 2.3 m, v = beta.stats(a, b, moments="mv") mean = beta.mean(a=a, b=b) var = beta.var(a=a, b=b) print("The mean is " + str(m)) print("The variance is " + str(v)) prob = 1 - beta.cdf(a=a, b=b, x=.90) print("The probability of having a variance over 90% is " + str(prob))
def mean(self): return beta.mean(self.a, self.b)