def test_confint_multinomial_proportions_zeros(): # test when a count is zero or close to zero # values from R MultinomialCI ci01 = np.array([ 0.09364718, 0.1898413, 0.00000000, 0.0483581, 0.13667426, 0.2328684, 0.10124019, 0.1974343, 0.10883321, 0.2050273, 0.17210833, 0.2683024, 0.09870919, 0.1949033 ]).reshape(-1, 2) ci0 = np.array([ 0.09620253, 0.19238867, 0.00000000, 0.05061652, 0.13924051, 0.23542664, 0.10379747, 0.19998360, 0.11139241, 0.20757854, 0.17468354, 0.27086968, 0.10126582, 0.19745196 ]).reshape(-1, 2) # the shifts are the differences between "LOWER(SG)" "UPPER(SG)" and # "LOWER(C+1)" "UPPER(C+1)" in verbose printout # ci01_shift = np.array([0.002531008, -0.002515122]) # not needed ci0_shift = np.array([0.002531642, 0.002515247]) p = [56, 0.1, 73, 59, 62, 87, 58] ci_01 = smprop.multinomial_proportions_confint(p, 0.05, method='sison_glaz') p = [56, 0, 73, 59, 62, 87, 58] ci_0 = smprop.multinomial_proportions_confint(p, 0.05, method='sison_glaz') assert_allclose(ci_01, ci01, atol=1e-5) assert_allclose(ci_0, np.maximum(ci0 - ci0_shift, 0), atol=1e-5) assert_allclose(ci_01, ci_0, atol=5e-4)
def compute_multinomial_confidence_intervals(trace): indices=pymbar.timeseries.subsampleCorrelatedData(trace[::10,0]) confint_trace_USE=trace[indices] trace_model_0=[] trace_model_1=[] trace_model_2=[] for i in range(np.size(confint_trace_USE,0)): if confint_trace_USE[i,0] == 0: trace_model_0.append(confint_trace_USE[i]) #log_trace_0.append(logp_trace[i]) elif confint_trace_USE[i,0] == 1: trace_model_1.append(confint_trace_USE[i]) #log_trace_1.append(logp_trace[i]) elif confint_trace_USE[i,0] == 2: trace_model_2.append(confint_trace_USE[i]) #log_trace_2.append(logp_trace[i]) trace_model_0=np.asarray(trace_model_0) trace_model_1=np.asarray(trace_model_1) trace_model_2=np.asarray(trace_model_2) counts = np.asarray([len(trace_model_0),len(trace_model_1),len(trace_model_2)]) prob_conf = multinomial_proportions_confint(counts) return prob_conf
def test_df_loc(self): # tests the df subsetting including subsetting the index subsample_index=2 nc = 4 nr = subsample_index*16 alpha=0.01 a = np.arange(0, nr * nc).reshape(nr, nc) df_all = pd.DataFrame(a, columns=np.arange(0, nc)) df_sub=df_all.loc[::subsample_index,:] in_values=df_sub.values out_values=np.zeroes((nr/subsample_index,2*nc)) for nl in range(0,in_values.shape[0]): ci=multinomial_proportions_confint(in_values[nl,:],alpha=alpha,method='goodman' ) out_values[nl, 0:nc] = ci[:, 0] out_values[nl, nc:-1] = ci[:, 1] output_sm=pd.DataFrame(out_values,gen_ci_label(df_all.columns,'lb')+gen_ci_label(df_all.columns,'ub')) out_df = multinomial_proportions_confint_df(df_sub, alpha=alpha) assert_allclose(output_sm.values, out_df.values, rtol=1e-07, err_msg="test_df_loc ") self.assertSequenceEqual(list(output_sm.columns), list(out_df.columns), 'test_df_loc: different columns') self.assertSequenceEqual(list(output_sm.index), list(out_df.index), 'test_df_loc: different indices')
def certification(self, model, data, label, data_adv=None, mask=None): if mask is None: bone_length = self.preprocess(data.data) mask = bone_length.abs().max(2)[0] mask = torch.sign(torch.max(mask - 1e-5, torch.zeros_like(mask))).float() mask = mask.view(data.size(0), 1, data.size(2), 1, data.size(4)) mask = mask.repeat(1, data.size(1), 1, data.size(3), 1).cuda() counts = self.counts(data, model, mask).numpy() # max_counts = np.max(counts, axis=1) pred_label = np.argmax(counts, axis=1) p1, p2 = [], [] for i in range(counts.shape[0]): p = multinomial_proportions_confint(np.sort(counts[i, :])[::-1], alpha=self.fail_prob) p1.append(p[0, 0]) p2.append(p[1, 1]) radius = np.maximum( 0.5 * self.sigma * (norm.ppf(np.array(p1)) - norm.ppf(np.array(p2))), 0.0) if data_adv is not None: l2_norm = self.L2_distance(data.data, data_adv.data).cpu().numpy() return np.logical_and( np.greater(radius, l2_norm), np.equal(pred_label, label.data.cpu().numpy())) return np.where(np.equal(pred_label, label.data.cpu().numpy()), radius, -1)
def test_confint_multinomial_proportions(): from .results.results_multinomial_proportions import res_multinomial for ((method, description), values) in res_multinomial.items(): cis = multinomial_proportions_confint(values.proportions, 0.05, method=method) assert_almost_equal( values.cis, cis, decimal=values.precision, err_msg='"%s" method, %s' % (method, description))
def test_confint_multinomial_proportions_zeros(): # test when a count is zero or close to zero # values from R MultinomialCI ci01 = np.array([ 0.09364718, 0.1898413, 0.00000000, 0.0483581, 0.13667426, 0.2328684, 0.10124019, 0.1974343, 0.10883321, 0.2050273, 0.17210833, 0.2683024, 0.09870919, 0.1949033]).reshape(-1,2) ci0 = np.array([ 0.09620253, 0.19238867, 0.00000000, 0.05061652, 0.13924051, 0.23542664, 0.10379747, 0.19998360, 0.11139241, 0.20757854, 0.17468354, 0.27086968, 0.10126582, 0.19745196]).reshape(-1,2) # the shifts are the differences between "LOWER(SG)" "UPPER(SG)" and # "LOWER(C+1)" "UPPER(C+1)" in verbose printout # ci01_shift = np.array([0.002531008, -0.002515122]) # not needed ci0_shift = np.array([0.002531642, 0.002515247]) p = [56, 0.1, 73, 59, 62, 87, 58] ci_01 = smprop.multinomial_proportions_confint(p, 0.05, method='sison_glaz') p = [56, 0, 73, 59, 62, 87, 58] ci_0 = smprop.multinomial_proportions_confint(p, 0.05, method='sison_glaz') assert_allclose(ci_01, ci01, atol=1e-5) assert_allclose(ci_0, np.maximum(ci0 - ci0_shift, 0), atol=1e-5) assert_allclose(ci_01, ci_0, atol=5e-4)
def calculate_expansion_proportion(tree): """ Detects clonal expansion of all children at depth 1. :param tree: ete3.TreeNode, the root of the tree :return: A mapping from child to confidence interval """ N = len(tree.get_leaf_names()) props = {} for c in tree.children: props[c] = len(c.get_leaves()) ci = multinomial_proportions_confint(list(props.values()), alpha=0.05) props = {c: interval for c, interval in zip(props.keys(), ci)} return props
def fit(self, data): ''' Parameters ---------- data : array-like The data to use for the estimation Returns ------- matrix : estimated transition matrix confint_lower: lower confidence interval confint_upper: upper confidence interval Notes ------ ''' # loop over data (id, state_in, state_out) # calculate population count N^i_k per state i # calculate migrations count N^{ij}_{kl} from i to j # calculate transition matrix as ratio T^{ij}_{kl} = N^{ij}_{kl} / N^i_k # In the simple estimator all events are part of the same cohort state_count = self.states.cardinality state_list = self.states.get_states() # storage of counts tm_count = np.ndarray(state_count) tmn_count = np.ndarray((state_count, state_count)) tm_count.fill(0.0) tmn_count.fill(0.0) i = 0 for row in data.itertuples(): state_in = state_list.index(row[2]) state_out = state_list.index(row[3]) # print(row[2], row[3]) # print(state_in, state_out) tm_count[state_in] += 1 tmn_count[state_in, state_out] += 1 i += 1 self.counts = int(tm_count.sum()) '''Confidence intervals for multinomial proportions. See statsmodels URL http://www.statsmodels.org/devel/_modules/statsmodels/stats/proportion.html Parameters ---------- counts : array_like of int, 1-D Number of observations in each category. alpha : float in (0, 1), optional Significance level, defaults to 0.05. method : {'goodman', 'sison-glaz'}, optional Method to use to compute the confidence intervals; available methods are: - `goodman`: based on a chi-squared approximation, valid if all values in `counts` are greater or equal to 5 [2]_ - `sison-glaz`: less conservative than `goodman`, but only valid if `counts` has 7 or more categories (``len(counts) >= 7``) [3]_ Returns ------- confint : ndarray, 2-D Array of [lower, upper] confidence levels for each category, such that overall coverage is (approximately) `1-alpha`. ''' confint_lower = np.ndarray((state_count, state_count, 1)) confint_upper = np.ndarray((state_count, state_count, 1)) for s1 in range(state_count): intervals = st.multinomial_proportions_confint(tmn_count[s1, :], alpha=self.ci_alpha, method=self.ci_method) for s2 in range(state_count): confint_lower[s1, s2, 0] = intervals[s2][0] confint_upper[s1, s2, 0] = intervals[s2][1] self.confint_lower = confint_lower self.confint_upper = confint_upper # Normalization of counts to produce family of probability matrices for s1 in range(state_count): for s2 in range(state_count): if tm_count[s1] > 0: tmn_count[(s1, s2)] = tmn_count[(s1, s2)] / tm_count[s1] # for s1 in range(state_count): # for s2 in range(state_count): # print(confint_lower[s1, s2], tmn_count[(s1, s2)], confint_upper[s1, s2]) self.matrix_set.append(tmn_count) return self.matrix_set
def fit(self, data, labels=None): ''' Parameters ---------- data : array-like The data to use for the estimation labels: a dictionary for relabeling column names Returns ------- matrix : estimated transition matrix confint_lower: lower confidence interval confint_upper: upper confidence interval Notes ------ ''' # loop over data (id, period, state) # calculate population count N^i_k per state i per period k # calculate migrations count N^{ij}_{kl} from i to j from period k to period l # calculate transition matrix as ratio T^{ij}_{kl} = N^{ij}_{kl} / N^i_k if labels is not None: state_label = labels['State'] timestep_label = labels['Timestamp'] id_label = labels['ID'] else: state_label = 'State' id_label = 'ID' timestep_label = 'Timestep' state_dim = self.states.cardinality cohort_labels = data[timestep_label].unique() cohort_dim = len(cohort_labels) - 1 event_count = data[id_label].count() # store data in 1d arrays for fast processing # capture nan events for missing observations event_exists = np.empty(event_count, int) event_id = np.empty(event_count, int) event_state = np.empty(event_count, int) event_time = np.empty(event_count, int) nan_count = 0 i = 0 # TODO read data by labels, not column location for row in data.itertuples(): try: event_state[i] = int(row[3]) event_id[i] = row[1] event_time[i] = int(row[2]) event_exists[i] = 1 except ValueError: nan_count += 1 i += 1 self.nans = nan_count # storage of counts # number of entities observed in given state per period tm_count = np.ndarray((state_dim, cohort_dim), int) # number of entities observed to transition from state to state per period tmn_count = np.ndarray((state_dim, state_dim, cohort_dim), int) tm_count.fill(0) tmn_count.fill(0) # normalized frequencies tmn_values = np.ndarray((state_dim, state_dim, cohort_dim), float) tmn_values.fill(0.0) # TODO Capture case if entity with only one observation (hence no transition count) for i in range(1, event_count - 1): if event_exists[i] == 1: # while processing event data from same entity if event_id[i + 1] == event_id[i]: tm_count[(event_state[i], event_time[i])] += 1 tmn_count[(event_state[i], event_state[i + 1], event_time[i])] += 1 # last data point from entity data # elif event_id[i + 1] != event_id[i] and event_id[i] == event_id[i - 1]: # tm_count[(event_state[i], event_time[i])] += 1 # tmn_count[(event_state[i - 1], event_state[i], event_time[i])] += 1 # elif event_id[i + 1] != event_id[i] and event_id[i] != event_id[i - 1]: # sys.exit("Isolated observation in data") # boundary cases # i = 0 if event_exists[i] == 1: if event_id[i + 1] == event_id[i]: tm_count[(event_state[i], event_time[i])] += 1 tmn_count[(event_state[i], event_state[i + 1], event_time[i])] += 1 # # i = event_count - 1 # if event_exists[i] == 1: # if event_id[i] == event_id[i - 1]: # tm_count[(event_state[i], event_time[i])] += 1 # tmn_count[(event_state[i - 1], event_state[i], event_time[i])] += 1 self.counts = int(tm_count.sum()) # Confidence Interval Estimation (Based on Counts) confint_lower = np.ndarray((state_dim, state_dim, cohort_dim)) confint_upper = np.ndarray((state_dim, state_dim, cohort_dim)) for k in range(cohort_dim): for s1 in range(state_dim): intervals = st.multinomial_proportions_confint(tmn_count[s1, :, k], alpha=self.ci_alpha, method=self.ci_method) for s2 in range(state_dim): confint_lower[s1, s2, k] = intervals[s2][0] confint_upper[s1, s2, k] = intervals[s2][1] self.confint_lower = confint_lower self.confint_upper = confint_upper # Normalization of counts to produce family of probability matrices for s1 in range(state_dim): for s2 in range(state_dim): for k in range(cohort_dim): if tm_count[(s1, k)] > 0: tmn_values[(s1, s2, k)] = tmn_count[(s1, s2, k)] / tm_count[(s1, k)] # print(s1, s2, k, tmn_values[(s1, s2, k)], tmn_count[(s1, s2, k)], tm_count[(s1, k)]) # Return a list of transition matrices for k in range(cohort_dim): self.matrix_set.append(tmn_values[:, :, k]) self.count_set.append(tmn_count[:, :, k]) self.count_normalization.append(tm_count[:, k]) return self.matrix_set
def simultaneous_confint_from_cdf(alpha, n_samples, x, cdf): return binom.multinomial_proportions_confint( n_samples*np.diff(cdf), alpha=alpha, method='sison-glaz' )
def fit(self, data, labels=None): """ Parameters ---------- data : dataframe - The data to use for the estimation (in sorted by ID in compact format) labels: an optional dictionary for relabeling column names Returns ------- matrix_set : An estimated transition matrix set Notes ------ * loop over data rows (id, timepoint, state) * at least two distinct timepoints are required (initial and final) * calculate population count N^i_k per state i per timepoint k * calculate migrations count N^{ij}_{kl} from i to j from timepoint k to timepoint l * calculate transition matrix as ratio T^{ij}_{kl} = N^{ij}_{kl} / N^i_k * calculate also count-averaged matrix References ---------- """ # Allow for flexible labelling for dataframe columns if labels is not None: state_label = labels['State'] timestep_label = labels['Time'] id_label = labels['ID'] else: state_label = 'State' id_label = 'ID' timestep_label = 'Time' # Old way of enumerating cohort intervals was using labels # cohort_labels = data[timestep_label].unique() # cohort_dim = len(cohort_labels) - 1 # The size of the state space state_dim = self.states.cardinality # The number of cohorts is the number of intervals # Minimally two (initial and final) cohort_dim = len(self.cohort_bounds) - 1 event_count = data[id_label].count() # store data in 1d arrays for faster processing # capture nan events for missing observations event_exists = np.empty(event_count, int) entity_id = np.empty(event_count, int) entity_state = np.empty(event_count, int) event_time = np.empty(event_count, int) nan_count = 0 i = 0 for index, row in data.iterrows(): entity_id[i] = row[id_label] try: entity_state[i] = row[state_label] event_time[i] = row[timestep_label] event_exists[i] = 1 # indicates a valid (complete) data row except ValueError: entity_state[i] = -99999 event_time[i] = -99999 event_exists[i] = 0 nan_count += 1 i += 1 self.nans = nan_count # store number of entities observed in given state per time step tm_count = np.ndarray((state_dim, cohort_dim + 1), int) # store number of entities observed to transition from state (From) to state (To) per period tmn_count = np.ndarray((state_dim, state_dim, cohort_dim), int) # store normalized frequencies tmn_values = np.ndarray((state_dim, state_dim, cohort_dim), float) # matrix to store average transitions tmn_average = np.ndarray((state_dim, state_dim), float) # initialize to zero (TODO ?) tm_count.fill(0) tmn_count.fill(0) tmn_values.fill(0) tmn_average.fill(0) # TODO Capture case if entity with only one observation (hence no transition count) # TODO Capture case with stale observations (no transitions) for i in range(0, event_count - 1): # the last point handled separately if event_exists[i] == 1: # while processing valid event data from same entity # increment state count tm_count[(entity_state[i], event_time[i])] += 1 if entity_id[i + 1] == entity_id[i]: # increment migration count if there is subsequent observation # NB: It does not have to be different tmn_count[(entity_state[i], entity_state[i + 1], event_time[i])] += 1 # handle boundary cases # the last event must be evaluated in comparison with its previous one i = event_count - 1 if event_exists[i] == 1: # ATTN we must shift the time index of the tm_count, tmn_count tm_count[(entity_state[i], event_time[i] - 1)] += 1 if entity_id[i] == entity_id[i - 1]: tmn_count[(entity_state[i - 1], entity_state[i], event_time[i] - 1)] += 1 # print(tm_count) # print(tm_count.sum()) # print(tmn_count[:, :, 0]) self.counts = int(tm_count.sum()) # Normalization of counts to produce a family of probability matrices for s1 in range(state_dim): for s2 in range(state_dim): for k in range(cohort_dim): if tm_count[(s1, k)] > 0: tmn_values[(s1, s2, k)] = tmn_count[(s1, s2, k)] / tm_count[(s1, k)] # for k in range(cohort_dim): # m = transitionMatrix.TransitionMatrix(tmn_values[:, :, k]) # m.print_matrix(accuracy=3) # Average transition matrix (assuming temporal homogeneity) for s1 in range(state_dim): for s2 in range(state_dim): tm_total_count = 0 for k in range(cohort_dim): tmn_average[(s1, s2)] += tmn_count[(s1, s2, k)] tm_total_count += tm_count[(s1, k)] if tm_total_count > 0: tmn_average[(s1, s2)] /= tm_total_count self.average_matrix = tmn_average # Confidence Interval Estimation (Based on Counts) confint_lower = np.ndarray((state_dim, state_dim, cohort_dim)) confint_upper = np.ndarray((state_dim, state_dim, cohort_dim)) for k in range(cohort_dim - 1): for s1 in range(state_dim): intervals = st.multinomial_proportions_confint(tmn_count[s1, :, k], alpha=self.ci_alpha, method=self.ci_method) for s2 in range(state_dim): confint_lower[s1, s2, k] = intervals[s2][0] confint_upper[s1, s2, k] = intervals[s2][1] self.confint_lower = confint_lower self.confint_upper = confint_upper # Return a list of transition matrices # Both absolute (frequency) and relative (probability) format for k in range(cohort_dim): self.matrix_set.append(tmn_values[:, :, k]) self.count_set.append(tmn_count[:, :, k]) # Return absolute counts at time points for k in range(cohort_dim + 1): self.count_normalization.append(tm_count[:, k]) # print(self.count_normalization) return self.matrix_set