def get_predictive_percentile_calibration(runs, percentile, method='exact', max_Ntest=50): model = models.PoissonRegressionModel(None, None, example_weights=None, test_data=None) in_interval = [] for run in runs: if method == 'exact': bootstrap_samples = run['multinomial']['bootstrap_params_exact'] elif method == 'appx': bootstrap_samples = run['multinomial']['bootstrap_params_appx'] test_data = run['test_data'] test_data.Y = test_data.Y[:max_Ntest] test_data.X = test_data.X[:max_Ntest] test_data.N = test_data.X.shape[0] model.test_data = test_data Ntest = test_data.N for n in range(Ntest): sampled_ys = model.get_predictive_distribution(bootstrap_samples, model.test_data.X[n], Nsamples=100) true_y = model.test_data.Y[n] lower = np.percentile(sampled_ys, tail) upper = np.percentile(sampled_ys, 100-tail) in_interval_for_run[n] = (lower <= true_y) and (true_y <= upper) in_interval.append(in_interval_for_run) return in_interval
def get_distances(data, all_pairs): # Extracts the quantile distances between patients history = get_history_length(data) q0_set = np.percentile(all_pairs, 5, axis=1, keepdims=True) q1_set = np.percentile(all_pairs, 10, axis=1, keepdims=True) q2_set = np.percentile(all_pairs, 15, axis=1, keepdims=True) quantiles = np.hstack((history, q0_set, q1_set, q2_set)) return quantiles
def get_orthogonality_score(C_matrix, verbose=True): """ Gets the angle between each subspace and the other ones. Note the we leave the diagonal as zeros, because the angles are 1 anyway And it helps to have a more representative mean. """ in_degree = True len_1, len_2 = C_matrix.shape orthogonality_matrix = np.zeros((len_2, len_2)) for lat_i in range(0, len_2): for lat_j in range(lat_i + 1, len_2): angle = np.dot(C_matrix[:, lat_i], C_matrix[:, lat_j]) / (np.dot( np.linalg.norm(C_matrix[:, lat_i]), np.linalg.norm(C_matrix[:, lat_j]))) orthogonality_matrix[lat_i, lat_j] = np.arccos(np.abs(angle)) orthogonality_matrix[lat_j, lat_i] = np.arccos(np.abs(angle)) if in_degree: orthogonality_matrix = 180 * orthogonality_matrix / np.pi mean_per_sub_space = np.sum(np.abs(orthogonality_matrix), 1) / (len_2 - 1) glob_mean = np.mean(mean_per_sub_space) try: all_non_diag = orthogonality_matrix.flatten() all_non_diag = all_non_diag[np.nonzero(all_non_diag)] tenth_percentil = np.percentile(all_non_diag, 25) ninetith_percentil = np.percentile(all_non_diag, 75) small_avr = np.average( all_non_diag, weights=(all_non_diag <= tenth_percentil).astype(int)) high_avr = np.average( all_non_diag, weights=(all_non_diag >= ninetith_percentil).astype(int)) except: small_avr = glob_mean high_avr = glob_mean if verbose: print(np.around(orthogonality_matrix, 2)) print("Mean abs angle per subspace: ", mean_per_sub_space) print("Mean abs angle overall: ", glob_mean) #print("Std abs angle overall: ", np.std(mean_per_sub_space)) # print(small_avr, high_avr) if len_2 <= 1: glob_mean = small_avr = high_avr = 0 return glob_mean, small_avr, high_avr
def plot_summary(x, s, interval=95, num_samples=100, sample_color='k', sample_alpha=0.4, interval_alpha=0.25, color='r', legend=True, title="", plot_mean=True, plot_median=False, label=""): b = 0.5 * (100 - interval) lower = np.percentile(s, b, axis=0).T upper = np.percentile(s, 100 - b, axis=0).T if plot_median: median = np.percentile(s, [50], axis=0).T lab = 'Median' if len(label) > 0: lab += " %s" % label plt.plot(x.ravel(), median, label=lab, color=color, linewidth=4) if plot_mean: mean = np.mean(s, axis=0).T lab = 'Mean' if len(label) > 0: lab += " %s" % label plt.plot(x.ravel(), mean, '--', label=lab, color=color, linewidth=4) plt.fill_between(x.ravel(), lower.ravel(), upper.ravel(), color=color, alpha=interval_alpha, label='%d%% Interval' % interval) if num_samples > 0: idx_samples = np.random.choice(range(len(s)), size=num_samples, replace=False) plt.plot(x, s[idx_samples, :].T, color=sample_color, alpha=sample_alpha) if legend: plt.legend(loc='best') if len(title) > 0: plt.title(title, fontweight='bold')
def callback(self, th, t, g, tskip=20, n_samps=10): """ custom callback --- prints statistics of all gradient comps""" if t % tskip == 0: fval = self.elbo_mc(th, n_samps=n_samps) gm, gv = np.abs(g[:self.D]), np.abs(g[self.D:]) print \ """ iter {t}; val = {val}, abs gm = {m} [{mlo}, {mhi}] gv = {v} [{vlo}, {vhi}] """.format(t=t, val="%2.4f"%fval, m ="%2.4f"%np.mean(gm), mlo="%2.4f"%np.percentile(gm, 1.), mhi="%2.4f"%np.percentile(gm, 99.), v ="%2.4f"%np.mean(gv), vlo="%2.4f"%np.percentile(gv, 1.), vhi="%2.4f"%np.percentile(gv, 99.))
def post_plot(ax, dist): ax.hist(dist, 50, histtype="step") ylim = ax.get_ylim() low, mid, high = np.percentile(dist, [16, 50, 84]) plt.plot([mid, mid], ylim, c='indianred') for lh in (low, high): plt.plot([lh, lh], ylim, ':', c='indianred') ax.set_xlabel('Rotation Period (days)', fontsize=14) ax.set_ylabel('Posterior Probability', fontsize=14)
def get_likelihood_based_interval(thetas, interval_coverage, model): ''' thetas should be a list of free parameters for the model q should be in [0,100] Returns list of thetas, starting with lowest model.eval_objective(theta) up to the qth percentile. ''' #fn_vals = [model.eval_objective(theta) for theta in thetas] #sorted_inds = np.array(np.argsort(fn_vals)) #thresh = int(np.floor(thetas.shape[0]*(q/100.0))) #return thetas[sorted_inds[:thresh]], np.array(fn_vals)[sorted_inds] fn_vals = np.array([model.eval_objective(theta) for theta in thetas]) upper = np.percentile(fn_vals, interval_coverage+(100-interval_coverage)/2) lower = np.percentile(fn_vals, (100-interval_coverage)/2) inds = np.where(np.logical_and(lower <= fn_vals, fn_vals <= upper)) return fn_vals[inds]
def initialize(self, base_model, datas, inputs=None, masks=None, tags=None, emission_optimizer="bfgs", num_optimizer_iters=1000): print("Initializing Emissions parameters...") if self.D == 1 and base_model.transitions.__class__.__name__ == "DDMTransitions": # if self.D == 0: d_init = np.mean([y[0:3] for y in datas], axis=(0, 1)) u_sum = np.array([np.sum(u) for u in inputs]) y_end = np.array([y[-3:] for y in datas]) u_l, u_u = np.percentile( u_sum, [20, 80]) # use 20th and 80th percentile input y_U = y_end[np.where(u_sum >= u_u)] y_L = y_end[np.where(u_sum <= u_l)] C_init = (1.0 / 2.0) * np.mean( (np.mean(y_U, axis=0) - np.mean(y_L, axis=0)), axis=0) self.Cs = C_init.reshape([1, self.N, self.D]) / self.bin_size self.ds = d_init.reshape([1, self.N]) / self.bin_size else: datas = [ interpolate_data(data, mask) for data, mask in zip(datas, masks) ] Td = sum([data.shape[0] for data in datas]) xs = [ base_model.sample(T=data.shape[0], input=input)[1] for data, input in zip(datas, inputs) ] def _objective(params, itr): self.params = params # self.Cs = params obj = 0 obj += self.log_prior() for data, input, mask, tag, x in \ zip(datas, inputs, masks, tags, xs): obj += np.sum( self.log_likelihoods(data, input, mask, tag, x)) return -obj / Td # Optimize emissions log-likelihood optimizer = dict(bfgs=bfgs, lbfgs=lbfgs)[emission_optimizer] self.params = \ optimizer(_objective, self.params, num_iters=num_optimizer_iters, full_output=False)
def callback(self, th, t, g, tskip=20, n_samps=100): """ custom callback --- prints statistics of all gradient comps""" if t % tskip == 0: fval = self.elbo_mc(th, n_samps=n_samps) gm, gv, gC = self.unpack(g) gm, gv, gC = np.abs(gm), np.abs(gv), np.abs(gC) m, v, C = self.unpack(th) Cmags = np.sqrt(np.sum(C**2, axis=0)) if self.r > 0: Cm ="%2.4f"%np.mean(gC), Clo="%2.4f"%np.percentile(gC, 1.), Chi="%2.4f"%np.percentile(gC, 99.), else: Cm, Clo, Chi = "na", "na", "na" print \ """ iter {t}; val = {val}, abs gm = {m} [{mlo}, {mhi}] gv = {v} [{vlo}, {vhi}] gC ({D} x {r}) = {C} [{Clo}, {Chi}] Comp mags = {Cmags} """.format(t=t, val="%2.4f"%fval, D = "%d"%self.D, r="%d"%self.r, m ="%2.4f"%np.mean(gm), mlo="%2.4f"%np.percentile(gm, 1.), mhi="%2.4f"%np.percentile(gm, 99.), v ="%2.4f"%np.mean(gv), vlo="%2.4f"%np.percentile(gv, 1.), vhi="%2.4f"%np.percentile(gv, 99.), C =Cm, Clo=Clo, Chi=Chi, Cmags=np.str(Cmags))
def get_percentile_calibration(true_params, bs_runs, interval_coverage=90): ''' Currently checks percentile estimates over each dimension of the parameters independently (so we only have to compute precentiles over 1-D things) true_params should be a D dimensional array bootstrap_samples should be a list of B x D arrays, where B is the number of bootstrap samples. interval_coverage specifies the size of the interval around the median; i.e. 95 corresponds to the interval [2.5%, 97.5%] ''' D = true_params.shape[0] nExp = len(bs_runs) in_range = np.zeros((nExp,D), dtype=np.bool) tail = (100-interval_coverage)/2.0 for n in range(nExp): lower = np.percentile(bs_runs[n]['multinomial']['bootstrap_params_appx'], tail, axis=0) upper = np.percentile(bs_runs[n]['multinomial']['bootstrap_params_appx'], 100-tail, axis=0) for d in range(D): in_range[n,d] = (lower[d] < true_params[d]) and (true_params[d] < upper[d]) return in_range
def __init__(self,data_obj,p=1,oversampled=0,t_offset=None,precomputed=None,pct_spike=95): # some fns. require 'precomputed', a dict with at least two keys theta_star (output of lbfgsb) and fn_obj used in the optimiztion of theta_star # t_offset: if oversampled, this is shape (N,), and gives the offset between stim trigger and frame (nbefore). self.data_obj = data_obj self.F = data_obj.F self.nroi = self.F.shape[0] self.p = p self.b = np.zeros((self.nroi,1,1)) self.g = np.zeros((self.nroi,self.p,1)) self.a = np.zeros((self.nroi,1,1)) self.sn = np.zeros((self.nroi,1,1)) fudge_factor = .97 for i in range(self.nroi): _,s,self.b[i,0,0],gtemp,_ = deconvolve(data_obj.dfof[i].astype(np.float64),penalty=1,g=tuple([None]*self.p)) self.g[i,:,0] = np.array(gtemp) self.a[i] = np.percentile(s,pct_spike) est = estimate_parameters(data_obj.dfof[i].astype(np.float64), p=self.p, fudge_factor=fudge_factor) self.sn[i] = est[1] # if not type(g) is tuple: # g = (g,) # self.g = np.array(g) #self.fn_obj = fn_obj #nangle = len(np.unique(data_obj.angle)) self.noise = (self.sn**2*(1+(self.g**2).sum(1)[:,np.newaxis])) self.smax = 5 #self.fn_obj.compute_helper_vars(data_obj,self) ##self.pFs = [self.p_F_given_s(s) for s in range(self.smax)] self.log_pFs = [self.log_p_F_given_s(s) for s in range(self.smax)] self.oversampled = oversampled if self.oversampled: self.sampwt = np.ones((self.oversampled,1))/self.oversampled self.sampmat = np.zeros((self.oversampled*(self.F.shape[0]-1),self.F.shape[1]),dtype='bool') dig = np.floor(self.oversampled*t_offset).astype('<i2') for i in range(self.sampmat.shape[1]): self.sampmat[dig::self.oversampled,i] = 1 if precomputed: theta_star = precomputed['theta_star'] fn_obj = precomputed['fn_obj'] self.rpre = np.zeros(np.array(self.F.shape)+np.array((0,-1,0))) # one fewer time point required for i in range(self.nroi): self.rpre[i] = fn_obj.rfunc(theta_star[i][0])
def getquantile(x, lower=0.025, upper=0.975, return_indices=False): """ Indicates which elements of `x` fall into a quantile range Arguments: x: `ndarray(nsamples)` lower: `0<=float<max(upper,1)`. Lower quantile upper: `min(0, lower)<float<=1`. Upper quantile return_indices: `bool`. If `False`, returns boolean array. If `True` returns indices for entries of `x` falling between `lower` and `upper`. Returns: `ndarray`. Dimensionality will depend on `return_indices` """ lb, ub = np.percentile(x, [lower * 100, upper * 100]) y = np.logical_and(np.greater_equal(x, lb), np.less(x, ub)) if return_indices: y = np.arange(x.size)[y] return y
coef, intercept = baseRegression.adjust_coef(self, w) else: # self.prob_func_ == "softmax" coef = np.divide(w[:-1].T, self.scaler_.scale_) intercept = w[-1] - np.sum(coef * self.scaler_.mean_) if self.penalty_ == "l1": # ===FIXME=== # I don't now the condition to shrink the coef to 0 coef = np.array([0.0 if abs(wi) < 0.1 else wi for wi in coef]) intercept = 0.0 if abs(intercept) < 0.1 else intercept return coef, intercept def predict(self, x): if self.prob_func_ == "sigmoid": prob = (1.0 / (1.0 + np.exp(-np.dot(x, self.coef_) - self.intercept_)))[:,np.newaxis] prob = np.concatenate((1.0-prob, prob), axis=1) else: # self.prob_func_ == "softmax" prob = np.exp(np.dot(x, self.coef_.T) + self.intercept_) prob /= np.sum(prob, axis=1)[:,np.newaxis] return np.array([self.classes_[i] for i in np.argmax(prob, axis=1)]) def score(self, x, y): return self.accuracy(x, y) if __name__ == "__main__": from sklearn.datasets import make_regression x, y_orig = make_regression(n_samples=10, n_features=5, n_informative=5, n_targets=1, noise=1.0, random_state=1) # y = np.array([1 if v >= np.mean(y_orig) else 0 for v in y_orig]) y = np.array([0 if y < np.percentile(y_orig, 25) else 1 if y < np.percentile(y_orig, 50) else 2 if y < np.percentile(y_orig, 75) else 3 for y in y_orig]) lr = LogisticRegression() lr.fit(x, y) print(lr.coef_, lr.intercept_) print(lr.score(x, y))
def fit( self, df, duration_col=None, event_col=None, show_progress=False, timeline=None, weights_col=None, robust=False, initial_point=None, ): """ Fit the accelerated failure time model to a dataset. Parameters ---------- df: DataFrame a Pandas DataFrame with necessary columns `duration_col` and `event_col` (see below), covariates columns, and special columns (weights). `duration_col` refers to the lifetimes of the subjects. `event_col` refers to whether the 'death' events was observed: 1 if observed, 0 else (censored). duration_col: string the name of the column in DataFrame that contains the subjects' lifetimes. event_col: string, optional the name of the column in DataFrame that contains the subjects' death observation. If left as None, assume all individuals are uncensored. show_progress: boolean, optional (default=False) since the fitter is iterative, show convergence diagnostics. Useful if convergence is failing. timeline: array, optional Specify a timeline that will be used for plotting and prediction weights_col: string the column in df that specifies weights per observation. robust: boolean, optional (default=False) Compute the robust errors using the Huber sandwich estimator. initial_point: (d,) numpy array, optional initialize the starting point of the iterative algorithm. Default is the zero vector. Returns ------- self: self with additional new properties: ``print_summary``, ``params_``, ``confidence_intervals_`` and more Examples -------- TODO >>> from lifelines import WeibullAFTFitter >>> >>> df = pd.DataFrame({ >>> 'T': [5, 3, 9, 8, 7, 4, 4, 3, 2, 5, 6, 7], >>> 'E': [1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0], >>> 'var': [0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2], >>> 'age': [4, 3, 9, 8, 7, 4, 4, 3, 2, 5, 6, 7], >>> }) >>> >>> aft = WeibullAFTFitter() >>> aft.fit(df, 'T', 'E') >>> aft.print_summary() >>> aft.predict_median(df) >>> >>> aft = WeibullAFTFitter() >>> aft.fit(df, 'T', 'E', ancillary_df=df) >>> aft.print_summary() >>> aft.predict_median(df) """ if duration_col is None: raise TypeError("duration_col cannot be None.") self._time_fit_was_called = datetime.utcnow().strftime( "%Y-%m-%d %H:%M:%S") + " UTC" self.duration_col = duration_col self.event_col = event_col self.weights_col = weights_col self._n_examples = df.shape[0] self.timeline = timeline self.robust = robust df = df.copy() T = pass_for_numeric_dtypes_or_raise_array( df.pop(duration_col)).astype(float) E = (pass_for_numeric_dtypes_or_raise_array(df.pop( self.event_col)).astype(bool) if (self.event_col is not None) else pd.Series(np.ones(self._n_examples, dtype=bool), index=df.index, name="E")) weights = (pass_for_numeric_dtypes_or_raise_array( df.pop(self.weights_col)).astype(float) if (self.weights_col is not None) else pd.Series( np.ones(self._n_examples, dtype=float), index=df.index, name="weights")) # check to make sure their weights are okay if self.weights_col: if (weights.astype(int) != weights).any() and not self.robust: warnings.warn( dedent( """It appears your weights are not integers, possibly propensity or sampling scores then? It's important to know that the naive variance estimates of the coefficients are biased. Instead a) set `robust=True` in the call to `fit`, or b) use Monte Carlo to estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis""" ), StatisticalWarning, ) if (weights <= 0).any(): raise ValueError( "values in weight column %s must be positive." % self.weights_col) self.durations = T.copy() self.event_observed = E.copy() self.weights = weights.copy() if np.any(self.durations <= 0): raise ValueError( "This model does not allow for non-positive durations. Suggestion: add a small positive value to zero elements." ) df = df.astype(float) self._check_values(df, T, E, self.event_col) if self.fit_intercept: assert "_intercept" not in df df["_intercept"] = 1.0 self._LOOKUP_SLICE = self._create_slicer(len(df.columns)) # TODO _norm_std = df.std(0) self._norm_mean = df.mean(0) # if we included an intercept, we need to fix not divide by zero. if self.fit_intercept: _norm_std["_intercept"] = 1.0 else: _norm_std[_norm_std < 1e-8] = 1.0 _index = pd.MultiIndex.from_tuples( sum([[(name, c) for c in df.columns] for name in self._fitted_parameter_names], [])) self._norm_std = pd.Series(np.concatenate([_norm_std.values] * self.n_breakpoints), index=_index) _params, self._log_likelihood, self._hessian_ = self._fit_model( T.values, E.values, weights.values, normalize(df, 0, _norm_std).values, show_progress=show_progress, initial_point=initial_point, ) self.params_ = _params / self._norm_std self.variance_matrix_ = self._compute_variance_matrix() self.standard_errors_ = self._compute_standard_errors( T.values, E.values, weights.values, df.values) self.confidence_intervals_ = self._compute_confidence_intervals() self._predicted_cumulative_hazard_ = self.predict_cumulative_hazard( df, times=[np.percentile(T, 75)]).T return self
def initialize(self, base_model, datas, inputs=None, masks=None, tags=None, num_em_iters=50, num_tr_iters=50): print("Initializing...") print("First with FA using {} steps of EM.".format(num_em_iters)) fa, xhats, Cov_xhats, lls = factor_analysis_with_imputation( self.D, datas, masks=masks, num_iters=num_em_iters) if self.D == 1 and base_model.transitions.__class__.__name__ == "DDMTransitions": d_init = np.mean([y[0:3] for y in datas], axis=(0, 1)) u_sum = np.array([np.sum(u) for u in inputs]) y_end = np.array([y[-3:] for y in datas]) u_l, u_u = np.percentile( u_sum, [20, 80]) # use 20th and 80th percentile input y_U = y_end[np.where(u_sum >= u_u)] y_L = y_end[np.where(u_sum <= u_l)] C_init = (1.0 / 2.0) * np.mean( (np.mean(y_U, axis=0) - np.mean(y_L, axis=0)), axis=0) self.Cs = C_init.reshape([1, self.N, self.D]) self.ds = d_init.reshape([1, self.N]) self.inv_etas = np.log(fa.sigmasq).reshape([1, self.N]) else: # define objective Td = sum([x.shape[0] for x in xhats]) def _objective(params, itr): new_datas = [np.dot(x, params[0].T) + params[1] for x in xhats] obj = base_model.log_likelihood(new_datas, inputs=inputs) return -obj / Td # initialize R and r R = 0.1 * np.random.randn(self.D, self.D) r = 0.01 * np.random.randn(self.D) params = [R, r] print( "Next by transforming latents to match AR-HMM prior using {} steps of max log likelihood." .format(num_tr_iters)) state = None lls = [-_objective(params, 0) * Td] pbar = trange(num_tr_iters) pbar.set_description("Epoch {} Itr {} LP: {:.1f}".format( 0, 0, lls[-1])) for itr in pbar: params, val, g, state = sgd_step(value_and_grad(_objective), params, itr, state) lls.append(-val * Td) pbar.set_description("LP: {:.1f}".format(lls[-1])) pbar.update(1) R = params[0] r = params[1] # scale x's to be max at 1.1 for d in range(self.D): x_transformed = [(np.dot(x, R.T) + r)[:, d] for x in xhats] max_x = np.max(x_transformed) R[d, :] *= 1.1 / max_x r[d] *= 1.1 / max_x self.Cs = (fa.W @ np.linalg.inv(R)).reshape([1, self.N, self.D]) self.ds = fa.mean - fa.W @ np.linalg.inv(R) @ r self.inv_etas = np.log(fa.sigmasq).reshape([1, self.N])
def set_knots(self, T, E): self.knots = np.percentile(np.log(T[E.astype(bool).values]), np.linspace(5, 95, self.n_baseline_knots))
def offline_evaluation(self, metrics, mode, data, parameter, confidence=0.95, bootstrap=False, n_bootstrap=1): """ Performs offline evaluation Args: metrics (dic): metrics dictionnary to be filled mode (str): train, valid or test split data (tuple): tuple of np.arrays with features, actions, rewards parameter (np.array): optimized parameter or any baseline parameter confidence (float): confidence level for the interval bootstrap (bool): choose whether to perform bootstrap or not n_bootstrap (int): number of bootstrap folds Note: Computes ips, snips scores. Also computes t-student test, std, bootstrap std on ips and snips, and importance sampling diagnostics Returns: metrics (dic): contains results information on the data split """ features, actions, rewards, pi_logging = data rng_bootstrap = np.random.RandomState(1) bootstrap_ips_metric = [] bootstrap_snips_metric = [] bootstrap_delta_snips_metric = [] bootstrap_t_h = [] bootstrap_std_h = [] bootstrap_em_diagnostic = [] bootstrap_ess_diagnostic = [] for n in range(n_bootstrap): idx = rng_bootstrap.choice(np.arange(features.shape[0]), size=features.shape[0], replace=bootstrap) ips_metric, snips_metric = self.get_ips_and_snips_metrics(parameter, features[idx], actions[idx], rewards[idx], pi_logging[idx]) loss_logging = np.mean(-rewards[idx]) bootstrap_ips_metric.append(ips_metric) bootstrap_snips_metric.append(snips_metric) bootstrap_delta_snips_metric.append(snips_metric - loss_logging) # Student-t distribution test n = self.impt_smplg_weight.shape[0] se = sp.stats.sem(self.impt_smplg_weight*rewards) t_h = se * sp.stats.t.ppf((1 + confidence) / 2., n - 1) # Gaussian distribution test std_h = np.std(self.impt_smplg_weight) bootstrap_t_h.append(t_h) bootstrap_std_h.append(std_h) # Diagnostics empirical_mean_diagnostic = np.mean(self.impt_smplg_weight) effective_sample_size_diagnostic = (np.sum(self.impt_smplg_weight)**2/(np.sum(self.impt_smplg_weight**2)+EPS))/n bootstrap_em_diagnostic.append(empirical_mean_diagnostic) bootstrap_ess_diagnostic.append(effective_sample_size_diagnostic) metrics['ips_{}'.format(mode)] = np.mean(bootstrap_ips_metric) metrics['snips_{}'.format(mode)] = np.mean(bootstrap_snips_metric) metrics['t_h_{}'.format(mode)] = np.mean(bootstrap_t_h) metrics['std_h_{}'.format(mode)] = np.mean(bootstrap_std_h) metrics['bootstrap_std_ips_{}'.format(mode)] = np.std(bootstrap_ips_metric) metrics['bootstrap_h25_ips_{}'.format(mode)] = np.percentile(bootstrap_ips_metric, 2.5) metrics['bootstrap_h975_ips_{}'.format(mode)] = np.percentile(bootstrap_ips_metric, 97.5) metrics['bootstrap_std_snips_{}'.format(mode)] = np.std(bootstrap_snips_metric) metrics['bootstrap_h25_snips_{}'.format(mode)] = np.percentile(bootstrap_snips_metric, 2.5) metrics['bootstrap_h975_snips_{}'.format(mode)] = np.percentile(bootstrap_snips_metric, 97.5) metrics['em_diagnostic_{}'.format(mode)] = np.mean(bootstrap_em_diagnostic) metrics['ess_diagnostic_{}'.format(mode)] = np.mean(bootstrap_ess_diagnostic) metrics['snips_delta_{}'.format(mode)] = np.mean(bootstrap_delta_snips_metric) metrics['bootstrap_delta_std_snips_{}'.format(mode)] = np.std(bootstrap_delta_snips_metric) metrics['bootstrap_delta_h25_snips_{}'.format(mode)] = np.percentile(bootstrap_delta_snips_metric, 2.5) metrics['bootstrap_delta_h975_snips_{}'.format(mode)] = np.percentile(bootstrap_delta_snips_metric, 97.5) return metrics
def _quantile_knots(low, high, x, num_bases, degree): num_interior_knots = num_bases - (degree + 1) clipped = x[(x >= low) & (x <= high)] knots = np.percentile(clipped, np.linspace(0, 100, num_interior_knots + 2)) knots = [low] + list(knots[1:-1]) + [high] return np.asarray(knots)
def fit( self, df, duration_col=None, event_col=None, show_progress=False, timeline=None, weights_col=None, robust=False, initial_point=None, ): """ Fit the accelerated failure time model to a dataset. Parameters ---------- df: DataFrame a Pandas DataFrame with necessary columns `duration_col` and `event_col` (see below), covariates columns, and special columns (weights). `duration_col` refers to the lifetimes of the subjects. `event_col` refers to whether the 'death' events was observed: 1 if observed, 0 else (censored). duration_col: string the name of the column in DataFrame that contains the subjects' lifetimes. event_col: string, optional the name of the column in DataFrame that contains the subjects' death observation. If left as None, assume all individuals are uncensored. show_progress: boolean, optional (default=False) since the fitter is iterative, show convergence diagnostics. Useful if convergence is failing. timeline: array, optional Specify a timeline that will be used for plotting and prediction weights_col: string the column in df that specifies weights per observation. robust: boolean, optional (default=False) Compute the robust errors using the Huber sandwich estimator. initial_point: (d,) numpy array, optional initialize the starting point of the iterative algorithm. Default is the zero vector. Returns ------- self: self with additional new properties: ``print_summary``, ``params_``, ``confidence_intervals_`` and more Examples -------- >>> N, d = 80000, 2 >>> # some numbers take from http://statwonk.com/parametric-survival.html >>> breakpoints = (1, 31, 34, 62, 65) >>> betas = np.array( >>> [ >>> [1.0, -0.2, np.log(15)], >>> [5.0, -0.4, np.log(333)], >>> [9.0, -0.6, np.log(18)], >>> [5.0, -0.8, np.log(500)], >>> [2.0, -1.0, np.log(20)], >>> [1.0, -1.2, np.log(500)], >>> ] >>> ) >>> X = 0.1 * np.random.exponential(size=(N, d)) >>> X = np.c_[X, np.ones(N)] >>> T = np.empty(N) >>> for i in range(N): >>> lambdas = np.exp(-betas.dot(X[i, :])) >>> T[i] = piecewise_exponential_survival_data(1, breakpoints, lambdas)[0] >>> T_censor = np.minimum( >>> T.mean() * np.random.exponential(size=N), 110 >>> ) # 110 is the end of observation, eg. current time. >>> df = pd.DataFrame(X[:, :-1], columns=["var1", "var2"]) >>> df["T"] = np.round(np.maximum(np.minimum(T, T_censor), 0.1), 1) >>> df["E"] = T <= T_censor >>> pew = PiecewiseExponentialRegressionFitter(breakpoints=breakpoints, penalizer=0.0001).fit(df, "T", "E") >>> pew.print_summary() >>> pew.plot() """ if duration_col is None: raise TypeError("duration_col cannot be None.") self._time_fit_was_called = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") + " UTC" self.duration_col = duration_col self.event_col = event_col self.weights_col = weights_col self._n_examples = df.shape[0] self.timeline = timeline self.robust = robust df = df.copy() T = pass_for_numeric_dtypes_or_raise_array(df.pop(duration_col)).astype(float) E = ( pass_for_numeric_dtypes_or_raise_array(df.pop(self.event_col)) if (self.event_col is not None) else pd.Series(np.ones(self._n_examples, dtype=bool), index=df.index, name="E") ) weights = ( pass_for_numeric_dtypes_or_raise_array(df.pop(self.weights_col)).astype(float) if (self.weights_col is not None) else pd.Series(np.ones(self._n_examples, dtype=float), index=df.index, name="weights") ) # check to make sure their weights are okay if self.weights_col: if (weights.astype(int) != weights).any() and not self.robust: warnings.warn( dedent( """It appears your weights are not integers, possibly propensity or sampling scores then? It's important to know that the naive variance estimates of the coefficients are biased. Instead a) set `robust=True` in the call to `fit`, or b) use Monte Carlo to estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis""" ), StatisticalWarning, ) if (weights <= 0).any(): raise ValueError("values in weight column %s must be positive." % self.weights_col) df = df.astype(float) self._check_values(df, T, E, self.event_col) E = E.astype(bool) self.durations = T.copy() self.event_observed = E.copy() self.weights = weights.copy() if np.any(self.durations <= 0): raise ValueError( "This model does not allow for non-positive durations. Suggestion: add a small positive value to zero elements." ) if self.fit_intercept: assert "_intercept" not in df df["_intercept"] = 1.0 self._LOOKUP_SLICE = self._create_slicer(len(df.columns)) _norm_std = df.std(0) self._norm_mean = df.mean(0) # if we included an intercept, we need to fix not divide by zero. if self.fit_intercept: _norm_std["_intercept"] = 1.0 else: _norm_std[_norm_std < 1e-8] = 1.0 _index = pd.MultiIndex.from_tuples( sum([[(name, c) for c in df.columns] for name in self._fitted_parameter_names], []) ) self._norm_std = pd.Series(np.concatenate([_norm_std.values] * self.n_breakpoints), index=_index) _params, self._log_likelihood, self._hessian_ = self._fit_model( T.values, E.values, weights.values, normalize(df, 0, _norm_std).values, show_progress=show_progress, initial_point=initial_point, ) self.params_ = _params / self._norm_std self.variance_matrix_ = self._compute_variance_matrix() self.standard_errors_ = self._compute_standard_errors(T.values, E.values, weights.values, df.values) self.confidence_intervals_ = self._compute_confidence_intervals() self._predicted_cumulative_hazard_ = self.predict_cumulative_hazard(df, times=[np.percentile(T, 75)]).T return self
nsamps = 1000 z = mogsamples(nsamps, theta) lls = moglogpdf(z, theta) Hmc = -np.mean(lls) Hmc_hi = Hmc + 3 * np.std(lls) / np.sqrt(nsamps) # compute bound and store gap Hbound = lower_bound_MoG(theta) gaps[i] = Hmc - Hbound # Hmc should be greater than Hbound assert Hmc_hi > Hbound, "bound isn't lower ya dope (%2.3f not greater than %2.3f)" % ( Hmc_hi, Hbound) print "Gap percentiles [1, 50, 99] %s" % str( np.percentile(gaps, [1, 50, 99])) ######################################### # test per mu_n function and gradient # ######################################### n = 0 lbn, lbs = make_lower_bound_MoGn(theta, n, s2min=1e-7) thn = theta[n, :D] assert np.isclose(lower_bound_MoG(theta), lbn(thn)), "per n is bad" from autograd.util import quick_grad_check, nd quick_grad_check(lbn, thn) print "Hessiandiag, numeric hessian diag" hlbn = hessian(lbn) print np.diag(hlbn(thn))