def run_tests(n_cases, n_features, sparse, exposure_type, distribution, time_drift): n_intervals = 5 n_lags = np.repeat(2, n_features).astype('uint64') sim = SimuSCCS(n_cases, n_intervals, n_features, n_lags, time_drift, exposure_type, distribution, sparse, verbose=False) X, X_c, y, c, coeffs = sim.simulate() self.assertEqual(len(X), n_cases) self.assertEqual(len(y), n_cases) self.assertEqual(X[0].shape, (n_intervals, n_features)) self.assertEqual(y[0].shape, (n_intervals, )) self.assertEqual(c.shape, (n_cases, )) [ self.assertEqual(co.shape, (int(n_lags[i] + 1), )) for i, co in enumerate(coeffs) ] self.assertEqual(np.sum([1 for f in X if f.sum() <= 0]), 0) self.assertEqual(np.sum([1 for f in X_c if f.sum() <= 0]), 0)
def _bootstrap(self, p_features, p_labels, p_censoring, coeffs, rep, confidence): # WARNING: _bootstrap inputs are already preprocessed p_features, # p_labels and p_censoring # Coeffs here are assumed to be an array (same object than self._coeffs) if confidence <= 0 or confidence >= 1: raise ValueError("`confidence_level` should be in (0, 1)") confidence = 1 - confidence if not self._fitted: raise RuntimeError('You must fit the model first') bootstrap_coeffs = [] sim = SimuSCCS(self.n_cases, self.n_intervals, self.n_features, self.n_lags, coeffs=self._format_coeffs(coeffs)) # TODO later: parallelize bootstrap (everything should be pickable...) for k in range(rep): y = sim._simulate_multinomial_outcomes(p_features, coeffs) self._model_obj.fit(p_features, y, p_censoring) bootstrap_coeffs.append(self._fit(True)) bootstrap_coeffs = np.exp(np.array(bootstrap_coeffs)) bootstrap_coeffs.sort(axis=0) lower_bound = np.log(bootstrap_coeffs[int( np.floor(rep * confidence / 2))]) upper_bound = np.log(bootstrap_coeffs[int( np.floor(rep * (1 - confidence / 2)))]) return Confidence_intervals(self._format_coeffs(coeffs), self._format_coeffs(lower_bound), self._format_coeffs(upper_bound), confidence)
def test_simulated_features(self): n_features = 3 n_lags = np.repeat(2, n_features) sim = SimuSCCS(100, 10, n_features, n_lags, None, 'multiple_exposures', verbose=False) feat, n_samples = sim.simulate_features(100) self.assertEqual(100, len(feat)) print(np.sum([1 for f in feat if f.sum() <= 0]))
def test_grad_loss_consistency(self): """Test longitudinal multinomial model gradient properties.""" n_intervals = 16 n_lags = 4 sim = SimuSCCS(500, n_intervals, 3, n_lags, None, True, "infinite", seed=42, verbose=False) X, y, censoring, coeffs = sim.simulate() X = LongitudinalFeaturesLagger(n_lags=n_lags) \ .fit_transform(X, censoring) model = ModelSCCS(n_intervals=n_intervals, n_lags=n_lags) \ .fit(X, y, censoring) self._test_grad(model, coeffs) X_sparse = [csr_matrix(x) for x in X] model = ModelSCCS(n_intervals=n_intervals, n_lags=n_lags) \ .fit(X_sparse, y, censoring) self._test_grad(model, coeffs)
def test_convergence_with_lags(self): """Test longitudinal multinomial model convergence.""" n_intervals = 10 n_lags = 3 n_samples = 1500 n_features = 3 sim = SimuSCCS(n_samples, n_intervals, n_features, n_lags, None, True, "short", seed=42, verbose=False) X, y, censoring, coeffs = sim.simulate() X = LongitudinalFeaturesLagger(n_lags=n_lags) \ .fit_transform(X, censoring) model = ModelSCCS(n_intervals=n_intervals, n_lags=n_lags).fit(X, y, censoring) solver = SVRG(max_iter=15, verbose=False) solver.set_model(model).set_prox(ProxZero()) coeffs_svrg = solver.solve(step=1 / model.get_lip_max()) np.testing.assert_almost_equal(coeffs, coeffs_svrg, decimal=1)
def test_censoring(self): array_list = [np.ones((2, 3)) for i in range(3)] expected = [np.zeros((2, 3)) for i in range(3)] for i in range(1, 3): expected[i][:i] += 1 censoring = np.arange(3) output = SimuSCCS._censor_array_list(array_list, censoring) for i in range(3): np.testing.assert_equal(output[i], expected[i])
def test_LearnerSCCS_fit(self): seed = 42 n_lags = np.repeat(2, 2).astype('uint64') sim = SimuSCCS(n_cases=800, n_intervals=10, n_features=2, n_lags=n_lags, verbose=False, seed=seed, exposure_type='multiple_exposures') features, _, labels, censoring, coeffs = sim.simulate() lrn = ConvSCCS(n_lags=n_lags, penalized_features=[], tol=0, max_iter=10, random_state=seed) estimated_coeffs, _ = lrn.fit(features, labels, censoring) np.testing.assert_almost_equal(np.hstack(estimated_coeffs), np.hstack(coeffs), decimal=1)
def setUp(self): self.n_lags = np.repeat(1, 2).astype('uint64') self.seed = 42 self.coeffs = [ np.log(np.array([2.1, 2.5])), np.log(np.array([.8, .5])) ] self.n_features = len(self.n_lags) self.n_correlations = 2 # Create data sim = SimuSCCS(n_cases=500, n_intervals=10, n_features=self.n_features, n_lags=self.n_lags, verbose=False, seed=self.seed, coeffs=self.coeffs, n_correlations=self.n_correlations) _, self.features, self.labels, self.censoring, self.coeffs =\ sim.simulate()
def test_grad_loss_consistency(self): """Test longitudinal multinomial model gradient properties.""" n_lags = np.repeat(9, 3).astype(dtype="uint64") sim = SimuSCCS(500, 36, 3, n_lags, None, "single_exposure", seed=42, verbose=False) _, X, y, censoring, coeffs = sim.simulate() coeffs = np.hstack(coeffs) X, _, _ = LongitudinalFeaturesLagger(n_lags=n_lags) \ .fit_transform(X, censoring) model = ModelSCCS(n_intervals=36, n_lags=n_lags)\ .fit(X, y, censoring) self._test_grad(model, coeffs) X_sparse = [csr_matrix(x) for x in X] model = ModelSCCS(n_intervals=36, n_lags=n_lags)\ .fit(X_sparse, y, censoring) self._test_grad(model, coeffs)
def run_tests(n_samples, n_features, sparse, exposure_type, distribution, first_tick_only, censoring): n_intervals = 5 n_lags = 2 sim = SimuSCCS(n_samples, n_intervals, n_features, n_lags, None, sparse, exposure_type, distribution, first_tick_only, censoring, seed=42, verbose=False) X, y, c, coeffs = sim.simulate() self.assertEqual(len(X), n_samples) self.assertEqual(len(y), n_samples) self.assertEqual(X[0].shape, (n_intervals, n_features)) self.assertEqual(y[0].shape, (n_intervals, )) self.assertEqual(c.shape, (n_samples, )) self.assertEqual(coeffs.shape, (n_features * (n_lags + 1), ))
def test_convergence_with_lags(self): """Test longitudinal multinomial model convergence.""" n_intervals = 10 n_samples = 800 n_features = 2 n_lags = np.repeat(2, n_features).astype(dtype="uint64") sim = SimuSCCS(n_samples, n_intervals, n_features, n_lags, None, "multiple_exposures", seed=42) _, X, y, censoring, coeffs = sim.simulate() coeffs = np.hstack(coeffs) X, _, _ = LongitudinalFeaturesLagger(n_lags=n_lags) \ .fit_transform(X, censoring) model = ModelSCCS(n_intervals=n_intervals, n_lags=n_lags).fit(X, y, censoring) solver = SVRG(max_iter=15, verbose=False) solver.set_model(model).set_prox(ProxZero()) coeffs_svrg = solver.solve(step=1 / model.get_lip_max()) np.testing.assert_almost_equal(coeffs, coeffs_svrg, decimal=1)
effects_compiled = compile(effects_str, "<string>", "exec") exec(effects_compiled) # create sim_effects td_compiled = compile(time_drift_str, "<string>", "exec") exec(td_compiled) # create time_drift n_features = len(sim_effects) sim_effects = np.hstack(sim_effects) coeffs = np.log(sim_effects) normalized_time_drift = np.exp(time_drift(np.arange(750))) normalized_time_drift /= normalized_time_drift.sum() sim = SimuSCCS( int(n_cases), n_intervals, n_features, n_lags, time_drift=time_drift, n_correlations=n_features, coeffs=coeffs, seed=seed, verbose=False, ) features, censored_features, labels, censoring, coeffs = sim.simulate() adjacency_matrix = sim.hawkes_exp_kernels.adjacency.tobytes() # Convert to R format df = to_nonparasccs(censored_features, labels, censoring, lags) df["indiv"] = df.index df = df.astype("int64")
ce = CustomEffects(lags + 1) effects_compiled = compile(effects_str, "<string>", "exec") exec(effects_compiled) # create sim_effects td_compiled = compile(time_drift_str, "<string>", "exec") exec(td_compiled) # create time_drift n_features = len(sim_effects) sim_effects = np.hstack(sim_effects) coeffs = np.log(sim_effects) normalized_time_drift = np.exp(time_drift(np.arange(750))) normalized_time_drift /= normalized_time_drift.sum() n_lags = np.repeat(lags, n_features + 1).astype("uint64") sim = SimuSCCS(n_cases, n_intervals, n_features + 1, n_lags, time_drift=time_drift, exposure_type="multiple_exposures", n_correlations=n_corr, seed=seed) features, censored_features, labels, censoring, coeffs = sim.simulate() adjacency_matrix = sim.hawkes_exp_kernels.adjacency.tobytes() # Convert to R format df = to_nonparasccs(censored_features, labels, censoring, lags) df["indiv"] = df.index df = df.astype("int64") exposures_frequencies = df.drugid.value_counts()