def test_summary(self): # smoke test np.random.seed(34234) time = 50 * np.random.uniform(size=200) status = np.random.randint(0, 2, 200).astype(np.float64) exog = np.random.normal(size=(200,4)) mod = PHReg(time, exog, status) rslt = mod.fit() smry = rslt.summary() strata = np.kron(np.arange(50), np.ones(4)) mod = PHReg(time, exog, status, strata=strata) rslt = mod.fit() smry = rslt.summary() msg = "3 strata dropped for having no events" assert_(msg in str(smry)) groups = np.kron(np.arange(25), np.ones(8)) mod = PHReg(time, exog, status) rslt = mod.fit(groups=groups) smry = rslt.summary() entry = np.random.uniform(0.1, 0.8, 200) * time mod = PHReg(time, exog, status, entry=entry) rslt = mod.fit() smry = rslt.summary() msg = "200 observations have positive entry times" assert_(msg in str(smry))
def test_summary(self): # smoke test np.random.seed(34234) time = 50 * np.random.uniform(size=200) status = np.random.randint(0, 2, 200).astype(np.float64) exog = np.random.normal(size=(200, 4)) mod = PHReg(time, exog, status) rslt = mod.fit() smry = rslt.summary() strata = np.kron(np.arange(50), np.ones(4)) mod = PHReg(time, exog, status, strata=strata) rslt = mod.fit() smry = rslt.summary() msg = "3 strata dropped for having no events" assert_(msg in str(smry)) groups = np.kron(np.arange(25), np.ones(8)) mod = PHReg(time, exog, status) rslt = mod.fit(groups=groups) smry = rslt.summary() entry = np.random.uniform(0.1, 0.8, 200) * time mod = PHReg(time, exog, status, entry=entry) rslt = mod.fit() smry = rslt.summary() msg = "200 observations have positive entry times" assert_(msg in str(smry))
def test_formula(self): np.random.seed(34234) time = 50 * np.random.uniform(size=200) status = np.random.randint(0, 2, 200).astype(np.float64) exog = np.random.normal(size=(200,4)) entry = np.zeros_like(time) entry[0:10] = time[0:10] / 2 df = pd.DataFrame({"time": time, "status": status, "exog1": exog[:, 0], "exog2": exog[:, 1], "exog3": exog[:, 2], "exog4": exog[:, 3], "entry": entry}) mod1 = PHReg(time, exog, status, entry=entry) rslt1 = mod1.fit() fml = "time ~ 0 + exog1 + exog2 + exog3 + exog4" mod2 = PHReg.from_formula(fml, df, status=status, entry=entry) rslt2 = mod2.fit() mod3 = PHReg.from_formula(fml, df, status="status", entry="entry") rslt3 = mod3.fit() assert_allclose(rslt1.params, rslt2.params) assert_allclose(rslt1.params, rslt3.params) assert_allclose(rslt1.bse, rslt2.bse) assert_allclose(rslt1.bse, rslt3.bse)
def test_offset(self): np.random.seed(34234) time = 50 * np.random.uniform(size=200) status = np.random.randint(0, 2, 200).astype(np.float64) exog = np.random.normal(size=(200, 4)) mod1 = PHReg(time, exog, status) rslt1 = mod1.fit() offset = exog[:, 0] * rslt1.params[0] exog = exog[:, 1:] mod2 = PHReg(time, exog, status, offset=offset) rslt2 = mod2.fit() assert_allclose(rslt2.params, rslt1.params[1:])
def test_offset(self): np.random.seed(34234) time = 50 * np.random.uniform(size=200) status = np.random.randint(0, 2, 200).astype(np.float64) exog = np.random.normal(size=(200,4)) mod1 = PHReg(time, exog, status) rslt1 = mod1.fit() offset = exog[:,0] * rslt1.params[0] exog = exog[:, 1:] mod2 = PHReg(time, exog, status, offset=offset) rslt2 = mod2.fit() assert_allclose(rslt2.params, rslt1.params[1:])
def test_formula(self): np.random.seed(34234) time = 50 * np.random.uniform(size=200) status = np.random.randint(0, 2, 200).astype(np.float64) exog = np.random.normal(size=(200, 4)) entry = np.zeros_like(time) entry[0:10] = time[0:10] / 2 df = pd.DataFrame({ "time": time, "status": status, "exog1": exog[:, 0], "exog2": exog[:, 1], "exog3": exog[:, 2], "exog4": exog[:, 3], "entry": entry }) mod1 = PHReg(time, exog, status, entry=entry) rslt1 = mod1.fit() fml = "time ~ 0 + exog1 + exog2 + exog3 + exog4" mod2 = PHReg.from_formula(fml, df, status=status, entry=entry) rslt2 = mod2.fit() mod3 = PHReg.from_formula(fml, df, status="status", entry="entry") rslt3 = mod3.fit() assert_allclose(rslt1.params, rslt2.params) assert_allclose(rslt1.params, rslt3.params) assert_allclose(rslt1.bse, rslt2.bse) assert_allclose(rslt1.bse, rslt3.bse)
def test_summary(self): # smoke test np.random.seed(34234) time = 50 * np.random.uniform(size=200) status = np.random.randint(0, 2, 200).astype(np.float64) exog = np.random.normal(size=(200, 4)) mod = PHReg(time, exog, status) rslt = mod.fit() rslt.summary()
def test_summary(self): # smoke test np.random.seed(34234) time = 50 * np.random.uniform(size=200) status = np.random.randint(0, 2, 200).astype(np.float64) exog = np.random.normal(size=(200,4)) mod = PHReg(time, exog, status) rslt = mod.fit() rslt.summary()
def test_post_estimation(self): # All regression tests np.random.seed(34234) time = 50 * np.random.uniform(size=200) status = np.random.randint(0, 2, 200).astype(np.float64) exog = np.random.normal(size=(200, 4)) mod = PHReg(time, exog, status) rslt = mod.fit() mart_resid = rslt.martingale_residuals assert_allclose(np.abs(mart_resid).sum(), 120.72475743348433) w_avg = rslt.weighted_covariate_averages assert_allclose( np.abs(w_avg[0]).sum(0), np.r_[7.31008415, 9.77608674, 10.89515885, 13.1106801]) bc_haz = rslt.baseline_cumulative_hazard v = [np.mean(np.abs(x)) for x in bc_haz[0]] w = np.r_[23.482841556421608, 0.44149255358417017, 0.68660114081275281] assert_allclose(v, w) score_resid = rslt.score_residuals v = np.r_[0.50924792, 0.4533952, 0.4876718, 0.5441128] w = np.abs(score_resid).mean(0) assert_allclose(v, w) groups = np.random.randint(0, 3, 200) mod = PHReg(time, exog, status) rslt = mod.fit(groups=groups) robust_cov = rslt.cov_params() v = [0.00513432, 0.01278423, 0.00810427, 0.00293147] w = np.abs(robust_cov).mean(0) assert_allclose(v, w, rtol=1e-6) s_resid = rslt.schoenfeld_residuals ii = np.flatnonzero(np.isfinite(s_resid).all(1)) s_resid = s_resid[ii, :] v = np.r_[0.85154336, 0.72993748, 0.73758071, 0.78599333] assert_allclose(np.abs(s_resid).mean(0), v)
def test_post_estimation(self): # All regression tests np.random.seed(34234) time = 50 * np.random.uniform(size=200) status = np.random.randint(0, 2, 200).astype(np.float64) exog = np.random.normal(size=(200,4)) mod = PHReg(time, exog, status) rslt = mod.fit() mart_resid = rslt.martingale_residuals assert_allclose(np.abs(mart_resid).sum(), 120.72475743348433) w_avg = rslt.weighted_covariate_averages assert_allclose(np.abs(w_avg[0]).sum(0), np.r_[7.31008415, 9.77608674,10.89515885, 13.1106801]) bc_haz = rslt.baseline_cumulative_hazard v = [np.mean(np.abs(x)) for x in bc_haz[0]] w = np.r_[23.482841556421608, 0.44149255358417017, 0.68660114081275281] assert_allclose(v, w) score_resid = rslt.score_residuals v = np.r_[ 0.50924792, 0.4533952, 0.4876718, 0.5441128] w = np.abs(score_resid).mean(0) assert_allclose(v, w) groups = np.random.randint(0, 3, 200) mod = PHReg(time, exog, status) rslt = mod.fit(groups=groups) robust_cov = rslt.cov_params() v = [0.00513432, 0.01278423, 0.00810427, 0.00293147] w = np.abs(robust_cov).mean(0) assert_allclose(v, w, rtol=1e-6) s_resid = rslt.schoenfeld_residuals ii = np.flatnonzero(np.isfinite(s_resid).all(1)) s_resid = s_resid[ii, :] v = np.r_[0.85154336, 0.72993748, 0.73758071, 0.78599333] assert_allclose(np.abs(s_resid).mean(0), v)
def test_predict(self): # All smoke tests. We should be able to convert the lhr and hr # tests into real tests against R. There are many options to # this function that may interact in complicated ways. Only a # few key combinations are tested here. np.random.seed(34234) endog = 50 * np.random.uniform(size=200) status = np.random.randint(0, 2, 200).astype(np.float64) exog = np.random.normal(size=(200, 4)) mod = PHReg(endog, exog, status) rslt = mod.fit() rslt.predict() for pred_type in "lhr", "hr", "cumhaz", "surv": rslt.predict(pred_type=pred_type) rslt.predict(endog=endog[0:10], pred_type=pred_type) rslt.predict(endog=endog[0:10], exog=exog[0:10, :], pred_type=pred_type)
def do1(self, fname, ties, entry_f, strata_f): # Read the test data. time, status, entry, exog = self.load_file(fname) n = len(time) vs = fname.split("_") n = int(vs[2]) p = int(vs[3].split(".")[0]) ties1 = ties[0:3] # Needs to match the kronecker statement in survival.R strata = np.kron(range(5), np.ones(n // 5)) # No stratification or entry times mod = PHReg(time, exog, status, ties=ties) phrb = mod.fit(**args) coef_r, se_r, time_r, hazard_r = get_results(n, p, None, ties1) assert_allclose(phrb.params, coef_r, rtol=1e-3) assert_allclose(phrb.bse, se_r, rtol=1e-4) time_h, cumhaz, surv = phrb.baseline_cumulative_hazard[0] # Entry times but no stratification phrb = PHReg(time, exog, status, entry=entry, ties=ties).fit(**args) coef, se, time_r, hazard_r = get_results(n, p, "et", ties1) assert_allclose(phrb.params, coef, rtol=1e-3) assert_allclose(phrb.bse, se, rtol=1e-3) # Stratification but no entry times phrb = PHReg(time, exog, status, strata=strata, ties=ties).fit(**args) coef, se, time_r, hazard_r = get_results(n, p, "st", ties1) assert_allclose(phrb.params, coef, rtol=1e-4) assert_allclose(phrb.bse, se, rtol=1e-4) # Stratification and entry times phrb = PHReg(time, exog, status, entry=entry, strata=strata, ties=ties).fit(**args) coef, se, time_r, hazard_r = get_results(n, p, "et_st", ties1) assert_allclose(phrb.params, coef, rtol=1e-3) assert_allclose(phrb.bse, se, rtol=1e-4) #smoke test time_h, cumhaz, surv = phrb.baseline_cumulative_hazard[0]
def do1(fname, ties, entry_f, strata_f): # Read the test data. time, status, entry, exog = TestPHReg.load_file(fname) n = len(time) vs = fname.split("_") n = int(vs[2]) p = int(vs[3].split(".")[0]) ties1 = ties[0:3] # Needs to match the kronecker statement in survival.R strata = np.kron(range(5), np.ones(n // 5)) # No stratification or entry times mod = PHReg(time, exog, status, ties=ties) phrb = mod.fit(**args) coef_r, se_r, time_r, hazard_r = get_results(n, p, None, ties1) assert_allclose(phrb.params, coef_r, rtol=1e-3) assert_allclose(phrb.bse, se_r, rtol=1e-4) time_h, cumhaz, surv = phrb.baseline_cumulative_hazard[0] # Entry times but no stratification phrb = PHReg(time, exog, status, entry=entry, ties=ties).fit(**args) coef, se, time_r, hazard_r = get_results(n, p, "et", ties1) assert_allclose(phrb.params, coef, rtol=1e-3) assert_allclose(phrb.bse, se, rtol=1e-3) # Stratification but no entry times phrb = PHReg(time, exog, status, strata=strata, ties=ties).fit(**args) coef, se, time_r, hazard_r = get_results(n, p, "st", ties1) assert_allclose(phrb.params, coef, rtol=1e-4) assert_allclose(phrb.bse, se, rtol=1e-4) # Stratification and entry times phrb = PHReg(time, exog, status, entry=entry, strata=strata, ties=ties).fit(**args) coef, se, time_r, hazard_r = get_results(n, p, "et_st", ties1) assert_allclose(phrb.params, coef, rtol=1e-3) assert_allclose(phrb.bse, se, rtol=1e-4) #smoke test time_h, cumhaz, surv = phrb.baseline_cumulative_hazard[0]
def test_get_distribution(self): # Smoke test np.random.seed(34234) exog = np.random.normal(size=(200, 2)) lin_pred = exog.sum(1) elin_pred = np.exp(-lin_pred) time = -elin_pred * np.log(np.random.uniform(size=200)) mod = PHReg(time, exog) rslt = mod.fit() dist = rslt.get_distribution() fitted_means = dist.mean() true_means = elin_pred fitted_var = dist.var() fitted_sd = dist.std() sample = dist.rvs()
def test_predict(self): # All smoke tests. We should be able to convert the lhr and hr # tests into real tests against R. There are many options to # this function that may interact in complicated ways. Only a # few key combinations are tested here. np.random.seed(34234) endog = 50 * np.random.uniform(size=200) status = np.random.randint(0, 2, 200).astype(np.float64) exog = np.random.normal(size=(200,4)) mod = PHReg(endog, exog, status) rslt = mod.fit() rslt.predict() for pred_type in 'lhr', 'hr', 'cumhaz', 'surv': rslt.predict(pred_type=pred_type) rslt.predict(endog=endog[0:10], pred_type=pred_type) rslt.predict(endog=endog[0:10], exog=exog[0:10,:], pred_type=pred_type)
def test_get_distribution(self): np.random.seed(34234) n = 200 exog = np.random.normal(size=(n, 2)) lin_pred = exog.sum(1) elin_pred = np.exp(-lin_pred) time = -elin_pred * np.log(np.random.uniform(size=n)) status = np.ones(n) status[0:20] = 0 strata = np.kron(range(5), np.ones(n // 5)) mod = PHReg(time, exog, status=status, strata=strata) rslt = mod.fit() dist = rslt.get_distribution() fitted_means = dist.mean() true_means = elin_pred fitted_var = dist.var() fitted_sd = dist.std() sample = dist.rvs()
def core(X, Y1, Y2, Z=None): ''' Y1: pd.Series,生存时间, 定量数据 Y2: pd.Series,生存状态, 定类数据, 只能为0或者1, 1表示活, 0 表示死 X: pd.DataFrame,药物组合的类型、年龄等等定类或者定量数据 Z: pd.Series, 分层项,定类数据 ''' X = X.reset_index(drop=True) if type(Y1) == np.ndarray: Y1 = pd.Series(Y1, name='futime') else: Y1 = Y1.reset_index(drop=True) if type(Y2) == np.ndarray: Y2 = pd.Series(Y2, name='death') else: Y2 = Y2.reset_index(drop=True) if type(Z) == np.ndarray: Z = pd.Series(Z, name='class') elif type(Z) == pd.Series: Z = Z.reset_index(drop=True) else: Z = pd.Series(['' for i in range(len(Y1))], name='class') mod = PHReg(Y1, X, status=Y2) res = mod.fit() tables = res.summary().tables dfinfo1 = tables[1] dfinfo1.index.name = '项' dfinfo1.columns.name = '参数类型' dfinfo1.columns = [ '回归系数', '标准误差SE', '风险比HR', 'Z值', 'p值', '95%CI(下限)', '95%CI(上限)' ] dfinfo1['or值'] = np.exp(res.params) dfinfo1 = dfinfo1.round(3) tb2 = {'df': res.df_model, '似然比卡方值': res.llf} dfinfo2 = pd.DataFrame([tb2]).round(3) dfinfo2 = dfinfo2.set_index('似然比卡方值') ## 生存率曲线 D = Y1.to_frame(name='futime').join(Y2.to_frame(name='death')).join( Z.to_frame(name='class')) gb = D.groupby("class") classes = [] for g in gb: sf = sm.SurvfuncRight(g[1]["futime"], g[1]["death"]).summary() sl = sf['Surv prob'] sl.index.name = '生存时间' sl.name = str(g[0]) + '_生存率' classes.append(sl.to_frame()) df_sl = pd.concat(classes, axis=1) rr = {'生存函数曲线': df_sl, 'Cox回归模型分析结果汇总': dfinfo1, 'Cox回归模型似然比检验结果': dfinfo2} return rr