def test_bmt(): # All tests against SAS # Results taken from here: # http://support.sas.com/documentation/cdl/en/statug/68162/HTML/default/viewer.htm#statug_lifetest_details03.htm # Confidence intervals for 25% percentile of the survival # distribution (for "ALL" subjects), taken from the SAS web site cb = {"linear": [107, 276], "cloglog": [86, 230], "log": [107, 332], "asinsqrt": [104, 276], "logit": [104, 230]} dfa = bmt[bmt.Group == "ALL"] cur_dir = os.path.dirname(os.path.abspath(__file__)) fp = os.path.join(cur_dir, 'results', 'bmt_results.csv') rslt = pd.read_csv(fp) sf = SurvfuncRight(dfa["T"].values, dfa.Status.values) assert_allclose(sf.surv_times, rslt.t) assert_allclose(sf.surv_prob, rslt.s, atol=1e-4, rtol=1e-4) assert_allclose(sf.surv_prob_se, rslt.se, atol=1e-4, rtol=1e-4) for method in "linear", "cloglog", "log", "logit", "asinsqrt": lcb, ucb = sf.quantile_ci(0.25, method=method) assert_allclose(cb[method], np.r_[lcb, ucb])
def test_bmt(): # All tests against SAS # Results taken from here: # http://support.sas.com/documentation/cdl/en/statug/68162/HTML/default/viewer.htm#statug_lifetest_details03.htm # Confidence intervals for 25% percentile of the survival # distribution (for "ALL" subjects), taken from the SAS web site cb = { "linear": [107, 276], "cloglog": [86, 230], "log": [107, 332], "asinsqrt": [104, 276], "logit": [104, 230] } dfa = bmt[bmt.Group == "ALL"] cur_dir = os.path.dirname(os.path.abspath(__file__)) fp = os.path.join(cur_dir, '#1lab_results', 'bmt_results.csv') rslt = pd.read_csv(fp) sf = SurvfuncRight(dfa["T"].values, dfa.Status.values) assert_allclose(sf.surv_times, rslt.t) assert_allclose(sf.surv_prob, rslt.s, atol=1e-4, rtol=1e-4) assert_allclose(sf.surv_prob_se, rslt.se, atol=1e-4, rtol=1e-4) for method in "linear", "cloglog", "log", "logit", "asinsqrt": lcb, ucb = sf.quantile_ci(0.25, method=method) assert_allclose(cb[method], np.r_[lcb, ucb])
def test_kernel_survfunc3(): # cases with tied times n = 100 np.random.seed(3434) x = np.random.normal(size=(n, 3)) time = np.random.randint(0, 10, size=n) status = np.random.randint(0, 2, size=n) SurvfuncRight(time, status, exog=x, bw_factor=10000) SurvfuncRight(time, status, exog=x, bw_factor=np.r_[10000, 10000])
def test_survfunc_entry_2(): # entry = 0 is equivalent to no entry time times = np.r_[1, 3, 3, 5, 5, 7, 7, 8, 8, 9, 10, 10] status = np.r_[1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1] entry = np.r_[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] sf = SurvfuncRight(times, status, entry=entry) sf0 = SurvfuncRight(times, status) assert_allclose(sf.n_risk, sf0.n_risk) assert_allclose(sf.surv_times, sf0.surv_times) assert_allclose(sf.surv_prob, sf0.surv_prob) assert_allclose(sf.surv_prob_se, sf0.surv_prob_se)
def test_kernel_survfunc2(): # Check that when bandwidth is very large, the kernel procedure # agrees with standard KM. (Note: the results do not agree # perfectly when there are tied times). n = 100 np.random.seed(3434) x = np.random.normal(size=(n, 3)) time = np.random.uniform(0, 10, size=n) status = np.random.randint(0, 2, size=n) resultkm = SurvfuncRight(time, status) result = SurvfuncRight(time, status, exog=x, bw_factor=10000) assert_allclose(resultkm.surv_times, result.surv_times) assert_allclose(resultkm.surv_prob, result.surv_prob, rtol=1e-6, atol=1e-6)
def test_simultaneous_cb(): # The exact numbers here are regression tests, but they are close # to page 103 of Klein and Moeschberger. df = bmt.loc[bmt["Group"] == "ALL", :] sf = SurvfuncRight(df["T"], df["Status"]) lcb1, ucb1 = sf.simultaneous_cb(transform="log") lcb2, ucb2 = sf.simultaneous_cb(transform="arcsin") ti = sf.surv_times.tolist() ix = [ti.index(x) for x in (110, 122, 129, 172)] assert_allclose(lcb1[ix], np.r_[0.43590582, 0.42115592, 0.4035897, 0.38785927]) assert_allclose(ucb1[ix], np.r_[0.93491636, 0.89776803, 0.87922239, 0.85894181]) assert_allclose(lcb2[ix], np.r_[0.52115708, 0.48079378, 0.45595321, 0.43341115]) assert_allclose(ucb2[ix], np.r_[0.96465636, 0.92745068, 0.90885428, 0.88796708])
def test_survfunc2(): # Test where some times have no events. sr = SurvfuncRight(ti2, st2) assert_allclose(sr.surv_prob, surv_prob2, atol=1e-5, rtol=1e-5) assert_allclose(sr.surv_prob_se, surv_prob_se2, atol=1e-5, rtol=1e-5) assert_allclose(sr.surv_times, times2) assert_allclose(sr.n_risk, n_risk2) assert_allclose(sr.n_events, n_events2)
def test_survfunc1(): # Test where all times have at least 1 event. sr = SurvfuncRight(ti1, st1) assert_allclose(sr.surv_prob, surv_prob1, atol=1e-5, rtol=1e-5) assert_allclose(sr.surv_prob_se, surv_prob_se1, atol=1e-5, rtol=1e-5) assert_allclose(sr.surv_times, times1) assert_allclose(sr.n_risk, n_risk1) assert_allclose(sr.n_events, n_events1)
def test_plot_km(): if pdf_output: from matplotlib.backends.backend_pdf import PdfPages pdf = PdfPages("test_survfunc.pdf") else: pdf = None sr1 = SurvfuncRight(ti1, st1) sr2 = SurvfuncRight(ti2, st2) fig = plot_survfunc(sr1) close_or_save(pdf, fig) fig = plot_survfunc(sr2) close_or_save(pdf, fig) fig = plot_survfunc([sr1, sr2]) close_or_save(pdf, fig) # Plot the SAS BMT data gb = bmt.groupby("Group") sv = [] for g in gb: s0 = SurvfuncRight(g[1]["T"], g[1]["Status"], title=g[0]) sv.append(s0) fig = plot_survfunc(sv) ax = fig.get_axes()[0] ax.set_position([0.1, 0.1, 0.64, 0.8]) ha, lb = ax.get_legend_handles_labels() fig.legend([ha[k] for k in (0, 2, 4)], [lb[k] for k in (0, 2, 4)], 'center right') close_or_save(pdf, fig) # Simultaneous CB for BMT data ii = bmt.Group == "ALL" sf = SurvfuncRight(bmt.loc[ii, "T"], bmt.loc[ii, "Status"]) fig = sf.plot() ax = fig.get_axes()[0] ax.set_position([0.1, 0.1, 0.64, 0.8]) ha, lb = ax.get_legend_handles_labels() lcb, ucb = sf.simultaneous_cb(transform="log") plt.fill_between(sf.surv_times, lcb, ucb, color="lightgrey") lcb, ucb = sf.simultaneous_cb(transform="arcsin") plt.plot(sf.surv_times, lcb, color="darkgrey") plt.plot(sf.surv_times, ucb, color="darkgrey") plt.plot(sf.surv_times, sf.surv_prob - 2*sf.surv_prob_se, color="red") plt.plot(sf.surv_times, sf.surv_prob + 2*sf.surv_prob_se, color="red") plt.xlim(100, 600) close_or_save(pdf, fig) if pdf_output: pdf.close()
def test_weights2(): # tm = c(1, 3, 5, 6, 7, 2, 4, 6, 8, 10) # st = c(1, 1, 0, 1, 1, 1, 1, 0, 1, 1) # wt = c(1, 1, 1, 1, 1, 2, 2, 2, 2, 2) # library(survival) # sf =s urvfit(Surv(tm, st) ~ 1, weights=wt, err='tsiatis') tm = np.r_[1, 3, 5, 6, 7, 2, 4, 6, 8, 10] st = np.r_[1, 1, 0, 1, 1, 1, 1, 0, 1, 1] wt = np.r_[1, 1, 1, 1, 1, 2, 2, 2, 2, 2] tm0 = np.r_[1, 3, 5, 6, 7, 2, 4, 6, 8, 10, 2, 4, 6, 8, 10] st0 = np.r_[1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1] sf0 = SurvfuncRight(tm, st, freq_weights=wt) sf1 = SurvfuncRight(tm0, st0) assert_allclose(sf0.surv_times, sf1.surv_times) assert_allclose(sf0.surv_prob, sf1.surv_prob) assert_allclose( sf0.surv_prob_se, np.r_[0.06666667, 0.1210311, 0.14694547, 0.19524829, 0.23183377, 0.30618115, 0.46770386, 0.84778942])
def test_kernel_survfunc1(): # Regression test n = 100 np.random.seed(3434) x = np.random.normal(size=(n, 3)) time = np.random.uniform(size=n) status = np.random.randint(0, 2, size=n) result = SurvfuncRight(time, status, exog=x) timex = np.r_[0.30721103, 0.0515439, 0.69246897, 0.16446079, 0.31308528] sprob = np.r_[0.98948277, 0.98162275, 0.97129237, 0.96044668, 0.95030368] assert_allclose(result.time[0:5], timex) assert_allclose(result.surv_prob[0:5], sprob)
def test_incidence2(): # Check that the cumulative incidence functions for all competing # risks sum to the complementary survival function. np.random.seed(2423) n = 200 time = -np.log(np.random.uniform(size=n)) status = np.random.randint(0, 3, size=n) ii = np.argsort(time) time = time[ii] status = status[ii] ci = CumIncidenceRight(time, status) statusa = 1 * (status >= 1) sf = SurvfuncRight(time, statusa) x = 1 - sf.surv_prob y = (ci.cinc[0] + ci.cinc[1])[np.flatnonzero(statusa)] assert_allclose(x, y)
def test_survfunc_entry_3(): # times = c(1, 2, 5, 6, 6, 6, 6, 6, 9) # status = c(0, 0, 1, 1, 1, 0, 1, 1, 0) # entry = c(0, 1, 1, 2, 2, 2, 3, 4, 4) # sv = Surv(entry, times, event=status) # sdf = survfit(coxph(sv ~ 1), type='kaplan-meier') times = np.r_[1, 2, 5, 6, 6, 6, 6, 6, 9] status = np.r_[0, 0, 1, 1, 1, 0, 1, 1, 0] entry = np.r_[0, 1, 1, 2, 2, 2, 3, 4, 4] sf = SurvfuncRight(times, status, entry=entry) assert_allclose(sf.n_risk, np.r_[7, 6]) assert_allclose(sf.surv_times, np.r_[5, 6]) assert_allclose(sf.surv_prob, np.r_[0.857143, 0.285714], atol=1e-5) assert_allclose(sf.surv_prob_se, np.r_[0.13226, 0.170747], atol=1e-5)
def test_weights1(): # tm = c(1, 3, 5, 6, 7, 8, 8, 9, 3, 4, 1, 3, 2) # st = c(1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0) # wt = c(1, 2, 3, 2, 3, 1, 2, 1, 1, 2, 2, 3, 1) # library(survival) # sf = survfit(Surv(tm, st) ~ 1, weights=wt, err='tsiatis') tm = np.r_[1, 3, 5, 6, 7, 8, 8, 9, 3, 4, 1, 3, 2] st = np.r_[1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0] wt = np.r_[1, 2, 3, 2, 3, 1, 2, 1, 1, 2, 2, 3, 1] sf = SurvfuncRight(tm, st, freq_weights=wt) assert_allclose(sf.surv_times, np.r_[1, 3, 6, 7, 9]) assert_allclose(sf.surv_prob, np.r_[0.875, 0.65625, 0.51041667, 0.29166667, 0.]) assert_allclose( sf.surv_prob_se, np.r_[0.07216878, 0.13307266, 0.20591185, 0.3219071, 1.05053519])
def test_survfunc_entry_1(): # times = c(1, 3, 3, 5, 5, 7, 7, 8, 8, 9, 10, 10) # status = c(1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1) # entry = c(0, 1, 1, 2, 2, 2, 3, 4, 4, 4, 4, 0) # sv = Surv(entry, times, event=status) # sdf = survfit(coxph(sv ~ 1), type='kaplan-meier') times = np.r_[1, 3, 3, 5, 5, 7, 7, 8, 8, 9, 10, 10] status = np.r_[1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1] entry = np.r_[0, 1, 1, 2, 2, 2, 3, 4, 4, 4, 4, 0] sf = SurvfuncRight(times, status, entry=entry) assert_allclose(sf.n_risk, np.r_[2, 6, 9, 7, 5, 3, 2]) assert_allclose(sf.surv_times, np.r_[1, 3, 5, 7, 8, 9, 10]) assert_allclose(sf.surv_prob, np.r_[ 0.5000, 0.4167, 0.3241, 0.2778, 0.2222, 0.1481, 0.0741], atol=1e-4) assert_allclose(sf.surv_prob_se, np.r_[ 0.3536, 0.3043, 0.2436, 0.2132, 0.1776, 0.1330, 0.0846], atol=1e-4)
def cli(): parser = argparse.ArgumentParser( description='GAP - Git Activity Predictor') parser.add_argument('paths', metavar='PATH', type=str, nargs='*', default=['.'], help='Paths to one or more git repositories') parser.add_argument( '--date', type=lambda d: dateutil.parser.parse(d).date(), required=False, default=datetime.date.today(), help='Date used for predictions (default to current date)') parser.add_argument('--obs', type=int, required=False, default=20, help='Number of observations to consider') parser.add_argument('--probs', metavar='PROB', type=float, nargs='*', required=False, default=[0.5, 0.6, 0.7, 0.8, 0.9], help='Probabilities to output, strictly in [0,1].') parser.add_argument( '--limit', type=int, required=False, default=30, help= 'Limit contributors to the one that were active at least once during the last x days (default 30)' ) parser.add_argument( '--mapping', type=str, nargs='?', help= 'Mapping file to merge identities. This file must be a csv file where each line contains two values: the name to be merged, and the corresponding identity. Use "IGNORE" as identity to ignore specific names.' ) parser.add_argument('--branches', metavar='BRANCH', type=str, nargs='*', default=list(), help='Git branches to analyse (default to all).') parser.add_argument( '--as-dates', dest='as_dates', action='store_true', help= 'Express predictions using dates instead of time differences in days') group = parser.add_mutually_exclusive_group() group.add_argument('--text', action='store_true', help='Print results as text.') group.add_argument('--csv', action='store_true', help='Print results as csv.') group.add_argument('--json', action='store_true', help='Print results as json.') group.add_argument( '--plot', nargs='?', const=True, help='Export results to a plot. Filepath can be optionaly specified.') args = parser.parse_args() # Default plot location if args.plot is True: args.plot = str(args.date) + '.pdf' # Default to text if not other option is provided if not args.csv and not args.json and not args.plot: args.text = True # Identity mapping if args.mapping: d = pandas.read_csv(args.mapping, names=['source', 'target']) mapping = {r.source: r.target for r in d.itertuples()} else: mapping = {} raw_data = dict() # author -> dates of activity # Get data from git for path in args.paths: try: repo = git.Repo(path) except Exception as e: # Must be refined print('Unable to access repository {} ({}:{})'.format( path, e.__class__.__name__, e)) sys.exit() # Default branches if len(args.branches) == 0: commits = repo.iter_commits('--all') else: commits = repo.iter_commits(' '.join(args.branches)) for commit in commits: try: author = commit.author.name identity = mapping.get(author, author) if author.lower() != 'ignore' and identity.lower() == 'ignore': continue date = datetime.date.fromtimestamp(commit.authored_date) raw_data.setdefault(identity, []).append(date) except Exception as e: print('Unable to read commit ({}: {}): {}'.format( e.__class__.__name__, e, commit)) # Compute durations and apply model data = [] # (author, past activities, predicted durations) for author, commits in raw_data.items(): commits = sorted([e for e in commits if e <= args.date]) durations = dates_to_duration(commits, window_size=args.obs) if len(durations) >= args.obs: # Currently implemented with no censor surv = SurvfuncRight(durations, [1] * len(durations)) predictions = [surv.quantile(p) for p in args.probs] last_day = commits[-1] if last_day >= args.date - datetime.timedelta(args.limit): data.append(( author, commits, predictions, )) # Prepare dataframe df = pandas.DataFrame(index=set([a for a, c, p in data]), columns=['last'] + args.probs) if len(df) == 0: print( 'No author has {} observations and was active at least once during the last {} days' .format(args.obs, args.limit)) sys.exit() df.index.name = 'author' if not args.plot: for author, commits, predictions in data: last = commits[-1] if args.as_dates: df.at[author, 'last'] = last else: df.at[author, 'last'] = (last - args.date).days for prob, p in zip(args.probs, predictions): if args.as_dates: df.at[author, prob] = last + datetime.timedelta(days=int(p)) else: df.at[author, prob] = (last + datetime.timedelta(days=int(p)) - args.date).days df = df.sort_values(['last'] + args.probs, ascending=[False] + [True] * len(args.probs)) df = df.astype(str) if args.text: pandas.set_option('expand_frame_repr', False) pandas.set_option('display.max_columns', 999) print(df) elif args.csv: print(df.to_csv()) elif args.json: print(df.to_json(orient='index')) else: # Because of plotnine's way of initializing matplotlib import warnings warnings.filterwarnings("ignore") VIEW_LIMIT = 28 activities = [ ] # List of (author, day) where day is a delta w.r.t. given date forecasts = [ ] # List of (author, from_day, to_day, p) where probability p # applies between from_day and to_day (delta w.r.t. given date) for author, commits, predictions in data: last = (commits[-1] - args.date).days for e in commits: activities.append((author, (e - args.date).days)) previous = previous_previous = 0 for d, p in zip(predictions, args.probs): if d > previous: forecasts.append((author, last + previous, last + d, p)) previous_previous = previous previous = d else: forecasts.append( (author, last + previous_previous, last + d, p)) activities = pandas.DataFrame(columns=['author', 'day'], data=activities) forecasts = pandas.DataFrame(columns=['author', 'fromd', 'tod', 'p'], data=forecasts) plot = (p9.ggplot(p9.aes(y='author')) + p9.geom_segment( p9.aes('day - 0.5', 'author', xend='day + 0.5', yend='author'), data=activities, size=4, color='orange', ) + p9.geom_segment( p9.aes('fromd + 0.5', 'author', xend='tod + 0.5', yend='author', alpha='factor(p)'), data=forecasts.sort_values('p').drop_duplicates( ['author', 'fromd', 'tod'], keep='last'), size=4, color='steelblue', ) + p9.geom_vline( xintercept=0, color='r', alpha=0.5, linetype='dashed') + p9.scale_x_continuous( name=' << past days {:^20} future days >>'.format( str(args.date)), breaks=range(-VIEW_LIMIT // 7 * 7, (VIEW_LIMIT // 7 * 7) + 1, 7), minor_breaks=6) + p9.scale_y_discrete( name='', limits=activities.sort_values( 'day', ascending=False)['author'].unique()) + p9.scale_alpha_discrete(range=(0.2, 1), name=' ') + p9.coord_cartesian(xlim=(-VIEW_LIMIT, VIEW_LIMIT)) + p9.theme_matplotlib() + p9.theme( figure_size=(6, 4 * activities['author'].nunique() / 15))) fig = plot.draw() fig.savefig(args.plot, bbox_inches='tight') print('Plot exported to {}'.format(args.plot))