def test_bmt():
    # All tests against SAS
    # Results taken from here:
    # http://support.sas.com/documentation/cdl/en/statug/68162/HTML/default/viewer.htm#statug_lifetest_details03.htm

    # Confidence intervals for 25% percentile of the survival
    # distribution (for "ALL" subjects), taken from the SAS web site
    cb = {"linear": [107, 276],
          "cloglog": [86, 230],
          "log": [107, 332],
          "asinsqrt": [104, 276],
          "logit": [104, 230]}

    dfa = bmt[bmt.Group == "ALL"]

    cur_dir = os.path.dirname(os.path.abspath(__file__))
    fp = os.path.join(cur_dir, 'results', 'bmt_results.csv')
    rslt = pd.read_csv(fp)

    sf = SurvfuncRight(dfa["T"].values, dfa.Status.values)

    assert_allclose(sf.surv_times, rslt.t)
    assert_allclose(sf.surv_prob, rslt.s, atol=1e-4, rtol=1e-4)
    assert_allclose(sf.surv_prob_se, rslt.se, atol=1e-4, rtol=1e-4)

    for method in "linear", "cloglog", "log", "logit", "asinsqrt":
        lcb, ucb = sf.quantile_ci(0.25, method=method)
        assert_allclose(cb[method], np.r_[lcb, ucb])
def test_bmt():
    # All tests against SAS
    # Results taken from here:
    # http://support.sas.com/documentation/cdl/en/statug/68162/HTML/default/viewer.htm#statug_lifetest_details03.htm

    # Confidence intervals for 25% percentile of the survival
    # distribution (for "ALL" subjects), taken from the SAS web site
    cb = {
        "linear": [107, 276],
        "cloglog": [86, 230],
        "log": [107, 332],
        "asinsqrt": [104, 276],
        "logit": [104, 230]
    }

    dfa = bmt[bmt.Group == "ALL"]

    cur_dir = os.path.dirname(os.path.abspath(__file__))
    fp = os.path.join(cur_dir, '#1lab_results', 'bmt_results.csv')
    rslt = pd.read_csv(fp)

    sf = SurvfuncRight(dfa["T"].values, dfa.Status.values)

    assert_allclose(sf.surv_times, rslt.t)
    assert_allclose(sf.surv_prob, rslt.s, atol=1e-4, rtol=1e-4)
    assert_allclose(sf.surv_prob_se, rslt.se, atol=1e-4, rtol=1e-4)

    for method in "linear", "cloglog", "log", "logit", "asinsqrt":
        lcb, ucb = sf.quantile_ci(0.25, method=method)
        assert_allclose(cb[method], np.r_[lcb, ucb])
def test_kernel_survfunc3():
    # cases with tied times

    n = 100
    np.random.seed(3434)
    x = np.random.normal(size=(n, 3))
    time = np.random.randint(0, 10, size=n)
    status = np.random.randint(0, 2, size=n)
    SurvfuncRight(time, status, exog=x, bw_factor=10000)
    SurvfuncRight(time, status, exog=x, bw_factor=np.r_[10000, 10000])
def test_survfunc_entry_2():
    # entry = 0 is equivalent to no entry time

    times = np.r_[1, 3, 3, 5, 5, 7, 7, 8, 8, 9, 10, 10]
    status = np.r_[1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1]
    entry = np.r_[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

    sf = SurvfuncRight(times, status, entry=entry)
    sf0 = SurvfuncRight(times, status)

    assert_allclose(sf.n_risk, sf0.n_risk)
    assert_allclose(sf.surv_times, sf0.surv_times)
    assert_allclose(sf.surv_prob, sf0.surv_prob)
    assert_allclose(sf.surv_prob_se, sf0.surv_prob_se)
def test_kernel_survfunc2():
    # Check that when bandwidth is very large, the kernel procedure
    # agrees with standard KM. (Note: the results do not agree
    # perfectly when there are tied times).

    n = 100
    np.random.seed(3434)
    x = np.random.normal(size=(n, 3))
    time = np.random.uniform(0, 10, size=n)
    status = np.random.randint(0, 2, size=n)

    resultkm = SurvfuncRight(time, status)
    result = SurvfuncRight(time, status, exog=x, bw_factor=10000)

    assert_allclose(resultkm.surv_times, result.surv_times)
    assert_allclose(resultkm.surv_prob, result.surv_prob, rtol=1e-6, atol=1e-6)
def test_simultaneous_cb():

    # The exact numbers here are regression tests, but they are close
    # to page 103 of Klein and Moeschberger.

    df = bmt.loc[bmt["Group"] == "ALL", :]
    sf = SurvfuncRight(df["T"], df["Status"])
    lcb1, ucb1 = sf.simultaneous_cb(transform="log")
    lcb2, ucb2 = sf.simultaneous_cb(transform="arcsin")

    ti = sf.surv_times.tolist()
    ix = [ti.index(x) for x in (110, 122, 129, 172)]
    assert_allclose(lcb1[ix], np.r_[0.43590582, 0.42115592, 0.4035897, 0.38785927])
    assert_allclose(ucb1[ix], np.r_[0.93491636, 0.89776803, 0.87922239, 0.85894181])

    assert_allclose(lcb2[ix], np.r_[0.52115708, 0.48079378, 0.45595321, 0.43341115])
    assert_allclose(ucb2[ix], np.r_[0.96465636,  0.92745068,  0.90885428, 0.88796708])
def test_survfunc2():
    # Test where some times have no events.

    sr = SurvfuncRight(ti2, st2)
    assert_allclose(sr.surv_prob, surv_prob2, atol=1e-5, rtol=1e-5)
    assert_allclose(sr.surv_prob_se, surv_prob_se2, atol=1e-5, rtol=1e-5)
    assert_allclose(sr.surv_times, times2)
    assert_allclose(sr.n_risk, n_risk2)
    assert_allclose(sr.n_events, n_events2)
def test_survfunc1():
    # Test where all times have at least 1 event.

    sr = SurvfuncRight(ti1, st1)
    assert_allclose(sr.surv_prob, surv_prob1, atol=1e-5, rtol=1e-5)
    assert_allclose(sr.surv_prob_se, surv_prob_se1, atol=1e-5, rtol=1e-5)
    assert_allclose(sr.surv_times, times1)
    assert_allclose(sr.n_risk, n_risk1)
    assert_allclose(sr.n_events, n_events1)
Exemple #9
0
def test_plot_km():

    if pdf_output:
        from matplotlib.backends.backend_pdf import PdfPages
        pdf = PdfPages("test_survfunc.pdf")
    else:
        pdf = None

    sr1 = SurvfuncRight(ti1, st1)
    sr2 = SurvfuncRight(ti2, st2)

    fig = plot_survfunc(sr1)
    close_or_save(pdf, fig)

    fig = plot_survfunc(sr2)
    close_or_save(pdf, fig)

    fig = plot_survfunc([sr1, sr2])
    close_or_save(pdf, fig)

    # Plot the SAS BMT data
    gb = bmt.groupby("Group")
    sv = []
    for g in gb:
        s0 = SurvfuncRight(g[1]["T"], g[1]["Status"], title=g[0])
        sv.append(s0)
    fig = plot_survfunc(sv)
    ax = fig.get_axes()[0]
    ax.set_position([0.1, 0.1, 0.64, 0.8])
    ha, lb = ax.get_legend_handles_labels()
    fig.legend([ha[k] for k in (0, 2, 4)],
               [lb[k] for k in (0, 2, 4)],
               'center right')
    close_or_save(pdf, fig)

    # Simultaneous CB for BMT data
    ii = bmt.Group == "ALL"
    sf = SurvfuncRight(bmt.loc[ii, "T"], bmt.loc[ii, "Status"])
    fig = sf.plot()
    ax = fig.get_axes()[0]
    ax.set_position([0.1, 0.1, 0.64, 0.8])
    ha, lb = ax.get_legend_handles_labels()
    lcb, ucb = sf.simultaneous_cb(transform="log")
    plt.fill_between(sf.surv_times, lcb, ucb, color="lightgrey")
    lcb, ucb = sf.simultaneous_cb(transform="arcsin")
    plt.plot(sf.surv_times, lcb, color="darkgrey")
    plt.plot(sf.surv_times, ucb, color="darkgrey")
    plt.plot(sf.surv_times, sf.surv_prob - 2*sf.surv_prob_se, color="red")
    plt.plot(sf.surv_times, sf.surv_prob + 2*sf.surv_prob_se, color="red")
    plt.xlim(100, 600)
    close_or_save(pdf, fig)

    if pdf_output:
        pdf.close()
def test_simultaneous_cb():

    # The exact numbers here are regression tests, but they are close
    # to page 103 of Klein and Moeschberger.

    df = bmt.loc[bmt["Group"] == "ALL", :]
    sf = SurvfuncRight(df["T"], df["Status"])
    lcb1, ucb1 = sf.simultaneous_cb(transform="log")
    lcb2, ucb2 = sf.simultaneous_cb(transform="arcsin")

    ti = sf.surv_times.tolist()
    ix = [ti.index(x) for x in (110, 122, 129, 172)]
    assert_allclose(lcb1[ix], np.r_[0.43590582, 0.42115592, 0.4035897,
                                    0.38785927])
    assert_allclose(ucb1[ix], np.r_[0.93491636, 0.89776803, 0.87922239,
                                    0.85894181])

    assert_allclose(lcb2[ix], np.r_[0.52115708, 0.48079378, 0.45595321,
                                    0.43341115])
    assert_allclose(ucb2[ix], np.r_[0.96465636, 0.92745068, 0.90885428,
                                    0.88796708])
def test_weights2():
    # tm = c(1, 3, 5, 6, 7, 2, 4, 6, 8, 10)
    # st = c(1, 1, 0, 1, 1, 1, 1, 0, 1, 1)
    # wt = c(1, 1, 1, 1, 1, 2, 2, 2, 2, 2)
    # library(survival)
    # sf =s urvfit(Surv(tm, st) ~ 1, weights=wt, err='tsiatis')

    tm = np.r_[1, 3, 5, 6, 7, 2, 4, 6, 8, 10]
    st = np.r_[1, 1, 0, 1, 1, 1, 1, 0, 1, 1]
    wt = np.r_[1, 1, 1, 1, 1, 2, 2, 2, 2, 2]
    tm0 = np.r_[1, 3, 5, 6, 7, 2, 4, 6, 8, 10, 2, 4, 6, 8, 10]
    st0 = np.r_[1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1]

    sf0 = SurvfuncRight(tm, st, freq_weights=wt)
    sf1 = SurvfuncRight(tm0, st0)

    assert_allclose(sf0.surv_times, sf1.surv_times)
    assert_allclose(sf0.surv_prob, sf1.surv_prob)

    assert_allclose(
        sf0.surv_prob_se,
        np.r_[0.06666667, 0.1210311, 0.14694547, 0.19524829, 0.23183377,
              0.30618115, 0.46770386, 0.84778942])
def test_kernel_survfunc1():
    # Regression test
    n = 100
    np.random.seed(3434)
    x = np.random.normal(size=(n, 3))
    time = np.random.uniform(size=n)
    status = np.random.randint(0, 2, size=n)

    result = SurvfuncRight(time, status, exog=x)

    timex = np.r_[0.30721103, 0.0515439, 0.69246897, 0.16446079, 0.31308528]
    sprob = np.r_[0.98948277, 0.98162275, 0.97129237, 0.96044668, 0.95030368]

    assert_allclose(result.time[0:5], timex)
    assert_allclose(result.surv_prob[0:5], sprob)
def test_incidence2():
    # Check that the cumulative incidence functions for all competing
    # risks sum to the complementary survival function.

    np.random.seed(2423)
    n = 200
    time = -np.log(np.random.uniform(size=n))
    status = np.random.randint(0, 3, size=n)
    ii = np.argsort(time)
    time = time[ii]
    status = status[ii]
    ci = CumIncidenceRight(time, status)
    statusa = 1 * (status >= 1)
    sf = SurvfuncRight(time, statusa)
    x = 1 - sf.surv_prob
    y = (ci.cinc[0] + ci.cinc[1])[np.flatnonzero(statusa)]
    assert_allclose(x, y)
def test_survfunc_entry_3():
    # times = c(1, 2, 5, 6, 6, 6, 6, 6, 9)
    # status = c(0, 0, 1, 1, 1, 0, 1, 1, 0)
    # entry = c(0, 1, 1, 2, 2, 2, 3, 4, 4)
    # sv = Surv(entry, times, event=status)
    # sdf = survfit(coxph(sv ~ 1), type='kaplan-meier')

    times = np.r_[1, 2, 5, 6, 6, 6, 6, 6, 9]
    status = np.r_[0, 0, 1, 1, 1, 0, 1, 1, 0]
    entry = np.r_[0, 1, 1, 2, 2, 2, 3, 4, 4]

    sf = SurvfuncRight(times, status, entry=entry)

    assert_allclose(sf.n_risk, np.r_[7, 6])
    assert_allclose(sf.surv_times, np.r_[5, 6])
    assert_allclose(sf.surv_prob, np.r_[0.857143, 0.285714], atol=1e-5)
    assert_allclose(sf.surv_prob_se, np.r_[0.13226, 0.170747], atol=1e-5)
def test_weights1():
    # tm = c(1, 3, 5, 6, 7, 8, 8, 9, 3, 4, 1, 3, 2)
    # st = c(1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0)
    # wt = c(1, 2, 3, 2, 3, 1, 2, 1, 1, 2, 2, 3, 1)
    # library(survival)
    # sf = survfit(Surv(tm, st) ~ 1, weights=wt, err='tsiatis')

    tm = np.r_[1, 3, 5, 6, 7, 8, 8, 9, 3, 4, 1, 3, 2]
    st = np.r_[1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0]
    wt = np.r_[1, 2, 3, 2, 3, 1, 2, 1, 1, 2, 2, 3, 1]

    sf = SurvfuncRight(tm, st, freq_weights=wt)
    assert_allclose(sf.surv_times, np.r_[1, 3, 6, 7, 9])
    assert_allclose(sf.surv_prob, np.r_[0.875, 0.65625, 0.51041667, 0.29166667,
                                        0.])
    assert_allclose(
        sf.surv_prob_se, np.r_[0.07216878, 0.13307266, 0.20591185, 0.3219071,
                               1.05053519])
Exemple #16
0
def test_survfunc_entry_1():
    # times = c(1, 3, 3, 5, 5, 7, 7, 8, 8, 9, 10, 10)
    # status = c(1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1)
    # entry = c(0, 1, 1, 2, 2, 2, 3, 4, 4, 4, 4, 0)
    # sv = Surv(entry, times, event=status)
    # sdf = survfit(coxph(sv ~ 1), type='kaplan-meier')

    times = np.r_[1, 3, 3, 5, 5, 7, 7, 8, 8, 9, 10, 10]
    status = np.r_[1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1]
    entry = np.r_[0, 1, 1, 2, 2, 2, 3, 4, 4, 4, 4, 0]

    sf = SurvfuncRight(times, status, entry=entry)

    assert_allclose(sf.n_risk, np.r_[2, 6, 9, 7, 5, 3, 2])
    assert_allclose(sf.surv_times, np.r_[1, 3, 5, 7, 8, 9, 10])
    assert_allclose(sf.surv_prob, np.r_[
        0.5000, 0.4167, 0.3241, 0.2778, 0.2222, 0.1481, 0.0741],
        atol=1e-4)
    assert_allclose(sf.surv_prob_se, np.r_[
        0.3536, 0.3043, 0.2436, 0.2132, 0.1776, 0.1330, 0.0846],
        atol=1e-4)
Exemple #17
0
def cli():
    parser = argparse.ArgumentParser(
        description='GAP - Git Activity Predictor')
    parser.add_argument('paths',
                        metavar='PATH',
                        type=str,
                        nargs='*',
                        default=['.'],
                        help='Paths to one or more git repositories')
    parser.add_argument(
        '--date',
        type=lambda d: dateutil.parser.parse(d).date(),
        required=False,
        default=datetime.date.today(),
        help='Date used for predictions (default to current date)')
    parser.add_argument('--obs',
                        type=int,
                        required=False,
                        default=20,
                        help='Number of observations to consider')
    parser.add_argument('--probs',
                        metavar='PROB',
                        type=float,
                        nargs='*',
                        required=False,
                        default=[0.5, 0.6, 0.7, 0.8, 0.9],
                        help='Probabilities to output, strictly in [0,1].')
    parser.add_argument(
        '--limit',
        type=int,
        required=False,
        default=30,
        help=
        'Limit contributors to the one that were active at least once during the last x days (default 30)'
    )
    parser.add_argument(
        '--mapping',
        type=str,
        nargs='?',
        help=
        'Mapping file to merge identities. This file must be a csv file where each line contains two values: the name to be merged, and the corresponding identity. Use "IGNORE" as identity to ignore specific names.'
    )
    parser.add_argument('--branches',
                        metavar='BRANCH',
                        type=str,
                        nargs='*',
                        default=list(),
                        help='Git branches to analyse (default to all).')
    parser.add_argument(
        '--as-dates',
        dest='as_dates',
        action='store_true',
        help=
        'Express predictions using dates instead of time differences in days')

    group = parser.add_mutually_exclusive_group()
    group.add_argument('--text',
                       action='store_true',
                       help='Print results as text.')
    group.add_argument('--csv',
                       action='store_true',
                       help='Print results as csv.')
    group.add_argument('--json',
                       action='store_true',
                       help='Print results as json.')
    group.add_argument(
        '--plot',
        nargs='?',
        const=True,
        help='Export results to a plot. Filepath can be optionaly specified.')

    args = parser.parse_args()

    # Default plot location
    if args.plot is True:
        args.plot = str(args.date) + '.pdf'

    # Default to text if not other option is provided
    if not args.csv and not args.json and not args.plot:
        args.text = True

    # Identity mapping
    if args.mapping:
        d = pandas.read_csv(args.mapping, names=['source', 'target'])
        mapping = {r.source: r.target for r in d.itertuples()}
    else:
        mapping = {}

    raw_data = dict()  # author -> dates of activity

    # Get data from git
    for path in args.paths:
        try:
            repo = git.Repo(path)
        except Exception as e:  # Must be refined
            print('Unable to access repository {} ({}:{})'.format(
                path, e.__class__.__name__, e))
            sys.exit()

        # Default branches
        if len(args.branches) == 0:
            commits = repo.iter_commits('--all')
        else:
            commits = repo.iter_commits(' '.join(args.branches))

        for commit in commits:
            try:
                author = commit.author.name
                identity = mapping.get(author, author)
                if author.lower() != 'ignore' and identity.lower() == 'ignore':
                    continue

                date = datetime.date.fromtimestamp(commit.authored_date)
                raw_data.setdefault(identity, []).append(date)
            except Exception as e:
                print('Unable to read commit ({}: {}): {}'.format(
                    e.__class__.__name__, e, commit))

    # Compute durations and apply model
    data = []  # (author, past activities, predicted durations)

    for author, commits in raw_data.items():
        commits = sorted([e for e in commits if e <= args.date])
        durations = dates_to_duration(commits, window_size=args.obs)

        if len(durations) >= args.obs:
            # Currently implemented with no censor
            surv = SurvfuncRight(durations, [1] * len(durations))
            predictions = [surv.quantile(p) for p in args.probs]
            last_day = commits[-1]

            if last_day >= args.date - datetime.timedelta(args.limit):
                data.append((
                    author,
                    commits,
                    predictions,
                ))

    # Prepare dataframe
    df = pandas.DataFrame(index=set([a for a, c, p in data]),
                          columns=['last'] + args.probs)
    if len(df) == 0:
        print(
            'No author has {} observations and was active at least once during the last {} days'
            .format(args.obs, args.limit))
        sys.exit()

    df.index.name = 'author'

    if not args.plot:
        for author, commits, predictions in data:
            last = commits[-1]
            if args.as_dates:
                df.at[author, 'last'] = last
            else:
                df.at[author, 'last'] = (last - args.date).days

            for prob, p in zip(args.probs, predictions):
                if args.as_dates:
                    df.at[author,
                          prob] = last + datetime.timedelta(days=int(p))
                else:
                    df.at[author,
                          prob] = (last + datetime.timedelta(days=int(p)) -
                                   args.date).days

        df = df.sort_values(['last'] + args.probs,
                            ascending=[False] + [True] * len(args.probs))
        df = df.astype(str)

        if args.text:
            pandas.set_option('expand_frame_repr', False)
            pandas.set_option('display.max_columns', 999)
            print(df)
        elif args.csv:
            print(df.to_csv())
        elif args.json:
            print(df.to_json(orient='index'))
    else:
        # Because of plotnine's way of initializing matplotlib
        import warnings
        warnings.filterwarnings("ignore")

        VIEW_LIMIT = 28

        activities = [
        ]  # List of (author, day) where day is a delta w.r.t. given date
        forecasts = [
        ]  # List of (author, from_day, to_day, p) where probability p
        # applies between from_day and to_day (delta w.r.t. given date)

        for author, commits, predictions in data:
            last = (commits[-1] - args.date).days
            for e in commits:
                activities.append((author, (e - args.date).days))

            previous = previous_previous = 0
            for d, p in zip(predictions, args.probs):
                if d > previous:
                    forecasts.append((author, last + previous, last + d, p))
                    previous_previous = previous
                    previous = d
                else:
                    forecasts.append(
                        (author, last + previous_previous, last + d, p))

        activities = pandas.DataFrame(columns=['author', 'day'],
                                      data=activities)
        forecasts = pandas.DataFrame(columns=['author', 'fromd', 'tod', 'p'],
                                     data=forecasts)

        plot = (p9.ggplot(p9.aes(y='author')) + p9.geom_segment(
            p9.aes('day - 0.5', 'author', xend='day + 0.5', yend='author'),
            data=activities,
            size=4,
            color='orange',
        ) + p9.geom_segment(
            p9.aes('fromd + 0.5',
                   'author',
                   xend='tod + 0.5',
                   yend='author',
                   alpha='factor(p)'),
            data=forecasts.sort_values('p').drop_duplicates(
                ['author', 'fromd', 'tod'], keep='last'),
            size=4,
            color='steelblue',
        ) + p9.geom_vline(
            xintercept=0,
            color='r', alpha=0.5, linetype='dashed') + p9.scale_x_continuous(
                name='  <<  past days {:^20} future days  >>'.format(
                    str(args.date)),
                breaks=range(-VIEW_LIMIT // 7 * 7,
                             (VIEW_LIMIT // 7 * 7) + 1, 7),
                minor_breaks=6) + p9.scale_y_discrete(
                    name='',
                    limits=activities.sort_values(
                        'day', ascending=False)['author'].unique()) +
                p9.scale_alpha_discrete(range=(0.2, 1), name=' ') +
                p9.coord_cartesian(xlim=(-VIEW_LIMIT, VIEW_LIMIT)) +
                p9.theme_matplotlib() + p9.theme(
                    figure_size=(6, 4 * activities['author'].nunique() / 15)))

        fig = plot.draw()
        fig.savefig(args.plot, bbox_inches='tight')
        print('Plot exported to {}'.format(args.plot))