コード例 #1
0
ファイル: test_utils.py プロジェクト: bashtage/statsmodels
def test_get_rdataset():
    test_url = "https://raw.githubusercontent.com/vincentarelbundock/" \
               "Rdatasets/master/csv/datasets/cars.csv"
    internet_available = check_internet(test_url)
    if not internet_available:
        pytest.skip('Unable to retrieve file - skipping test')
    try:
        duncan = get_rdataset("Duncan", "carData", cache=cur_dir)
    except (HTTPError, URLError, SSLError, timeout):
        pytest.skip('Failed with HTTPError or URLError, these are random')
    assert_(isinstance(duncan, utils.Dataset))
    duncan = get_rdataset("Duncan", "carData", cache=cur_dir)
    assert_(duncan.from_cache)
def test_get_rdataset():
    test_url = "https://raw.githubusercontent.com/vincentarelbundock/" \
               "Rdatasets/master/csv/datasets/cars.csv"
    internet_available = check_internet(test_url)
    if not internet_available:
        pytest.skip('Unable to retrieve file - skipping test')
    try:
        duncan = get_rdataset("Duncan", "carData", cache=cur_dir)
    except IGNORED_EXCEPTIONS:
        pytest.skip('Failed with HTTPError or URLError, these are random')
    assert_(isinstance(duncan, utils.Dataset))
    duncan = get_rdataset("Duncan", "carData", cache=cur_dir)
    assert_(duncan.from_cache)
コード例 #3
0
ファイル: test_utils.py プロジェクト: bashtage/statsmodels
def test_get_rdataset_write_read_cache():
    # test writing and reading cache
    try:
        guerry = get_rdataset("Guerry", "HistData", cache=cur_dir)
    except (HTTPError, URLError, SSLError, timeout):
        pytest.skip('Failed with HTTPError or URLError, these are random')

    assert_(guerry.from_cache is False)
    guerry2 = get_rdataset("Guerry", "HistData", cache=cur_dir)
    assert_(guerry2.from_cache is True)
    fn = "raw.githubusercontent.com,vincentarelbundock,Rdatasets,master,csv," \
         "HistData,Guerry.csv.zip"
    os.remove(os.path.join(cur_dir, fn))
    fn = "raw.githubusercontent.com,vincentarelbundock,Rdatasets,master,doc," \
         "HistData,rst,Guerry.rst.zip"
    os.remove(os.path.join(cur_dir, fn))
コード例 #4
0
    def test_results_on_the_quakes_dataset(self):
        """
        R code:
        ------

        > data("quakes")
        > x = quakes[1:50, 1:3]
        > y = quakes[51:100, 1:3]
        > dcov.test(x, y, R=200)

            dCov independence test (permutation test)

        data:  index 1, replicates 200
        nV^2 = 45046, p-value = 0.4577
        sample estimates:
            dCov
        30.01526
        """
        quakes = get_rdataset("quakes").data.values[:, :3]
        x = quakes[:50]
        y = quakes[50:100]

        stats = ddm.distance_statistics(x, y)

        assert_almost_equal(np.round(stats.test_statistic), 45046, 0)
        assert_almost_equal(stats.distance_correlation, 0.1894193, 4)
        assert_almost_equal(stats.distance_covariance, 30.01526, 4)
        assert_almost_equal(stats.dvar_x, 170.1702, 4)
        assert_almost_equal(stats.dvar_y, 147.5545, 4)
        assert_almost_equal(stats.S, 52265, 0)

        test_statistic, _, method = ddm.distance_covariance_test(x, y, B=199)

        assert_almost_equal(np.round(test_statistic), 45046, 0)
        assert method == "emp"
コード例 #5
0
    def test_results_on_the_iris_dataset(self):
        """
        R code example from the `energy` package documentation for
        `energy::distance_covariance.test`:

        > x <- iris[1:50, 1:4]
        > y <- iris[51:100, 1:4]
        > set.seed(1)
        > dcov.test(x, y, R=200)

            dCov independence test (permutation test)

        data:  index 1, replicates 200
        nV^2 = 0.5254, p-value = 0.9552
        sample estimates:
             dCov
        0.1025087
        """
        iris = get_rdataset("iris").data.values[:, :4]
        x = iris[:50]
        y = iris[50:100]

        stats = ddm.distance_statistics(x, y)

        assert_almost_equal(stats.test_statistic, 0.5254, 4)
        assert_almost_equal(stats.distance_correlation, 0.3060479, 4)
        assert_almost_equal(stats.distance_covariance, 0.1025087, 4)
        assert_almost_equal(stats.dvar_x, 0.2712927, 4)
        assert_almost_equal(stats.dvar_y, 0.4135274, 4)
        assert_almost_equal(stats.S, 0.667456, 4)

        test_statistic, _, method = ddm.distance_covariance_test(x, y, B=199)

        assert_almost_equal(test_statistic, 0.5254, 4)
        assert method == "emp"
def test_get_rdataset_write_read_cache():
    # test writing and reading cache
    try:
        guerry = get_rdataset("Guerry", "HistData", cache=cur_dir)
    except IGNORED_EXCEPTIONS:
        pytest.skip('Failed with HTTPError or URLError, these are random')

    assert_(guerry.from_cache is False)
    guerry2 = get_rdataset("Guerry", "HistData", cache=cur_dir)
    assert_(guerry2.from_cache is True)
    fn = "raw.githubusercontent.com,vincentarelbundock,Rdatasets,master,csv," \
         "HistData,Guerry.csv.zip"
    os.remove(os.path.join(cur_dir, fn))
    fn = "raw.githubusercontent.com,vincentarelbundock,Rdatasets,master,doc," \
         "HistData,rst,Guerry.rst.zip"
    os.remove(os.path.join(cur_dir, fn))
コード例 #7
0
def pull_data_function():
    print("Initiating data pull...")
    data = dt.get_rdataset("Boston", "MASS").data
    print("Data pull complete...")
    print()
    print("Saving...")
    data.to_csv(path + "/data/raw/input_raw.csv", header=True)
    print("Save complete!")
    return data
コード例 #8
0
def test_get_rdataset():
    # smoke test
    if not PY3:
        #NOTE: there's no way to test both since the cached files were
        #created with Python 2.x, they're strings, but Python 3 expects
        #bytes and the index file path is hard-coded so both can't live
        #side by side
        duncan = get_rdataset("Duncan", "car", cache=cur_dir)
        assert_(duncan.from_cache)
コード例 #9
0
ファイル: test_utils.py プロジェクト: BranYang/statsmodels
def test_get_rdataset():
    # smoke test
    test_url = "https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/datasets/cars.csv"
    internet_available = check_internet(test_url)
    if not internet_available:
        raise SkipTest('Unable to retrieve file - skipping test')
    duncan = get_rdataset("Duncan", "car", cache=cur_dir)
    assert_(isinstance(duncan, utils.Dataset))
    assert_(duncan.from_cache)
コード例 #10
0
def test_get_rdataset():
    # smoke test
    test_url = "https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/datasets/cars.csv"
    internet_available = check_internet(test_url)
    if not internet_available:
        raise SkipTest('Unable to retrieve file - skipping test')
    duncan = get_rdataset("Duncan", "car", cache=cur_dir)
    assert_(isinstance(duncan, utils.Dataset))
    assert_(duncan.from_cache)
コード例 #11
0
ファイル: test_utils.py プロジェクト: Bonfils-ebu/statsmodels
def test_get_rdataset():
    # smoke test
    if not PY3:
        #NOTE: there's no way to test both since the cached files were
        #created with Python 2.x, they're strings, but Python 3 expects
        #bytes and the index file path is hard-coded so both can't live
        #side by side
        duncan = get_rdataset("Duncan", "car", cache=cur_dir)
        assert_(duncan.from_cache)
コード例 #12
0
ファイル: test_utils.py プロジェクト: kasunsp/pinalpha_mvp
def test_get_rdataset():
    # smoke test
    test_url = "https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/datasets/cars.csv"
    internet_available = check_internet(test_url)
    if not internet_available:
        raise SkipTest('Unable to retrieve file - skipping test')
    duncan = get_rdataset("Duncan", "car", cache=cur_dir)
    assert_(isinstance(duncan, utils.Dataset))
    assert_(duncan.from_cache)

    # test writing and reading cache
    guerry = get_rdataset("Guerry", "HistData", cache=cur_dir)
    assert_(guerry.from_cache is False)
    guerry2 = get_rdataset("Guerry", "HistData", cache=cur_dir)
    assert_(guerry2.from_cache is True)
    fn = "raw.github.com,vincentarelbundock,Rdatasets,master,csv,HistData,Guerry.csv.zip"
    os.remove(os.path.join(cur_dir, fn))
    fn = "raw.github.com,vincentarelbundock,Rdatasets,master,doc,HistData,rst,Guerry.rst.zip"
    os.remove(os.path.join(cur_dir, fn))
コード例 #13
0
ファイル: test_utils.py プロジェクト: bert9bert/statsmodels
def test_get_rdataset():
    # smoke test
    test_url = "https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/datasets/cars.csv"
    internet_available = check_internet(test_url)
    if not internet_available:
        raise SkipTest('Unable to retrieve file - skipping test')
    duncan = get_rdataset("Duncan", "car", cache=cur_dir)
    assert_(isinstance(duncan, utils.Dataset))
    if not PY3:
        #NOTE: there's no way to test both since the cached files were
        #created with Python 2.x, they're strings, but Python 3 expects
        #bytes and the index file path is hard-coded so both can't live
        #side by side
        assert_(duncan.from_cache)
コード例 #14
0
def test_get_rdataset():
    # smoke test
    test_url = "https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/datasets/cars.csv"
    internet_available = check_internet(test_url)
    if not internet_available:
        raise SkipTest('Unable to retrieve file - skipping test')
    duncan = get_rdataset("Duncan", "car", cache=cur_dir)
    assert_(isinstance(duncan, utils.Dataset))
    if not PY3:
        #NOTE: there's no way to test both since the cached files were
        #created with Python 2.x, they're strings, but Python 3 expects
        #bytes and the index file path is hard-coded so both can't live
        #side by side
        assert_(duncan.from_cache)
コード例 #15
0
    def test_results_on_the_quakes_dataset(self):
        """
        R code:
        ------

        > data("quakes")
        > x = quakes[1:50, 1:3]
        > y = quakes[51:100, 1:3]
        > dcov.test(x, y, R=200)

            dCov independence test (permutation test)

        data:  index 1, replicates 200
        nV^2 = 45046, p-value = 0.4577
        sample estimates:
            dCov
        30.01526
        """
        try:
            quakes = get_rdataset("quakes").data.values[:, :3]
        except IGNORED_EXCEPTIONS:
            pytest.skip('Failed with HTTPError or URLError, these are random')

        x = np.asarray(quakes[:50], dtype=float)
        y = np.asarray(quakes[50:100], dtype=float)

        stats = ddm.distance_statistics(x, y)

        assert_almost_equal(np.round(stats.test_statistic), 45046, 0)
        assert_almost_equal(stats.distance_correlation, 0.1894193, 4)
        assert_almost_equal(stats.distance_covariance, 30.01526, 4)
        assert_almost_equal(stats.dvar_x, 170.1702, 4)
        assert_almost_equal(stats.dvar_y, 147.5545, 4)
        assert_almost_equal(stats.S, 52265, 0)

        test_statistic, _, method = ddm.distance_covariance_test(x, y, B=199)

        assert_almost_equal(np.round(test_statistic), 45046, 0)
        assert method == "emp"
コード例 #16
0
    def test_results_on_the_iris_dataset(self):
        """
        R code example from the `energy` package documentation for
        `energy::distance_covariance.test`:

        > x <- iris[1:50, 1:4]
        > y <- iris[51:100, 1:4]
        > set.seed(1)
        > dcov.test(x, y, R=200)

            dCov independence test (permutation test)

        data:  index 1, replicates 200
        nV^2 = 0.5254, p-value = 0.9552
        sample estimates:
             dCov
        0.1025087
        """
        try:
            iris = get_rdataset("iris").data.values[:, :4]
        except IGNORED_EXCEPTIONS:
            pytest.skip('Failed with HTTPError or URLError, these are random')

        x = np.asarray(iris[:50], dtype=float)
        y = np.asarray(iris[50:100], dtype=float)

        stats = ddm.distance_statistics(x, y)

        assert_almost_equal(stats.test_statistic, 0.5254, 4)
        assert_almost_equal(stats.distance_correlation, 0.3060479, 4)
        assert_almost_equal(stats.distance_covariance, 0.1025087, 4)
        assert_almost_equal(stats.dvar_x, 0.2712927, 4)
        assert_almost_equal(stats.dvar_y, 0.4135274, 4)
        assert_almost_equal(stats.S, 0.667456, 4)

        test_statistic, _, method = ddm.distance_covariance_test(x, y, B=199)

        assert_almost_equal(test_statistic, 0.5254, 4)
        assert method == "emp"
コード例 #17
0
def main():

    #Load mastectomy dataset
    df = datasets.get_rdataset('mastectomy', 'HSAUR', cache=True).data
    #Change event to integer
    df.event = df.event.astype(np.int64)
    #Change metastized to integer (1 for yes, 0 for no)
    df.metastized = (df.metastized == 'yes').astype(np.int64)
    #Count the number of patients
    n_patients = df.shape[0]
    #Create array for each individual patient
    patients = np.arange(n_patients)

    #Censoring - we do not observe the death of every subject, and subjects may still be alive at time t=0
    #1 - observation is not censored (death was observed)
    #0 - observation is censored (death was not observed)
    nonCensored = df.event.mean()

    #Create censoring plot
    fig, ax = plt.subplots(figsize=(8, 6))
    blue, _, red = sns.color_palette()[:3]
    #Create horizontal lines for censored observations
    ax.hlines(patients[df.event.values == 0],
              0,
              df[df.event.values == 0].time,
              color=blue,
              label='Censored')
    #Create horizontal red lines for uncensored observations
    ax.hlines(patients[df.event.values == 1],
              0,
              df[df.event.values == 1].time,
              color=red,
              label='Uncensored')
    #Create scatter ppoints for metastized months
    ax.scatter(df[df.metastized.values == 1].time,
               patients[df.metastized.values == 1],
               color='k',
               zorder=10,
               label='Metastized')
    ax.set_xlim(left=0)
    ax.set_xlabel('Months since mastectomy')
    ax.set_yticks([])
    ax.set_ylabel('Subject')
    ax.set_ylim(-0.25, n_patients + 0.25)
    ax.legend(loc='center right')

    #To understand the impact of metastization on survival time, we use a risk regression model
    #Cox proportional hazards model
    #Make intervals 3 months long
    interval_length = 3
    interval_bounds = np.arange(0,
                                df.time.max() + interval_length + 1,
                                interval_length)
    n_intervals = interval_bounds.size - 1
    intervals = np.arange(n_intervals)
    #Check how deaths and censored observations are distributed in intervals
    fig, ax = plt.subplots(figsize=(8, 6))
    #Plot histogram of uncensored events
    ax.hist(df[df.event == 1].time.values,
            bins=interval_bounds,
            color=red,
            alpha=0.5,
            lw=0,
            label='Uncensored')
    #Plot histogram of censored events
    ax.hist(df[df.event == 0].time.values,
            bins=interval_bounds,
            color=blue,
            alpha=0.5,
            lw=0,
            label='Censored')
    ax.set_xlim(0, interval_bounds[-1])
    ax.set_xlabel('Months since mastectomy')
    ax.set_yticks([0, 1, 2, 3])
    ax.set_ylabel('Number of observations')
    ax.legend()

    #Calculates the last interval period when a subject was alive
    last_period = np.floor((df.time - 0.01) / interval_length).astype(int)
    #Creates an empty matrix to store deaths
    death = np.zeros((n_patients, n_intervals))
    #For each patient (row), create an event where the last interval period was observed (column)
    death[patients, last_period] = df.event

    #Create matrix of the amount of time a subject (row) was at risk in an interval (column)
    exposure = np.greater_equal.outer(df.time,
                                      interval_bounds[:-1]) * interval_length
    exposure[patients, last_period] = df.time - interval_bounds[last_period]

    #Define parameters for PyMC
    SEED = 5078864
    n_samples = 1000
    n_tune = 1000

    #Create PyMC model -> lambda(t) = lambda0(t) * e ^ (X*beta)
    with pm.Model() as model:
        #Define prior distribution of hazards as vague Gamma distribution
        lambda0 = pm.Gamma('lambda0', 0.01, 0.01, shape=n_intervals)

        #Define hazard regression coefficients (beta) for covariates X as a normal distribution
        beta = pm.Normal('beta', 0, sd=1000)

        #Create equation for lambda(t) as a deterministic node - record sampled values as part of output
        #T.outer = symbolic matrix, vector-vector outer product
        lambda_ = pm.Deterministic(
            'lambda_', T.outer(T.exp(beta * df.metastized), lambda0))
        #Mu is created from our lambda values (hazard) times patient exposure per interval
        mu = pm.Deterministic('mu', exposure * lambda_)

        #We model the posterior distribution as a Poisson distribution with mean Mu
        obs = pm.Poisson('obs', mu, observed=death)

    with model:
        trace = pm.sample(n_samples, tune=n_tune, random_seed=SEED)

    pm.traceplot(trace)

    #Calculate hazard rate for subjects with metastized cancer (based on regression coefficients)
    hazardRate = np.exp(trace['beta'].mean())
    pm.plot_posterior(trace, varnames=['beta'], color='#87ceeb')
    pm.autocorrplot(trace, varnames=['beta'])

    #Store base hazard as well as metastized hazard for each sample per interval
    #(sample x number of intervals)
    base_hazard = trace['lambda0']
    met_hazard = trace['lambda0'] * np.exp(np.atleast_2d(trace['beta']).T)

    #Calculate cumulative hazard
    def cum_hazard(hazard):
        return (interval_length * hazard).cumsum(axis=-1)

    #Calculative survival as = e^(-cumulative hazard)
    def survival(hazard):
        return np.exp(-cum_hazard(hazard))

    #Plot highest posterior density
    def plot_with_hpd(x, hazard, f, ax, color=None, label=None, alpha=0.05):
        #Use function f on hazard mean
        mean = f(hazard.mean(axis=0))
        #Create confidence percentiles
        percentiles = 100 * np.array([alpha / 2., 1. - alpha / 2.])
        hpd = np.percentile(f(hazard), percentiles, axis=0)

        ax.fill_between(x, hpd[0], hpd[1], color=color, alpha=0.25)
        ax.step(x, mean, color=color, label=label)

    #Create figure
    fig, (hazard_ax, surv_ax) = plt.subplots(ncols=2,
                                             sharex=True,
                                             sharey=False,
                                             figsize=(16, 6))
    #Plot Hazard with HPD up until the last interval for non-metasized cancer
    plot_with_hpd(interval_bounds[:-1],
                  base_hazard,
                  cum_hazard,
                  hazard_ax,
                  color=blue,
                  label='Had not metastized')
    #Plot Hazard with HPD up until the last interval for metasized cancer
    plot_with_hpd(interval_bounds[:-1],
                  met_hazard,
                  cum_hazard,
                  hazard_ax,
                  color=red,
                  label='Metastized')
    hazard_ax.set_xlim(0, df.time.max())
    hazard_ax.set_xlabel('Months since mastectomy')
    hazard_ax.set_ylabel(r'Cumulative hazard $\Lambda(t)$')
    hazard_ax.legend(loc=2)
    #Plot Survival with HPD up until the last interval for non-metasized cancer
    plot_with_hpd(interval_bounds[:-1],
                  base_hazard,
                  survival,
                  surv_ax,
                  color=blue)
    #Plot Survival with HPD up until the last interval for metasized cancer
    plot_with_hpd(interval_bounds[:-1],
                  met_hazard,
                  survival,
                  surv_ax,
                  color=red)
    surv_ax.set_xlim(0, df.time.max())
    surv_ax.set_xlabel('Months since mastectomy')
    surv_ax.set_ylabel('Survival function $S(t)$')
    fig.suptitle('Bayesian survival model')

    #Consider time varying effects
    with pm.Model() as time_varying_model:
        lambda0 = pm.Gamma('lambda0', 0.01, 0.01, shape=n_intervals)
        #Beta is now modeled as a normal random walk instead of a normal distribution
        #This is due to the fact that the regression coefficients can vary over time
        beta = GaussianRandomWalk('beta', tau=1., shape=n_intervals)

        lambda_ = pm.Deterministic(
            'h', lambda0 * T.exp(T.outer(T.constant(df.metastized), beta)))
        mu = pm.Deterministic('mu', exposure * lambda_)

        obs = pm.Poisson('obs', mu, observed=death)

    with time_varying_model:
        time_varying_trace = pm.sample(n_samples,
                                       tune=n_tune,
                                       random_seed=SEED)

    pm.traceplot(time_varying_trace)
    pm.plot_posterior(time_varying_trace, varnames=['beta'], color='#87ceeb')
    pm.forestplot(time_varying_trace, varnames=['beta'])

    #Create plot to show the mean trace of beta
    fig, ax = plt.subplots(figsize=(8, 6))
    #Create percentiles of the new trace
    beta_hpd = np.percentile(time_varying_trace['beta'], [2.5, 97.5], axis=0)
    beta_low = beta_hpd[0]
    beta_high = beta_hpd[1]
    #Fill percentile interval
    ax.fill_between(interval_bounds[:-1],
                    beta_low,
                    beta_high,
                    color=blue,
                    alpha=0.25)
    #Create the mean estimate for beta from trace samples
    beta_hat = time_varying_trace['beta'].mean(axis=0)
    #Plot a stepwise line for beta_hat per interval
    ax.step(interval_bounds[:-1], beta_hat, color=blue)
    #Plot points where cancer was metastized, differentiation between death and censorship
    ax.scatter(interval_bounds[last_period[(df.event.values == 1)
                                           & (df.metastized == 1)]],
               beta_hat[last_period[(df.event.values == 1)
                                    & (df.metastized == 1)]],
               c=red,
               zorder=10,
               label='Died, cancer metastized')
    ax.scatter(interval_bounds[last_period[(df.event.values == 0)
                                           & (df.metastized == 1)]],
               beta_hat[last_period[(df.event.values == 0)
                                    & (df.metastized == 1)]],
               c=blue,
               zorder=10,
               label='Censored, cancer metastized')
    ax.set_xlim(0, df.time.max())
    ax.set_xlabel('Months since mastectomy')
    ax.set_ylabel(r'$\beta_j$')
    ax.legend()

    #Store time-varying model
    tv_base_hazard = time_varying_trace['lambda0']
    tv_met_hazard = time_varying_trace['lambda0'] * np.exp(
        np.atleast_2d(time_varying_trace['beta']))

    #Plot cumulative hazard functions with and without time-varying effect
    fig, ax = plt.subplots(figsize=(8, 6))
    ax.step(interval_bounds[:-1],
            cum_hazard(base_hazard.mean(axis=0)),
            color=blue,
            label='Had not metastized')
    ax.step(interval_bounds[:-1],
            cum_hazard(met_hazard.mean(axis=0)),
            color=red,
            label='Metastized')
    ax.step(interval_bounds[:-1],
            cum_hazard(tv_base_hazard.mean(axis=0)),
            color=blue,
            linestyle='--',
            label='Had not metastized (time varying effect)')
    ax.step(interval_bounds[:-1],
            cum_hazard(tv_met_hazard.mean(axis=0)),
            color=red,
            linestyle='--',
            label='Metastized (time varying effect)')
    ax.set_xlim(0, df.time.max() - 4)
    ax.set_xlabel('Months since mastectomy')
    ax.set_ylim(0, 2)
    ax.set_ylabel(r'Cumulative hazard $\Lambda(t)$')
    ax.legend(loc=2)

    #Plot cumulative hazard and survival models with HPD
    fig, (hazard_ax, surv_ax) = plt.subplots(ncols=2,
                                             sharex=True,
                                             sharey=False,
                                             figsize=(16, 6))
    plot_with_hpd(interval_bounds[:-1],
                  tv_base_hazard,
                  cum_hazard,
                  hazard_ax,
                  color=blue,
                  label='Had not metastized')
    plot_with_hpd(interval_bounds[:-1],
                  tv_met_hazard,
                  cum_hazard,
                  hazard_ax,
                  color=red,
                  label='Metastized')
    hazard_ax.set_xlim(0, df.time.max())
    hazard_ax.set_xlabel('Months since mastectomy')
    hazard_ax.set_ylim(0, 2)
    hazard_ax.set_ylabel(r'Cumulative hazard $\Lambda(t)$')
    hazard_ax.legend(loc=2)
    plot_with_hpd(interval_bounds[:-1],
                  tv_base_hazard,
                  survival,
                  surv_ax,
                  color=blue)
    plot_with_hpd(interval_bounds[:-1],
                  tv_met_hazard,
                  survival,
                  surv_ax,
                  color=red)
    surv_ax.set_xlim(0, df.time.max())
    surv_ax.set_xlabel('Months since mastectomy')
    surv_ax.set_ylabel('Survival function $S(t)$')
    fig.suptitle('Bayesian survival model with time varying effects')

    plt.show()

    print('x')
コード例 #18
0
# hittersRegTree.py
# Code to plot figures 8.1 and 8.2
# How to find region boundaries (e.g., 4.5, 117.5) from tree estimator?

from statsmodels import datasets
from sklearn.tree import DecisionTreeRegressor, plot_tree
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')

hitters = datasets.get_rdataset('Hitters', 'ISLR').data

hitters_use = hitters[['Hits', 'Years', 'Salary']].copy()
hitters_use.dropna(how='any', inplace=True)

tree = DecisionTreeRegressor(max_leaf_nodes=3)
X = hitters_use[['Hits', 'Years']]
y = np.log(hitters_use['Salary'])

tree.fit(X, y)

fig, ax = plt.subplots()
plot_tree(tree, feature_names=['Hits', 'Years'], ax=ax)

# Plot decision tree regions and training data
xx = np.linspace(hitters_use['Years'].min(), hitters_use['Years'].max())
yy = np.linspace(hitters_use['Hits'].min(), hitters_use['Hits'].max())
x_grid, y_grid = np.meshgrid(xx, yy)
zz = tree.predict(np.vstack((y_grid.ravel(), x_grid.ravel())).T)
z_grid = zz.reshape(x_grid.shape)
コード例 #19
0
# Figure 4.3

import matplotlib.pyplot as plt
from statsmodels import datasets
import pandas as pd
import numpy as np

default = datasets.get_rdataset('Default', 'ISLR').data
default['balance_grp'] = pd.cut(default['balance'],
                                bins=np.linspace(0, 2700, 10))

student_balance = default.loc[default['student'] == 'Yes',
                              ['balance', 'balance_grp']].groupby(
                                  'balance_grp').mean()
student_defrate = default.loc[default['student'] == 'Yes'].groupby(
    'balance_grp').apply(lambda x: np.sum(x['default'] == 'Yes') / x.shape[0])
student_defrate.name = 'default_rate'
student_data = pd.merge(student_balance,
                        student_defrate,
                        left_index=True,
                        right_index=True)

notstudent_balance = default.loc[default['student'] == 'No',
                                 ['balance', 'balance_grp']].groupby(
                                     'balance_grp').mean()
notstudent_defrate = default.loc[default['student'] == 'No'].groupby(
    'balance_grp').apply(lambda x: np.sum(x['default'] == 'Yes') / x.shape[0])
notstudent_defrate.name = 'default_rate'
notstudent_data = pd.merge(notstudent_balance,
                           notstudent_defrate,
                           left_index=True,
コード例 #20
0
# Plot figure 4.1

from statsmodels import datasets
import matplotlib.pyplot as plt
import numpy as np

credit = datasets.get_rdataset('Default', 'ISLR', cache=True)
credit_data = credit.data
credit_sample = credit_data.iloc[np.random.choice(credit_data.shape[0],
                                                  1000,
                                                  replace=False)]

fig = plt.figure(figsize=(8, 4))
ax1 = fig.add_subplot(121)
credit_sample.loc[credit_sample['default'] == 'No'].plot(x='balance',
                                                         y='income',
                                                         alpha=0.5,
                                                         kind='scatter',
                                                         ax=ax1)
credit_data.loc[credit_data['default'] == 'Yes'].plot(x='balance',
                                                      y='income',
                                                      marker='+',
                                                      color='brown',
                                                      kind='scatter',
                                                      alpha=0.9,
                                                      ax=ax1)

ax1.set_xlabel('Balance')
ax1.set_ylabel('Income')

ax2 = fig.add_subplot(143)
コード例 #21
0
# Figure 7.1
# Polynomial fit along with confidence intervals for Wage data
# Confidence intervals are drawn for ols regression only
# statsmodels does not provide confidence intervals for logistic regression

from statsmodels import datasets
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures

wage = datasets.get_rdataset('Wage', 'ISLR').data

wage_model = smf.ols('wage ~ age + I(age ** 2) + I(age ** 3) + I(age ** 4)',
                     data=wage)
wage_fit = wage_model.fit()

res_df = pd.DataFrame({
    'age': wage['age'],
    'wage_fit': wage_fit.fittedvalues,
    'wage_lower': wage_fit.get_prediction().conf_int()[:, 0],
    'wage_upper': wage_fit.get_prediction().conf_int()[:, 1]
})
res_df.sort_values('age', inplace=True)

fig = plt.figure(figsize=(7, 4))
ax1 = fig.add_subplot(121)
wage.plot(x='age', y='wage', kind='scatter', s=10, alpha=0.5, ax=ax1)
res_df.plot(x='age', y='wage_fit', c='k', ax=ax1)
res_df.plot(x='age', y='wage_lower', linestyle='--', c='r', ax=ax1)
コード例 #22
0
# Plot figure 3.15

import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.datasets as datasets
import statsmodels.formula.api as smf
import numpy as np

credit = datasets.get_rdataset('Credit', 'ISLR').data


def calcRSS(beta1, beta2, var1, var2, yvar, df):
    # Solve for intercept
    alpha = np.mean(df[yvar] - beta1 * df[var1] - beta2 * df[var2])
    rss = np.sum((df[yvar] - alpha - beta1 * df[var1] - beta2 * df[var2])**2)
    return rss


age_limit_reg = smf.ols(formula='Balance ~ Limit + Age', data=credit)
age_limit_fit = age_limit_reg.fit()
age_limit_res = age_limit_fit.summary2().tables[1]
beta_limit = age_limit_fit.params['Limit']
beta_limit_se = age_limit_res.loc['Limit', 'Std.Err.']
beta_age = age_limit_fit.params['Age']
beta_age_se = age_limit_res.loc['Age', 'Std.Err.']

beta_limit_xx = np.linspace(beta_limit - 4 * beta_limit_se,
                            beta_limit + 4 * beta_limit_se)
beta_age_xx = np.linspace(beta_age - 4 * beta_age_se,
                          beta_age + 4 * beta_age_se)
beta_limit_grid, beta_age_grid = np.meshgrid(beta_limit_xx, beta_age_xx)
コード例 #23
0
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.datasets as datasets

credit = datasets.get_rdataset('Credit', package='ISLR').data

axes = pd.plotting.scatter_matrix(credit[[
    'Balance', 'Age', 'Cards', 'Education', 'Income', 'Limit', 'Rating'
]],
                                  alpha=0.6,
                                  s=5,
                                  figsize=(6, 6))

[plt.setp(item.xaxis.get_label(), 'size', 7) for item in axes.ravel()]
[
    plt.setp(item.xaxis.get_majorticklabels(), 'size', 7)
    for item in axes.ravel()
]
[plt.setp(item.yaxis.get_label(), 'size', 7) for item in axes.ravel()]
[
    plt.setp(item.yaxis.get_majorticklabels(), 'size', 7)
    for item in axes.ravel()
]
plt.tight_layout()
コード例 #24
0
def plantTraits():
    data = datasets.get_rdataset("plantTraits", "cluster", cache=True).data
    data.index.name = "ID"
    return data
コード例 #25
0
# Figure 5.2

import numpy as np
from statsmodels import datasets
import matplotlib.pyplot as plt
from numpy.polynomial.polynomial import polyfit, polyval

auto = datasets.get_rdataset('Auto', 'ISLR').data
n_obs = auto.shape[0]
n_train = int(n_obs / 2)
n_test = n_obs - n_train

np.random.seed(911)
train_ind = np.random.choice(range(n_obs), n_train, replace=False)
test_ind = set(range(n_obs)).difference(set(train_ind))
test_ind = list(test_ind)
auto_train = auto.iloc[train_ind]
auto_test = auto.iloc[test_ind]

poly_degree = []
poly_mse = []
for p in range(1, 11):
    poly_fit = polyfit(auto_train['horsepower'], auto_train['mpg'], deg=p)
    mpg_test = polyval(auto_test['horsepower'], poly_fit)
    mse_test = np.mean((mpg_test - auto_test['mpg'])**2)
    poly_degree.append(p)
    poly_mse.append(mse_test)

fig = plt.figure(figsize=(8, 4))
ax1 = fig.add_subplot(121)
ax1.plot(poly_degree, poly_mse, marker='o', c='r')
コード例 #26
0
    datasets_df = read_csv(documentation_file, usecols=datasets_usecols)
    logger.info(datasets_df.shape)
    packages = datasets_df['Package'].unique()
    for package in packages:
        if not exists(data_folder + package):
            logger.info('creating output folder {}'.format(data_folder + package))
            mkdir(data_folder + package)

    for index, row in datasets_df.iterrows():
        logger.info('loading {} / {} data'.format(row['Package'], row['Item']))
        current_pickle = data_folder + row['Package'] + '/' + row['Item'] + '.pkl'
        if exists(current_pickle):
            with open(current_pickle, 'rb') as current_fp:
                current_bundle = pickle.load(current_fp)
        else:
            current_bundle = get_rdataset(row['Item'], row['Package'])
            with open(current_pickle, 'wb') as current_fp:
                pickle.dump(current_bundle, current_fp)
        current_data = current_bundle.data
        logger.info('{} data has variables {}'.format(row['Item'], list(current_data)))
        if len(current_data.shape) > 1:
            logger.info('{} data has {} rows and {} variables'.format(row['Item'], current_data.shape[0],
                                                                      current_data.shape[1]))
        if 'title' in current_bundle.keys():
            current_title = current_bundle.title
            logger.info('{} data has title {}'.format(row['Item'], current_title))

    return_X_y = False

    if not exists(data_folder + 'statsmodels'):
        logger.info('creating output folder {}'.format(data_folder + 'statsmodels'))