Beispiel #1
0
def test_qth_survival_time_with_cdf_instead_of_survival_function():
    cdf = np.linspace(0, 1, 50)
    assert utils.qth_survival_times(0.5, cdf, cdf=True) == 25
    assert utils.qth_survival_times(0.05, cdf, cdf=True) == 3

    cdf = np.linspace(0.1, 1, 50)
    assert utils.qth_survival_times(0.05, cdf, cdf=True) == -np.inf
    assert utils.qth_survival_times(0.50, cdf, cdf=True) == 22
Beispiel #2
0
def test_qth_survival_times_with_multivariate_q():
    sf = np.linspace(1, 0, 50)
    sf_multi_df = pd.DataFrame({'sf': sf, 'sf**2': sf ** 2})

    assert_frame_equal(utils.qth_survival_times([0.2, 0.5], sf_multi_df), pd.DataFrame([[40, 25], [28, 15]], columns=[0.2, 0.5], index=['sf', 'sf**2']))
    assert_frame_equal(utils.qth_survival_times([0.2, 0.5], sf_multi_df['sf']), pd.DataFrame([[40, 25]], columns=[0.2, 0.5], index=['sf']))
    assert_frame_equal(utils.qth_survival_times(0.5, sf_multi_df), pd.DataFrame([[25], [15]], columns=[0.5], index=['sf', 'sf**2']))
    assert utils.qth_survival_times(0.5, sf_multi_df['sf']) == 25
Beispiel #3
0
def test_qth_survival_times_with_multivariate_q():
    sf = np.linspace(1, 0, 50)
    sf_multi_df = pd.DataFrame({'sf': sf, 'sf**2': sf ** 2})

    assert_frame_equal(utils.qth_survival_times([0.2, 0.5], sf_multi_df), pd.DataFrame([[40, 28], [25, 15]], index=[0.2, 0.5], columns=['sf', 'sf**2']))
    assert_frame_equal(utils.qth_survival_times([0.2, 0.5], sf_multi_df['sf']), pd.DataFrame([40, 25], index=[0.2, 0.5], columns=['sf']))
    assert_frame_equal(utils.qth_survival_times(0.5, sf_multi_df), pd.DataFrame([[25, 15]], index=[0.5], columns=['sf', 'sf**2']))
    assert utils.qth_survival_times(0.5, sf_multi_df['sf']) == 25
Beispiel #4
0
def test_qth_survival_times_with_duplicate_q_returns_valid_index_and_shape():
    sf = pd.DataFrame(np.linspace(1, 0, 50))

    q = pd.Series([0.5, 0.5, 0.2, 0.0, 0.0])
    actual = utils.qth_survival_times(q, sf)
    assert actual.shape[0] == len(q)
    npt.assert_almost_equal(actual.index.values, q.values)
Beispiel #5
0
    def predict_percentile(self, X, p=0.5):
        """
        Returns the median lifetimes for the individuals, by default. If the survival curve of an
        individual does not cross 0.5 in the timeline (set in ``fit``), then the result is infinity.
        http://stats.stackexchange.com/questions/102986/percentile-loss-functions

        Parameters
        ----------
        X:  numpy array or DataFrame
            a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns
            can be in any order. If a numpy array, columns must be in the
            same order as the training data.
        p: float, optional (default=0.5)
            the percentile, must be between 0 and 1.

        Returns
        -------
        percentiles: DataFrame

        See Also
        --------
        predict_median

        """
        subjects = _get_index(X)
        return qth_survival_times(p, self.predict_survival_function(X)[subjects]).T
Beispiel #6
0
    def _conditional_time_to_event_(self):
        """
        Return a DataFrame, with index equal to survival_function_, that estimates the median
        duration remaining until the death event, given survival up until time t. For example, if an
        individual exists until age 1, their expected life remaining *given they lived to time 1*
        might be 9 years.

        Returns
        -------
        conditional_time_to_: DataFrame 
            with index equal to survival_function_

        """
        age = self.survival_function_.index.values[:, None]
        columns = ["%s - Conditional time remaining to event" % self._label]
        return (
            pd.DataFrame(
                qth_survival_times(self.survival_function_[self._label] * 0.5, self.survival_function_)
                .sort_index(ascending=False)
                .values,
                index=self.survival_function_.index,
                columns=columns,
            )
            - age
        )
Beispiel #7
0
 def predict_percentile(self, X, p=0.5):
     """
     X: a (n,d) covariate matrix
     Returns the median lifetimes for the individuals.
     http://stats.stackexchange.com/questions/102986/percentile-loss-functions
     """
     index = _get_index(X)
     return qth_survival_times(p, self.predict_survival_function(X)[index])
Beispiel #8
0
 def predict_percentile(self, X, p=0.5):
     """
     X: a (n,d) covariate matrix
     Returns the median lifetimes for the individuals.
     http://stats.stackexchange.com/questions/102986/percentile-loss-functions
     """
     index = _get_index(X)
     return qth_survival_times(p, self.predict_survival_function(X)[index])
Beispiel #9
0
 def predict_percentile(self, conditioned_sf, percentile):
     # Predict the month number where the survival chance of customer is 50%
     # This can also be modified as predictions_50 = qth_survival_times(.50, conditioned_sf),
     # where the percentile can be modified depending on our requirement
     predictions = qth_survival_times(percentile, conditioned_sf)
     st.write(
         '### predictions\n Predicting the month at which the survival chance of the customer is ',
         percentile * 100, ' percentile')
     st.write(predictions[[self.customer]])
     return predictions
Beispiel #10
0
    def predict_percentile(self, X, p=0.5):
        """
        X: a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns
            can be in any order. If a numpy array, columns must be in the
            same order as the training data.

        By default, returns the median lifetimes for the individuals.
        http://stats.stackexchange.com/questions/102986/percentile-loss-functions
        """
        index = _get_index(X)
        return qth_survival_times(p, self.predict_survival_function(X)[index])
Beispiel #11
0
    def predict_percentile(self, X, p=0.5):
        """
        X: a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns
            can be in any order. If a numpy array, columns must be in the
            same order as the training data.

        By default, returns the median lifetimes for the individuals.
        http://stats.stackexchange.com/questions/102986/percentile-loss-functions
        """
        index = _get_index(X)
        return qth_survival_times(p, self.predict_survival_function(X)[index])
Beispiel #12
0
    def predict(self, df, q=0.5):
        df, _ = self._prepare_df(df)

        if self.scaler is not None:
            X = df[self.feat_cols].values
            X_scaled = self.scaler.transform(X)
            df[self.feat_cols] = X_scaled
            if self.verbose:
                print("Scaled features based on training set results!")

        # the cox model predicts log hazards and from that can also compute individual survival time curves
        surv_funs = self.model.predict_survival_function(df)
        # and we take the time point were the curve reaches 0.5
        pred_time = qth_survival_times(q, surv_funs).squeeze()

        # print("head(df) =\n{}".format(df.head(5)))
        log_hazards = self.model.predict_log_partial_hazard(
            df).values.squeeze()
        # print("Log_hazards = {}".format(log_hazards.shape))
        # print("pred_time.shape", pred_time.shape)
        # print("ids.shape", df[self.id_col].values.shape)

        pred_df = pd.DataFrame({
            self.id_col: df.index.values,
            'pred_time': pred_time,
            'pred_per_pat(log_hazard)': log_hazards
        })

        perf_df = None
        # in case we have labels
        if self.time_col in df.columns and self.event_col in df.columns:
            # we can append some information to the pred_df as well
            true_time = df[self.time_col].values
            event_status = df[self.event_col].values
            diff = true_time - pred_time

            pred_df[self.event_col + "_truth"] = event_status
            pred_df[self.time_col + "_truth"] = true_time
            pred_df["error(time_prediction)"] = diff

            # only if we have the true label we can return performance
            # values
            perf_df = pd.DataFrame({
                'MSE': [np.mean(diff**2)],
                'MAE': [np.mean(np.abs(diff))],
                'C-index_time':
                [concordance_index(true_time, pred_time, event_status)],
                'C-index_log_hazard':
                [concordance_index(true_time, log_hazards, event_status)]
            })

        return pred_df, perf_df
Beispiel #13
0
    def predict_percentile(self, X, p=0.5):
        """
        X: a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns
            can be in any order. If a numpy array, columns must be in the
            same order as the training data.

        Returns the median lifetimes for the individuals, by default. If the survival curve of an
        individual does not cross 0.5, then the result is infinity.
        http://stats.stackexchange.com/questions/102986/percentile-loss-functions
        """
        subjects = _get_index(X)
        return qth_survival_times(
            p,
            self.predict_survival_function(X)[subjects]).T
Beispiel #14
0
    def _conditional_time_to_event_(self):
        """
        Return a DataFrame, with index equal to survival_function_, that estimates the median
        duration remaining until the death event, given survival up until time t. For example, if an
        individual exists until age 1, their expected life remaining *given they lived to time 1*
        might be 9 years.

        Returns:
            conditional_time_to_: DataFrame, with index equal to survival_function_

        """
        age = self.survival_function_.index.values[:, None]
        columns = ['%s - Conditional time remaining to event' % self._label]
        return pd.DataFrame(qth_survival_times(self.survival_function_[self._label] * 0.5, self.survival_function_).T.sort_index(ascending=False).values,
                            index=self.survival_function_.index,
                            columns=columns) - age
Beispiel #15
0
def test_qth_survival_times_with_varying_datatype_inputs():
    sf_list = [1.0, 0.75, 0.5, 0.25, 0.0]
    sf_array = np.array([1.0, 0.75, 0.5, 0.25, 0.0])
    sf_df_no_index = pd.DataFrame([1.0, 0.75, 0.5, 0.25, 0.0])
    sf_df_index = pd.DataFrame([1.0, 0.75, 0.5, 0.25, 0.0], index=[10, 20, 30, 40, 50])
    sf_series_index = pd.Series([1.0, 0.75, 0.5, 0.25, 0.0], index=[10, 20, 30, 40, 50])
    sf_series_no_index = pd.Series([1.0, 0.75, 0.5, 0.25, 0.0])

    q = 0.5

    assert utils.qth_survival_times(q, sf_list) == 2
    assert utils.qth_survival_times(q, sf_array) == 2
    assert utils.qth_survival_times(q, sf_df_no_index) == 2
    assert utils.qth_survival_times(q, sf_df_index) == 30
    assert utils.qth_survival_times(q, sf_series_index) == 30
    assert utils.qth_survival_times(q, sf_series_no_index) == 2
Beispiel #16
0
def test_qth_survival_times_with_varying_datatype_inputs():
    sf_list = [1.0, 0.75, 0.5, 0.25, 0.0]
    sf_array = np.array([1.0, 0.75, 0.5, 0.25, 0.0])
    sf_df_no_index = pd.DataFrame([1.0, 0.75, 0.5, 0.25, 0.0])
    sf_df_index = pd.DataFrame([1.0, 0.75, 0.5, 0.25, 0.0], index=[10, 20, 30, 40, 50])
    sf_series_index = pd.Series([1.0, 0.75, 0.5, 0.25, 0.0], index=[10, 20, 30, 40, 50])
    sf_series_no_index = pd.Series([1.0, 0.75, 0.5, 0.25, 0.0])

    q = 0.5

    assert utils.qth_survival_times(q, sf_list) == 2
    assert utils.qth_survival_times(q, sf_array) == 2
    assert utils.qth_survival_times(q, sf_df_no_index) == 2
    assert utils.qth_survival_times(q, sf_df_index) == 30
    assert utils.qth_survival_times(q, sf_series_index) == 30
    assert utils.qth_survival_times(q, sf_series_no_index) == 2
Beispiel #17
0
def test_qth_survival_times_multi_dim_input():
    sf = np.linspace(1, 0, 50)
    sf_multi_df = pd.DataFrame({"sf": sf, "sf**2": sf ** 2})
    medians = utils.qth_survival_times(0.5, sf_multi_df)
    assert medians["sf"].loc[0.5] == 25
    assert medians["sf**2"].loc[0.5] == 15
Beispiel #18
0
def qq_plot(model, ax=None, **plot_kwargs):
    """
    Produces a quantile-quantile plot of the empirical CDF against
    the fitted parametric CDF. Large deviances away from the line y=x
    can invalidate a model (though we expect some natural deviance in the tails).

    Parameters
    -----------
    model: obj
        A fitted lifelines univariate parametric model, like ``WeibullFitter``
    plot_kwargs:
        kwargs for the plot.

    Returns
    --------
    ax:
        The axes which was used.

    Examples
    ---------

    >>> from lifelines import *
    >>> from lifelines.plotting import qq_plot
    >>> from lifelines.datasets import load_rossi
    >>> df = load_rossi()
    >>> wf = WeibullFitter().fit(df['week'], df['arrest'])
    >>> qq_plot(wf)


    """
    from lifelines.utils import qth_survival_times
    from lifelines import KaplanMeierFitter

    if ax is None:
        ax = plt.gca()

    dist = get_distribution_name_of_lifelines_model(model)
    dist_object = create_scipy_stats_model_from_lifelines_model(model)

    COL_EMP = "empirical quantiles"
    COL_THEO = "fitted %s quantiles" % dist

    if CensoringType.is_left_censoring(model):
        kmf = KaplanMeierFitter().fit_left_censoring(model.durations, model.event_observed, label=COL_EMP)
    elif CensoringType.is_right_censoring(model):
        kmf = KaplanMeierFitter().fit_right_censoring(model.durations, model.event_observed, label=COL_EMP)
    elif CensoringType.is_interval_censoring(model):
        raise NotImplementedError("lifelines does not have a non-parametric interval model yet.")

    q = np.unique(kmf.cumulative_density_.values[:, 0])
    # this is equivalent to the old code `qth_survival_times(q, kmf.cumulative_density, cdf=True)`
    quantiles = qth_survival_times(1 - q, kmf.survival_function_)
    quantiles[COL_THEO] = dist_object.ppf(q)
    quantiles = quantiles.replace([-np.inf, 0, np.inf], np.nan).dropna()

    max_, min_ = quantiles[COL_EMP].max(), quantiles[COL_EMP].min()

    quantiles.plot.scatter(COL_THEO, COL_EMP, c="none", edgecolor="k", lw=0.5, ax=ax)
    ax.plot([min_, max_], [min_, max_], c="k", ls=":", lw=1.0)
    ax.set_ylim(min_, max_)
    ax.set_xlim(min_, max_)

    return ax
Beispiel #19
0
def predictor(request):

    house_link = request.GET['weblink']

    print(request.GET['weblink'])

    # get training data
    feat = pd.read_csv('./survival_api_data/feat.csv')

    house_all = houseScraper(house_link)

    house_feat = [0]
    house_feat.append(house_all['days'])
    house_feat.append(house_all['discount'])
    house_feat.append(house_all['price'])
    house_feat.append(house_all['r2M'])
    house_feat.append(house_all['MonthList'])
    house_feat.append(house_all['MonthSold'])
    house_feat.append(house_all['NumList'])
    house_feat.append(house_all['NumPC'])
    house_feat.append(house_all['NumSold'])

    col_name = [
        'sold', 'days', 'discount', 'price', 'r2M', 'MonthList', 'MonthSold',
        'NumList', 'NumPC', 'NumSold'
    ]

    feat_df = pd.DataFrame([house_feat], columns=col_name)

    feat_all = pd.concat([feat, feat_df], ignore_index=True)

    censor = house_all['days'] // 30 + 3
    cph_time = get_time_model(censor, feat_all)
    pred_time_0 = cph_time.predict_survival_function(feat_df)
    pred_time = pred_time_0.apply(
        lambda c: (c / c.loc[feat_all.loc[c.name, 'days']]).clip_upper(1))
    pred_time_75 = qth_survival_times(0.25, pred_time)
    pred_time_50 = qth_survival_times(0.5, pred_time)

    pred_week = int((pred_time_50 - house_all['days']) // 7)

    cph_off = get_off_model(5, feat_all)
    pred_off_0 = cph_off.predict_survival_function(feat_df)
    pred_off = pred_off_0.apply(
        lambda c: (c / c.loc[feat_all.loc[c.name, 'discount']]).clip_upper(1))
    pred_off_75 = qth_survival_times(0.25, pred_off)
    pred_off_50 = qth_survival_times(0.5, pred_off)

    if pred_off_75 != float("inf"):
        off = pred_off_75
    elif pred_off_50 != float("inf"):
        off = pred_off_50

    else:
        off = 0

    off_str = "{:.1f}%".format(off)
    off = round(off / 100, 1)
    off_usd = int(round((1 - off) * house_all['price']))
    off_usd_str = '$' + format(off_usd, ',')

    dom_w = int(math.ceil(house_all['days'] / 7))
    dom_w_str = str(dom_w) + ' weeks'
    pred_week_str = str(pred_week) + ' weeks'

    response_dict = {
        "log": True,
        # "pred_time_75": pred_time_75,
        # "pred_off_75": pred_off_75,
        # "pred_off_50": pred_off_50,
        "prediction": {
            "address": house_all['address'],
            "listing_price": house_all['listingPrice'],
            "dom": dom_w_str,
            "pred_weeks": pred_week_str,
            "off_usd": off_usd_str,
            "off_pct": off_str
        }
    }

    response = json.dumps(response_dict)

    return HttpResponse(response)
conditioned_sf = unconditioned_sf.apply(lambda c: (c/c.loc[data.loc[c.name, 'tenure']]).clip_upper(1))

# now we can investigate customers to see how the conditioning has affected their survival over the baseline rate
subject = 12
unconditioned_sf[subject].plot(ls="--", color="#A60628", label="unconditioned")
conditioned_sf[subject].plot(color="#A60628", label="conditioned on $T>58$")
plt.legend()
# we can see that cust 12 is still a customer after 58 months, which means cust 12's survival curve drops slower than
# the baseline for similar custs without that condition.

# the predict_survival_function has created a metrix of survival probabilities for each remaining customer at each
# point in time. what we need to do now is use that to select a single value as  prdiction for how long a customer
# will last. Lets use the the median.
# predictions_50 = median_survival_times(conditioned_sf)
likelihood_cutoff = 0.5
predictions_50 = qth_survival_times(likelihood_cutoff, conditioned_sf) # same as above but specifying the %
# this gave us a single row with the month number where the customer has a 50% likelihood of churning.

# Lets join it to some data to investigate.
values = predictions_50.T.join(data[['MonthlyCharges', 'tenure']])
values['RemainingValue'] = values['MonthlyCharges'] * (values[likelihood_cutoff] - values['tenure'])
# now looking at the RemainingValue column we can see which customers would most affect our bottom line.

# Great, so we know which customers have the highest risk of churn and when they are likely to, but what can we do?
# lets take a look at our coefficients from earlier.
# we can see that the features that impact survival positively are 'Contract_One year', 'Contract_Two year',
# 'PaymentMethod_Bank transfer (automatic)', 'PaymentMethod_Credit card (automatic)'. Beyond these the results are
# insignificant. Lets compare customers with the features to understand the best place to spend money.
upgrades = ['Contract_One year',
            'Contract_Two year',
            'PaymentMethod_Bank transfer (automatic)',
Beispiel #21
0
def qq_plot(model, ax=None, **plot_kwargs):
    """
    Produces a quantile-quantile plot of the empirical CDF against
    the fitted parametric CDF. Large deviances away from the line y=x
    can invalidate a model (though we expect some natural deviance in the tails).

    Parameters
    -----------
    model: obj
        A fitted lifelines univariate parametric model, like ``WeibullFitter``
    plot_kwargs:
        kwargs for the plot.

    Returns
    --------
    ax:
        The axes which was used.

    Examples
    ---------
    .. code:: python

        from lifelines import *
        from lifelines.plotting import qq_plot
        from lifelines.datasets import load_rossi
        df = load_rossi()
        wf = WeibullFitter().fit(df['week'], df['arrest'])
        qq_plot(wf)

    Notes
    ------
    The interval censoring case uses the mean between the upper and lower bounds.

    """
    from lifelines.utils import qth_survival_times
    from lifelines import KaplanMeierFitter

    if ax is None:
        ax = plt.gca()

    dist = get_distribution_name_of_lifelines_model(model)
    dist_object = create_scipy_stats_model_from_lifelines_model(model)

    COL_EMP = "empirical quantiles"
    COL_THEO = "fitted %s quantiles" % dist

    if CensoringType.is_left_censoring(model):
        kmf = KaplanMeierFitter().fit_left_censoring(
            model.durations, model.event_observed, label=COL_EMP, weights=model.weights, entry=model.entry
        )
        sf, cdf = kmf.survival_function_[COL_EMP], kmf.cumulative_density_[COL_EMP]
    elif CensoringType.is_right_censoring(model):
        kmf = KaplanMeierFitter().fit_right_censoring(
            model.durations, model.event_observed, label=COL_EMP, weights=model.weights, entry=model.entry
        )
        sf, cdf = kmf.survival_function_[COL_EMP], kmf.cumulative_density_[COL_EMP]

    elif CensoringType.is_interval_censoring(model):
        kmf = KaplanMeierFitter().fit_interval_censoring(
            model.lower_bound, model.upper_bound, label=COL_EMP, weights=model.weights, entry=model.entry
        )
        sf, cdf = kmf.survival_function_.mean(1), kmf.cumulative_density_[COL_EMP + "_lower"]

    q = np.unique(cdf.values)

    quantiles = qth_survival_times(1 - q, sf)
    quantiles[COL_THEO] = dist_object.ppf(q)
    quantiles = quantiles.replace([-np.inf, 0, np.inf], np.nan).dropna()

    max_, min_ = quantiles[COL_EMP].max(), quantiles[COL_EMP].min()

    quantiles.plot.scatter(COL_THEO, COL_EMP, c="none", edgecolor="k", lw=0.5, ax=ax)
    ax.plot([min_, max_], [min_, max_], c="k", ls=":", lw=1.0)
    ax.set_ylim(min_, max_)
    ax.set_xlim(min_, max_)

    return ax
Beispiel #22
0
def plot_survival(unique_groups, grouped_data, analysis_type, censors, ci, showplot, stat_results, time='Months'):
	#plot survival curve
	kmf = KaplanMeierFitter()
	fig, ax = plt.subplots()
	n_in_groups = []

	f = open('Kaplan_%s.txt' % (analysis_type), 'a')
	f.write("\nPercent %s\n" % analysis_type)
	headers = "Group\t"
	for x in range(95,-1,-5):
		headers += str(x) + "%\t"
	f.write("%s\n" % headers)


	for i, group in enumerate(unique_groups):
		data = grouped_data.get_group(group)
		n_in_groups.append(len(data))
		# Adjust survival data from days to whatever form wanted
		if time.lower() == 'months':
			survival_time = (data['survival']/(365/12))
		elif time.lower() == 'years':
			survival_time = (data['survival']/(365))
		else:
			survival_time = data['survival']
		kmf.fit(survival_time, data['event'], label = group)
		# print(data[survival])

		# print(kmf.survival_function_)
		f.write("%s\t" % group)
		for x in range(95, -1, -5):
			f.write(str(qth_survival_times(x/100, kmf.survival_function_)) + "\t")
		f.write("\n")	

		kmf.plot(ax=ax, show_censors=censors, ci_show=ci, linewidth=2.5)

	# Make the graph pretty!
	textbox = dict(horizontalalignment = 'left', verticalalignment = 'bottom', fontname = 'Arial', fontsize = 18)
	labels = dict(horizontalalignment = 'center', verticalalignment = 'center', fontname = 'Arial', fontsize = 28)

	ax.grid(False)
	ax.set_ylim(0,1.05)
	ax.spines['left'].set_linewidth(2.5)
	ax.spines['right'].set_linewidth(2.5)
	ax.spines['top'].set_linewidth(2.5)
	ax.spines['bottom'].set_linewidth(2.5)
	ax.yaxis.set_tick_params(width=2.5)
	ax.xaxis.set_tick_params(width=2.5)
	ax.xaxis.set_ticks_position('bottom')
	ax.yaxis.set_ticks_position('left')

	# plt.title('%s' % (analysis_type), labels, y = 1.05)
	plt.xlabel('%s Post-Diagnosis' % time, labels, labelpad = 20)
	if analysis_type == 'survival':
		plt.ylabel('Overall Survival', labels, labelpad = 20)
	else:
		plt.ylabel('Relapse-Free Survival', labels, labelpad=20)
	plt.xticks(fontname = 'Arial', fontsize = 24)
	plt.yticks(fontname = 'Arial', fontsize = 24)
	ax.tick_params(axis='y', pad=10)
	ax.tick_params(axis='x', pad=10)


	legend = ax.legend(frameon=False,loc=3)
	counter=0
	for label in legend.get_texts():
		label.set_fontsize(20)
		label.set_text('%s   n=%d' % (unique_groups[counter], n_in_groups[counter]))
		counter += 1

	if len(unique_groups) == 2:	
		plt.text(0.95, 0.05, 'p = %.2g' % (stat_results.p_value), fontname='Arial', fontsize=20, ha='right', transform=ax.transAxes)

	plt.tight_layout()


	fig.savefig('Kaplan_%s.png' % analysis_type, transparent = True)
	fig.savefig('Kaplan_%s.eps' % analysis_type, transparent = True)
	if showplot == True:
		plt.show()
	plt.close(fig)
Beispiel #23
0
def test_qth_survival_times_multi_dim_input():
    sf = np.linspace(1, 0, 50)
    sf_multi_df = pd.DataFrame({'sf': sf, 'sf**2': sf ** 2})
    medians = utils.qth_survival_times(0.5, sf_multi_df)
    assert medians.ix['sf'][0.5] == 25
    assert medians.ix['sf**2'][0.5] == 15
Beispiel #24
0
def test_qth_survival_times_multi_dim_input():
    sf = np.linspace(1, 0, 50)
    sf_multi_df = pd.DataFrame({'sf': sf, 'sf**2': sf**2})
    medians = utils.qth_survival_times(0.5, sf_multi_df)
    assert medians.ix['sf'][0.5] == 25
    assert medians.ix['sf**2'][0.5] == 15
Beispiel #25
0
 def predict_percentile(self, df, p=0.5):
     return qth_survival_times(p, self.predict_survival_function(df))
Beispiel #26
0
def test_qth_survival_time_with_cdf_instead_of_survival_function():
    cdf = np.linspace(0, 1, 50)
    assert utils.qth_survival_times(0.5, cdf, cdf=True) == 25
Beispiel #27
0
def qq_plot(model, **plot_kwargs):
    """
    Produces a quantile-quantile plot of the empirical CDF against
    the fitted parametric CDF. Large deviances away from the line y=x
    can invalidate a model (though we expect some natural deviance in the tails).

    Parameters
    -----------
    model: obj
        A fitted lifelines univariate parametric model, like ``WeibullFitter``
    plot_kwargs:
        kwargs for the plot.

    Returns
    --------
    ax: axis object

    Examples
    ---------

    >>> from lifelines import *
    >>> from lifelines.plotting import qq_plot
    >>> from lifelines.datasets import load_rossi
    >>> df = load_rossi()
    >>> wf = WeibullFitter().fit(df['week'], df['arrest'])
    >>> qq_plot(wf)


    """
    from lifelines.utils import qth_survival_times
    from lifelines import KaplanMeierFitter
    from lifelines.fitters import KnownModelParametericUnivariateFitter

    assert isinstance(model, KnownModelParametericUnivariateFitter)

    set_kwargs_ax(plot_kwargs)
    ax = plot_kwargs.pop("ax")

    dist = get_distribution_name_of_lifelines_model(model)
    dist_object = create_scipy_stats_model_from_lifelines_model(model)

    COL_EMP = "empirical quantiles"
    COL_THEO = "fitted %s quantiles" % dist

    kmf = KaplanMeierFitter().fit(model.durations,
                                  model.event_observed,
                                  left_censorship=model.left_censorship,
                                  label=COL_EMP)
    if model.left_censorship:
        q = np.unique(kmf.cumulative_density_.values[:, 0])
        quantiles = qth_survival_times(q, kmf.cumulative_density_, cdf=True)
    else:
        q = np.unique(1 - kmf.survival_function_.values[:, 0])
        quantiles = qth_survival_times(q, 1 - kmf.survival_function_, cdf=True)

    quantiles[COL_THEO] = dist_object.ppf(q)
    quantiles = quantiles.replace([-np.inf, 0, np.inf], np.nan).dropna()

    max_, min_ = quantiles[COL_EMP].max(), quantiles[COL_EMP].min()

    quantiles.plot.scatter(COL_THEO,
                           COL_EMP,
                           c="none",
                           edgecolor="k",
                           lw=0.5,
                           ax=ax)
    ax.plot([min_, max_], [min_, max_], c="k", ls=":", lw=1.0)
    ax.set_ylim(min_, max_)
    ax.set_xlim(min_, max_)

    return ax
Beispiel #28
0
     [1000, 10000, 0, 0, 1, 0, 2], [1000, 10000, 0, 1, 1, 0, 2]]
num_d = len(d)

dfn = pd.DataFrame(
    d, columns=["ID", "KM", "DEAD", "ENGINE", "MOUNTAIN", "CITY", "MONDAY"])
print(dfn)
censored_subjects = censored_subjects.append(dfn, ignore_index=True)

print(censored_subjects)

unconditioned_sf = cph.predict_survival_function(censored_subjects)
print(unconditioned_sf)

from lifelines.utils import median_survival_times, qth_survival_times

predictions_75 = qth_survival_times(0.75, unconditioned_sf)
predictions_25 = qth_survival_times(0.25, unconditioned_sf)
predictions_50 = median_survival_times(unconditioned_sf)
print(predictions_50)

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(8, 4))
for f in unconditioned_sf:
    ax.plot(unconditioned_sf[f], alpha=.5, label=f)
#ax.legend()

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(8, 4))
for i, f in enumerate(reversed(unconditioned_sf.columns)):
    #print( i )
    if i < num_d:
        print(i, f)
        ax.plot(unconditioned_sf[f], alpha=1, label=f)