Example #1
0
    def predict_log_partial_hazard(self, X):
        r"""
        This is equivalent to R's linear.predictors.
        Returns the log of the partial hazard for the individuals, partial since the
        baseline hazard is not included. Equal to :math:`(x - \bar{x})'\beta `


        Parameters
        ----------
        X: numpy array or DataFrame
            a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns
            can be in any order. If a numpy array, columns must be in the
            same order as the training data.

        Returns
        -------
        DataFrame

        Note
        -----
        If X is a DataFrame, the order of the columns do not matter. But
        if X is an array, then the column ordering is assumed to be the
        same as the training dataset.
        """
        if isinstance(X, pd.DataFrame):
            order = self.params_.index
            X = X[order]
            check_for_numeric_dtypes_or_raise(X)

        X = X.astype(float)
        index = _get_index(X)
        X = normalize(X, self._norm_mean.values, 1)
        return pd.DataFrame(np.dot(X, self.params_), index=index)
    def predict_log_partial_hazard(self, X):
        r"""
        This is equivalent to R's linear.predictors.
        Returns the log of the partial hazard for the individuals, partial since the
        baseline hazard is not included. Equal to :math:`(x - \bar{x})'\beta `


        Parameters
        ----------
        X: numpy array or DataFrame
            a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns
            can be in any order. If a numpy array, columns must be in the
            same order as the training data.

        Returns
        -------
        DataFrame

        Note
        -----
        If X is a DataFrame, the order of the columns do not matter. But
        if X is an array, then the column ordering is assumed to be the
        same as the training dataset.
        """
        if isinstance(X, pd.DataFrame):
            order = self.hazards_.index
            X = X[order]
            check_for_numeric_dtypes_or_raise(X)

        X = X.astype(float)
        index = _get_index(X)
        X = normalize(X, self._norm_mean.values, 1)
        return pd.DataFrame(np.dot(X, self.hazards_), index=index)
Example #3
0
    def _check_values(self, df, T, E, weights, entries):
        check_for_numeric_dtypes_or_raise(df)
        check_nans_or_infs(df)
        check_nans_or_infs(T)
        check_nans_or_infs(E)
        check_positivity(T)
        check_complete_separation(df, E, T, self.event_col)

        if self.weights_col:
            if (weights.astype(int) != weights).any() and not self.robust:
                warnings.warn(
                    dedent(
                        """It appears your weights are not integers, possibly propensity or sampling scores then?
                                        It's important to know that the naive variance estimates of the coefficients are biased. Instead a) set `robust=True` in the call to `fit`, or b) use Monte Carlo to
                                        estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis"""
                    ),
                    StatisticalWarning,
                )
            if (weights <= 0).any():
                raise ValueError(
                    "values in weight column %s must be positive." %
                    self.weights_col)

        if self.entry_col:
            count_invalid_rows = (entries > T).sum()
            if count_invalid_rows:
                warnings.warn(
                    """There exist %d rows where entry > duration.""")
    def predict_log_partial_hazard(self, X):
        r"""
        This is equivalent to R's linear.predictors.
        Returns the log of the partial hazard for the individuals, partial since the
        baseline hazard is not included. Equal to :math:`(x - \bar{x})'\beta`
        Parameters
        ----------
        X: numpy array or DataFrame
            a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns
            can be in any order. If a numpy array, columns must be in the
            same order as the training data.
        Returns
        -------
        DataFrame
        Note
        -----
        If X is a DataFrame, the order of the columns do not matter. But
        if X is an array, then the column ordering is assumed to be the
        same as the training dataset.
        """
        if isinstance(X, pd.DataFrame):
            check_for_numeric_dtypes_or_raise(X)

        X = X.astype(float)
        X = normalize(X, self._norm_mean.values, 1)
        X_pt = torch.tensor(X, dtype=self.type_pt)
        return pd.Series(self.net(X_pt).detach().numpy().ravel())
 def _check_values(self, df, events, start, stop):
     # check_for_overlapping_intervals(df) # this is currently too slow for production.
     check_nans_or_infs(df)
     check_low_var(df)
     check_complete_separation_low_variance(df, events, self.event_col)
     check_for_numeric_dtypes_or_raise(df)
     check_for_immediate_deaths(events, start, stop)
     check_for_instantaneous_events(start, stop)
 def _check_values(self, df, events, start, stop):
     # check_for_overlapping_intervals(df) # this is currently too slow for production.
     check_nans_or_infs(df)
     check_low_var(df)
     check_complete_separation_low_variance(df, events, self.event_col)
     check_for_numeric_dtypes_or_raise(df)
     check_for_immediate_deaths(events, start, stop)
     check_for_instantaneous_events(start, stop)
Example #7
0
    def _check_values(self, df, T, E, event_col):
        check_for_numeric_dtypes_or_raise(df)
        check_nans_or_infs(T)
        check_nans_or_infs(E)
        check_nans_or_infs(df)
        check_complete_separation(df, E, T, event_col)

        if self.fit_intercept:
            check_low_var(df)
 def _check_values(self, X, T, E):
     check_for_numeric_dtypes_or_raise(X)
     check_nans_or_infs(T)
     check_nans_or_infs(X)
 def _check_values(self, X, T, E):
     check_for_numeric_dtypes_or_raise(X)
     check_nans_or_infs(T)
     check_nans_or_infs(X)
Example #10
0
    def _fit(
        self,
        log_likelihood_function,
        df,
        Ts,
        regressors,
        event_col=None,
        show_progress=False,
        timeline=None,
        weights_col=None,
        robust=False,
        initial_point=None,
        entry_col=None,
    ):

        self._time_fit_was_called = datetime.utcnow().strftime(
            "%Y-%m-%d %H:%M:%S") + " UTC"
        self.weights_col = weights_col
        self.entry_col = entry_col
        self.event_col = event_col
        self._n_examples = df.shape[0]
        self.timeline = timeline
        self.robust = robust
        self.regressors = regressors  # TODO name

        E = (pass_for_numeric_dtypes_or_raise_array(df.pop(self.event_col)) if
             (self.event_col is not None) else pd.Series(np.ones(
                 self._n_examples, dtype=bool),
                                                         index=df.index,
                                                         name="E"))
        weights = (pass_for_numeric_dtypes_or_raise_array(
            df.pop(self.weights_col)).astype(float) if
                   (self.weights_col is not None) else pd.Series(
                       np.ones(self._n_examples, dtype=float),
                       index=df.index,
                       name="weights"))

        entries = (pass_for_numeric_dtypes_or_raise_array(
            df.pop(entry_col)).astype(float) if (entry_col is not None) else
                   pd.Series(np.zeros(self._n_examples, dtype=float),
                             index=df.index,
                             name="entry"))

        check_nans_or_infs(E)
        E = E.astype(bool)
        self.event_observed = E.copy()
        self.entry = entries.copy()
        self.weights = weights.copy()

        df = df.astype(float)
        self._check_values(df, coalesce(Ts[1], Ts[0]), E, weights, entries)
        check_for_numeric_dtypes_or_raise(df)
        check_nans_or_infs(df)

        _norm_std = df.std(0)
        _norm_std[_norm_std < 1e-8] = 1.0
        df_normalized = normalize(df, 0, _norm_std)

        Xs = self._create_Xs_dict(df_normalized)

        self._LOOKUP_SLICE = self._create_slicer(Xs)

        _index = pd.MultiIndex.from_tuples(
            sum(([(name, col) for col in columns]
                 for name, columns in regressors.items()), []))

        self._norm_std = pd.Series(
            [_norm_std.loc[variable_name] for _, variable_name in _index],
            index=_index)

        _params, self._log_likelihood, self._hessian_ = self._fit_model(
            log_likelihood_function,
            Ts,
            Xs,
            E.values,
            weights.values,
            entries.values,
            show_progress=show_progress,
            initial_point=initial_point,
        )
        self.params_ = _params / self._norm_std

        self.variance_matrix_ = self._compute_variance_matrix()
        self.standard_errors_ = self._compute_standard_errors(
            Ts, E.values, weights.values, entries.values, Xs)
        self.confidence_intervals_ = self._compute_confidence_intervals()
        self._predicted_median = self.predict_median(df)