Python pass_for_numeric_dtypes_or_raiseの例、lifelines.utils.pass_for_numeric_dtypes_or_raise Pythonの例

コード例 #1

0

ファイルを表示

 def _check_values(df, T, E):
     pass_for_numeric_dtypes_or_raise(df)
     check_nans_or_infs(T)
     check_nans_or_infs(E)
     check_nans_or_infs(df)
     check_low_var(df)
     check_complete_separation(df, E, T)

コード例 #2

0

ファイルを表示

ファイル: cox_time_varying_fitter.py プロジェクト: Path-AI/lifelines

    def predict_log_partial_hazard(self, X):
        r"""
        This is equivalent to R's linear.predictors.
        Returns the log of the partial hazard for the individuals, partial since the
        baseline hazard is not included. Equal to :math:`\beta (X - \bar{X})`


        Parameters
        ----------
        X: numpy array or DataFrame
            a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns
            can be in any order. If a numpy array, columns must be in the
            same order as the training data.

        Returns
        -------
        DataFrame

        Note
        -----
        If X is a dataframe, the order of the columns do not matter. But
        if X is an array, then the column ordering is assumed to be the
        same as the training dataset.
        """
        if isinstance(X, pd.DataFrame):
            order = self.hazards_.columns
            X = X[order]
            pass_for_numeric_dtypes_or_raise(X)

        X = X.astype(float)
        index = _get_index(X)
        X = normalize(X, self._norm_mean.values, 1)
        return pd.DataFrame(np.dot(X, self.hazards_.T), index=index)

コード例 #3

0

ファイルを表示

 def _check_values(df, stop_times_events):
     # check_for_overlapping_intervals(df) # this is currenty too slow for production.
     check_low_var(df)
     check_complete_separation_low_variance(df, stop_times_events['event'])
     pass_for_numeric_dtypes_or_raise(df)
     check_for_immediate_deaths(stop_times_events)
     check_for_instantaneous_events(stop_times_events)

コード例 #4

0

ファイルを表示

ファイル: cox_time_varying_fitter.py プロジェクト: Path-AI/lifelines

 def _check_values(df, events, start, stop, event_col):
     # check_for_overlapping_intervals(df) # this is currenty too slow for production.
     check_nans_or_infs(df)
     check_low_var(df)
     check_complete_separation_low_variance(df, events, event_col)
     pass_for_numeric_dtypes_or_raise(df)
     check_for_immediate_deaths(events, start, stop)
     check_for_instantaneous_events(start, stop)

コード例 #5

0

ファイルを表示

ファイル: coxph_fitter.py プロジェクト: nonabelian/lifelines

    def _check_values(self, X):
        low_var = (X.var(0) < 10e-5)
        if low_var.any():
            cols = str(list(X.columns[low_var]))
            warning_text = "Column(s) %s have very low variance.\
 This may harm convergence. Try dropping this redundant column before fitting\
 if convergence fails." % cols
            warnings.warn(warning_text, RuntimeWarning)
        pass_for_numeric_dtypes_or_raise(X)

コード例 #6

0

ファイルを表示

    def predict_log_partial_hazard(self, X):
        r"""
        This is equivalent to R's linear.predictors.
        Returns the log of the partial hazard for the individuals, partial since the
        baseline hazard is not included. Equal to :math:`\beta (X - mean(X_{train}))`


        Parameters
        ----------
        X:  numpy array or DataFrame
            a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns
            can be in any order. If a numpy array, columns must be in the
            same order as the training data.

        Returns
        -------
        log_partial_hazard: DataFrame


        Notes
        -----
        If X is a dataframe, the order of the columns do not matter. But
        if X is an array, then the column ordering is assumed to be the
        same as the training dataset.
        """

        hazard_names = self.hazards_.columns
        if isinstance(X, pd.DataFrame):
            order = hazard_names
            X = X[order]
            pass_for_numeric_dtypes_or_raise(X)
        elif isinstance(
                X, pd.Series) and ((X.shape[0] == len(hazard_names) + 2) or
                                   (X.shape[0] == len(hazard_names))):
            X = X.to_frame().T
            order = hazard_names
            X = X[order]
            pass_for_numeric_dtypes_or_raise(X)
        elif isinstance(X, pd.Series):
            assert len(hazard_names) == 1, "Series not the correct arugment"
            X = pd.DataFrame(X)
            pass_for_numeric_dtypes_or_raise(X)

        X = X.astype(float)
        index = _get_index(X)

        X = normalize(X, self._norm_mean.values, 1)
        return pd.DataFrame(np.dot(X, self.hazards_.T), index=index)

コード例 #7

0

ファイルを表示

ファイル: cox_time_varying_fitter.py プロジェクト: pzivich/lifelines

 def _check_values(df, E):
     # check_for_overlapping_intervals(df) # this is currenty too slow for production.
     check_low_var(df)
     check_complete_separation_low_variance(df, E)
     pass_for_numeric_dtypes_or_raise(df)

コード例 #8

0

ファイルを表示

ファイル: coxph_fitter.py プロジェクト: kolesnykandrii/lifelines

 def _check_values(df, E):
     check_low_var(df)
     check_complete_separation(df, E)
     pass_for_numeric_dtypes_or_raise(df)

コード例 #9

0

ファイルを表示

ファイル: aalen_additive_fitter.py プロジェクト: pzivich/lifelines

    def _fit_varying(self,
                     dataframe,
                     duration_col="T",
                     event_col="E",
                     id_col=None,
                     timeline=None,
                     show_progress=True):

        from_tuples = pd.MultiIndex.from_tuples
        df = dataframe.copy()

        # if the regression should fit an intercept
        if self.fit_intercept:
            df['baseline'] = 1.

        # each individual should have an ID of time of leaving study
        df = df.set_index([duration_col, id_col])
        pass_for_numeric_dtypes_or_raise(df)

        # if no event_col is specified, assume all non-censorships
        if event_col is None:
            event_col = 'E'
            df[event_col] = 1

        C_panel = df[[event_col]].to_panel().transpose(2, 1, 0)
        C = C_panel.minor_xs(event_col).sum().astype(bool)
        T = (C_panel.minor_xs(event_col).notnull()).cumsum().idxmax()

        del df[event_col]
        n, d = df.shape

        # so this is a problem line. bfill performs a recursion which is
        # really not scalable. Plus even for modest datasets, this eats a lot of memory.
        # Plus is bfill the correct thing to choose? It's forward looking...
        wp = df.to_panel().bfill().fillna(0)

        # initialize dataframe to store estimates
        non_censorsed_times = list(T[C].iteritems())
        columns = wp.items
        hazards_ = pd.DataFrame(np.zeros((len(non_censorsed_times), d)),
                                columns=columns,
                                index=from_tuples(non_censorsed_times))

        variance_ = pd.DataFrame(np.zeros((len(non_censorsed_times), d)),
                                 columns=columns,
                                 index=from_tuples(non_censorsed_times))

        previous_hazard = np.zeros((d, ))
        ids = wp.minor_axis.values
        progress = progress_bar(len(non_censorsed_times))

        # this makes indexing times much faster
        wp = wp.swapaxes(0, 1, copy=False).swapaxes(1, 2, copy=False)

        for i, (id, time) in enumerate(non_censorsed_times):

            relevant_individuals = (ids == id)
            assert relevant_individuals.sum() == 1.

            # perform linear regression step.
            try:
                v, V = lr(wp[time].values,
                          relevant_individuals,
                          c1=self.coef_penalizer,
                          c2=self.smoothing_penalizer,
                          offset=previous_hazard)
            except LinAlgError:
                print(
                    "Linear regression error. Try increasing the penalizer term."
                )

            hazards_.loc[id, time] = v.T
            variance_.loc[id, time] = V[:, relevant_individuals][:, 0]**2
            previous_hazard = v.T

            # update progress bar
            if show_progress:
                progress.update(i)

        # print a new line so the console displays well
        if show_progress:
            print()

        ordered_cols = df.columns  # to_panel() mixes up my columns

        self.hazards_ = hazards_.groupby(level=1).sum()[ordered_cols]
        self.cumulative_hazards_ = self.hazards_.cumsum()[ordered_cols]
        self.variance_ = variance_.groupby(level=1).sum()[ordered_cols]

        if timeline is not None:
            self.hazards_ = self.hazards_.reindex(timeline, method='ffill')
            self.cumulative_hazards_ = self.cumulative_hazards_.reindex(
                timeline, method='ffill')
            self.variance_ = self.variance_.reindex(timeline, method='ffill')
            self.timeline = timeline
        else:
            self.timeline = self.hazards_.index.values.astype(float)

        self.durations = T
        self.event_observed = C
        self._compute_confidence_intervals()

        return

コード例 #10

0

ファイルを表示

ファイル: aalen_additive_fitter.py プロジェクト: pzivich/lifelines

    def _fit_static(self,
                    dataframe,
                    duration_col,
                    event_col=None,
                    timeline=None,
                    show_progress=True):
        """
        Perform inference on the coefficients of the Aalen additive model.

        Parameters:
            dataframe: a pandas dataframe, with covariates and a duration_col and a event_col.
                      one row per individual. duration_col refers to how long the individual was
                      observed for. event_col is a boolean: 1 if individual 'died', 0 else. id_col
                      should be left as None.

            duration_col: specify what the duration column is called in the dataframe
            event_col: specify what the event occurred column is called in the dataframe
            timeline: reformat the estimates index to a new timeline.
            progress_bar: include a fancy progress bar!

        Returns:
          self, with new methods like plot, smoothed_hazards_ and properties like cumulative_hazards_
        """

        from_tuples = pd.MultiIndex.from_tuples
        df = dataframe.copy()

        # set unique ids for individuals
        id_col = 'id'
        ids = np.arange(df.shape[0])
        df[id_col] = ids

        # if the regression should fit an intercept
        if self.fit_intercept:
            df['baseline'] = 1.

        # if no event_col is specified, assume all non-censorships
        if event_col:
            c = df[event_col].values
            del df[event_col]
        else:
            c = np.ones_like(ids)

        # each individual should have an ID of time of leaving study
        C = pd.Series(c, dtype=bool, index=ids)
        T = pd.Series(df[duration_col].values, index=ids)

        df = df.set_index(id_col)
        pass_for_numeric_dtypes_or_raise(df)

        ix = T.argsort()
        T, C = T.iloc[ix], C.iloc[ix]

        del df[duration_col]
        n, d = df.shape
        columns = df.columns

        # initialize dataframe to store estimates
        non_censorsed_times = list(T[C].iteritems())
        n_deaths = len(non_censorsed_times)

        hazards_ = pd.DataFrame(
            np.zeros((n_deaths, d)),
            columns=columns,
            index=from_tuples(non_censorsed_times)).swaplevel(1, 0)

        variance_ = pd.DataFrame(
            np.zeros((n_deaths, d)),
            columns=columns,
            index=from_tuples(non_censorsed_times)).swaplevel(1, 0)

        # initialize loop variables.
        previous_hazard = np.zeros((d, ))
        progress = progress_bar(n_deaths)
        to_remove = []
        t = T.iloc[0]
        i = 0

        for id, time in T.iteritems():  # should be sorted.

            if t != time:
                assert t < time
                # remove the individuals from the previous loop.
                df.iloc[to_remove] = 0.
                to_remove = []
                t = time

            to_remove.append(id)
            if C[id] == 0:
                continue

            relevant_individuals = (ids == id)
            assert relevant_individuals.sum() == 1.

            # perform linear regression step.
            try:
                v, V = lr(df.values,
                          relevant_individuals,
                          c1=self.coef_penalizer,
                          c2=self.smoothing_penalizer,
                          offset=previous_hazard)
            except LinAlgError:
                print(
                    "Linear regression error. Try increasing the penalizer term."
                )

            hazards_.loc[time, id] = v.T
            variance_.loc[time, id] = V[:, relevant_individuals][:, 0]**2
            previous_hazard = v.T

            # update progress bar
            if show_progress:
                i += 1
                progress.update(i)

        # print a new line so the console displays well
        if show_progress:
            print()

        # not sure this is the correct thing to do.
        self.hazards_ = hazards_.groupby(level=0).sum()
        self.cumulative_hazards_ = self.hazards_.cumsum()
        self.variance_ = variance_.groupby(level=0).sum()

        if timeline is not None:
            self.hazards_ = self.hazards_.reindex(timeline, method='ffill')
            self.cumulative_hazards_ = self.cumulative_hazards_.reindex(
                timeline, method='ffill')
            self.variance_ = self.variance_.reindex(timeline, method='ffill')
            self.timeline = timeline
        else:
            self.timeline = self.hazards_.index.values.astype(float)

        self.durations = T
        self.event_observed = C
        self._compute_confidence_intervals()
        self.score_ = concordance_index(
            self.durations,
            self.predict_median(dataframe).values.ravel(), self.event_observed)
        return

コード例 #11

0

ファイルを表示

ファイル: aalen_additive_fitter.py プロジェクト: springcoil/lifelines

    def _fit_static(self, dataframe, duration_col, event_col=None,
                    timeline=None, show_progress=True):
        """
        Perform inference on the coefficients of the Aalen additive model.

        Parameters:
            dataframe: a pandas dataframe, with covariates and a duration_col and a event_col.
                      one row per individual. duration_col refers to how long the individual was
                      observed for. event_col is a boolean: 1 if individual 'died', 0 else. id_col
                      should be left as None.

            duration_col: specify what the duration column is called in the dataframe
            event_col: specify what the event occurred column is called in the dataframe
            timeline: reformat the estimates index to a new timeline.
            progress_bar: include a fancy progress bar!

        Returns:
          self, with new methods like plot, smoothed_hazards_ and properties like cumulative_hazards_
        """

        from_tuples = pd.MultiIndex.from_tuples
        df = dataframe.copy()

        # set unique ids for individuals
        id_col = 'id'
        ids = np.arange(df.shape[0])
        df[id_col] = ids

        # if the regression should fit an intercept
        if self.fit_intercept:
            df['baseline'] = 1.

        # if no event_col is specified, assume all non-censorships
        if event_col:
            c = df[event_col].values
            del df[event_col]
        else:
            c = np.ones_like(ids)

        # each individual should have an ID of time of leaving study
        C = pd.Series(c, dtype=bool, index=ids)
        T = pd.Series(df[duration_col].values, index=ids)

        df = df.set_index(id_col)
        pass_for_numeric_dtypes_or_raise(df)

        ix = T.argsort()
        T, C = T.iloc[ix], C.iloc[ix]

        del df[duration_col]
        n, d = df.shape
        columns = df.columns

        # initialize dataframe to store estimates
        non_censorsed_times = list(T[C].iteritems())
        n_deaths = len(non_censorsed_times)

        hazards_ = pd.DataFrame(np.zeros((n_deaths, d)), columns=columns,
                                index=from_tuples(non_censorsed_times)).swaplevel(1, 0)

        variance_ = pd.DataFrame(np.zeros((n_deaths, d)), columns=columns,
                                 index=from_tuples(non_censorsed_times)).swaplevel(1, 0)

        # initialize loop variables.
        previous_hazard = np.zeros((d,))
        progress = progress_bar(n_deaths)
        to_remove = []
        t = T.iloc[0]
        i = 0

        for id, time in T.iteritems():  # should be sorted.

            if t != time:
                assert t < time
                # remove the individuals from the previous loop.
                df.iloc[to_remove] = 0.
                to_remove = []
                t = time

            to_remove.append(id)
            if C[id] == 0:
                continue

            relevant_individuals = (ids == id)
            assert relevant_individuals.sum() == 1.

            # perform linear regression step.
            try:
                v, V = lr(df.values, relevant_individuals, c1=self.coef_penalizer, c2=self.smoothing_penalizer, offset=previous_hazard)
            except LinAlgError:
                print("Linear regression error. Try increasing the penalizer term.")

            hazards_.loc[time, id] = v.T
            variance_.loc[time, id] = V[:, relevant_individuals][:, 0] ** 2
            previous_hazard = v.T

            # update progress bar
            if show_progress:
                i += 1
                progress.update(i)

        # print a new line so the console displays well
        if show_progress:
            print()

        # not sure this is the correct thing to do.
        self.hazards_ = hazards_.groupby(level=0).sum()
        self.cumulative_hazards_ = self.hazards_.cumsum()
        self.variance_ = variance_.groupby(level=0).sum()

        if timeline is not None:
            self.hazards_ = self.hazards_.reindex(timeline, method='ffill')
            self.cumulative_hazards_ = self.cumulative_hazards_.reindex(timeline, method='ffill')
            self.variance_ = self.variance_.reindex(timeline, method='ffill')
            self.timeline = timeline
        else:
            self.timeline = self.hazards_.index.values.astype(float)

        self.durations = T
        self.event_observed = C
        self._compute_confidence_intervals()
        self.score_ = concordance_index(self.durations,
                                        self.predict_median(dataframe).values.ravel(),
                                        self.event_observed)
        return

コード例 #12

0

ファイルを表示

ファイル: coxph_fitter.py プロジェクト: stefgibson/lifelines

 def _check_values(df, E):
     deaths = E == 1
     check_low_var(df)
     check_low_var(df.loc[deaths], "Complete seperation possibly detected. ", " See https://stats.idre.ucla.edu/other/mult-pkg/faq/general/faqwhat-is-complete-or-quasi-complete-separation-in-logisticprobit-regression-and-how-do-we-deal-with-them/")
     check_low_var(df.loc[~deaths], "Complete seperation possibly detected. ", " See https://stats.idre.ucla.edu/other/mult-pkg/faq/general/faqwhat-is-complete-or-quasi-complete-separation-in-logisticprobit-regression-and-how-do-we-deal-with-them/")
     pass_for_numeric_dtypes_or_raise(df)

コード例 #13

0

ファイルを表示

ファイル: aalen_additive_fitter.py プロジェクト: ycq091044/lifelines

 def _check_values(self, df, T, E):
     pass_for_numeric_dtypes_or_raise(df)
     check_nans(T)
     check_nans(E)

コード例 #14

0

ファイルを表示

ファイル: aalen_additive_fitter.py プロジェクト: Path-AI/lifelines

 def _check_values(self, X, T, E):
     pass_for_numeric_dtypes_or_raise(X)
     check_nans_or_infs(T)
     check_nans_or_infs(E)
     check_nans_or_infs(X)

コード例 #15

0

ファイルを表示

ファイル: coxph_fitter.py プロジェクト: springcoil/lifelines

 def _check_values(df, E):
     check_low_var(df)
     check_complete_separation(df, E)
     pass_for_numeric_dtypes_or_raise(df)

コード例 #16

0

ファイルを表示

ファイル: cox_time_varying_fitter.py プロジェクト: springcoil/lifelines

 def _check_values(df, E):
     # check_for_overlapping_intervals(df) # this is currenty too slow for production.
     check_low_var(df)
     check_complete_separation(df, E)
     pass_for_numeric_dtypes_or_raise(df)

コード例 #17

0

ファイルを表示

ファイル: aalen_additive_fitter.py プロジェクト: springcoil/lifelines

    def _fit_varying(self, dataframe, duration_col="T", event_col="E",
                     id_col=None, timeline=None, show_progress=True):

        from_tuples = pd.MultiIndex.from_tuples
        df = dataframe.copy()

        # if the regression should fit an intercept
        if self.fit_intercept:
            df['baseline'] = 1.

        # each individual should have an ID of time of leaving study
        df = df.set_index([duration_col, id_col])
        pass_for_numeric_dtypes_or_raise(df)

        # if no event_col is specified, assume all non-censorships
        if event_col is None:
            event_col = 'E'
            df[event_col] = 1

        C_panel = df[[event_col]].to_panel().transpose(2, 1, 0)
        C = C_panel.minor_xs(event_col).sum().astype(bool)
        T = (C_panel.minor_xs(event_col).notnull()).cumsum().idxmax()

        del df[event_col]
        n, d = df.shape

        # so this is a problem line. bfill performs a recursion which is
        # really not scalable. Plus even for modest datasets, this eats a lot of memory.
        # Plus is bfill the correct thing to choose? It's forward looking...
        wp = df.to_panel().bfill().fillna(0)

        # initialize dataframe to store estimates
        non_censorsed_times = list(T[C].iteritems())
        columns = wp.items
        hazards_ = pd.DataFrame(np.zeros((len(non_censorsed_times), d)),
                                columns=columns, index=from_tuples(non_censorsed_times))

        variance_ = pd.DataFrame(np.zeros((len(non_censorsed_times), d)),
                                 columns=columns, index=from_tuples(non_censorsed_times))

        previous_hazard = np.zeros((d,))
        ids = wp.minor_axis.values
        progress = progress_bar(len(non_censorsed_times))

        # this makes indexing times much faster
        wp = wp.swapaxes(0, 1, copy=False).swapaxes(1, 2, copy=False)

        for i, (id, time) in enumerate(non_censorsed_times):

            relevant_individuals = (ids == id)
            assert relevant_individuals.sum() == 1.

            # perform linear regression step.
            try:
                v, V = lr(wp[time].values, relevant_individuals, c1=self.coef_penalizer, c2=self.smoothing_penalizer, offset=previous_hazard)
            except LinAlgError:
                print("Linear regression error. Try increasing the penalizer term.")

            hazards_.loc[id, time] = v.T
            variance_.loc[id, time] = V[:, relevant_individuals][:, 0] ** 2
            previous_hazard = v.T

            # update progress bar
            if show_progress:
                progress.update(i)

        # print a new line so the console displays well
        if show_progress:
            print()

        ordered_cols = df.columns  # to_panel() mixes up my columns

        self.hazards_ = hazards_.groupby(level=1).sum()[ordered_cols]
        self.cumulative_hazards_ = self.hazards_.cumsum()[ordered_cols]
        self.variance_ = variance_.groupby(level=1).sum()[ordered_cols]

        if timeline is not None:
            self.hazards_ = self.hazards_.reindex(timeline, method='ffill')
            self.cumulative_hazards_ = self.cumulative_hazards_.reindex(timeline, method='ffill')
            self.variance_ = self.variance_.reindex(timeline, method='ffill')
            self.timeline = timeline
        else:
            self.timeline = self.hazards_.index.values.astype(float)


        self.durations = T
        self.event_observed = C
        self._compute_confidence_intervals()

        return