def _fit_varying(self, dataframe, duration_col="T", event_col="E",
                     id_col=None, timeline=None, show_progress=True):

        from_tuples = pd.MultiIndex.from_tuples
        df = dataframe.copy()

        # if the regression should fit an intercept
        if self.fit_intercept:
            df['baseline'] = 1.

        # each individual should have an ID of time of leaving study
        df = df.set_index([duration_col, id_col])

        # if no event_col is specified, assume all non-censorships
        if event_col is None:
            event_col = 'E'
            df[event_col] = 1

        C_panel = df[[event_col]].to_panel().transpose(2, 1, 0)
        C = C_panel.minor_xs(event_col).sum().astype(bool)
        T = (C_panel.minor_xs(event_col).notnull()).cumsum().idxmax()

        del df[event_col]
        n, d = df.shape

        # so this is a problem line. bfill performs a recursion which is
        # really not scalable. Plus even for modest datasets, this eats a lot of memory.
        # Plus is bfill the correct thing to choose? It's forward looking...
        wp = df.to_panel().bfill().fillna(0)

        # initialize dataframe to store estimates
        non_censorsed_times = list(T[C].iteritems())
        columns = wp.items
        hazards_ = pd.DataFrame(np.zeros((len(non_censorsed_times), d)),
                                columns=columns, index=from_tuples(non_censorsed_times))

        variance_ = pd.DataFrame(np.zeros((len(non_censorsed_times), d)),
                                 columns=columns, index=from_tuples(non_censorsed_times))

        previous_hazard = np.zeros((d,))
        ids = wp.minor_axis.values
        progress = progress_bar(len(non_censorsed_times))

        # this makes indexing times much faster
        wp = wp.swapaxes(0, 1, copy=False).swapaxes(1, 2, copy=False)

        for i, (id, time) in enumerate(non_censorsed_times):

            relevant_individuals = (ids == id)
            assert relevant_individuals.sum() == 1.

            # perform linear regression step.
            try:
                v, V = lr(wp[time].values, relevant_individuals, c1=self.coef_penalizer, c2=self.smoothing_penalizer, offset=previous_hazard)
            except LinAlgError:
                print("Linear regression error. Try increasing the penalizer term.")

            hazards_.ix[id, time] = v.T
            variance_.ix[id, time] = V[:, relevant_individuals][:, 0] ** 2
            previous_hazard = v.T

            # update progress bar
            if show_progress:
                progress.update(i)

        # print a new line so the console displays well
        if show_progress:
            print()

        ordered_cols = df.columns  # to_panel() mixes up my columns

        self.hazards_ = hazards_.groupby(level=1).sum()[ordered_cols]
        self.cumulative_hazards_ = self.hazards_.cumsum()[ordered_cols]
        self.variance_ = variance_.groupby(level=1).sum()[ordered_cols]

        if timeline is not None:
            self.hazards_ = self.hazards_.reindex(timeline, method='ffill')
            self.cumulative_hazards_ = self.cumulative_hazards_.reindex(timeline, method='ffill')
            self.variance_ = self.variance_.reindex(timeline, method='ffill')
            self.timeline = timeline
        else:
            self.timeline = self.hazards_.index.values.astype(float)

        self.data = wp

        self.durations = T
        self.event_observed = C
        self._compute_confidence_intervals()

        return
Exemple #2
0
    def _fit_varying(self,
                     dataframe,
                     duration_col="T",
                     event_col="E",
                     id_col=None,
                     timeline=None,
                     show_progress=True):

        from_tuples = pd.MultiIndex.from_tuples
        df = dataframe.copy()

        # if the regression should fit an intercept
        if self.fit_intercept:
            df['baseline'] = 1.

        # each individual should have an ID of time of leaving study
        df = df.set_index([duration_col, id_col])

        # if no event_col is specified, assume all non-censorships
        if event_col is None:
            event_col = 'E'
            df[event_col] = 1

        C_panel = df[[event_col]].to_panel().transpose(2, 1, 0)
        C = C_panel.minor_xs(event_col).sum().astype(bool)
        T = (C_panel.minor_xs(event_col).notnull()).cumsum().idxmax()

        del df[event_col]
        n, d = df.shape

        # so this is a problem line. bfill performs a recursion which is
        # really not scalable. Plus even for modest datasets, this eats a lot of memory.
        # Plus is bfill the correct thing to choose? It's forward looking...
        wp = df.to_panel().bfill().fillna(0)

        # initialize dataframe to store estimates
        non_censorsed_times = list(T[C].iteritems())
        columns = wp.items
        hazards_ = pd.DataFrame(np.zeros((len(non_censorsed_times), d)),
                                columns=columns,
                                index=from_tuples(non_censorsed_times))

        variance_ = pd.DataFrame(np.zeros((len(non_censorsed_times), d)),
                                 columns=columns,
                                 index=from_tuples(non_censorsed_times))

        previous_hazard = np.zeros((d, ))
        ids = wp.minor_axis.values
        progress = progress_bar(len(non_censorsed_times))

        # this makes indexing times much faster
        wp = wp.swapaxes(0, 1, copy=False).swapaxes(1, 2, copy=False)

        for i, (id, time) in enumerate(non_censorsed_times):

            relevant_individuals = (ids == id)
            assert relevant_individuals.sum() == 1.

            # perform linear regression step.
            try:
                v, V = lr(wp[time].values,
                          relevant_individuals,
                          c1=self.coef_penalizer,
                          c2=self.smoothing_penalizer,
                          offset=previous_hazard)
            except LinAlgError:
                print(
                    "Linear regression error. Try increasing the penalizer term."
                )

            hazards_.ix[id, time] = v.T
            variance_.ix[id, time] = V[:, relevant_individuals][:, 0]**2
            previous_hazard = v.T

            # update progress bar
            if show_progress:
                progress.update(i)

        # print a new line so the console displays well
        if show_progress:
            print()

        ordered_cols = df.columns  # to_panel() mixes up my columns

        self.hazards_ = hazards_.groupby(level=1).sum()[ordered_cols]
        self.cumulative_hazards_ = self.hazards_.cumsum()[ordered_cols]
        self.variance_ = variance_.groupby(level=1).sum()[ordered_cols]

        if timeline is not None:
            self.hazards_ = self.hazards_.reindex(timeline, method='ffill')
            self.cumulative_hazards_ = self.cumulative_hazards_.reindex(
                timeline, method='ffill')
            self.variance_ = self.variance_.reindex(timeline, method='ffill')
            self.timeline = timeline
        else:
            self.timeline = self.hazards_.index.values.astype(float)

        self.data = wp

        self.durations = T
        self.event_observed = C
        self._compute_confidence_intervals()

        return
    def _fit_static(self, dataframe, duration_col, event_col=None,
                    timeline=None, show_progress=True):
        """
        Perform inference on the coefficients of the Aalen additive model.

        Parameters:
            dataframe: a pandas dataframe, with covariates and a duration_col and a event_col.
                      one row per individual. duration_col refers to how long the individual was
                      observed for. event_col is a boolean: 1 if individual 'died', 0 else. id_col
                      should be left as None.

            duration_col: specify what the duration column is called in the dataframe
            event_col: specify what the event occurred column is called in the dataframe
            timeline: reformat the estimates index to a new timeline.
            progress_bar: include a fancy progress bar!

        Returns:
          self, with new methods like plot, smoothed_hazards_ and properties like cumulative_hazards_
        """

        from_tuples = pd.MultiIndex.from_tuples
        df = dataframe.copy()

        # set unique ids for individuals
        id_col = 'id'
        ids = np.arange(df.shape[0])
        df[id_col] = ids

        # if the regression should fit an intercept
        if self.fit_intercept:
            df['baseline'] = 1.

        # if no event_col is specified, assume all non-censorships
        if event_col:
            c = df[event_col].values
            del df[event_col]
        else:
            c = np.ones_like(ids)

        # each individual should have an ID of time of leaving study
        C = pd.Series(c, dtype=bool, index=ids)
        T = pd.Series(df[duration_col].values, index=ids)

        df = df.set_index(id_col)

        ix = T.argsort()
        T, C = T.iloc[ix], C.iloc[ix]

        del df[duration_col]
        n, d = df.shape
        columns = df.columns

        # initialize dataframe to store estimates
        non_censorsed_times = list(T[C].iteritems())
        n_deaths = len(non_censorsed_times)

        hazards_ = pd.DataFrame(np.zeros((n_deaths, d)), columns=columns,
                                index=from_tuples(non_censorsed_times)).swaplevel(1, 0)

        variance_ = pd.DataFrame(np.zeros((n_deaths, d)), columns=columns,
                                 index=from_tuples(non_censorsed_times)).swaplevel(1, 0)

        # initialize loop variables.
        previous_hazard = np.zeros((d,))
        progress = progress_bar(n_deaths)
        to_remove = []
        t = T.iloc[0]
        i = 0

        for id, time in T.iteritems():  # should be sorted.

            if t != time:
                assert t < time
                # remove the individuals from the previous loop.
                df.iloc[to_remove] = 0.
                to_remove = []
                t = time

            to_remove.append(id)
            if C[id] == 0:
                continue

            relevant_individuals = (ids == id)
            assert relevant_individuals.sum() == 1.

            # perform linear regression step.
            try:
                v, V = lr(df.values, relevant_individuals, c1=self.coef_penalizer, c2=self.smoothing_penalizer, offset=previous_hazard)
            except LinAlgError:
                print("Linear regression error. Try increasing the penalizer term.")

            hazards_.ix[time, id] = v.T
            variance_.ix[time, id] = V[:, relevant_individuals][:, 0] ** 2
            previous_hazard = v.T

            # update progress bar
            if show_progress:
                i += 1
                progress.update(i)

        # print a new line so the console displays well
        if show_progress:
            print()

        # not sure this is the correct thing to do.
        self.hazards_ = hazards_.groupby(level=0).sum()
        self.cumulative_hazards_ = self.hazards_.cumsum()
        self.variance_ = variance_.groupby(level=0).sum()

        if timeline is not None:
            self.hazards_ = self.hazards_.reindex(timeline, method='ffill')
            self.cumulative_hazards_ = self.cumulative_hazards_.reindex(timeline, method='ffill')
            self.variance_ = self.variance_.reindex(timeline, method='ffill')
            self.timeline = timeline
        else:
            self.timeline = self.hazards_.index.values.astype(float)

        self.data = dataframe
        self.durations = T
        self.event_observed = C
        self._compute_confidence_intervals()

        return
Exemple #4
0
    def _fit_static(self,
                    dataframe,
                    duration_col,
                    event_col=None,
                    timeline=None,
                    show_progress=True):
        """
        Perform inference on the coefficients of the Aalen additive model.

        Parameters:
            dataframe: a pandas dataframe, with covariates and a duration_col and a event_col.
                      one row per individual. duration_col refers to how long the individual was
                      observed for. event_col is a boolean: 1 if individual 'died', 0 else. id_col
                      should be left as None.

            duration_col: specify what the duration column is called in the dataframe
            event_col: specify what the event occurred column is called in the dataframe
            timeline: reformat the estimates index to a new timeline.
            progress_bar: include a fancy progress bar!

        Returns:
          self, with new methods like plot, smoothed_hazards_ and properties like cumulative_hazards_
        """

        from_tuples = pd.MultiIndex.from_tuples
        df = dataframe.copy()

        # set unique ids for individuals
        id_col = 'id'
        ids = np.arange(df.shape[0])
        df[id_col] = ids

        # if the regression should fit an intercept
        if self.fit_intercept:
            df['baseline'] = 1.

        # if no event_col is specified, assume all non-censorships
        if event_col:
            c = df[event_col].values
            del df[event_col]
        else:
            c = np.ones_like(ids)

        # each individual should have an ID of time of leaving study
        C = pd.Series(c, dtype=bool, index=ids)
        T = pd.Series(df[duration_col].values, index=ids)

        df = df.set_index(id_col)

        ix = T.argsort()
        T, C = T.iloc[ix], C.iloc[ix]

        del df[duration_col]
        n, d = df.shape
        columns = df.columns

        # initialize dataframe to store estimates
        non_censorsed_times = list(T[C].iteritems())
        n_deaths = len(non_censorsed_times)

        hazards_ = pd.DataFrame(
            np.zeros((n_deaths, d)),
            columns=columns,
            index=from_tuples(non_censorsed_times)).swaplevel(1, 0)

        variance_ = pd.DataFrame(
            np.zeros((n_deaths, d)),
            columns=columns,
            index=from_tuples(non_censorsed_times)).swaplevel(1, 0)

        # initialize loop variables.
        previous_hazard = np.zeros((d, ))
        progress = progress_bar(n_deaths)
        to_remove = []
        t = T.iloc[0]
        i = 0

        for id, time in T.iteritems():  # should be sorted.

            if t != time:
                assert t < time
                # remove the individuals from the previous loop.
                df.iloc[to_remove] = 0.
                to_remove = []
                t = time

            to_remove.append(id)
            if C[id] == 0:
                continue

            relevant_individuals = (ids == id)
            assert relevant_individuals.sum() == 1.

            # perform linear regression step.
            try:
                v, V = lr(df.values,
                          relevant_individuals,
                          c1=self.coef_penalizer,
                          c2=self.smoothing_penalizer,
                          offset=previous_hazard)
            except LinAlgError:
                print(
                    "Linear regression error. Try increasing the penalizer term."
                )

            hazards_.ix[time, id] = v.T
            variance_.ix[time, id] = V[:, relevant_individuals][:, 0]**2
            previous_hazard = v.T

            # update progress bar
            if show_progress:
                i += 1
                progress.update(i)

        # print a new line so the console displays well
        if show_progress:
            print()

        # not sure this is the correct thing to do.
        self.hazards_ = hazards_.groupby(level=0).sum()
        self.cumulative_hazards_ = self.hazards_.cumsum()
        self.variance_ = variance_.groupby(level=0).sum()

        if timeline is not None:
            self.hazards_ = self.hazards_.reindex(timeline, method='ffill')
            self.cumulative_hazards_ = self.cumulative_hazards_.reindex(
                timeline, method='ffill')
            self.variance_ = self.variance_.reindex(timeline, method='ffill')
            self.timeline = timeline
        else:
            self.timeline = self.hazards_.index.values.astype(float)

        self.data = dataframe
        self.durations = T
        self.event_observed = C
        self._compute_confidence_intervals()

        return