Example #1
0
    def _fit_varying(self, dataframe, duration_col="T", event_col="E",
                     id_col=None, timeline=None, show_progress=True):

        from_tuples = pd.MultiIndex.from_tuples
        df = dataframe.copy()

        # if the regression should fit an intercept
        if self.fit_intercept:
            df['baseline'] = 1.

        # each individual should have an ID of time of leaving study
        df = df.set_index([duration_col, id_col])

        C_panel = df[[event_col]].to_panel().transpose(2, 1, 0)
        C = C_panel.minor_xs(event_col).sum().astype(bool)
        T = (C_panel.minor_xs(event_col).notnull()).cumsum().idxmax()

        del df[event_col]
        n, d = df.shape

        # so this is a problem line. bfill performs a recursion which is
        # really not scalable. Plus even for modest datasets, this eats a lot of memory.
        wp = df.to_panel().bfill().fillna(0)

        # initialize dataframe to store estimates
        non_censorsed_times = list(T[C].iteritems())
        columns = wp.items
        hazards_ = pd.DataFrame(np.zeros((len(non_censorsed_times), d)),
                                columns=columns, index=from_tuples(non_censorsed_times))

        variance_ = pd.DataFrame(np.zeros((len(non_censorsed_times), d)),
                                 columns=columns, index=from_tuples(non_censorsed_times))

        # initializes the penalizer matrix
        penalizer = self.penalizer * np.eye(d)

        ids = wp.minor_axis.values
        progress = progress_bar(len(non_censorsed_times))

        # this makes indexing times much faster
        wp = wp.swapaxes(0, 1, copy=False).swapaxes(1, 2, copy=False)

        for i, (id, time) in enumerate(non_censorsed_times):

            relevant_individuals = (ids == id)
            assert relevant_individuals.sum() == 1.

            X = wp[time].values

            # perform linear regression step.
            try:
                V = dot(inv(dot(X.T, X) + penalizer), X.T)
            except LinAlgError:
                print("Linear regression error. Try increasing the penalizer term.")

            v = dot(V, 1.0 * relevant_individuals)

            hazards_.ix[id, time] = v.T
            variance_.ix[id, time] = V[:, relevant_individuals][:, 0] ** 2

            # update progress bar
            if show_progress:
                progress.update(i)

        # print a new line so the console displays well
        if show_progress:
            print()

        ordered_cols = df.columns  # to_panel() mixes up my columns
        # not sure this is the correct thing to do.
        self.hazards_ = hazards_.groupby(level=1).sum()[ordered_cols]
        self.cumulative_hazards_ = self.hazards_.cumsum()[ordered_cols]
        self.variance_ = variance_.groupby(level=1).sum()[ordered_cols]

        if timeline is not None:
            self.hazards_ = self.hazards_.reindex(timeline, method='ffill')
            self.cumulative_hazards_ = self.cumulative_hazards_.reindex(timeline, method='ffill')
            self.variance_ = self.variance_.reindex(timeline, method='ffill')
            self.timeline = timeline
        else:
            self.timeline = self.hazards_.index.values.astype(float)

        self.data = wp

        self.durations = T
        self.event_observed = C
        self._compute_confidence_intervals()
        self.plot = plot_regressions(self)

        return
Example #2
0
    def _fit_static(self, dataframe, duration_col="T", event_col="E",
                    timeline=None, show_progress=True):
        """
        Perform inference on the coefficients of the Aalen additive model.

        Parameters:
            dataframe: a pandas dataframe, with covariates and a duration_col and a event_col.
                      one row per individual. duration_col refers to how long the individual was
                      observed for. event_col is a boolean: 1 if individual 'died', 0 else. id_col
                      should be left as None.

            duration_col: specify what the duration column is called in the dataframe
            event_col: specify what the event occurred column is called in the dataframe
            timeline: reformat the estimates index to a new timeline.
            progress_bar: include a fancy progress bar!

        Returns:
          self, with new methods like plot, smoothed_hazards_ and properties like cumulative_hazards_
        """

        from_tuples = pd.MultiIndex.from_tuples
        df = dataframe.copy()

        # set unique ids for individuals
        id_col = 'id'
        ids = np.arange(df.shape[0])
        df[id_col] = ids

        # if the regression should fit an intercept
        if self.fit_intercept:
            df['baseline'] = 1.

        # each individual should have an ID of time of leaving study
        C = pd.Series(df[event_col].values, dtype=bool, index=ids)
        T = pd.Series(df[duration_col].values, index=ids)

        df = df.set_index(id_col)

        ix = T.argsort()
        T, C = T.iloc[ix], C.iloc[ix]

        del df[event_col]
        del df[duration_col]
        n, d = df.shape
        columns = df.columns

        # initialize dataframe to store estimates
        non_censorsed_times = list(T[C].iteritems())
        n_deaths = len(non_censorsed_times)

        hazards_ = pd.DataFrame(np.zeros((n_deaths, d)), columns=columns,
                                index=from_tuples(non_censorsed_times)).swaplevel(1, 0)

        variance_ = pd.DataFrame(np.zeros((n_deaths, d)), columns=columns,
                                 index=from_tuples(non_censorsed_times)).swaplevel(1, 0)

        # initializes the penalizer matrix
        penalizer = self.penalizer * np.eye(d)

        # initialize loop variables.
        progress = progress_bar(n_deaths)
        to_remove = []
        t = T.iloc[0]
        i = 0

        for id, time in T.iteritems():  # should be sorted.

            if t != time:
                assert t < time
                # remove the individuals from the previous loop.
                df.iloc[to_remove] = 0.
                to_remove = []
                t = time

            to_remove.append(id)
            if C[id] == 0:
                continue

            relevant_individuals = (ids == id)
            assert relevant_individuals.sum() == 1.

            # perform linear regression step.
            X = df.values
            try:
                V = dot(inv(dot(X.T, X) + penalizer), X.T)
            except LinAlgError:
                print("Linear regression error. Try increasing the penalizer term.")

            v = dot(V, 1.0 * relevant_individuals)

            hazards_.ix[time, id] = v.T
            variance_.ix[time, id] = V[:, relevant_individuals][:, 0] ** 2

            # update progress bar
            if show_progress:
                i += 1
                progress.update(i)

        # print a new line so the console displays well
        if show_progress:
            print()

        # not sure this is the correct thing to do.
        self.hazards_ = hazards_.groupby(level=0).sum()
        self.cumulative_hazards_ = self.hazards_.cumsum()
        self.variance_ = variance_.groupby(level=0).sum()

        if timeline is not None:
            self.hazards_ = self.hazards_.reindex(timeline, method='ffill')
            self.cumulative_hazards_ = self.cumulative_hazards_.reindex(timeline, method='ffill')
            self.variance_ = self.variance_.reindex(timeline, method='ffill')
            self.timeline = timeline
        else:
            self.timeline = self.hazards_.index.values.astype(float)

        self.data = dataframe
        self.durations = T
        self.event_observed = C
        self._compute_confidence_intervals()
        self.plot = plot_regressions(self)

        return
Example #3
0
    def _fit_varying(self,
                     dataframe,
                     duration_col="T",
                     event_col="E",
                     id_col=None,
                     timeline=None,
                     show_progress=True):

        from_tuples = pd.MultiIndex.from_tuples
        df = dataframe.copy()

        #if the regression should fit an intercept
        if self.fit_intercept:
            df['baseline'] = 1.

        #each individual should have an ID of time of leaving study
        df = df.set_index([duration_col, id_col])

        C_panel = df[[event_col]].to_panel().transpose(2, 1, 0)
        C = C_panel.minor_xs(event_col).sum().astype(bool)
        T = (C_panel.minor_xs(event_col).notnull()).cumsum().idxmax()

        del df[event_col]
        n, d = df.shape

        #so this is a problem line. bfill performs a recursion which is
        #really not scalable. Plus even for modest datasets, this eats a lot of memory.
        wp = df.to_panel().bfill().fillna(0)

        #initialize dataframe to store estimates
        non_censorsed_times = T[C].iteritems()
        columns = wp.items
        hazards_ = pd.DataFrame(np.zeros((len(non_censorsed_times), d)),
                                columns=columns,
                                index=from_tuples(non_censorsed_times))

        variance_ = pd.DataFrame(np.zeros((len(non_censorsed_times), d)),
                                 columns=columns,
                                 index=from_tuples(non_censorsed_times))

        #initializes the penalizer matrix
        penalizer = self.penalizer * np.eye(d)

        ids = wp.minor_axis.values
        progress = progress_bar(len(non_censorsed_times))

        #this makes indexing times much faster
        wp = wp.swapaxes(0, 1, copy=False).swapaxes(1, 2, copy=False)

        for i, (id, time) in enumerate(non_censorsed_times):

            relevant_individuals = (ids == id)
            assert relevant_individuals.sum() == 1.

            X = wp[time].values

            #perform linear regression step.
            try:
                V = dot(inv(dot(X.T, X) + penalizer), X.T)
            except LinAlgError:
                print(
                    "Linear regression error. Try increasing the penalizer term."
                )

            v = dot(V, 1.0 * relevant_individuals)

            hazards_.ix[id, time] = v.T
            variance_.ix[id, time] = V[:, relevant_individuals][:, 0]**2

            #update progress bar
            if show_progress:
                progress.update(i)

        #print a new line so the console displays well
        if show_progress:
            print()

        ordered_cols = df.columns  #to_panel() mixes up my columns
        #not sure this is the correct thing to do.
        self.hazards_ = hazards_.groupby(level=1).sum()[ordered_cols]
        self.cumulative_hazards_ = self.hazards_.cumsum()[ordered_cols]
        self.variance_ = variance_.groupby(level=1).sum()[ordered_cols]

        if timeline is not None:
            self.hazards_ = self.hazards_.reindex(timeline, method='ffill')
            self.cumulative_hazards_ = self.cumulative_hazards_.reindex(
                timeline, method='ffill')
            self.variance_ = self.variance_.reindex(timeline, method='ffill')
            self.timeline = timeline
        else:
            self.timeline = self.hazards_.index.values.astype(float)

        self.data = wp

        self.durations = T
        self.event_observed = C
        self._compute_confidence_intervals()
        self.plot = plot_regressions(self)

        return
Example #4
0
    def _fit_static(self,
                    dataframe,
                    duration_col="T",
                    event_col="E",
                    timeline=None,
                    show_progress=True):
        """
        Perform inference on the coefficients of the Aalen additive model. 

        Parameters:
            dataframe: a pandas dataframe, with covariates and a duration_col and a event_col.
                      one row per individual. duration_col refers to how long the individual was 
                      observed for. event_col is a boolean: 1 if individual 'died', 0 else. id_col 
                      should be left as None.

            duration_col: specify what the duration column is called in the dataframe 
            event_col: specify what the event occurred column is called in the dataframe 
            timeline: reformat the estimates index to a new timeline.
            progress_bar: include a fancy progress bar!

        Returns:
          self, with new methods like plot, smoothed_hazards_ and properties like cumulative_hazards_
        """

        from_tuples = pd.MultiIndex.from_tuples
        df = dataframe.copy()

        #set unique ids for individuals
        id_col = 'id'
        ids = np.arange(df.shape[0])
        df[id_col] = ids

        #if the regression should fit an intercept
        if self.fit_intercept:
            df['baseline'] = 1.

        #each individual should have an ID of time of leaving study
        C = df[event_col].astype(bool)
        T = df[duration_col]
        df = df.set_index([duration_col, id_col])

        ix = T.argsort()
        T, C = T.iloc[ix], C.iloc[ix]

        del df[event_col]
        n, d = df.shape
        columns = df.columns

        #initialize dataframe to store estimates
        non_censorsed_times = T[C].iteritems()
        n_deaths = len(non_censorsed_times)

        hazards_ = pd.DataFrame(
            np.zeros((n_deaths, d)),
            columns=columns,
            index=from_tuples(non_censorsed_times)).swaplevel(1, 0)

        variance_ = pd.DataFrame(
            np.zeros((n_deaths, d)),
            columns=columns,
            index=from_tuples(non_censorsed_times)).swaplevel(1, 0)

        #initializes the penalizer matrix
        penalizer = self.penalizer * np.eye(d)

        #initialize loop variables.
        progress = progress_bar(n_deaths)
        to_remove = []
        t = T.iloc[0]
        i = 0

        for id, time in T.iteritems():  #should be sorted.

            if t != time:
                assert t < time
                #remove the individuals from the previous loop.
                df.ix[to_remove] = 0.
                to_remove = []
                t = time

            to_remove.append(id)
            if C[id] == 0:
                continue

            relevant_individuals = (ids == id)
            assert relevant_individuals.sum() == 1.

            #perform linear regression step.
            X = df.values
            try:
                V = dot(inv(dot(X.T, X) + penalizer), X.T)
            except LinAlgError:
                print(
                    "Linear regression error. Try increasing the penalizer term."
                )

            v = dot(V, 1.0 * relevant_individuals)

            hazards_.ix[time, id] = v.T
            variance_.ix[time, id] = V[:, relevant_individuals][:, 0]**2

            #update progress bar
            if show_progress:
                i += 1
                progress.update(i)

        #print a new line so the console displays well
        if show_progress:
            print()

        #not sure this is the correct thing to do.
        self.hazards_ = hazards_.groupby(level=0).sum()
        self.cumulative_hazards_ = self.hazards_.cumsum()
        self.variance_ = variance_.groupby(level=0).sum()

        if timeline is not None:
            self.hazards_ = self.hazards_.reindex(timeline, method='ffill')
            self.cumulative_hazards_ = self.cumulative_hazards_.reindex(
                timeline, method='ffill')
            self.variance_ = self.variance_.reindex(timeline, method='ffill')
            self.timeline = timeline
        else:
            self.timeline = self.hazards_.index.values.astype(float)

        self.data = dataframe
        self.durations = T
        self.event_observed = C
        self._compute_confidence_intervals()
        self.plot = plot_regressions(self)

        return
    def _fit_varying(self, dataframe, duration_col="T", event_col="E", id_col=None, timeline=None, show_progress=True):

        from_tuples = pd.MultiIndex.from_tuples
        df = dataframe.copy()

        # if the regression should fit an intercept
        if self.fit_intercept:
            df["baseline"] = 1.0

        # each individual should have an ID of time of leaving study
        df = df.set_index([duration_col, id_col])

        # if no event_col is specified, assume all non-censorships
        if event_col is None:
            event_col = "E"
            df[event_col] = 1

        C_panel = df[[event_col]].to_panel().transpose(2, 1, 0)
        C = C_panel.minor_xs(event_col).sum().astype(bool)
        T = (C_panel.minor_xs(event_col).notnull()).cumsum().idxmax()

        del df[event_col]
        n, d = df.shape

        # so this is a problem line. bfill performs a recursion which is
        # really not scalable. Plus even for modest datasets, this eats a lot of memory.
        # Plus is bfill the correct thing to choose? It's forward looking...
        wp = df.to_panel().bfill().fillna(0)

        # initialize dataframe to store estimates
        non_censorsed_times = list(T[C].iteritems())
        columns = wp.items
        hazards_ = pd.DataFrame(
            np.zeros((len(non_censorsed_times), d)), columns=columns, index=from_tuples(non_censorsed_times)
        )

        variance_ = pd.DataFrame(
            np.zeros((len(non_censorsed_times), d)), columns=columns, index=from_tuples(non_censorsed_times)
        )

        previous_hazard = np.zeros((d,))
        ids = wp.minor_axis.values
        progress = progress_bar(len(non_censorsed_times))

        # this makes indexing times much faster
        wp = wp.swapaxes(0, 1, copy=False).swapaxes(1, 2, copy=False)

        for i, (id, time) in enumerate(non_censorsed_times):

            relevant_individuals = ids == id
            assert relevant_individuals.sum() == 1.0

            # perform linear regression step.
            try:
                v, V = lr(
                    wp[time].values,
                    relevant_individuals,
                    c1=self.coef_penalizer,
                    c2=self.smoothing_penalizer,
                    offset=previous_hazard,
                )
            except LinAlgError:
                print("Linear regression error. Try increasing the penalizer term.")

            hazards_.ix[id, time] = v.T
            variance_.ix[id, time] = V[:, relevant_individuals][:, 0] ** 2
            previous_hazard = v.T

            # update progress bar
            if show_progress:
                progress.update(i)

        # print a new line so the console displays well
        if show_progress:
            print()

        ordered_cols = df.columns  # to_panel() mixes up my columns

        self.hazards_ = hazards_.groupby(level=1).sum()[ordered_cols]
        self.cumulative_hazards_ = self.hazards_.cumsum()[ordered_cols]
        self.variance_ = variance_.groupby(level=1).sum()[ordered_cols]

        if timeline is not None:
            self.hazards_ = self.hazards_.reindex(timeline, method="ffill")
            self.cumulative_hazards_ = self.cumulative_hazards_.reindex(timeline, method="ffill")
            self.variance_ = self.variance_.reindex(timeline, method="ffill")
            self.timeline = timeline
        else:
            self.timeline = self.hazards_.index.values.astype(float)

        self.data = wp

        self.durations = T
        self.event_observed = C
        self._compute_confidence_intervals()
        self.plot = plot_regressions(self)

        return
Example #6
0
    def fit(self, dataframe, duration_col="T", event_col="E", timeline=None, id_col=None, show_progress=True):
        """
        Perform inference on the coefficients of the Aalen additive model. 

        Parameters:
            dataframe: a pandas dataframe, with covariates and a duration_col and a event_col.

                static covariates:
                    one row per individual. duration_col refers to how long the individual was 
                      observed for. event_col is a boolean: 1 if individual 'died', 0 else. id_col 
                      should be left as None.

                time-varying covariates:
                    For time-varying covariates, an id_col is required to keep track of individuals'
                    changing covariates. individual should have a unique id. duration_col refers to how 
                    long the individual has been  observed to up to that point. event_col refers to if 
                    the event (death) occured in that  period. Censored individuals will not have a 1. 
                    For example:

                        +----+---+---+------+------+
                        | id | T | E | var1 | var2 |
                        +----+---+---+------+------+
                        |  1 | 1 | 0 |    0 |    1 |
                        |  1 | 2 | 0 |    0 |    1 |
                        |  1 | 3 | 0 |    4 |    3 |
                        |  1 | 4 | 1 |    8 |    4 |
                        |  2 | 1 | 0 |    1 |    1 |
                        |  2 | 2 | 0 |    1 |    2 |
                        |  2 | 3 | 0 |    1 |    2 |
                        +----+---+---+------+------+

            duration_col: specify what the duration column is called in the dataframe 
            event_col: specify what the event occurred column is called in the dataframe 
            timeline: reformat the estimates index to a new timeline.
            id_col: (only for time-varying covariates) name of the id column in the dataframe
            progress_bar: include a fancy progress bar!


        Returns:
          self, with new methods like plot, smoothed_hazards_ and properties like cumulative_hazards_
        """
        from_tuples = pd.MultiIndex.from_tuples
        df = dataframe.copy()

        #only for time-indp. covariates
        if id_col is None:
            df['id'] = np.arange(df.shape[0])
            id_col = 'id'

        #if the regression should fit an intercept
        if self.fit_intercept:
            df['baseline'] = 1.

        #each individual should have an ID of time of leaving study
        df = df.set_index([id_col, duration_col])
  
        C_panel = df[[event_col]].to_panel().transpose(1,2,0)
        C = C_panel.minor_xs(event_col).sum().astype(bool)
        T = (C_panel.minor_xs(event_col).notnull()).cumsum().idxmax()

        del df[event_col]
        n,d = df.shape

        
        wp = df.to_panel().transpose(1,2,0).bfill().fillna(0) #bfill will cause problems later, plus it is slow.

        non_censorsed_times = T[C].iteritems()

        #initialize dataframe to store estimates
        hazards_ = pd.DataFrame( np.zeros((len(non_censorsed_times),d)), 
                        columns = df.columns, index = from_tuples(non_censorsed_times))

        variance_  = pd.DataFrame( np.zeros((len(non_censorsed_times),d)), 
                        columns = df.columns, index = from_tuples(non_censorsed_times))

        #initializes the penalizer matrix
        penalizer = self.penalizer*np.eye(d)
        ids = wp.items
        progress = progress_bar(len(non_censorsed_times))
        #wp = wp.transpose(1,0,2)

        for i,(id, time) in enumerate(non_censorsed_times): 

            relevant_individuals = (ids==id)
            assert relevant_individuals.sum() == 1.

            #X = wp[time].values
            X = wp.major_xs(time).values.T

            #perform linear regression step.
            try:
                V = dot(inv(dot(X.T, X) + penalizer), X.T)
            except LinAlgError:
                print("Linear regression error. Try increasing the penalizer term.")
                
            v = dot(V, 1.0*relevant_individuals )

            hazards_.ix[id, time]  = v.T
            variance_.ix[id, time] = V[:, relevant_individuals][:,0]**2

            #update progress bar
            if show_progress:
                progress.update(i)

        #print a new line so the console displays well
        if show_progress:
            print()

        #not sure this is the correct thing to do.
        self.hazards_ = hazards_.groupby(level=1).sum()
        self.cumulative_hazards_= self.hazards_.cumsum()
        self.variance_ = variance_.groupby(level=1).sum()

        if timeline is not None:
            self.hazards_ = self.hazards_.reindex(timeline, method='ffill')
            self.cumulative_hazards_ = self.cumulative_hazards_.reindex(timeline, method='ffill')
            self.variance_= self.variance_.reindex(timeline, method='ffill')
            self.timeline = timeline
        else:
            self.timeline = self.hazards_.index.values.astype(float)

        self.data = wp

        self.durations = T
        self.event_observed = C
        self._compute_confidence_intervals()
        self.plot = plot_regressions(self)

        return self