Ejemplo n.º 1
0
    def _fit_static(self, dataframe, duration_col="T", event_col="E",
                    timeline=None, show_progress=True):
        """
        Perform inference on the coefficients of the Aalen additive model.

        Parameters:
            dataframe: a pandas dataframe, with covariates and a duration_col and a event_col.
                      one row per individual. duration_col refers to how long the individual was
                      observed for. event_col is a boolean: 1 if individual 'died', 0 else. id_col
                      should be left as None.

            duration_col: specify what the duration column is called in the dataframe
            event_col: specify what the event occurred column is called in the dataframe
            timeline: reformat the estimates index to a new timeline.
            progress_bar: include a fancy progress bar!

        Returns:
          self, with new methods like plot, smoothed_hazards_ and properties like cumulative_hazards_
        """

        from_tuples = pd.MultiIndex.from_tuples
        df = dataframe.copy()

        # set unique ids for individuals
        id_col = 'id'
        ids = np.arange(df.shape[0])
        df[id_col] = ids

        # if the regression should fit an intercept
        if self.fit_intercept:
            df['baseline'] = 1.

        # each individual should have an ID of time of leaving study
        C = pd.Series(df[event_col].values, dtype=bool, index=ids)
        T = pd.Series(df[duration_col].values, index=ids)

        df = df.set_index(id_col)

        ix = T.argsort()
        T, C = T.iloc[ix], C.iloc[ix]

        del df[event_col]
        del df[duration_col]
        n, d = df.shape
        columns = df.columns

        # initialize dataframe to store estimates
        non_censorsed_times = list(T[C].iteritems())
        n_deaths = len(non_censorsed_times)

        hazards_ = pd.DataFrame(np.zeros((n_deaths, d)), columns=columns,
                                index=from_tuples(non_censorsed_times)).swaplevel(1, 0)

        variance_ = pd.DataFrame(np.zeros((n_deaths, d)), columns=columns,
                                 index=from_tuples(non_censorsed_times)).swaplevel(1, 0)

        # initializes the penalizer matrix
        penalizer = self.penalizer * np.eye(d)

        # initialize loop variables.
        progress = progress_bar(n_deaths)
        to_remove = []
        t = T.iloc[0]
        i = 0

        for id, time in T.iteritems():  # should be sorted.

            if t != time:
                assert t < time
                # remove the individuals from the previous loop.
                df.iloc[to_remove] = 0.
                to_remove = []
                t = time

            to_remove.append(id)
            if C[id] == 0:
                continue

            relevant_individuals = (ids == id)
            assert relevant_individuals.sum() == 1.

            # perform linear regression step.
            X = df.values
            try:
                V = dot(inv(dot(X.T, X) + penalizer), X.T)
            except LinAlgError:
                print("Linear regression error. Try increasing the penalizer term.")

            v = dot(V, 1.0 * relevant_individuals)

            hazards_.ix[time, id] = v.T
            variance_.ix[time, id] = V[:, relevant_individuals][:, 0] ** 2

            # update progress bar
            if show_progress:
                i += 1
                progress.update(i)

        # print a new line so the console displays well
        if show_progress:
            print()

        # not sure this is the correct thing to do.
        self.hazards_ = hazards_.groupby(level=0).sum()
        self.cumulative_hazards_ = self.hazards_.cumsum()
        self.variance_ = variance_.groupby(level=0).sum()

        if timeline is not None:
            self.hazards_ = self.hazards_.reindex(timeline, method='ffill')
            self.cumulative_hazards_ = self.cumulative_hazards_.reindex(timeline, method='ffill')
            self.variance_ = self.variance_.reindex(timeline, method='ffill')
            self.timeline = timeline
        else:
            self.timeline = self.hazards_.index.values.astype(float)

        self.data = dataframe
        self.durations = T
        self.event_observed = C
        self._compute_confidence_intervals()
        self.plot = plot_regressions(self)

        return
Ejemplo n.º 2
0
    def _fit_varying(self, dataframe, duration_col="T", event_col="E",
                     id_col=None, timeline=None, show_progress=True):

        from_tuples = pd.MultiIndex.from_tuples
        df = dataframe.copy()

        # if the regression should fit an intercept
        if self.fit_intercept:
            df['baseline'] = 1.

        # each individual should have an ID of time of leaving study
        df = df.set_index([duration_col, id_col])

        C_panel = df[[event_col]].to_panel().transpose(2, 1, 0)
        C = C_panel.minor_xs(event_col).sum().astype(bool)
        T = (C_panel.minor_xs(event_col).notnull()).cumsum().idxmax()

        del df[event_col]
        n, d = df.shape

        # so this is a problem line. bfill performs a recursion which is
        # really not scalable. Plus even for modest datasets, this eats a lot of memory.
        wp = df.to_panel().bfill().fillna(0)

        # initialize dataframe to store estimates
        non_censorsed_times = list(T[C].iteritems())
        columns = wp.items
        hazards_ = pd.DataFrame(np.zeros((len(non_censorsed_times), d)),
                                columns=columns, index=from_tuples(non_censorsed_times))

        variance_ = pd.DataFrame(np.zeros((len(non_censorsed_times), d)),
                                 columns=columns, index=from_tuples(non_censorsed_times))

        # initializes the penalizer matrix
        penalizer = self.penalizer * np.eye(d)

        ids = wp.minor_axis.values
        progress = progress_bar(len(non_censorsed_times))

        # this makes indexing times much faster
        wp = wp.swapaxes(0, 1, copy=False).swapaxes(1, 2, copy=False)

        for i, (id, time) in enumerate(non_censorsed_times):

            relevant_individuals = (ids == id)
            assert relevant_individuals.sum() == 1.

            X = wp[time].values

            # perform linear regression step.
            try:
                V = dot(inv(dot(X.T, X) + penalizer), X.T)
            except LinAlgError:
                print("Linear regression error. Try increasing the penalizer term.")

            v = dot(V, 1.0 * relevant_individuals)

            hazards_.ix[id, time] = v.T
            variance_.ix[id, time] = V[:, relevant_individuals][:, 0] ** 2

            # update progress bar
            if show_progress:
                progress.update(i)

        # print a new line so the console displays well
        if show_progress:
            print()

        ordered_cols = df.columns  # to_panel() mixes up my columns
        # not sure this is the correct thing to do.
        self.hazards_ = hazards_.groupby(level=1).sum()[ordered_cols]
        self.cumulative_hazards_ = self.hazards_.cumsum()[ordered_cols]
        self.variance_ = variance_.groupby(level=1).sum()[ordered_cols]

        if timeline is not None:
            self.hazards_ = self.hazards_.reindex(timeline, method='ffill')
            self.cumulative_hazards_ = self.cumulative_hazards_.reindex(timeline, method='ffill')
            self.variance_ = self.variance_.reindex(timeline, method='ffill')
            self.timeline = timeline
        else:
            self.timeline = self.hazards_.index.values.astype(float)

        self.data = wp

        self.durations = T
        self.event_observed = C
        self._compute_confidence_intervals()
        self.plot = plot_regressions(self)

        return
Ejemplo n.º 3
0
    def _fit_varying(self,
                     dataframe,
                     duration_col="T",
                     event_col="E",
                     id_col=None,
                     timeline=None,
                     show_progress=True):

        from_tuples = pd.MultiIndex.from_tuples
        df = dataframe.copy()

        #if the regression should fit an intercept
        if self.fit_intercept:
            df['baseline'] = 1.

        #each individual should have an ID of time of leaving study
        df = df.set_index([duration_col, id_col])

        C_panel = df[[event_col]].to_panel().transpose(2, 1, 0)
        C = C_panel.minor_xs(event_col).sum().astype(bool)
        T = (C_panel.minor_xs(event_col).notnull()).cumsum().idxmax()

        del df[event_col]
        n, d = df.shape

        #so this is a problem line. bfill performs a recursion which is
        #really not scalable. Plus even for modest datasets, this eats a lot of memory.
        wp = df.to_panel().bfill().fillna(0)

        #initialize dataframe to store estimates
        non_censorsed_times = T[C].iteritems()
        columns = wp.items
        hazards_ = pd.DataFrame(np.zeros((len(non_censorsed_times), d)),
                                columns=columns,
                                index=from_tuples(non_censorsed_times))

        variance_ = pd.DataFrame(np.zeros((len(non_censorsed_times), d)),
                                 columns=columns,
                                 index=from_tuples(non_censorsed_times))

        #initializes the penalizer matrix
        penalizer = self.penalizer * np.eye(d)

        ids = wp.minor_axis.values
        progress = progress_bar(len(non_censorsed_times))

        #this makes indexing times much faster
        wp = wp.swapaxes(0, 1, copy=False).swapaxes(1, 2, copy=False)

        for i, (id, time) in enumerate(non_censorsed_times):

            relevant_individuals = (ids == id)
            assert relevant_individuals.sum() == 1.

            X = wp[time].values

            #perform linear regression step.
            try:
                V = dot(inv(dot(X.T, X) + penalizer), X.T)
            except LinAlgError:
                print(
                    "Linear regression error. Try increasing the penalizer term."
                )

            v = dot(V, 1.0 * relevant_individuals)

            hazards_.ix[id, time] = v.T
            variance_.ix[id, time] = V[:, relevant_individuals][:, 0]**2

            #update progress bar
            if show_progress:
                progress.update(i)

        #print a new line so the console displays well
        if show_progress:
            print()

        ordered_cols = df.columns  #to_panel() mixes up my columns
        #not sure this is the correct thing to do.
        self.hazards_ = hazards_.groupby(level=1).sum()[ordered_cols]
        self.cumulative_hazards_ = self.hazards_.cumsum()[ordered_cols]
        self.variance_ = variance_.groupby(level=1).sum()[ordered_cols]

        if timeline is not None:
            self.hazards_ = self.hazards_.reindex(timeline, method='ffill')
            self.cumulative_hazards_ = self.cumulative_hazards_.reindex(
                timeline, method='ffill')
            self.variance_ = self.variance_.reindex(timeline, method='ffill')
            self.timeline = timeline
        else:
            self.timeline = self.hazards_.index.values.astype(float)

        self.data = wp

        self.durations = T
        self.event_observed = C
        self._compute_confidence_intervals()
        self.plot = plot_regressions(self)

        return
Ejemplo n.º 4
0
    def fit(self, event_times, X, timeline=None, censorship=None, columns=None):
        """currently X is a static (n,d) array

        event_times: (n,1) array of event times
        X: (n,d) the design matrix, either a numpy matrix or DataFrame.
        timeline: (t,1) timepoints in ascending order
        censorship: (n,1) boolean array of censorships: True if observed, False if right-censored.
                    By default, assuming all are observed.

        Fits: self.cumulative_hazards_: a (t,d+1) dataframe of cumulative hazard coefficients
              self.hazards_: a (t,d+1) dataframe of hazard coefficients

        """
        # deal with the covariate matrix. Check if it is a dataframe or numpy array
        n, d = X.shape
        if type(X) == pd.core.frame.DataFrame:
            X_ = X.values.copy()
            if columns is None:
                columns = X.columns
        else:
            X_ = X.copy()

        # append a columns of ones for the baseline hazard
        ix = event_times.argsort(0)[:, 0].copy()
        X_ = X_[ix, :].copy() if not self.fit_intercept else np.c_[X_[ix, :].copy(), np.ones((n, 1))]
        sorted_event_times = event_times[ix, 0].copy()

        # set the column's names of the dataframe.
        if columns is None:
            columns = range(d)
        else:
            columns = [c for c in columns]

        if self.fit_intercept:
            columns += ["baseline"]

        # set the censorship events. 1 if the death was observed.
        if censorship is None:
            observed = np.ones(n, dtype=bool)
        else:
            observed = censorship[ix].reshape(n)

        # set the timeline -- this is used as DataFrame index in the results
        if timeline is None:
            timeline = sorted_event_times.copy()

        timeline = np.unique(timeline.astype(float))
        if timeline[0] > 0:
            timeline = np.insert(timeline, 0, 0.0)

        unique_times = np.unique(timeline)
        zeros = np.zeros((timeline.shape[0], d + self.fit_intercept))
        self.cumulative_hazards_ = pd.DataFrame(zeros.copy(), index=unique_times, columns=columns)
        self.hazards_ = pd.DataFrame(
            np.zeros((event_times.shape[0], d + self.fit_intercept)), index=event_times[:, 0], columns=columns
        )
        self._variance = pd.DataFrame(zeros.copy(), index=unique_times, columns=columns)

        # create the penalizer matrix for L2 regression
        penalizer = self.penalizer * np.eye(d + self.fit_intercept)

        t_0 = sorted_event_times[0]
        cum_v = np.zeros((d + self.fit_intercept, 1))
        v = cum_v.copy()
        for i, time in enumerate(sorted_event_times):
            relevant_times = (t_0 < timeline) * (timeline <= time)
            if observed[i] == 0:
                X_[i, :] = 0
            try:
                V = dot(inv(dot(X_.T, X_) + penalizer), X_.T)
            except LinAlgError:
                # if penalizer > 0, this should not occur. But sometimes it does...
                V = dot(pinv(dot(X_.T, X_) + penalizer), X_.T)

            v = dot(V, basis(n, i))
            cum_v = cum_v + v
            self.cumulative_hazards_.ix[relevant_times] = self.cumulative_hazards_.ix[relevant_times].values + cum_v.T
            self.hazards_.iloc[i] = self.hazards_.iloc[i].values + v.T
            self._variance.ix[relevant_times] = (
                self._variance.ix[relevant_times].values + dot(V[:, i][:, None], V[:, i][None, :]).diagonal()
            )
            t_0 = time
            X_[i, :] = 0

        # clean up last iteration
        relevant_times = timeline > time
        self.hazards_.iloc[i] = v.T
        try:
            self.cumulative_hazards_.ix[relevant_times] = cum_v.T
            self._variance.ix[relevant_times] = dot(V[:, i][:, None], V[:, i][None, :]).diagonal()
        except:
            pass
        self.timeline = timeline
        self.X = X
        self.censorship = censorship
        self.event_times = event_times
        self._compute_confidence_intervals()
        self.plot = plot_regressions(self)
        return self
Ejemplo n.º 5
0
    def _fit_static(self,
                    dataframe,
                    duration_col="T",
                    event_col="E",
                    timeline=None,
                    show_progress=True):
        """
        Perform inference on the coefficients of the Aalen additive model. 

        Parameters:
            dataframe: a pandas dataframe, with covariates and a duration_col and a event_col.
                      one row per individual. duration_col refers to how long the individual was 
                      observed for. event_col is a boolean: 1 if individual 'died', 0 else. id_col 
                      should be left as None.

            duration_col: specify what the duration column is called in the dataframe 
            event_col: specify what the event occurred column is called in the dataframe 
            timeline: reformat the estimates index to a new timeline.
            progress_bar: include a fancy progress bar!

        Returns:
          self, with new methods like plot, smoothed_hazards_ and properties like cumulative_hazards_
        """

        from_tuples = pd.MultiIndex.from_tuples
        df = dataframe.copy()

        #set unique ids for individuals
        id_col = 'id'
        ids = np.arange(df.shape[0])
        df[id_col] = ids

        #if the regression should fit an intercept
        if self.fit_intercept:
            df['baseline'] = 1.

        #each individual should have an ID of time of leaving study
        C = df[event_col].astype(bool)
        T = df[duration_col]
        df = df.set_index([duration_col, id_col])

        ix = T.argsort()
        T, C = T.iloc[ix], C.iloc[ix]

        del df[event_col]
        n, d = df.shape
        columns = df.columns

        #initialize dataframe to store estimates
        non_censorsed_times = T[C].iteritems()
        n_deaths = len(non_censorsed_times)

        hazards_ = pd.DataFrame(
            np.zeros((n_deaths, d)),
            columns=columns,
            index=from_tuples(non_censorsed_times)).swaplevel(1, 0)

        variance_ = pd.DataFrame(
            np.zeros((n_deaths, d)),
            columns=columns,
            index=from_tuples(non_censorsed_times)).swaplevel(1, 0)

        #initializes the penalizer matrix
        penalizer = self.penalizer * np.eye(d)

        #initialize loop variables.
        progress = progress_bar(n_deaths)
        to_remove = []
        t = T.iloc[0]
        i = 0

        for id, time in T.iteritems():  #should be sorted.

            if t != time:
                assert t < time
                #remove the individuals from the previous loop.
                df.ix[to_remove] = 0.
                to_remove = []
                t = time

            to_remove.append(id)
            if C[id] == 0:
                continue

            relevant_individuals = (ids == id)
            assert relevant_individuals.sum() == 1.

            #perform linear regression step.
            X = df.values
            try:
                V = dot(inv(dot(X.T, X) + penalizer), X.T)
            except LinAlgError:
                print(
                    "Linear regression error. Try increasing the penalizer term."
                )

            v = dot(V, 1.0 * relevant_individuals)

            hazards_.ix[time, id] = v.T
            variance_.ix[time, id] = V[:, relevant_individuals][:, 0]**2

            #update progress bar
            if show_progress:
                i += 1
                progress.update(i)

        #print a new line so the console displays well
        if show_progress:
            print()

        #not sure this is the correct thing to do.
        self.hazards_ = hazards_.groupby(level=0).sum()
        self.cumulative_hazards_ = self.hazards_.cumsum()
        self.variance_ = variance_.groupby(level=0).sum()

        if timeline is not None:
            self.hazards_ = self.hazards_.reindex(timeline, method='ffill')
            self.cumulative_hazards_ = self.cumulative_hazards_.reindex(
                timeline, method='ffill')
            self.variance_ = self.variance_.reindex(timeline, method='ffill')
            self.timeline = timeline
        else:
            self.timeline = self.hazards_.index.values.astype(float)

        self.data = dataframe
        self.durations = T
        self.event_observed = C
        self._compute_confidence_intervals()
        self.plot = plot_regressions(self)

        return
Ejemplo n.º 6
0
    def fit(self, event_times, X, timeline=None, censorship=None, columns=None):
        """currently X is a static (n,d) array

        event_times: (n,1) array of event times
        X: (n,d) the design matrix, either a numpy matrix or DataFrame.
        timeline: (t,1) timepoints in ascending order
        censorship: (n,1) boolean array of censorships: True if observed, False if right-censored.
                    By default, assuming all are observed.

        Fits: self.cumulative_hazards_: a (t,d+1) dataframe of cumulative hazard coefficients
              self.hazards_: a (t,d+1) dataframe of hazard coefficients

        """
        # deal with the covariate matrix. Check if it is a dataframe or numpy array
        n, d = X.shape
        if type(X) == pd.core.frame.DataFrame:
            X_ = X.values.copy()
            if columns is None:
                columns = X.columns
        else:
            X_ = X.copy()

        # append a columns of ones for the baseline hazard
        ix = event_times.argsort(0)[:, 0].copy()
        X_ = X_[ix,:].copy() if not self.fit_intercept else np.c_[ X_[ix,:].copy(), np.ones((n, 1)) ]
        sorted_event_times = event_times[ix, 0].copy()

        # set the column's names of the dataframe.
        if columns is None:
            columns = range(d)
        else:
            columns = [c for c in columns]

        if self.fit_intercept:
            columns += ['baseline']

        # set the censorship events. 1 if the death was observed.
        if censorship is None:
            observed = np.ones(n, dtype=bool)
        else:
            observed = censorship[ix].reshape(n)

        # set the timeline -- this is used as DataFrame index in the results
        if timeline is None:
            timeline = sorted_event_times.copy()

        timeline = np.unique(timeline.astype(float))
        if timeline[0] > 0:
            timeline = np.insert(timeline, 0, 0.)

        unique_times = np.unique(timeline)
        zeros = np.zeros((timeline.shape[0], d + self.fit_intercept))
        self.cumulative_hazards_ = pd.DataFrame(zeros.copy(), index=unique_times, columns=columns)
        self.hazards_ = pd.DataFrame(
            np.zeros((event_times.shape[0], d + self.fit_intercept)), index=event_times[:, 0], columns=columns)
        self._variance = pd.DataFrame(zeros.copy(), index=unique_times, columns=columns)

        # create the penalizer matrix for L2 regression
        penalizer = self.penalizer * np.eye(d + self.fit_intercept)

        t_0 = sorted_event_times[0]
        cum_v = np.zeros((d + self.fit_intercept, 1))
        v = cum_v.copy()
        for i, time in enumerate(sorted_event_times):
            relevant_times = (t_0 < timeline) * (timeline <= time)
            if observed[i] == 0:
                X_[i,:] = 0
            try:
                V = dot(inv(dot(X_.T, X_) + penalizer), X_.T)
            except LinAlgError:
                # if penalizer > 0, this should not occur. But sometimes it does...
                V = dot(pinv(dot(X_.T, X_) + penalizer), X_.T)

            v = dot(V, basis(n, i))
            cum_v = cum_v + v
            self.cumulative_hazards_.ix[relevant_times] = self.cumulative_hazards_.ix[relevant_times].values + cum_v.T
            self.hazards_.iloc[i] = self.hazards_.iloc[i].values + v.T
            self._variance.ix[relevant_times] = self._variance.ix[relevant_times].values + dot( V[:, i][:, None], V[:, i][None,:] ).diagonal()
            t_0 = time
            X_[i,:] = 0

        # clean up last iteration
        relevant_times = (timeline > time)
        self.hazards_.iloc[i] = v.T
        try:
            self.cumulative_hazards_.ix[relevant_times] = cum_v.T
            self._variance.ix[relevant_times] = dot( V[:, i][:, None], V[:, i][None,:] ).diagonal()
        except:
            pass
        self.timeline = timeline
        self.X = X
        self.censorship = censorship
        self.event_times = event_times
        self._compute_confidence_intervals()
        self.plot = plot_regressions(self)
        return self
Ejemplo n.º 7
0
    def fit(self, event_times, X, timeline=None, censorship=None, columns=None, verbose=True, debug=False):
        """currently X is a static (n,d) array

        event_times: (n,1) array of event times
        X: (n,d) the design matrix, either a numpy matrix or DataFrame.
        timeline: (t,1) timepoints in ascending order
        censorship: (n,1) boolean array of censorships: True if observed, False if right-censored.
                    By default, assuming all are observed.

        Fits: self.cumulative_hazards_: a (t,d+1) dataframe of cumulative hazard coefficients
              self.hazards_: a (t,d+1) dataframe of hazard coefficients

        """
        # deal with the covariate matrix. Check if it is a dataframe or numpy
        # array
        n, d = X.shape

        # append a columns of ones for the baseline hazard
        ix = event_times.argsort(0)[:, 0]
        baseline = np.ones((n, 1))
        X = np.hstack([X[ix,:], baseline])
        sorted_event_times = event_times[ix, 0]

        # set the column's names of the dataframe.
        if columns is None:
            columns = range(d)
        else:
            columns = [c for c in columns]

        if self.fit_intercept:
            columns += ['baseline']

        # set the censorship events. 1 if the death was observed.
        if censorship is None:
            observed = np.ones(n, dtype=bool)
        else:
            observed = censorship[ix].reshape(n)

        # set the timeline -- this is used as DataFrame index in the results
        if timeline is None:
            timeline = sorted_event_times

        timeline = np.unique(timeline.astype(float))
        if timeline[0] > 0:
            timeline = np.insert(timeline, 0, 0.)

        unique_times = np.unique(timeline)
        zeros = np.zeros((timeline.shape[0], d + self.fit_intercept))
        self.cumulative_hazards_ = pd.DataFrame(
            zeros.copy(), index=unique_times, columns=columns)
        self.hazards_ = pd.DataFrame(
            np.zeros((event_times.shape[0], d + self.fit_intercept)), index=event_times[:, 0], columns=columns)
        self._variance = pd.DataFrame(
            zeros.copy(), index=unique_times, columns=columns)

        # create the penalizer matrix for L2 regression
        penalizer = (self.penalizer * np.eye(d + self.fit_intercept)).astype(
            np.float32, copy=False)

        t_0 = sorted_event_times[0]
        cum_v = np.zeros((d + self.fit_intercept, 1))
        v = cum_v.copy()
        n_iters = len(sorted_event_times)
        for i, time in enumerate(sorted_event_times):
            if debug:
                pdb.set_trace()
            relevant_times = (t_0 < timeline) * (timeline <= time)
            if observed[i] == 0:
                X[i,:] = 0
            try:
                lr(df.values, relevant_individuals, c1=self.coef_penalizer, c2=self.smoothing_penalizer, offset=previous_hazard)
                #V = dot(inv(dot(X.T, X) + penalizer), X.T)
            except LinAlgError:
                pass
                # if penalizer > 0, this should not occur. But sometimes it does...
                #V = dot(pinv(dot(X.T, X) + penalizer), X.T)

            v = dot(V, basis(n, i))
            cum_v = cum_v + v
            self.cumulative_hazards_.ix[relevant_times] = self.cumulative_hazards_.ix[
                relevant_times].values + cum_v.T
            self.hazards_.iloc[i] = self.hazards_.iloc[i].values + v.T
            self._variance.ix[relevant_times] = self._variance.ix[
                relevant_times].values + dot( V[:, i][:, None], V[:, i][None,:] ).diagonal()
            t_0 = time
            X[i,:] = 0

            if verbose:
                sys.stdout.write("\r iteration %i of %i completed" % (i + 1, n_iters))
                sys.stdout.flush()

        # clean up last iteration
        relevant_times = (timeline > time)
        self.hazards_.iloc[i] = v.T
        try:
            self.cumulative_hazards_.ix[relevant_times] = cum_v.T
            self._variance.ix[relevant_times] =  dot( V[:, i][:, None], V[:, i][None,:] ).diagonal()
        except:
            pass
        self.timeline = timeline
        self.X = X
        self.censorship = censorship
        self.event_times = event_times
        self._compute_confidence_intervals()
        self.plot = plot_regressions(self)
        return self
Ejemplo n.º 8
0
    def _fit_varying(self, dataframe, duration_col="T", event_col="E", id_col=None, timeline=None, show_progress=True):

        from_tuples = pd.MultiIndex.from_tuples
        df = dataframe.copy()

        # if the regression should fit an intercept
        if self.fit_intercept:
            df["baseline"] = 1.0

        # each individual should have an ID of time of leaving study
        df = df.set_index([duration_col, id_col])

        # if no event_col is specified, assume all non-censorships
        if event_col is None:
            event_col = "E"
            df[event_col] = 1

        C_panel = df[[event_col]].to_panel().transpose(2, 1, 0)
        C = C_panel.minor_xs(event_col).sum().astype(bool)
        T = (C_panel.minor_xs(event_col).notnull()).cumsum().idxmax()

        del df[event_col]
        n, d = df.shape

        # so this is a problem line. bfill performs a recursion which is
        # really not scalable. Plus even for modest datasets, this eats a lot of memory.
        # Plus is bfill the correct thing to choose? It's forward looking...
        wp = df.to_panel().bfill().fillna(0)

        # initialize dataframe to store estimates
        non_censorsed_times = list(T[C].iteritems())
        columns = wp.items
        hazards_ = pd.DataFrame(
            np.zeros((len(non_censorsed_times), d)), columns=columns, index=from_tuples(non_censorsed_times)
        )

        variance_ = pd.DataFrame(
            np.zeros((len(non_censorsed_times), d)), columns=columns, index=from_tuples(non_censorsed_times)
        )

        previous_hazard = np.zeros((d,))
        ids = wp.minor_axis.values
        progress = progress_bar(len(non_censorsed_times))

        # this makes indexing times much faster
        wp = wp.swapaxes(0, 1, copy=False).swapaxes(1, 2, copy=False)

        for i, (id, time) in enumerate(non_censorsed_times):

            relevant_individuals = ids == id
            assert relevant_individuals.sum() == 1.0

            # perform linear regression step.
            try:
                v, V = lr(
                    wp[time].values,
                    relevant_individuals,
                    c1=self.coef_penalizer,
                    c2=self.smoothing_penalizer,
                    offset=previous_hazard,
                )
            except LinAlgError:
                print("Linear regression error. Try increasing the penalizer term.")

            hazards_.ix[id, time] = v.T
            variance_.ix[id, time] = V[:, relevant_individuals][:, 0] ** 2
            previous_hazard = v.T

            # update progress bar
            if show_progress:
                progress.update(i)

        # print a new line so the console displays well
        if show_progress:
            print()

        ordered_cols = df.columns  # to_panel() mixes up my columns

        self.hazards_ = hazards_.groupby(level=1).sum()[ordered_cols]
        self.cumulative_hazards_ = self.hazards_.cumsum()[ordered_cols]
        self.variance_ = variance_.groupby(level=1).sum()[ordered_cols]

        if timeline is not None:
            self.hazards_ = self.hazards_.reindex(timeline, method="ffill")
            self.cumulative_hazards_ = self.cumulative_hazards_.reindex(timeline, method="ffill")
            self.variance_ = self.variance_.reindex(timeline, method="ffill")
            self.timeline = timeline
        else:
            self.timeline = self.hazards_.index.values.astype(float)

        self.data = wp

        self.durations = T
        self.event_observed = C
        self._compute_confidence_intervals()
        self.plot = plot_regressions(self)

        return
Ejemplo n.º 9
0
    def fit(self, dataframe, duration_col="T", event_col="E", timeline=None, id_col=None, show_progress=True):
        """
        Perform inference on the coefficients of the Aalen additive model. 

        Parameters:
            dataframe: a pandas dataframe, with covariates and a duration_col and a event_col.

                static covariates:
                    one row per individual. duration_col refers to how long the individual was 
                      observed for. event_col is a boolean: 1 if individual 'died', 0 else. id_col 
                      should be left as None.

                time-varying covariates:
                    For time-varying covariates, an id_col is required to keep track of individuals'
                    changing covariates. individual should have a unique id. duration_col refers to how 
                    long the individual has been  observed to up to that point. event_col refers to if 
                    the event (death) occured in that  period. Censored individuals will not have a 1. 
                    For example:

                        +----+---+---+------+------+
                        | id | T | E | var1 | var2 |
                        +----+---+---+------+------+
                        |  1 | 1 | 0 |    0 |    1 |
                        |  1 | 2 | 0 |    0 |    1 |
                        |  1 | 3 | 0 |    4 |    3 |
                        |  1 | 4 | 1 |    8 |    4 |
                        |  2 | 1 | 0 |    1 |    1 |
                        |  2 | 2 | 0 |    1 |    2 |
                        |  2 | 3 | 0 |    1 |    2 |
                        +----+---+---+------+------+

            duration_col: specify what the duration column is called in the dataframe 
            event_col: specify what the event occurred column is called in the dataframe 
            timeline: reformat the estimates index to a new timeline.
            id_col: (only for time-varying covariates) name of the id column in the dataframe
            progress_bar: include a fancy progress bar!


        Returns:
          self, with new methods like plot, smoothed_hazards_ and properties like cumulative_hazards_
        """
        from_tuples = pd.MultiIndex.from_tuples
        df = dataframe.copy()

        #only for time-indp. covariates
        if id_col is None:
            df['id'] = np.arange(df.shape[0])
            id_col = 'id'

        #if the regression should fit an intercept
        if self.fit_intercept:
            df['baseline'] = 1.

        #each individual should have an ID of time of leaving study
        df = df.set_index([id_col, duration_col])
  
        C_panel = df[[event_col]].to_panel().transpose(1,2,0)
        C = C_panel.minor_xs(event_col).sum().astype(bool)
        T = (C_panel.minor_xs(event_col).notnull()).cumsum().idxmax()

        del df[event_col]
        n,d = df.shape

        
        wp = df.to_panel().transpose(1,2,0).bfill().fillna(0) #bfill will cause problems later, plus it is slow.

        non_censorsed_times = T[C].iteritems()

        #initialize dataframe to store estimates
        hazards_ = pd.DataFrame( np.zeros((len(non_censorsed_times),d)), 
                        columns = df.columns, index = from_tuples(non_censorsed_times))

        variance_  = pd.DataFrame( np.zeros((len(non_censorsed_times),d)), 
                        columns = df.columns, index = from_tuples(non_censorsed_times))

        #initializes the penalizer matrix
        penalizer = self.penalizer*np.eye(d)
        ids = wp.items
        progress = progress_bar(len(non_censorsed_times))
        #wp = wp.transpose(1,0,2)

        for i,(id, time) in enumerate(non_censorsed_times): 

            relevant_individuals = (ids==id)
            assert relevant_individuals.sum() == 1.

            #X = wp[time].values
            X = wp.major_xs(time).values.T

            #perform linear regression step.
            try:
                V = dot(inv(dot(X.T, X) + penalizer), X.T)
            except LinAlgError:
                print("Linear regression error. Try increasing the penalizer term.")
                
            v = dot(V, 1.0*relevant_individuals )

            hazards_.ix[id, time]  = v.T
            variance_.ix[id, time] = V[:, relevant_individuals][:,0]**2

            #update progress bar
            if show_progress:
                progress.update(i)

        #print a new line so the console displays well
        if show_progress:
            print()

        #not sure this is the correct thing to do.
        self.hazards_ = hazards_.groupby(level=1).sum()
        self.cumulative_hazards_= self.hazards_.cumsum()
        self.variance_ = variance_.groupby(level=1).sum()

        if timeline is not None:
            self.hazards_ = self.hazards_.reindex(timeline, method='ffill')
            self.cumulative_hazards_ = self.cumulative_hazards_.reindex(timeline, method='ffill')
            self.variance_= self.variance_.reindex(timeline, method='ffill')
            self.timeline = timeline
        else:
            self.timeline = self.hazards_.index.values.astype(float)

        self.data = wp

        self.durations = T
        self.event_observed = C
        self._compute_confidence_intervals()
        self.plot = plot_regressions(self)

        return self