def _fit_static(self, dataframe, duration_col="T", event_col="E", timeline=None, show_progress=True): """ Perform inference on the coefficients of the Aalen additive model. Parameters: dataframe: a pandas dataframe, with covariates and a duration_col and a event_col. one row per individual. duration_col refers to how long the individual was observed for. event_col is a boolean: 1 if individual 'died', 0 else. id_col should be left as None. duration_col: specify what the duration column is called in the dataframe event_col: specify what the event occurred column is called in the dataframe timeline: reformat the estimates index to a new timeline. progress_bar: include a fancy progress bar! Returns: self, with new methods like plot, smoothed_hazards_ and properties like cumulative_hazards_ """ from_tuples = pd.MultiIndex.from_tuples df = dataframe.copy() # set unique ids for individuals id_col = 'id' ids = np.arange(df.shape[0]) df[id_col] = ids # if the regression should fit an intercept if self.fit_intercept: df['baseline'] = 1. # each individual should have an ID of time of leaving study C = pd.Series(df[event_col].values, dtype=bool, index=ids) T = pd.Series(df[duration_col].values, index=ids) df = df.set_index(id_col) ix = T.argsort() T, C = T.iloc[ix], C.iloc[ix] del df[event_col] del df[duration_col] n, d = df.shape columns = df.columns # initialize dataframe to store estimates non_censorsed_times = list(T[C].iteritems()) n_deaths = len(non_censorsed_times) hazards_ = pd.DataFrame(np.zeros((n_deaths, d)), columns=columns, index=from_tuples(non_censorsed_times)).swaplevel(1, 0) variance_ = pd.DataFrame(np.zeros((n_deaths, d)), columns=columns, index=from_tuples(non_censorsed_times)).swaplevel(1, 0) # initializes the penalizer matrix penalizer = self.penalizer * np.eye(d) # initialize loop variables. progress = progress_bar(n_deaths) to_remove = [] t = T.iloc[0] i = 0 for id, time in T.iteritems(): # should be sorted. if t != time: assert t < time # remove the individuals from the previous loop. df.iloc[to_remove] = 0. to_remove = [] t = time to_remove.append(id) if C[id] == 0: continue relevant_individuals = (ids == id) assert relevant_individuals.sum() == 1. # perform linear regression step. X = df.values try: V = dot(inv(dot(X.T, X) + penalizer), X.T) except LinAlgError: print("Linear regression error. Try increasing the penalizer term.") v = dot(V, 1.0 * relevant_individuals) hazards_.ix[time, id] = v.T variance_.ix[time, id] = V[:, relevant_individuals][:, 0] ** 2 # update progress bar if show_progress: i += 1 progress.update(i) # print a new line so the console displays well if show_progress: print() # not sure this is the correct thing to do. self.hazards_ = hazards_.groupby(level=0).sum() self.cumulative_hazards_ = self.hazards_.cumsum() self.variance_ = variance_.groupby(level=0).sum() if timeline is not None: self.hazards_ = self.hazards_.reindex(timeline, method='ffill') self.cumulative_hazards_ = self.cumulative_hazards_.reindex(timeline, method='ffill') self.variance_ = self.variance_.reindex(timeline, method='ffill') self.timeline = timeline else: self.timeline = self.hazards_.index.values.astype(float) self.data = dataframe self.durations = T self.event_observed = C self._compute_confidence_intervals() self.plot = plot_regressions(self) return
def _fit_varying(self, dataframe, duration_col="T", event_col="E", id_col=None, timeline=None, show_progress=True): from_tuples = pd.MultiIndex.from_tuples df = dataframe.copy() # if the regression should fit an intercept if self.fit_intercept: df['baseline'] = 1. # each individual should have an ID of time of leaving study df = df.set_index([duration_col, id_col]) C_panel = df[[event_col]].to_panel().transpose(2, 1, 0) C = C_panel.minor_xs(event_col).sum().astype(bool) T = (C_panel.minor_xs(event_col).notnull()).cumsum().idxmax() del df[event_col] n, d = df.shape # so this is a problem line. bfill performs a recursion which is # really not scalable. Plus even for modest datasets, this eats a lot of memory. wp = df.to_panel().bfill().fillna(0) # initialize dataframe to store estimates non_censorsed_times = list(T[C].iteritems()) columns = wp.items hazards_ = pd.DataFrame(np.zeros((len(non_censorsed_times), d)), columns=columns, index=from_tuples(non_censorsed_times)) variance_ = pd.DataFrame(np.zeros((len(non_censorsed_times), d)), columns=columns, index=from_tuples(non_censorsed_times)) # initializes the penalizer matrix penalizer = self.penalizer * np.eye(d) ids = wp.minor_axis.values progress = progress_bar(len(non_censorsed_times)) # this makes indexing times much faster wp = wp.swapaxes(0, 1, copy=False).swapaxes(1, 2, copy=False) for i, (id, time) in enumerate(non_censorsed_times): relevant_individuals = (ids == id) assert relevant_individuals.sum() == 1. X = wp[time].values # perform linear regression step. try: V = dot(inv(dot(X.T, X) + penalizer), X.T) except LinAlgError: print("Linear regression error. Try increasing the penalizer term.") v = dot(V, 1.0 * relevant_individuals) hazards_.ix[id, time] = v.T variance_.ix[id, time] = V[:, relevant_individuals][:, 0] ** 2 # update progress bar if show_progress: progress.update(i) # print a new line so the console displays well if show_progress: print() ordered_cols = df.columns # to_panel() mixes up my columns # not sure this is the correct thing to do. self.hazards_ = hazards_.groupby(level=1).sum()[ordered_cols] self.cumulative_hazards_ = self.hazards_.cumsum()[ordered_cols] self.variance_ = variance_.groupby(level=1).sum()[ordered_cols] if timeline is not None: self.hazards_ = self.hazards_.reindex(timeline, method='ffill') self.cumulative_hazards_ = self.cumulative_hazards_.reindex(timeline, method='ffill') self.variance_ = self.variance_.reindex(timeline, method='ffill') self.timeline = timeline else: self.timeline = self.hazards_.index.values.astype(float) self.data = wp self.durations = T self.event_observed = C self._compute_confidence_intervals() self.plot = plot_regressions(self) return
def _fit_varying(self, dataframe, duration_col="T", event_col="E", id_col=None, timeline=None, show_progress=True): from_tuples = pd.MultiIndex.from_tuples df = dataframe.copy() #if the regression should fit an intercept if self.fit_intercept: df['baseline'] = 1. #each individual should have an ID of time of leaving study df = df.set_index([duration_col, id_col]) C_panel = df[[event_col]].to_panel().transpose(2, 1, 0) C = C_panel.minor_xs(event_col).sum().astype(bool) T = (C_panel.minor_xs(event_col).notnull()).cumsum().idxmax() del df[event_col] n, d = df.shape #so this is a problem line. bfill performs a recursion which is #really not scalable. Plus even for modest datasets, this eats a lot of memory. wp = df.to_panel().bfill().fillna(0) #initialize dataframe to store estimates non_censorsed_times = T[C].iteritems() columns = wp.items hazards_ = pd.DataFrame(np.zeros((len(non_censorsed_times), d)), columns=columns, index=from_tuples(non_censorsed_times)) variance_ = pd.DataFrame(np.zeros((len(non_censorsed_times), d)), columns=columns, index=from_tuples(non_censorsed_times)) #initializes the penalizer matrix penalizer = self.penalizer * np.eye(d) ids = wp.minor_axis.values progress = progress_bar(len(non_censorsed_times)) #this makes indexing times much faster wp = wp.swapaxes(0, 1, copy=False).swapaxes(1, 2, copy=False) for i, (id, time) in enumerate(non_censorsed_times): relevant_individuals = (ids == id) assert relevant_individuals.sum() == 1. X = wp[time].values #perform linear regression step. try: V = dot(inv(dot(X.T, X) + penalizer), X.T) except LinAlgError: print( "Linear regression error. Try increasing the penalizer term." ) v = dot(V, 1.0 * relevant_individuals) hazards_.ix[id, time] = v.T variance_.ix[id, time] = V[:, relevant_individuals][:, 0]**2 #update progress bar if show_progress: progress.update(i) #print a new line so the console displays well if show_progress: print() ordered_cols = df.columns #to_panel() mixes up my columns #not sure this is the correct thing to do. self.hazards_ = hazards_.groupby(level=1).sum()[ordered_cols] self.cumulative_hazards_ = self.hazards_.cumsum()[ordered_cols] self.variance_ = variance_.groupby(level=1).sum()[ordered_cols] if timeline is not None: self.hazards_ = self.hazards_.reindex(timeline, method='ffill') self.cumulative_hazards_ = self.cumulative_hazards_.reindex( timeline, method='ffill') self.variance_ = self.variance_.reindex(timeline, method='ffill') self.timeline = timeline else: self.timeline = self.hazards_.index.values.astype(float) self.data = wp self.durations = T self.event_observed = C self._compute_confidence_intervals() self.plot = plot_regressions(self) return
def fit(self, event_times, X, timeline=None, censorship=None, columns=None): """currently X is a static (n,d) array event_times: (n,1) array of event times X: (n,d) the design matrix, either a numpy matrix or DataFrame. timeline: (t,1) timepoints in ascending order censorship: (n,1) boolean array of censorships: True if observed, False if right-censored. By default, assuming all are observed. Fits: self.cumulative_hazards_: a (t,d+1) dataframe of cumulative hazard coefficients self.hazards_: a (t,d+1) dataframe of hazard coefficients """ # deal with the covariate matrix. Check if it is a dataframe or numpy array n, d = X.shape if type(X) == pd.core.frame.DataFrame: X_ = X.values.copy() if columns is None: columns = X.columns else: X_ = X.copy() # append a columns of ones for the baseline hazard ix = event_times.argsort(0)[:, 0].copy() X_ = X_[ix, :].copy() if not self.fit_intercept else np.c_[X_[ix, :].copy(), np.ones((n, 1))] sorted_event_times = event_times[ix, 0].copy() # set the column's names of the dataframe. if columns is None: columns = range(d) else: columns = [c for c in columns] if self.fit_intercept: columns += ["baseline"] # set the censorship events. 1 if the death was observed. if censorship is None: observed = np.ones(n, dtype=bool) else: observed = censorship[ix].reshape(n) # set the timeline -- this is used as DataFrame index in the results if timeline is None: timeline = sorted_event_times.copy() timeline = np.unique(timeline.astype(float)) if timeline[0] > 0: timeline = np.insert(timeline, 0, 0.0) unique_times = np.unique(timeline) zeros = np.zeros((timeline.shape[0], d + self.fit_intercept)) self.cumulative_hazards_ = pd.DataFrame(zeros.copy(), index=unique_times, columns=columns) self.hazards_ = pd.DataFrame( np.zeros((event_times.shape[0], d + self.fit_intercept)), index=event_times[:, 0], columns=columns ) self._variance = pd.DataFrame(zeros.copy(), index=unique_times, columns=columns) # create the penalizer matrix for L2 regression penalizer = self.penalizer * np.eye(d + self.fit_intercept) t_0 = sorted_event_times[0] cum_v = np.zeros((d + self.fit_intercept, 1)) v = cum_v.copy() for i, time in enumerate(sorted_event_times): relevant_times = (t_0 < timeline) * (timeline <= time) if observed[i] == 0: X_[i, :] = 0 try: V = dot(inv(dot(X_.T, X_) + penalizer), X_.T) except LinAlgError: # if penalizer > 0, this should not occur. But sometimes it does... V = dot(pinv(dot(X_.T, X_) + penalizer), X_.T) v = dot(V, basis(n, i)) cum_v = cum_v + v self.cumulative_hazards_.ix[relevant_times] = self.cumulative_hazards_.ix[relevant_times].values + cum_v.T self.hazards_.iloc[i] = self.hazards_.iloc[i].values + v.T self._variance.ix[relevant_times] = ( self._variance.ix[relevant_times].values + dot(V[:, i][:, None], V[:, i][None, :]).diagonal() ) t_0 = time X_[i, :] = 0 # clean up last iteration relevant_times = timeline > time self.hazards_.iloc[i] = v.T try: self.cumulative_hazards_.ix[relevant_times] = cum_v.T self._variance.ix[relevant_times] = dot(V[:, i][:, None], V[:, i][None, :]).diagonal() except: pass self.timeline = timeline self.X = X self.censorship = censorship self.event_times = event_times self._compute_confidence_intervals() self.plot = plot_regressions(self) return self
def _fit_static(self, dataframe, duration_col="T", event_col="E", timeline=None, show_progress=True): """ Perform inference on the coefficients of the Aalen additive model. Parameters: dataframe: a pandas dataframe, with covariates and a duration_col and a event_col. one row per individual. duration_col refers to how long the individual was observed for. event_col is a boolean: 1 if individual 'died', 0 else. id_col should be left as None. duration_col: specify what the duration column is called in the dataframe event_col: specify what the event occurred column is called in the dataframe timeline: reformat the estimates index to a new timeline. progress_bar: include a fancy progress bar! Returns: self, with new methods like plot, smoothed_hazards_ and properties like cumulative_hazards_ """ from_tuples = pd.MultiIndex.from_tuples df = dataframe.copy() #set unique ids for individuals id_col = 'id' ids = np.arange(df.shape[0]) df[id_col] = ids #if the regression should fit an intercept if self.fit_intercept: df['baseline'] = 1. #each individual should have an ID of time of leaving study C = df[event_col].astype(bool) T = df[duration_col] df = df.set_index([duration_col, id_col]) ix = T.argsort() T, C = T.iloc[ix], C.iloc[ix] del df[event_col] n, d = df.shape columns = df.columns #initialize dataframe to store estimates non_censorsed_times = T[C].iteritems() n_deaths = len(non_censorsed_times) hazards_ = pd.DataFrame( np.zeros((n_deaths, d)), columns=columns, index=from_tuples(non_censorsed_times)).swaplevel(1, 0) variance_ = pd.DataFrame( np.zeros((n_deaths, d)), columns=columns, index=from_tuples(non_censorsed_times)).swaplevel(1, 0) #initializes the penalizer matrix penalizer = self.penalizer * np.eye(d) #initialize loop variables. progress = progress_bar(n_deaths) to_remove = [] t = T.iloc[0] i = 0 for id, time in T.iteritems(): #should be sorted. if t != time: assert t < time #remove the individuals from the previous loop. df.ix[to_remove] = 0. to_remove = [] t = time to_remove.append(id) if C[id] == 0: continue relevant_individuals = (ids == id) assert relevant_individuals.sum() == 1. #perform linear regression step. X = df.values try: V = dot(inv(dot(X.T, X) + penalizer), X.T) except LinAlgError: print( "Linear regression error. Try increasing the penalizer term." ) v = dot(V, 1.0 * relevant_individuals) hazards_.ix[time, id] = v.T variance_.ix[time, id] = V[:, relevant_individuals][:, 0]**2 #update progress bar if show_progress: i += 1 progress.update(i) #print a new line so the console displays well if show_progress: print() #not sure this is the correct thing to do. self.hazards_ = hazards_.groupby(level=0).sum() self.cumulative_hazards_ = self.hazards_.cumsum() self.variance_ = variance_.groupby(level=0).sum() if timeline is not None: self.hazards_ = self.hazards_.reindex(timeline, method='ffill') self.cumulative_hazards_ = self.cumulative_hazards_.reindex( timeline, method='ffill') self.variance_ = self.variance_.reindex(timeline, method='ffill') self.timeline = timeline else: self.timeline = self.hazards_.index.values.astype(float) self.data = dataframe self.durations = T self.event_observed = C self._compute_confidence_intervals() self.plot = plot_regressions(self) return
def fit(self, event_times, X, timeline=None, censorship=None, columns=None): """currently X is a static (n,d) array event_times: (n,1) array of event times X: (n,d) the design matrix, either a numpy matrix or DataFrame. timeline: (t,1) timepoints in ascending order censorship: (n,1) boolean array of censorships: True if observed, False if right-censored. By default, assuming all are observed. Fits: self.cumulative_hazards_: a (t,d+1) dataframe of cumulative hazard coefficients self.hazards_: a (t,d+1) dataframe of hazard coefficients """ # deal with the covariate matrix. Check if it is a dataframe or numpy array n, d = X.shape if type(X) == pd.core.frame.DataFrame: X_ = X.values.copy() if columns is None: columns = X.columns else: X_ = X.copy() # append a columns of ones for the baseline hazard ix = event_times.argsort(0)[:, 0].copy() X_ = X_[ix,:].copy() if not self.fit_intercept else np.c_[ X_[ix,:].copy(), np.ones((n, 1)) ] sorted_event_times = event_times[ix, 0].copy() # set the column's names of the dataframe. if columns is None: columns = range(d) else: columns = [c for c in columns] if self.fit_intercept: columns += ['baseline'] # set the censorship events. 1 if the death was observed. if censorship is None: observed = np.ones(n, dtype=bool) else: observed = censorship[ix].reshape(n) # set the timeline -- this is used as DataFrame index in the results if timeline is None: timeline = sorted_event_times.copy() timeline = np.unique(timeline.astype(float)) if timeline[0] > 0: timeline = np.insert(timeline, 0, 0.) unique_times = np.unique(timeline) zeros = np.zeros((timeline.shape[0], d + self.fit_intercept)) self.cumulative_hazards_ = pd.DataFrame(zeros.copy(), index=unique_times, columns=columns) self.hazards_ = pd.DataFrame( np.zeros((event_times.shape[0], d + self.fit_intercept)), index=event_times[:, 0], columns=columns) self._variance = pd.DataFrame(zeros.copy(), index=unique_times, columns=columns) # create the penalizer matrix for L2 regression penalizer = self.penalizer * np.eye(d + self.fit_intercept) t_0 = sorted_event_times[0] cum_v = np.zeros((d + self.fit_intercept, 1)) v = cum_v.copy() for i, time in enumerate(sorted_event_times): relevant_times = (t_0 < timeline) * (timeline <= time) if observed[i] == 0: X_[i,:] = 0 try: V = dot(inv(dot(X_.T, X_) + penalizer), X_.T) except LinAlgError: # if penalizer > 0, this should not occur. But sometimes it does... V = dot(pinv(dot(X_.T, X_) + penalizer), X_.T) v = dot(V, basis(n, i)) cum_v = cum_v + v self.cumulative_hazards_.ix[relevant_times] = self.cumulative_hazards_.ix[relevant_times].values + cum_v.T self.hazards_.iloc[i] = self.hazards_.iloc[i].values + v.T self._variance.ix[relevant_times] = self._variance.ix[relevant_times].values + dot( V[:, i][:, None], V[:, i][None,:] ).diagonal() t_0 = time X_[i,:] = 0 # clean up last iteration relevant_times = (timeline > time) self.hazards_.iloc[i] = v.T try: self.cumulative_hazards_.ix[relevant_times] = cum_v.T self._variance.ix[relevant_times] = dot( V[:, i][:, None], V[:, i][None,:] ).diagonal() except: pass self.timeline = timeline self.X = X self.censorship = censorship self.event_times = event_times self._compute_confidence_intervals() self.plot = plot_regressions(self) return self
def fit(self, event_times, X, timeline=None, censorship=None, columns=None, verbose=True, debug=False): """currently X is a static (n,d) array event_times: (n,1) array of event times X: (n,d) the design matrix, either a numpy matrix or DataFrame. timeline: (t,1) timepoints in ascending order censorship: (n,1) boolean array of censorships: True if observed, False if right-censored. By default, assuming all are observed. Fits: self.cumulative_hazards_: a (t,d+1) dataframe of cumulative hazard coefficients self.hazards_: a (t,d+1) dataframe of hazard coefficients """ # deal with the covariate matrix. Check if it is a dataframe or numpy # array n, d = X.shape # append a columns of ones for the baseline hazard ix = event_times.argsort(0)[:, 0] baseline = np.ones((n, 1)) X = np.hstack([X[ix,:], baseline]) sorted_event_times = event_times[ix, 0] # set the column's names of the dataframe. if columns is None: columns = range(d) else: columns = [c for c in columns] if self.fit_intercept: columns += ['baseline'] # set the censorship events. 1 if the death was observed. if censorship is None: observed = np.ones(n, dtype=bool) else: observed = censorship[ix].reshape(n) # set the timeline -- this is used as DataFrame index in the results if timeline is None: timeline = sorted_event_times timeline = np.unique(timeline.astype(float)) if timeline[0] > 0: timeline = np.insert(timeline, 0, 0.) unique_times = np.unique(timeline) zeros = np.zeros((timeline.shape[0], d + self.fit_intercept)) self.cumulative_hazards_ = pd.DataFrame( zeros.copy(), index=unique_times, columns=columns) self.hazards_ = pd.DataFrame( np.zeros((event_times.shape[0], d + self.fit_intercept)), index=event_times[:, 0], columns=columns) self._variance = pd.DataFrame( zeros.copy(), index=unique_times, columns=columns) # create the penalizer matrix for L2 regression penalizer = (self.penalizer * np.eye(d + self.fit_intercept)).astype( np.float32, copy=False) t_0 = sorted_event_times[0] cum_v = np.zeros((d + self.fit_intercept, 1)) v = cum_v.copy() n_iters = len(sorted_event_times) for i, time in enumerate(sorted_event_times): if debug: pdb.set_trace() relevant_times = (t_0 < timeline) * (timeline <= time) if observed[i] == 0: X[i,:] = 0 try: lr(df.values, relevant_individuals, c1=self.coef_penalizer, c2=self.smoothing_penalizer, offset=previous_hazard) #V = dot(inv(dot(X.T, X) + penalizer), X.T) except LinAlgError: pass # if penalizer > 0, this should not occur. But sometimes it does... #V = dot(pinv(dot(X.T, X) + penalizer), X.T) v = dot(V, basis(n, i)) cum_v = cum_v + v self.cumulative_hazards_.ix[relevant_times] = self.cumulative_hazards_.ix[ relevant_times].values + cum_v.T self.hazards_.iloc[i] = self.hazards_.iloc[i].values + v.T self._variance.ix[relevant_times] = self._variance.ix[ relevant_times].values + dot( V[:, i][:, None], V[:, i][None,:] ).diagonal() t_0 = time X[i,:] = 0 if verbose: sys.stdout.write("\r iteration %i of %i completed" % (i + 1, n_iters)) sys.stdout.flush() # clean up last iteration relevant_times = (timeline > time) self.hazards_.iloc[i] = v.T try: self.cumulative_hazards_.ix[relevant_times] = cum_v.T self._variance.ix[relevant_times] = dot( V[:, i][:, None], V[:, i][None,:] ).diagonal() except: pass self.timeline = timeline self.X = X self.censorship = censorship self.event_times = event_times self._compute_confidence_intervals() self.plot = plot_regressions(self) return self
def _fit_varying(self, dataframe, duration_col="T", event_col="E", id_col=None, timeline=None, show_progress=True): from_tuples = pd.MultiIndex.from_tuples df = dataframe.copy() # if the regression should fit an intercept if self.fit_intercept: df["baseline"] = 1.0 # each individual should have an ID of time of leaving study df = df.set_index([duration_col, id_col]) # if no event_col is specified, assume all non-censorships if event_col is None: event_col = "E" df[event_col] = 1 C_panel = df[[event_col]].to_panel().transpose(2, 1, 0) C = C_panel.minor_xs(event_col).sum().astype(bool) T = (C_panel.minor_xs(event_col).notnull()).cumsum().idxmax() del df[event_col] n, d = df.shape # so this is a problem line. bfill performs a recursion which is # really not scalable. Plus even for modest datasets, this eats a lot of memory. # Plus is bfill the correct thing to choose? It's forward looking... wp = df.to_panel().bfill().fillna(0) # initialize dataframe to store estimates non_censorsed_times = list(T[C].iteritems()) columns = wp.items hazards_ = pd.DataFrame( np.zeros((len(non_censorsed_times), d)), columns=columns, index=from_tuples(non_censorsed_times) ) variance_ = pd.DataFrame( np.zeros((len(non_censorsed_times), d)), columns=columns, index=from_tuples(non_censorsed_times) ) previous_hazard = np.zeros((d,)) ids = wp.minor_axis.values progress = progress_bar(len(non_censorsed_times)) # this makes indexing times much faster wp = wp.swapaxes(0, 1, copy=False).swapaxes(1, 2, copy=False) for i, (id, time) in enumerate(non_censorsed_times): relevant_individuals = ids == id assert relevant_individuals.sum() == 1.0 # perform linear regression step. try: v, V = lr( wp[time].values, relevant_individuals, c1=self.coef_penalizer, c2=self.smoothing_penalizer, offset=previous_hazard, ) except LinAlgError: print("Linear regression error. Try increasing the penalizer term.") hazards_.ix[id, time] = v.T variance_.ix[id, time] = V[:, relevant_individuals][:, 0] ** 2 previous_hazard = v.T # update progress bar if show_progress: progress.update(i) # print a new line so the console displays well if show_progress: print() ordered_cols = df.columns # to_panel() mixes up my columns self.hazards_ = hazards_.groupby(level=1).sum()[ordered_cols] self.cumulative_hazards_ = self.hazards_.cumsum()[ordered_cols] self.variance_ = variance_.groupby(level=1).sum()[ordered_cols] if timeline is not None: self.hazards_ = self.hazards_.reindex(timeline, method="ffill") self.cumulative_hazards_ = self.cumulative_hazards_.reindex(timeline, method="ffill") self.variance_ = self.variance_.reindex(timeline, method="ffill") self.timeline = timeline else: self.timeline = self.hazards_.index.values.astype(float) self.data = wp self.durations = T self.event_observed = C self._compute_confidence_intervals() self.plot = plot_regressions(self) return
def fit(self, dataframe, duration_col="T", event_col="E", timeline=None, id_col=None, show_progress=True): """ Perform inference on the coefficients of the Aalen additive model. Parameters: dataframe: a pandas dataframe, with covariates and a duration_col and a event_col. static covariates: one row per individual. duration_col refers to how long the individual was observed for. event_col is a boolean: 1 if individual 'died', 0 else. id_col should be left as None. time-varying covariates: For time-varying covariates, an id_col is required to keep track of individuals' changing covariates. individual should have a unique id. duration_col refers to how long the individual has been observed to up to that point. event_col refers to if the event (death) occured in that period. Censored individuals will not have a 1. For example: +----+---+---+------+------+ | id | T | E | var1 | var2 | +----+---+---+------+------+ | 1 | 1 | 0 | 0 | 1 | | 1 | 2 | 0 | 0 | 1 | | 1 | 3 | 0 | 4 | 3 | | 1 | 4 | 1 | 8 | 4 | | 2 | 1 | 0 | 1 | 1 | | 2 | 2 | 0 | 1 | 2 | | 2 | 3 | 0 | 1 | 2 | +----+---+---+------+------+ duration_col: specify what the duration column is called in the dataframe event_col: specify what the event occurred column is called in the dataframe timeline: reformat the estimates index to a new timeline. id_col: (only for time-varying covariates) name of the id column in the dataframe progress_bar: include a fancy progress bar! Returns: self, with new methods like plot, smoothed_hazards_ and properties like cumulative_hazards_ """ from_tuples = pd.MultiIndex.from_tuples df = dataframe.copy() #only for time-indp. covariates if id_col is None: df['id'] = np.arange(df.shape[0]) id_col = 'id' #if the regression should fit an intercept if self.fit_intercept: df['baseline'] = 1. #each individual should have an ID of time of leaving study df = df.set_index([id_col, duration_col]) C_panel = df[[event_col]].to_panel().transpose(1,2,0) C = C_panel.minor_xs(event_col).sum().astype(bool) T = (C_panel.minor_xs(event_col).notnull()).cumsum().idxmax() del df[event_col] n,d = df.shape wp = df.to_panel().transpose(1,2,0).bfill().fillna(0) #bfill will cause problems later, plus it is slow. non_censorsed_times = T[C].iteritems() #initialize dataframe to store estimates hazards_ = pd.DataFrame( np.zeros((len(non_censorsed_times),d)), columns = df.columns, index = from_tuples(non_censorsed_times)) variance_ = pd.DataFrame( np.zeros((len(non_censorsed_times),d)), columns = df.columns, index = from_tuples(non_censorsed_times)) #initializes the penalizer matrix penalizer = self.penalizer*np.eye(d) ids = wp.items progress = progress_bar(len(non_censorsed_times)) #wp = wp.transpose(1,0,2) for i,(id, time) in enumerate(non_censorsed_times): relevant_individuals = (ids==id) assert relevant_individuals.sum() == 1. #X = wp[time].values X = wp.major_xs(time).values.T #perform linear regression step. try: V = dot(inv(dot(X.T, X) + penalizer), X.T) except LinAlgError: print("Linear regression error. Try increasing the penalizer term.") v = dot(V, 1.0*relevant_individuals ) hazards_.ix[id, time] = v.T variance_.ix[id, time] = V[:, relevant_individuals][:,0]**2 #update progress bar if show_progress: progress.update(i) #print a new line so the console displays well if show_progress: print() #not sure this is the correct thing to do. self.hazards_ = hazards_.groupby(level=1).sum() self.cumulative_hazards_= self.hazards_.cumsum() self.variance_ = variance_.groupby(level=1).sum() if timeline is not None: self.hazards_ = self.hazards_.reindex(timeline, method='ffill') self.cumulative_hazards_ = self.cumulative_hazards_.reindex(timeline, method='ffill') self.variance_= self.variance_.reindex(timeline, method='ffill') self.timeline = timeline else: self.timeline = self.hazards_.index.values.astype(float) self.data = wp self.durations = T self.event_observed = C self._compute_confidence_intervals() self.plot = plot_regressions(self) return self