def _fit_varying(self, dataframe, duration_col="T", event_col="E", id_col=None, timeline=None, show_progress=True): from_tuples = pd.MultiIndex.from_tuples df = dataframe.copy() # if the regression should fit an intercept if self.fit_intercept: df['baseline'] = 1. # each individual should have an ID of time of leaving study df = df.set_index([duration_col, id_col]) # if no event_col is specified, assume all non-censorships if event_col is None: event_col = 'E' df[event_col] = 1 C_panel = df[[event_col]].to_panel().transpose(2, 1, 0) C = C_panel.minor_xs(event_col).sum().astype(bool) T = (C_panel.minor_xs(event_col).notnull()).cumsum().idxmax() del df[event_col] n, d = df.shape # so this is a problem line. bfill performs a recursion which is # really not scalable. Plus even for modest datasets, this eats a lot of memory. # Plus is bfill the correct thing to choose? It's forward looking... wp = df.to_panel().bfill().fillna(0) # initialize dataframe to store estimates non_censorsed_times = list(T[C].iteritems()) columns = wp.items hazards_ = pd.DataFrame(np.zeros((len(non_censorsed_times), d)), columns=columns, index=from_tuples(non_censorsed_times)) variance_ = pd.DataFrame(np.zeros((len(non_censorsed_times), d)), columns=columns, index=from_tuples(non_censorsed_times)) previous_hazard = np.zeros((d,)) ids = wp.minor_axis.values progress = progress_bar(len(non_censorsed_times)) # this makes indexing times much faster wp = wp.swapaxes(0, 1, copy=False).swapaxes(1, 2, copy=False) for i, (id, time) in enumerate(non_censorsed_times): relevant_individuals = (ids == id) assert relevant_individuals.sum() == 1. # perform linear regression step. try: v, V = lr(wp[time].values, relevant_individuals, c1=self.coef_penalizer, c2=self.smoothing_penalizer, offset=previous_hazard) except LinAlgError: print("Linear regression error. Try increasing the penalizer term.") hazards_.ix[id, time] = v.T variance_.ix[id, time] = V[:, relevant_individuals][:, 0] ** 2 previous_hazard = v.T # update progress bar if show_progress: progress.update(i) # print a new line so the console displays well if show_progress: print() ordered_cols = df.columns # to_panel() mixes up my columns self.hazards_ = hazards_.groupby(level=1).sum()[ordered_cols] self.cumulative_hazards_ = self.hazards_.cumsum()[ordered_cols] self.variance_ = variance_.groupby(level=1).sum()[ordered_cols] if timeline is not None: self.hazards_ = self.hazards_.reindex(timeline, method='ffill') self.cumulative_hazards_ = self.cumulative_hazards_.reindex(timeline, method='ffill') self.variance_ = self.variance_.reindex(timeline, method='ffill') self.timeline = timeline else: self.timeline = self.hazards_.index.values.astype(float) self.data = wp self.durations = T self.event_observed = C self._compute_confidence_intervals() return
def _fit_varying(self, dataframe, duration_col="T", event_col="E", id_col=None, timeline=None, show_progress=True): from_tuples = pd.MultiIndex.from_tuples df = dataframe.copy() # if the regression should fit an intercept if self.fit_intercept: df['baseline'] = 1. # each individual should have an ID of time of leaving study df = df.set_index([duration_col, id_col]) # if no event_col is specified, assume all non-censorships if event_col is None: event_col = 'E' df[event_col] = 1 C_panel = df[[event_col]].to_panel().transpose(2, 1, 0) C = C_panel.minor_xs(event_col).sum().astype(bool) T = (C_panel.minor_xs(event_col).notnull()).cumsum().idxmax() del df[event_col] n, d = df.shape # so this is a problem line. bfill performs a recursion which is # really not scalable. Plus even for modest datasets, this eats a lot of memory. # Plus is bfill the correct thing to choose? It's forward looking... wp = df.to_panel().bfill().fillna(0) # initialize dataframe to store estimates non_censorsed_times = list(T[C].iteritems()) columns = wp.items hazards_ = pd.DataFrame(np.zeros((len(non_censorsed_times), d)), columns=columns, index=from_tuples(non_censorsed_times)) variance_ = pd.DataFrame(np.zeros((len(non_censorsed_times), d)), columns=columns, index=from_tuples(non_censorsed_times)) previous_hazard = np.zeros((d, )) ids = wp.minor_axis.values progress = progress_bar(len(non_censorsed_times)) # this makes indexing times much faster wp = wp.swapaxes(0, 1, copy=False).swapaxes(1, 2, copy=False) for i, (id, time) in enumerate(non_censorsed_times): relevant_individuals = (ids == id) assert relevant_individuals.sum() == 1. # perform linear regression step. try: v, V = lr(wp[time].values, relevant_individuals, c1=self.coef_penalizer, c2=self.smoothing_penalizer, offset=previous_hazard) except LinAlgError: print( "Linear regression error. Try increasing the penalizer term." ) hazards_.ix[id, time] = v.T variance_.ix[id, time] = V[:, relevant_individuals][:, 0]**2 previous_hazard = v.T # update progress bar if show_progress: progress.update(i) # print a new line so the console displays well if show_progress: print() ordered_cols = df.columns # to_panel() mixes up my columns self.hazards_ = hazards_.groupby(level=1).sum()[ordered_cols] self.cumulative_hazards_ = self.hazards_.cumsum()[ordered_cols] self.variance_ = variance_.groupby(level=1).sum()[ordered_cols] if timeline is not None: self.hazards_ = self.hazards_.reindex(timeline, method='ffill') self.cumulative_hazards_ = self.cumulative_hazards_.reindex( timeline, method='ffill') self.variance_ = self.variance_.reindex(timeline, method='ffill') self.timeline = timeline else: self.timeline = self.hazards_.index.values.astype(float) self.data = wp self.durations = T self.event_observed = C self._compute_confidence_intervals() return
def _fit_static(self, dataframe, duration_col, event_col=None, timeline=None, show_progress=True): """ Perform inference on the coefficients of the Aalen additive model. Parameters: dataframe: a pandas dataframe, with covariates and a duration_col and a event_col. one row per individual. duration_col refers to how long the individual was observed for. event_col is a boolean: 1 if individual 'died', 0 else. id_col should be left as None. duration_col: specify what the duration column is called in the dataframe event_col: specify what the event occurred column is called in the dataframe timeline: reformat the estimates index to a new timeline. progress_bar: include a fancy progress bar! Returns: self, with new methods like plot, smoothed_hazards_ and properties like cumulative_hazards_ """ from_tuples = pd.MultiIndex.from_tuples df = dataframe.copy() # set unique ids for individuals id_col = 'id' ids = np.arange(df.shape[0]) df[id_col] = ids # if the regression should fit an intercept if self.fit_intercept: df['baseline'] = 1. # if no event_col is specified, assume all non-censorships if event_col: c = df[event_col].values del df[event_col] else: c = np.ones_like(ids) # each individual should have an ID of time of leaving study C = pd.Series(c, dtype=bool, index=ids) T = pd.Series(df[duration_col].values, index=ids) df = df.set_index(id_col) ix = T.argsort() T, C = T.iloc[ix], C.iloc[ix] del df[duration_col] n, d = df.shape columns = df.columns # initialize dataframe to store estimates non_censorsed_times = list(T[C].iteritems()) n_deaths = len(non_censorsed_times) hazards_ = pd.DataFrame(np.zeros((n_deaths, d)), columns=columns, index=from_tuples(non_censorsed_times)).swaplevel(1, 0) variance_ = pd.DataFrame(np.zeros((n_deaths, d)), columns=columns, index=from_tuples(non_censorsed_times)).swaplevel(1, 0) # initialize loop variables. previous_hazard = np.zeros((d,)) progress = progress_bar(n_deaths) to_remove = [] t = T.iloc[0] i = 0 for id, time in T.iteritems(): # should be sorted. if t != time: assert t < time # remove the individuals from the previous loop. df.iloc[to_remove] = 0. to_remove = [] t = time to_remove.append(id) if C[id] == 0: continue relevant_individuals = (ids == id) assert relevant_individuals.sum() == 1. # perform linear regression step. try: v, V = lr(df.values, relevant_individuals, c1=self.coef_penalizer, c2=self.smoothing_penalizer, offset=previous_hazard) except LinAlgError: print("Linear regression error. Try increasing the penalizer term.") hazards_.ix[time, id] = v.T variance_.ix[time, id] = V[:, relevant_individuals][:, 0] ** 2 previous_hazard = v.T # update progress bar if show_progress: i += 1 progress.update(i) # print a new line so the console displays well if show_progress: print() # not sure this is the correct thing to do. self.hazards_ = hazards_.groupby(level=0).sum() self.cumulative_hazards_ = self.hazards_.cumsum() self.variance_ = variance_.groupby(level=0).sum() if timeline is not None: self.hazards_ = self.hazards_.reindex(timeline, method='ffill') self.cumulative_hazards_ = self.cumulative_hazards_.reindex(timeline, method='ffill') self.variance_ = self.variance_.reindex(timeline, method='ffill') self.timeline = timeline else: self.timeline = self.hazards_.index.values.astype(float) self.data = dataframe self.durations = T self.event_observed = C self._compute_confidence_intervals() return
def _fit_static(self, dataframe, duration_col, event_col=None, timeline=None, show_progress=True): """ Perform inference on the coefficients of the Aalen additive model. Parameters: dataframe: a pandas dataframe, with covariates and a duration_col and a event_col. one row per individual. duration_col refers to how long the individual was observed for. event_col is a boolean: 1 if individual 'died', 0 else. id_col should be left as None. duration_col: specify what the duration column is called in the dataframe event_col: specify what the event occurred column is called in the dataframe timeline: reformat the estimates index to a new timeline. progress_bar: include a fancy progress bar! Returns: self, with new methods like plot, smoothed_hazards_ and properties like cumulative_hazards_ """ from_tuples = pd.MultiIndex.from_tuples df = dataframe.copy() # set unique ids for individuals id_col = 'id' ids = np.arange(df.shape[0]) df[id_col] = ids # if the regression should fit an intercept if self.fit_intercept: df['baseline'] = 1. # if no event_col is specified, assume all non-censorships if event_col: c = df[event_col].values del df[event_col] else: c = np.ones_like(ids) # each individual should have an ID of time of leaving study C = pd.Series(c, dtype=bool, index=ids) T = pd.Series(df[duration_col].values, index=ids) df = df.set_index(id_col) ix = T.argsort() T, C = T.iloc[ix], C.iloc[ix] del df[duration_col] n, d = df.shape columns = df.columns # initialize dataframe to store estimates non_censorsed_times = list(T[C].iteritems()) n_deaths = len(non_censorsed_times) hazards_ = pd.DataFrame( np.zeros((n_deaths, d)), columns=columns, index=from_tuples(non_censorsed_times)).swaplevel(1, 0) variance_ = pd.DataFrame( np.zeros((n_deaths, d)), columns=columns, index=from_tuples(non_censorsed_times)).swaplevel(1, 0) # initialize loop variables. previous_hazard = np.zeros((d, )) progress = progress_bar(n_deaths) to_remove = [] t = T.iloc[0] i = 0 for id, time in T.iteritems(): # should be sorted. if t != time: assert t < time # remove the individuals from the previous loop. df.iloc[to_remove] = 0. to_remove = [] t = time to_remove.append(id) if C[id] == 0: continue relevant_individuals = (ids == id) assert relevant_individuals.sum() == 1. # perform linear regression step. try: v, V = lr(df.values, relevant_individuals, c1=self.coef_penalizer, c2=self.smoothing_penalizer, offset=previous_hazard) except LinAlgError: print( "Linear regression error. Try increasing the penalizer term." ) hazards_.ix[time, id] = v.T variance_.ix[time, id] = V[:, relevant_individuals][:, 0]**2 previous_hazard = v.T # update progress bar if show_progress: i += 1 progress.update(i) # print a new line so the console displays well if show_progress: print() # not sure this is the correct thing to do. self.hazards_ = hazards_.groupby(level=0).sum() self.cumulative_hazards_ = self.hazards_.cumsum() self.variance_ = variance_.groupby(level=0).sum() if timeline is not None: self.hazards_ = self.hazards_.reindex(timeline, method='ffill') self.cumulative_hazards_ = self.cumulative_hazards_.reindex( timeline, method='ffill') self.variance_ = self.variance_.reindex(timeline, method='ffill') self.timeline = timeline else: self.timeline = self.hazards_.index.values.astype(float) self.data = dataframe self.durations = T self.event_observed = C self._compute_confidence_intervals() return