def _check_values(df, T, E): pass_for_numeric_dtypes_or_raise(df) check_nans_or_infs(T) check_nans_or_infs(E) check_nans_or_infs(df) check_low_var(df) check_complete_separation(df, E, T)
def predict_log_partial_hazard(self, X): r""" This is equivalent to R's linear.predictors. Returns the log of the partial hazard for the individuals, partial since the baseline hazard is not included. Equal to :math:`\beta (X - \bar{X})` Parameters ---------- X: numpy array or DataFrame a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns can be in any order. If a numpy array, columns must be in the same order as the training data. Returns ------- DataFrame Note ----- If X is a dataframe, the order of the columns do not matter. But if X is an array, then the column ordering is assumed to be the same as the training dataset. """ if isinstance(X, pd.DataFrame): order = self.hazards_.columns X = X[order] pass_for_numeric_dtypes_or_raise(X) X = X.astype(float) index = _get_index(X) X = normalize(X, self._norm_mean.values, 1) return pd.DataFrame(np.dot(X, self.hazards_.T), index=index)
def _check_values(df, stop_times_events): # check_for_overlapping_intervals(df) # this is currenty too slow for production. check_low_var(df) check_complete_separation_low_variance(df, stop_times_events['event']) pass_for_numeric_dtypes_or_raise(df) check_for_immediate_deaths(stop_times_events) check_for_instantaneous_events(stop_times_events)
def _check_values(df, events, start, stop, event_col): # check_for_overlapping_intervals(df) # this is currenty too slow for production. check_nans_or_infs(df) check_low_var(df) check_complete_separation_low_variance(df, events, event_col) pass_for_numeric_dtypes_or_raise(df) check_for_immediate_deaths(events, start, stop) check_for_instantaneous_events(start, stop)
def _check_values(self, X): low_var = (X.var(0) < 10e-5) if low_var.any(): cols = str(list(X.columns[low_var])) warning_text = "Column(s) %s have very low variance.\ This may harm convergence. Try dropping this redundant column before fitting\ if convergence fails." % cols warnings.warn(warning_text, RuntimeWarning) pass_for_numeric_dtypes_or_raise(X)
def predict_log_partial_hazard(self, X): r""" This is equivalent to R's linear.predictors. Returns the log of the partial hazard for the individuals, partial since the baseline hazard is not included. Equal to :math:`\beta (X - mean(X_{train}))` Parameters ---------- X: numpy array or DataFrame a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns can be in any order. If a numpy array, columns must be in the same order as the training data. Returns ------- log_partial_hazard: DataFrame Notes ----- If X is a dataframe, the order of the columns do not matter. But if X is an array, then the column ordering is assumed to be the same as the training dataset. """ hazard_names = self.hazards_.columns if isinstance(X, pd.DataFrame): order = hazard_names X = X[order] pass_for_numeric_dtypes_or_raise(X) elif isinstance( X, pd.Series) and ((X.shape[0] == len(hazard_names) + 2) or (X.shape[0] == len(hazard_names))): X = X.to_frame().T order = hazard_names X = X[order] pass_for_numeric_dtypes_or_raise(X) elif isinstance(X, pd.Series): assert len(hazard_names) == 1, "Series not the correct arugment" X = pd.DataFrame(X) pass_for_numeric_dtypes_or_raise(X) X = X.astype(float) index = _get_index(X) X = normalize(X, self._norm_mean.values, 1) return pd.DataFrame(np.dot(X, self.hazards_.T), index=index)
def _check_values(df, E): # check_for_overlapping_intervals(df) # this is currenty too slow for production. check_low_var(df) check_complete_separation_low_variance(df, E) pass_for_numeric_dtypes_or_raise(df)
def _check_values(df, E): check_low_var(df) check_complete_separation(df, E) pass_for_numeric_dtypes_or_raise(df)
def _fit_varying(self, dataframe, duration_col="T", event_col="E", id_col=None, timeline=None, show_progress=True): from_tuples = pd.MultiIndex.from_tuples df = dataframe.copy() # if the regression should fit an intercept if self.fit_intercept: df['baseline'] = 1. # each individual should have an ID of time of leaving study df = df.set_index([duration_col, id_col]) pass_for_numeric_dtypes_or_raise(df) # if no event_col is specified, assume all non-censorships if event_col is None: event_col = 'E' df[event_col] = 1 C_panel = df[[event_col]].to_panel().transpose(2, 1, 0) C = C_panel.minor_xs(event_col).sum().astype(bool) T = (C_panel.minor_xs(event_col).notnull()).cumsum().idxmax() del df[event_col] n, d = df.shape # so this is a problem line. bfill performs a recursion which is # really not scalable. Plus even for modest datasets, this eats a lot of memory. # Plus is bfill the correct thing to choose? It's forward looking... wp = df.to_panel().bfill().fillna(0) # initialize dataframe to store estimates non_censorsed_times = list(T[C].iteritems()) columns = wp.items hazards_ = pd.DataFrame(np.zeros((len(non_censorsed_times), d)), columns=columns, index=from_tuples(non_censorsed_times)) variance_ = pd.DataFrame(np.zeros((len(non_censorsed_times), d)), columns=columns, index=from_tuples(non_censorsed_times)) previous_hazard = np.zeros((d, )) ids = wp.minor_axis.values progress = progress_bar(len(non_censorsed_times)) # this makes indexing times much faster wp = wp.swapaxes(0, 1, copy=False).swapaxes(1, 2, copy=False) for i, (id, time) in enumerate(non_censorsed_times): relevant_individuals = (ids == id) assert relevant_individuals.sum() == 1. # perform linear regression step. try: v, V = lr(wp[time].values, relevant_individuals, c1=self.coef_penalizer, c2=self.smoothing_penalizer, offset=previous_hazard) except LinAlgError: print( "Linear regression error. Try increasing the penalizer term." ) hazards_.loc[id, time] = v.T variance_.loc[id, time] = V[:, relevant_individuals][:, 0]**2 previous_hazard = v.T # update progress bar if show_progress: progress.update(i) # print a new line so the console displays well if show_progress: print() ordered_cols = df.columns # to_panel() mixes up my columns self.hazards_ = hazards_.groupby(level=1).sum()[ordered_cols] self.cumulative_hazards_ = self.hazards_.cumsum()[ordered_cols] self.variance_ = variance_.groupby(level=1).sum()[ordered_cols] if timeline is not None: self.hazards_ = self.hazards_.reindex(timeline, method='ffill') self.cumulative_hazards_ = self.cumulative_hazards_.reindex( timeline, method='ffill') self.variance_ = self.variance_.reindex(timeline, method='ffill') self.timeline = timeline else: self.timeline = self.hazards_.index.values.astype(float) self.durations = T self.event_observed = C self._compute_confidence_intervals() return
def _fit_static(self, dataframe, duration_col, event_col=None, timeline=None, show_progress=True): """ Perform inference on the coefficients of the Aalen additive model. Parameters: dataframe: a pandas dataframe, with covariates and a duration_col and a event_col. one row per individual. duration_col refers to how long the individual was observed for. event_col is a boolean: 1 if individual 'died', 0 else. id_col should be left as None. duration_col: specify what the duration column is called in the dataframe event_col: specify what the event occurred column is called in the dataframe timeline: reformat the estimates index to a new timeline. progress_bar: include a fancy progress bar! Returns: self, with new methods like plot, smoothed_hazards_ and properties like cumulative_hazards_ """ from_tuples = pd.MultiIndex.from_tuples df = dataframe.copy() # set unique ids for individuals id_col = 'id' ids = np.arange(df.shape[0]) df[id_col] = ids # if the regression should fit an intercept if self.fit_intercept: df['baseline'] = 1. # if no event_col is specified, assume all non-censorships if event_col: c = df[event_col].values del df[event_col] else: c = np.ones_like(ids) # each individual should have an ID of time of leaving study C = pd.Series(c, dtype=bool, index=ids) T = pd.Series(df[duration_col].values, index=ids) df = df.set_index(id_col) pass_for_numeric_dtypes_or_raise(df) ix = T.argsort() T, C = T.iloc[ix], C.iloc[ix] del df[duration_col] n, d = df.shape columns = df.columns # initialize dataframe to store estimates non_censorsed_times = list(T[C].iteritems()) n_deaths = len(non_censorsed_times) hazards_ = pd.DataFrame( np.zeros((n_deaths, d)), columns=columns, index=from_tuples(non_censorsed_times)).swaplevel(1, 0) variance_ = pd.DataFrame( np.zeros((n_deaths, d)), columns=columns, index=from_tuples(non_censorsed_times)).swaplevel(1, 0) # initialize loop variables. previous_hazard = np.zeros((d, )) progress = progress_bar(n_deaths) to_remove = [] t = T.iloc[0] i = 0 for id, time in T.iteritems(): # should be sorted. if t != time: assert t < time # remove the individuals from the previous loop. df.iloc[to_remove] = 0. to_remove = [] t = time to_remove.append(id) if C[id] == 0: continue relevant_individuals = (ids == id) assert relevant_individuals.sum() == 1. # perform linear regression step. try: v, V = lr(df.values, relevant_individuals, c1=self.coef_penalizer, c2=self.smoothing_penalizer, offset=previous_hazard) except LinAlgError: print( "Linear regression error. Try increasing the penalizer term." ) hazards_.loc[time, id] = v.T variance_.loc[time, id] = V[:, relevant_individuals][:, 0]**2 previous_hazard = v.T # update progress bar if show_progress: i += 1 progress.update(i) # print a new line so the console displays well if show_progress: print() # not sure this is the correct thing to do. self.hazards_ = hazards_.groupby(level=0).sum() self.cumulative_hazards_ = self.hazards_.cumsum() self.variance_ = variance_.groupby(level=0).sum() if timeline is not None: self.hazards_ = self.hazards_.reindex(timeline, method='ffill') self.cumulative_hazards_ = self.cumulative_hazards_.reindex( timeline, method='ffill') self.variance_ = self.variance_.reindex(timeline, method='ffill') self.timeline = timeline else: self.timeline = self.hazards_.index.values.astype(float) self.durations = T self.event_observed = C self._compute_confidence_intervals() self.score_ = concordance_index( self.durations, self.predict_median(dataframe).values.ravel(), self.event_observed) return
def _fit_static(self, dataframe, duration_col, event_col=None, timeline=None, show_progress=True): """ Perform inference on the coefficients of the Aalen additive model. Parameters: dataframe: a pandas dataframe, with covariates and a duration_col and a event_col. one row per individual. duration_col refers to how long the individual was observed for. event_col is a boolean: 1 if individual 'died', 0 else. id_col should be left as None. duration_col: specify what the duration column is called in the dataframe event_col: specify what the event occurred column is called in the dataframe timeline: reformat the estimates index to a new timeline. progress_bar: include a fancy progress bar! Returns: self, with new methods like plot, smoothed_hazards_ and properties like cumulative_hazards_ """ from_tuples = pd.MultiIndex.from_tuples df = dataframe.copy() # set unique ids for individuals id_col = 'id' ids = np.arange(df.shape[0]) df[id_col] = ids # if the regression should fit an intercept if self.fit_intercept: df['baseline'] = 1. # if no event_col is specified, assume all non-censorships if event_col: c = df[event_col].values del df[event_col] else: c = np.ones_like(ids) # each individual should have an ID of time of leaving study C = pd.Series(c, dtype=bool, index=ids) T = pd.Series(df[duration_col].values, index=ids) df = df.set_index(id_col) pass_for_numeric_dtypes_or_raise(df) ix = T.argsort() T, C = T.iloc[ix], C.iloc[ix] del df[duration_col] n, d = df.shape columns = df.columns # initialize dataframe to store estimates non_censorsed_times = list(T[C].iteritems()) n_deaths = len(non_censorsed_times) hazards_ = pd.DataFrame(np.zeros((n_deaths, d)), columns=columns, index=from_tuples(non_censorsed_times)).swaplevel(1, 0) variance_ = pd.DataFrame(np.zeros((n_deaths, d)), columns=columns, index=from_tuples(non_censorsed_times)).swaplevel(1, 0) # initialize loop variables. previous_hazard = np.zeros((d,)) progress = progress_bar(n_deaths) to_remove = [] t = T.iloc[0] i = 0 for id, time in T.iteritems(): # should be sorted. if t != time: assert t < time # remove the individuals from the previous loop. df.iloc[to_remove] = 0. to_remove = [] t = time to_remove.append(id) if C[id] == 0: continue relevant_individuals = (ids == id) assert relevant_individuals.sum() == 1. # perform linear regression step. try: v, V = lr(df.values, relevant_individuals, c1=self.coef_penalizer, c2=self.smoothing_penalizer, offset=previous_hazard) except LinAlgError: print("Linear regression error. Try increasing the penalizer term.") hazards_.loc[time, id] = v.T variance_.loc[time, id] = V[:, relevant_individuals][:, 0] ** 2 previous_hazard = v.T # update progress bar if show_progress: i += 1 progress.update(i) # print a new line so the console displays well if show_progress: print() # not sure this is the correct thing to do. self.hazards_ = hazards_.groupby(level=0).sum() self.cumulative_hazards_ = self.hazards_.cumsum() self.variance_ = variance_.groupby(level=0).sum() if timeline is not None: self.hazards_ = self.hazards_.reindex(timeline, method='ffill') self.cumulative_hazards_ = self.cumulative_hazards_.reindex(timeline, method='ffill') self.variance_ = self.variance_.reindex(timeline, method='ffill') self.timeline = timeline else: self.timeline = self.hazards_.index.values.astype(float) self.durations = T self.event_observed = C self._compute_confidence_intervals() self.score_ = concordance_index(self.durations, self.predict_median(dataframe).values.ravel(), self.event_observed) return
def _check_values(df, E): deaths = E == 1 check_low_var(df) check_low_var(df.loc[deaths], "Complete seperation possibly detected. ", " See https://stats.idre.ucla.edu/other/mult-pkg/faq/general/faqwhat-is-complete-or-quasi-complete-separation-in-logisticprobit-regression-and-how-do-we-deal-with-them/") check_low_var(df.loc[~deaths], "Complete seperation possibly detected. ", " See https://stats.idre.ucla.edu/other/mult-pkg/faq/general/faqwhat-is-complete-or-quasi-complete-separation-in-logisticprobit-regression-and-how-do-we-deal-with-them/") pass_for_numeric_dtypes_or_raise(df)
def _check_values(self, df, T, E): pass_for_numeric_dtypes_or_raise(df) check_nans(T) check_nans(E)
def _check_values(self, X, T, E): pass_for_numeric_dtypes_or_raise(X) check_nans_or_infs(T) check_nans_or_infs(E) check_nans_or_infs(X)
def _check_values(df, E): check_low_var(df) check_complete_separation(df, E) pass_for_numeric_dtypes_or_raise(df)
def _check_values(df, E): # check_for_overlapping_intervals(df) # this is currenty too slow for production. check_low_var(df) check_complete_separation(df, E) pass_for_numeric_dtypes_or_raise(df)
def _fit_varying(self, dataframe, duration_col="T", event_col="E", id_col=None, timeline=None, show_progress=True): from_tuples = pd.MultiIndex.from_tuples df = dataframe.copy() # if the regression should fit an intercept if self.fit_intercept: df['baseline'] = 1. # each individual should have an ID of time of leaving study df = df.set_index([duration_col, id_col]) pass_for_numeric_dtypes_or_raise(df) # if no event_col is specified, assume all non-censorships if event_col is None: event_col = 'E' df[event_col] = 1 C_panel = df[[event_col]].to_panel().transpose(2, 1, 0) C = C_panel.minor_xs(event_col).sum().astype(bool) T = (C_panel.minor_xs(event_col).notnull()).cumsum().idxmax() del df[event_col] n, d = df.shape # so this is a problem line. bfill performs a recursion which is # really not scalable. Plus even for modest datasets, this eats a lot of memory. # Plus is bfill the correct thing to choose? It's forward looking... wp = df.to_panel().bfill().fillna(0) # initialize dataframe to store estimates non_censorsed_times = list(T[C].iteritems()) columns = wp.items hazards_ = pd.DataFrame(np.zeros((len(non_censorsed_times), d)), columns=columns, index=from_tuples(non_censorsed_times)) variance_ = pd.DataFrame(np.zeros((len(non_censorsed_times), d)), columns=columns, index=from_tuples(non_censorsed_times)) previous_hazard = np.zeros((d,)) ids = wp.minor_axis.values progress = progress_bar(len(non_censorsed_times)) # this makes indexing times much faster wp = wp.swapaxes(0, 1, copy=False).swapaxes(1, 2, copy=False) for i, (id, time) in enumerate(non_censorsed_times): relevant_individuals = (ids == id) assert relevant_individuals.sum() == 1. # perform linear regression step. try: v, V = lr(wp[time].values, relevant_individuals, c1=self.coef_penalizer, c2=self.smoothing_penalizer, offset=previous_hazard) except LinAlgError: print("Linear regression error. Try increasing the penalizer term.") hazards_.loc[id, time] = v.T variance_.loc[id, time] = V[:, relevant_individuals][:, 0] ** 2 previous_hazard = v.T # update progress bar if show_progress: progress.update(i) # print a new line so the console displays well if show_progress: print() ordered_cols = df.columns # to_panel() mixes up my columns self.hazards_ = hazards_.groupby(level=1).sum()[ordered_cols] self.cumulative_hazards_ = self.hazards_.cumsum()[ordered_cols] self.variance_ = variance_.groupby(level=1).sum()[ordered_cols] if timeline is not None: self.hazards_ = self.hazards_.reindex(timeline, method='ffill') self.cumulative_hazards_ = self.cumulative_hazards_.reindex(timeline, method='ffill') self.variance_ = self.variance_.reindex(timeline, method='ffill') self.timeline = timeline else: self.timeline = self.hazards_.index.values.astype(float) self.durations = T self.event_observed = C self._compute_confidence_intervals() return