def make_dataloader_predict(self, input, batch_size, shuffle=False, num_workers=0): input, durations = input input = tt.tuplefy(input) durations = tt.tuplefy(durations) new_input = input + durations dataloader = super().make_dataloader_predict(new_input, batch_size, shuffle, num_workers) return dataloader
def _sorted_input_target(input, target): durations, _ = target #.to_numpy() idx_sort = np.argsort(durations) if (idx_sort == np.arange(0, len(idx_sort))).all(): return input, target input = tuplefy(input).iloc[idx_sort] target = tuplefy(target).iloc[idx_sort] return input, target
def assert_survs(input, model): preds = model.predict_surv(input) assert type(preds) is type(input) assert preds.shape[0] == input.shape[0] surv_df = model.predict_surv_df(input) assert type(surv_df) is pd.DataFrame assert type(surv_df.values) is np.ndarray assert preds.shape[0] == surv_df.shape[1] assert preds.shape[1] == surv_df.shape[0] np_input = tt.tuplefy(input).to_numpy()[0] torch_input = tt.tuplefy(input).to_tensor()[0] np_preds = model.predict_surv(np_input) torch_preds = model.predict_surv(torch_input) assert (np_preds == torch_preds.numpy()).all()
def __getitem__(self, index): if not hasattr(index, '__iter__'): index = [index] data_array = [] for i in index: data_path = self.data_path[i] vs = pd.read_csv(data_path) vs = vs.drop(vs.columns[[0, 1, 2]], axis=1) data = np.array(vs).astype('float32') data = torch.from_numpy(data) data_array.append(data) data_array = torch.stack(data_array) target = tt.tuplefy(self.time[index], self.event[index]).to_tensor() return tt.tuplefy(data_array, target)
def make_data(multi_in=False, multi_out=False, cl=tuple, dl=False, data_torch=False): input = torch.randn(5) if multi_in: input = (input, (input, input)) input = cl(input) tensor = torch.randn(5) if multi_out: tensor = ((tensor, tensor), tensor) tensor = cl(tensor) if data_torch is False: input, tensor = tt.tuplefy(input, tensor).to_numpy() if dl is True: input = tt.tuplefy(input).make_dataloader(10, False) return input, tensor
def target_to_df(self, target): durations, events = tt.tuplefy(target).to_numpy() df = pd.DataFrame({ self.duration_col: durations, self.event_col: events }) return df
def _compute_baseline_hazards(self, input, df_train_target, max_duration, batch_size): if max_duration is None: max_duration = np.inf def compute_expg_at_risk(ix, t): sub = input.iloc[ix:] n = sub.lens().flatten().get_if_all_equal() t = np.repeat(t, n).reshape(-1, 1).astype('float32') return np.exp(self.predict((sub, t), batch_size)).flatten().sum() if not df_train_target[self.duration_col].is_monotonic_increasing: raise RuntimeError( f"Need 'df_train_target' to be sorted by {self.duration_col}") input = tuplefy(input) df = df_train_target.reset_index(drop=True) times = (df.loc[lambda x: x[self.event_col] != 0][self.duration_col]. loc[lambda x: x <= max_duration].drop_duplicates( keep='first')) at_risk_sum = (pd.Series( [compute_expg_at_risk(ix, t) for ix, t in times.iteritems()], index=times.values).rename('at_risk_sum')) events = (df.groupby(self.duration_col)[[ self.event_col ]].agg('sum').loc[lambda x: x.index <= max_duration]) base_haz = (events.join( at_risk_sum, how='left', sort=True).pipe(lambda x: x[self.event_col] / x['at_risk_sum']). fillna(0.).rename('baseline_hazards')) return base_haz
def label_transfer(times, events): # times = [period if x == -1 else x for x in times] # times = [period if x > period else x for x in times] # events = [True if x == 'abnormal' else x for x in events] # events = [False if x == 'normal' else x for x in events] labels = tt.tuplefy(np.array(times), np.array(events)) return labels
def __getitem__(self, index): batch = super().__getitem__(index) input, (duration, event) = batch idx_sort = duration.sort(descending=True)[1] event = event.float() batch = tuplefy(input, event).iloc[idx_sort] return batch
def compute_baseline_hazards(self, input=None, target=None, max_duration=None, sample=None, batch_size=8224, set_hazards=True, eval_=True, num_workers=0): """Computes the Breslow estimates form the data defined by `input` and `target` (if `None` use training data). Typically call model.compute_baseline_hazards() after fitting. Keyword Arguments: input -- Input data (train input) (default: {None}) target -- Target data (train target) (default: {None}) max_duration {float} -- Don't compute estimates for duration higher (default: {None}) sample {float or int} -- Compute estimates of subsample of data (default: {None}) batch_size {int} -- Batch size (default: {8224}) set_hazards {bool} -- Set hazards in model object, or just return hazards. (default: {True}) Returns: pd.Series -- Pandas series with baseline hazards. Index is duration_col. """ if (input is None) and (target is None): if not hasattr(self, 'training_data'): raise ValueError("Need to give a 'input' and 'target' to this function.") input, target = self.training_data df = self.target_to_df(target)#.sort_values(self.duration_col) if sample is not None: if sample >= 1: df = df.sample(n=sample) else: df = df.sample(frac=sample) input = tt.tuplefy(input).to_numpy().iloc[df.index.values] base_haz = self._compute_baseline_hazards(input, df, max_duration, batch_size, eval_=eval_, num_workers=num_workers) if set_hazards: self.compute_baseline_cumulative_hazards(set_hazards=True, baseline_hazards_=base_haz) return base_haz
def fit(self, input, target, batch_size=256, epochs=1, callbacks=None, verbose=True, num_workers=0, shuffle=True, metrics=None, val_data=None, val_batch_size=8224, **kwargs): """Fit model with inputs and targets. Where 'input' is the covariates, and 'target' is a tuple with (durations, events). Arguments: input {np.array, tensor or tuple} -- Input x passed to net. target {np.array, tensor or tuple} -- Target [durations, events]. Keyword Arguments: batch_size {int} -- Elements in each batch (default: {256}) epochs {int} -- Number of epochs (default: {1}) callbacks {list} -- list of callbacks (default: {None}) verbose {bool} -- Print progress (default: {True}) num_workers {int} -- Number of workers used in the dataloader (default: {0}) shuffle {bool} -- If we should shuffle the order of the dataset (default: {True}) **kwargs are passed to 'make_dataloader' method. Returns: TrainingLogger -- Training log """ self.training_data = tt.tuplefy(input, target) return super().fit(input, target, batch_size, epochs, callbacks, verbose, num_workers, shuffle, metrics, val_data, val_batch_size, **kwargs)
def compute_baseline_hazards(self, input=None, target=None, max_duration=None, sample=None, batch_size=8224, set_hazards=True, eval_=True, num_workers=0): if (input is None) and (target is None): if not hasattr(self, 'training_data'): raise ValueError( 'Need to fit, or supply a input and target to this function.' ) input, target = self.training_data df = self.target_to_df(target) if sample is not None: if sample >= 1: df = df.sample(n=sample) else: df = df.sample(frac=sample) df = df.sort_values(self.duration_col) input = tt.tuplefy(input).to_numpy().iloc[df.index.values] base_haz = self._compute_baseline_hazards(input, df, max_duration, batch_size, eval_, num_workers) if set_hazards: self.compute_baseline_cumulative_hazards( set_hazards=True, baseline_hazards_=base_haz) return base_haz
def setup(self): torch.manual_seed(1234) self.net = torch.nn.Linear(5, 3) self.prednet = _PredSigmoidNet(self.net) x = torch.randn(4, 5) self.data = x self.dataloader = tuplefy(x).make_dataloader(5, False) self.net_dropout = nn.Sequential(self.net, nn.Dropout(0.5))
def test_array_or_tensor_type_numpy(numpy, multi_in, multi_out, cl, dl, data_torch): input, tensor = make_data(multi_in, multi_out, cl, dl, data_torch) out = array_or_tensor(tensor, numpy, input) if multi_out is True: assert type(out) is tt.TupleTree else: assert type(out) in [np.ndarray, torch.Tensor] assert numpy is (tt.tuplefy(out).type() is np.ndarray)
def __init__(self, input, durations, events, n_control=1): df_train_target = pd.DataFrame(dict(duration=durations, event=events)) self.durations = df_train_target.loc[lambda x: x['event'] == 1]['duration'] self.at_risk_dict = make_at_risk_dict(durations) self.input = tt.tuplefy(input) assert type(self.durations) is pd.Series self.n_control = n_control
def __getitem__(self, index): if (not hasattr(index, '__iter__')) and (type(index) is not slice): index = [index] fails = self.durations.iloc[index] x_case = self.input.iloc[fails.index] control_idx = sample_alive_from_dates(fails.values, self.at_risk_dict, self.n_control) x_control = tt.TupleTree(self.input.iloc[idx] for idx in control_idx.transpose()) return tt.tuplefy(x_case, x_control).to_tensor()
def array_or_tensor(tensor, numpy, input): """Returs a tensor if numpy is False or input is tensor. Else it returns numpy array. """ if numpy is False: return tensor if (numpy is True) or (tt.tuplefy(input).type() is np.ndarray): tensor = tensor.cpu().numpy() return tensor
def expg_at_time(t): t = np.repeat(t, n_cols).reshape(-1, 1).astype('float32') if tt.tuplefy(input).type() is torch.Tensor: t = torch.from_numpy(t) return np.exp( self.predict((input, t), batch_size, True, eval_, num_workers=num_workers)).flatten()
def test_array_or_tensor_type_none(multi_in, multi_out, cl, dl, data_torch): numpy = None input, tensor = make_data(multi_in, multi_out, cl, dl, data_torch) out = array_or_tensor(tensor, numpy, input) if multi_out is True: assert type(out) is tt.TupleTree else: assert type(out) in [np.ndarray, torch.Tensor] correct_type = np.ndarray if (dl is True) or (data_torch is False) else torch.Tensor assert tt.tuplefy(out).type() is correct_type
def assert_survs(input, model, with_dl=True): preds = model.predict_surv(input) assert type(preds) is type(input) assert preds.shape[0] == input.shape[0] surv_df = model.predict_surv_df(input) assert type(surv_df) is pd.DataFrame assert type(surv_df.values) is np.ndarray assert preds.shape[0] == surv_df.shape[1] assert preds.shape[1] == surv_df.shape[0] np_input = tt.tuplefy(input).to_numpy()[0] torch_input = tt.tuplefy(input).to_tensor()[0] np_preds = model.predict_surv(np_input) torch_preds = model.predict_surv(torch_input) assert (np_preds == torch_preds.cpu().numpy()).all() if with_dl: dl_input = tt.tuplefy(input).make_dataloader(512, False) dl_preds = model.predict_surv(dl_input) assert type(np_preds) is type(dl_preds), f"got {type(np_preds)}, and, {type(dl_preds)}" assert (np_preds == dl_preds).all()
def test_cox_time_runs(numpy): input, target = make_dataset(False).apply(lambda x: x.float()).to_numpy() labtrans = CoxTime.label_transform() target = labtrans.fit_transform(*target) data = tt.tuplefy(input, target) if not numpy: data = data.to_tensor() net = MLPVanillaCoxTime(data[0].shape[1], [4], False) model = CoxTime(net) fit_model(data, model) model.compute_baseline_hazards() assert_survs(data[0], model)
def predict_survival_function(self, input, batch_size=8224, eval_=True, to_cpu=False, num_workers=0): """Might need to set to_cpu to true if too large dataset.""" pmf = self.predict_pmf(input, batch_size, eval_, to_cpu, num_workers, False) surv = 1 - pmf.cumsum(0) if tuplefy(input).type() is np.ndarray: surv = surv.cpu().numpy() return surv
def simulate_from_weights(self, weights, surv_df=False): logit_haz = self.logit_haz(self.times[1:], *weights) durations = self.sample_event_times(logit_haz)#.astype('float32') is_nan = np.isnan(durations) events = np.ones_like(durations) events[is_nan] = 0. durations[is_nan] = self.times[-1] covs = self.sample_covs(weights) covs = tt.tuplefy(covs).flatten() covs = np.concatenate(covs, axis=1)#.astype('float32') surv = self.surv_df(logit_haz) if surv_df is True else None return dict(covs=covs, durations=durations, events=events, weights=weights, surv_df=surv)
def _process_for_pycox(self): def _get_data(df): return df[df.columns[2:]].values.astype('float32') def _get_target(df): return (df['time'].values.astype('float32'), df['event'].values.astype('float32')) x = {group: _get_data(self.data[group]) for group in self.data} y = {group: _get_target(self.data[group]) for group in self.data} val = tt.tuplefy(x['val'], y['val']) return x, y, val
def drop_outliers(self, min_time, max_time): # Select the outliers and reset self.x, t, d and n outlier_mask = (self.t > max_time) or (self.t < min_time) self.t = self.t[~outlier_mask] self.d = self.d[~outlier_mask] if isinstance(self.x, tuple): self.x = tt.tuplefy((self.x[0][~outlier_mask],self.x[1][~outlier_mask])) else: self.x = self.x[~outlier_mask] self.n = len(self.t) return None
def partial_log_likelihood(self, input, target, batch_size=8224, eval_=True, num_workers=0): def expg_sum(t, i): sub = input_sorted.iloc[i:] n = sub.lens().flatten().get_if_all_equal() t = np.repeat(t, n).reshape(-1, 1).astype('float32') return np.exp( self.predict((sub, t), batch_size, True, eval_, num_workers=num_workers)).flatten().sum() durations, events = target df = pd.DataFrame({ self.duration_col: durations, self.event_col: events }) df = df.sort_values(self.duration_col) input = tt.tuplefy(input) input_sorted = input.iloc[df.index.values] times = (df.assign( _idx=np.arange(len(df)) ).loc[lambda x: x[self.event_col] == True].drop_duplicates( self.duration_col, keep='first').assign(_expg_sum=lambda x: [ expg_sum(t, i) for t, i in zip(x[self.duration_col], x['_idx']) ]).drop([self.event_col, '_idx'], axis=1)) idx_name_old = df.index.name idx_name = '__' + idx_name_old if idx_name_old else '__index' df.index.name = idx_name pll = df.loc[lambda x: x[self.event_col] == True] input_event = input.iloc[pll.index.values] durations_event = pll[self.duration_col].values.reshape(-1, 1) g_preds = self.predict((input_event, durations_event), batch_size, True, eval_, num_workers=num_workers).flatten() pll = (pll.assign(_g_preds=g_preds).reset_index().merge( times, on=self.duration_col).set_index(idx_name).assign( pll=lambda x: x['_g_preds'] - np.log(x['_expg_sum']))['pll']) pll.index.name = idx_name_old return pll
def array_or_tensor(tensor, numpy, input): """Returs a tensor if numpy is False or input is tensor. Else it returns numpy array, even if input is a DataLoader. """ is_tensor = None if numpy is False: is_tensor = True elif (numpy is True) or is_dl(input): is_tensor = False elif not (is_data(input) or is_dl(input)): raise ValueError(f"Do not understand type of `input`: {type(input)}") elif tuplefy(input).type() is torch.Tensor: is_tensor = True elif tuplefy(input).type() is np.ndarray: is_tensor = False else: raise ValueError("Something wrong") if is_tensor: tensor = tuplefy(tensor).to_tensor().val_if_single() else: tensor = tuplefy(tensor).to_numpy().val_if_single() return tensor
def make_dataset(numpy): n_events = 2 n_frac = 4 m = 10 n = m * n_frac * n_events p = 5 input = torch.randn((n, p)) durations = torch.arange(m).repeat(int(n / m)) events = torch.arange(n_events).repeat(int(n / n_events)).float() target = (durations, events) data = tt.tuplefy(input, target) if numpy: data = data.to_numpy() return data
def __init__(self, in_features, num_nodes, out_features, batch_norm=True, dropout=None, activation=nn.ReLU, output_activation=None, output_bias=True, w_init_=lambda w: nn.init.kaiming_normal_(w, nonlinearity='relu')): super().__init__() num_nodes = tuplefy(in_features, num_nodes).flatten() if not hasattr(dropout, '__iter__'): dropout = [dropout for _ in range(len(num_nodes)-1)] net = [] for n_in, n_out, p in zip(num_nodes[:-1], num_nodes[1:], dropout): net.append(DenseVanillaBlock(n_in, n_out, True, batch_norm, p, activation, w_init_)) net.append(nn.Linear(num_nodes[-1], out_features, output_bias)) if output_activation: net.append(output_activation) self.net = nn.Sequential(*net) self.bn = torch.nn.BatchNorm1d(in_features)
def test_pmf_runs(numpy, num_durations): data = make_dataset(True) input, target = data labtrans = PMF.label_transform(num_durations) target = labtrans.fit_transform(*target) data = tt.tuplefy(input, target) if not numpy: data = data.to_tensor() net = tt.practical.MLPVanilla(input.shape[1], [4], labtrans.out_features) model = PMF(net) fit_model(data, model) assert_survs(input, model) model.duration_index = labtrans.cuts assert_survs(input, model) cdi = model.interpolate(3, 'const_pdf') assert_survs(input, cdi)