Beispiel #1
0
 def make_dataloader_predict(self, input, batch_size, shuffle=False, num_workers=0):
     input, durations = input
     input = tt.tuplefy(input)
     durations = tt.tuplefy(durations)
     new_input = input + durations 
     dataloader = super().make_dataloader_predict(new_input, batch_size, shuffle, num_workers)
     return dataloader
Beispiel #2
0
 def _sorted_input_target(input, target):
     durations, _ = target  #.to_numpy()
     idx_sort = np.argsort(durations)
     if (idx_sort == np.arange(0, len(idx_sort))).all():
         return input, target
     input = tuplefy(input).iloc[idx_sort]
     target = tuplefy(target).iloc[idx_sort]
     return input, target
Beispiel #3
0
def assert_survs(input, model):
    preds = model.predict_surv(input)
    assert type(preds) is type(input)
    assert preds.shape[0] == input.shape[0]
    surv_df = model.predict_surv_df(input)
    assert type(surv_df) is pd.DataFrame
    assert type(surv_df.values) is np.ndarray
    assert preds.shape[0] == surv_df.shape[1]
    assert preds.shape[1] == surv_df.shape[0]
    np_input = tt.tuplefy(input).to_numpy()[0]
    torch_input = tt.tuplefy(input).to_tensor()[0]
    np_preds = model.predict_surv(np_input)
    torch_preds = model.predict_surv(torch_input)
    assert (np_preds == torch_preds.numpy()).all()
Beispiel #4
0
 def __getitem__(self, index):
     if not hasattr(index, '__iter__'):
         index = [index]
     data_array = []
     for i in index:
         data_path = self.data_path[i]
         vs = pd.read_csv(data_path)
         vs = vs.drop(vs.columns[[0, 1, 2]], axis=1)
         data = np.array(vs).astype('float32')
         data = torch.from_numpy(data)
         data_array.append(data)
     data_array = torch.stack(data_array)
     target = tt.tuplefy(self.time[index], self.event[index]).to_tensor()
     return tt.tuplefy(data_array, target)
Beispiel #5
0
def make_data(multi_in=False, multi_out=False, cl=tuple, dl=False, data_torch=False):
    input = torch.randn(5)
    if multi_in:
        input = (input, (input, input))
        input = cl(input)
    tensor = torch.randn(5)
    if multi_out:
        tensor = ((tensor, tensor), tensor)
        tensor = cl(tensor)
    if data_torch is False:
        input, tensor = tt.tuplefy(input, tensor).to_numpy()
    if dl is True:
        input = tt.tuplefy(input).make_dataloader(10, False)
    return input, tensor
Beispiel #6
0
 def target_to_df(self, target):
     durations, events = tt.tuplefy(target).to_numpy()
     df = pd.DataFrame({
         self.duration_col: durations,
         self.event_col: events
     })
     return df
Beispiel #7
0
    def _compute_baseline_hazards(self, input, df_train_target, max_duration,
                                  batch_size):
        if max_duration is None:
            max_duration = np.inf

        def compute_expg_at_risk(ix, t):
            sub = input.iloc[ix:]
            n = sub.lens().flatten().get_if_all_equal()
            t = np.repeat(t, n).reshape(-1, 1).astype('float32')
            return np.exp(self.predict((sub, t), batch_size)).flatten().sum()

        if not df_train_target[self.duration_col].is_monotonic_increasing:
            raise RuntimeError(
                f"Need 'df_train_target' to be sorted by {self.duration_col}")
        input = tuplefy(input)
        df = df_train_target.reset_index(drop=True)
        times = (df.loc[lambda x: x[self.event_col] != 0][self.duration_col].
                 loc[lambda x: x <= max_duration].drop_duplicates(
                     keep='first'))
        at_risk_sum = (pd.Series(
            [compute_expg_at_risk(ix, t) for ix, t in times.iteritems()],
            index=times.values).rename('at_risk_sum'))
        events = (df.groupby(self.duration_col)[[
            self.event_col
        ]].agg('sum').loc[lambda x: x.index <= max_duration])
        base_haz = (events.join(
            at_risk_sum, how='left',
            sort=True).pipe(lambda x: x[self.event_col] / x['at_risk_sum']).
                    fillna(0.).rename('baseline_hazards'))
        return base_haz
Beispiel #8
0
def label_transfer(times, events):
    # times = [period if x == -1 else x for x in times]
    # times = [period if x > period else x for x in times]
    # events = [True if x == 'abnormal' else x for x in events]
    # events = [False if x == 'normal' else x for x in events]
    labels = tt.tuplefy(np.array(times), np.array(events))
    return labels
Beispiel #9
0
 def __getitem__(self, index):
     batch = super().__getitem__(index)
     input, (duration, event) = batch
     idx_sort = duration.sort(descending=True)[1]
     event = event.float()
     batch = tuplefy(input, event).iloc[idx_sort]
     return batch
Beispiel #10
0
    def compute_baseline_hazards(self, input=None, target=None, max_duration=None, sample=None, batch_size=8224,
                                set_hazards=True, eval_=True, num_workers=0):
        """Computes the Breslow estimates form the data defined by `input` and `target`
        (if `None` use training data).

        Typically call
        model.compute_baseline_hazards() after fitting.

        Keyword Arguments:
            input  -- Input data (train input) (default: {None})
            target  -- Target data (train target) (default: {None})
            max_duration {float} -- Don't compute estimates for duration higher (default: {None})
            sample {float or int} -- Compute estimates of subsample of data (default: {None})
            batch_size {int} -- Batch size (default: {8224})
            set_hazards {bool} -- Set hazards in model object, or just return hazards. (default: {True})

        Returns:
            pd.Series -- Pandas series with baseline hazards. Index is duration_col.
        """
        if (input is None) and (target is None):
            if not hasattr(self, 'training_data'):
                raise ValueError("Need to give a 'input' and 'target' to this function.")
            input, target = self.training_data
        df = self.target_to_df(target)#.sort_values(self.duration_col)
        if sample is not None:
            if sample >= 1:
                df = df.sample(n=sample)
            else:
                df = df.sample(frac=sample)
        input = tt.tuplefy(input).to_numpy().iloc[df.index.values]
        base_haz = self._compute_baseline_hazards(input, df, max_duration, batch_size,
                                                  eval_=eval_, num_workers=num_workers)
        if set_hazards:
            self.compute_baseline_cumulative_hazards(set_hazards=True, baseline_hazards_=base_haz)
        return base_haz
Beispiel #11
0
    def fit(self, input, target, batch_size=256, epochs=1, callbacks=None, verbose=True,
            num_workers=0, shuffle=True, metrics=None, val_data=None, val_batch_size=8224,
            **kwargs):
        """Fit  model with inputs and targets. Where 'input' is the covariates, and
        'target' is a tuple with (durations, events).

        Arguments:
            input {np.array, tensor or tuple} -- Input x passed to net.
            target {np.array, tensor or tuple} -- Target [durations, events].

        Keyword Arguments:
            batch_size {int} -- Elements in each batch (default: {256})
            epochs {int} -- Number of epochs (default: {1})
            callbacks {list} -- list of callbacks (default: {None})
            verbose {bool} -- Print progress (default: {True})
            num_workers {int} -- Number of workers used in the dataloader (default: {0})
            shuffle {bool} -- If we should shuffle the order of the dataset (default: {True})
            **kwargs are passed to 'make_dataloader' method.

        Returns:
            TrainingLogger -- Training log
        """
        self.training_data = tt.tuplefy(input, target)
        return super().fit(input, target, batch_size, epochs, callbacks, verbose,
                           num_workers, shuffle, metrics, val_data, val_batch_size,
                           **kwargs)
Beispiel #12
0
 def compute_baseline_hazards(self,
                              input=None,
                              target=None,
                              max_duration=None,
                              sample=None,
                              batch_size=8224,
                              set_hazards=True,
                              eval_=True,
                              num_workers=0):
     if (input is None) and (target is None):
         if not hasattr(self, 'training_data'):
             raise ValueError(
                 'Need to fit, or supply a input and target to this function.'
             )
         input, target = self.training_data
     df = self.target_to_df(target)
     if sample is not None:
         if sample >= 1:
             df = df.sample(n=sample)
         else:
             df = df.sample(frac=sample)
         df = df.sort_values(self.duration_col)
     input = tt.tuplefy(input).to_numpy().iloc[df.index.values]
     base_haz = self._compute_baseline_hazards(input, df, max_duration,
                                               batch_size, eval_,
                                               num_workers)
     if set_hazards:
         self.compute_baseline_cumulative_hazards(
             set_hazards=True, baseline_hazards_=base_haz)
     return base_haz
Beispiel #13
0
 def setup(self):
     torch.manual_seed(1234)
     self.net = torch.nn.Linear(5, 3)
     self.prednet = _PredSigmoidNet(self.net)
     x = torch.randn(4, 5)
     self.data = x
     self.dataloader = tuplefy(x).make_dataloader(5, False)
     self.net_dropout = nn.Sequential(self.net, nn.Dropout(0.5))
Beispiel #14
0
def test_array_or_tensor_type_numpy(numpy, multi_in, multi_out, cl, dl, data_torch):
    input, tensor = make_data(multi_in, multi_out, cl, dl, data_torch)
    out = array_or_tensor(tensor, numpy, input)
    if multi_out is True:
        assert type(out) is tt.TupleTree
    else:
        assert type(out) in [np.ndarray, torch.Tensor]
    assert numpy is (tt.tuplefy(out).type() is np.ndarray)
Beispiel #15
0
    def __init__(self, input, durations, events, n_control=1):
        df_train_target = pd.DataFrame(dict(duration=durations, event=events))
        self.durations = df_train_target.loc[lambda x: x['event'] == 1]['duration']
        self.at_risk_dict = make_at_risk_dict(durations)

        self.input = tt.tuplefy(input)
        assert type(self.durations) is pd.Series
        self.n_control = n_control
Beispiel #16
0
 def __getitem__(self, index):
     if (not hasattr(index, '__iter__')) and (type(index) is not slice):
         index = [index]
     fails = self.durations.iloc[index]
     x_case = self.input.iloc[fails.index]
     control_idx = sample_alive_from_dates(fails.values, self.at_risk_dict, self.n_control)
     x_control = tt.TupleTree(self.input.iloc[idx] for idx in control_idx.transpose())
     return tt.tuplefy(x_case, x_control).to_tensor()
Beispiel #17
0
def array_or_tensor(tensor, numpy, input):
    """Returs a tensor if numpy is False or input is tensor.
    Else it returns numpy array.
    """
    if numpy is False:
        return tensor
    if (numpy is True) or (tt.tuplefy(input).type() is np.ndarray):
        tensor = tensor.cpu().numpy()
    return tensor
Beispiel #18
0
 def expg_at_time(t):
     t = np.repeat(t, n_cols).reshape(-1, 1).astype('float32')
     if tt.tuplefy(input).type() is torch.Tensor:
         t = torch.from_numpy(t)
     return np.exp(
         self.predict((input, t),
                      batch_size,
                      True,
                      eval_,
                      num_workers=num_workers)).flatten()
Beispiel #19
0
def test_array_or_tensor_type_none(multi_in, multi_out, cl, dl, data_torch):
    numpy = None
    input, tensor = make_data(multi_in, multi_out, cl, dl, data_torch)
    out = array_or_tensor(tensor, numpy, input)
    if multi_out is True:
        assert type(out) is tt.TupleTree
    else:
        assert type(out) in [np.ndarray, torch.Tensor]
    correct_type = np.ndarray if (dl is True) or (data_torch is False) else torch.Tensor
    assert tt.tuplefy(out).type() is correct_type
Beispiel #20
0
def assert_survs(input, model, with_dl=True):
    preds = model.predict_surv(input)
    assert type(preds) is type(input)
    assert preds.shape[0] == input.shape[0]
    surv_df = model.predict_surv_df(input)
    assert type(surv_df) is pd.DataFrame
    assert type(surv_df.values) is np.ndarray
    assert preds.shape[0] == surv_df.shape[1]
    assert preds.shape[1] == surv_df.shape[0]
    np_input = tt.tuplefy(input).to_numpy()[0]
    torch_input = tt.tuplefy(input).to_tensor()[0]
    np_preds = model.predict_surv(np_input)
    torch_preds = model.predict_surv(torch_input)
    assert (np_preds == torch_preds.cpu().numpy()).all()
    if with_dl:
        dl_input = tt.tuplefy(input).make_dataloader(512, False)
        dl_preds = model.predict_surv(dl_input)
        assert type(np_preds) is type(dl_preds), f"got {type(np_preds)}, and, {type(dl_preds)}"
        assert (np_preds == dl_preds).all()
Beispiel #21
0
def test_cox_time_runs(numpy):
    input, target = make_dataset(False).apply(lambda x: x.float()).to_numpy()
    labtrans = CoxTime.label_transform()
    target = labtrans.fit_transform(*target)
    data = tt.tuplefy(input, target)
    if not numpy:
        data = data.to_tensor()
    net = MLPVanillaCoxTime(data[0].shape[1], [4], False)
    model = CoxTime(net)
    fit_model(data, model)
    model.compute_baseline_hazards()
    assert_survs(data[0], model)
Beispiel #22
0
 def predict_survival_function(self,
                               input,
                               batch_size=8224,
                               eval_=True,
                               to_cpu=False,
                               num_workers=0):
     """Might need to set to_cpu to true if too large dataset."""
     pmf = self.predict_pmf(input, batch_size, eval_, to_cpu, num_workers,
                            False)
     surv = 1 - pmf.cumsum(0)
     if tuplefy(input).type() is np.ndarray:
         surv = surv.cpu().numpy()
     return surv
 def simulate_from_weights(self, weights, surv_df=False):
     logit_haz = self.logit_haz(self.times[1:], *weights)
     durations = self.sample_event_times(logit_haz)#.astype('float32')
     is_nan = np.isnan(durations)
     events = np.ones_like(durations)
     events[is_nan] = 0.
     durations[is_nan] = self.times[-1]
     covs = self.sample_covs(weights)
     covs = tt.tuplefy(covs).flatten()
     covs = np.concatenate(covs, axis=1)#.astype('float32')
     surv = self.surv_df(logit_haz) if surv_df is True else None
     return dict(covs=covs, durations=durations, events=events, weights=weights,
                 surv_df=surv)
Beispiel #24
0
    def _process_for_pycox(self):
        def _get_data(df):
            return df[df.columns[2:]].values.astype('float32')

        def _get_target(df):
            return (df['time'].values.astype('float32'),
                    df['event'].values.astype('float32'))

        x = {group: _get_data(self.data[group]) for group in self.data}
        y = {group: _get_target(self.data[group]) for group in self.data}
        val = tt.tuplefy(x['val'], y['val'])

        return x, y, val
Beispiel #25
0
    def drop_outliers(self, min_time, max_time):

        # Select the outliers and reset self.x, t, d and n
        outlier_mask = (self.t > max_time) or (self.t < min_time)
        self.t = self.t[~outlier_mask]
        self.d = self.d[~outlier_mask]
        if isinstance(self.x, tuple):
            self.x = tt.tuplefy((self.x[0][~outlier_mask],self.x[1][~outlier_mask]))
        else:
            self.x = self.x[~outlier_mask]
        self.n = len(self.t)

        return None
Beispiel #26
0
    def partial_log_likelihood(self,
                               input,
                               target,
                               batch_size=8224,
                               eval_=True,
                               num_workers=0):
        def expg_sum(t, i):
            sub = input_sorted.iloc[i:]
            n = sub.lens().flatten().get_if_all_equal()
            t = np.repeat(t, n).reshape(-1, 1).astype('float32')
            return np.exp(
                self.predict((sub, t),
                             batch_size,
                             True,
                             eval_,
                             num_workers=num_workers)).flatten().sum()

        durations, events = target
        df = pd.DataFrame({
            self.duration_col: durations,
            self.event_col: events
        })
        df = df.sort_values(self.duration_col)
        input = tt.tuplefy(input)
        input_sorted = input.iloc[df.index.values]

        times = (df.assign(
            _idx=np.arange(len(df))
        ).loc[lambda x: x[self.event_col] == True].drop_duplicates(
            self.duration_col, keep='first').assign(_expg_sum=lambda x: [
                expg_sum(t, i) for t, i in zip(x[self.duration_col], x['_idx'])
            ]).drop([self.event_col, '_idx'], axis=1))

        idx_name_old = df.index.name
        idx_name = '__' + idx_name_old if idx_name_old else '__index'
        df.index.name = idx_name

        pll = df.loc[lambda x: x[self.event_col] == True]
        input_event = input.iloc[pll.index.values]
        durations_event = pll[self.duration_col].values.reshape(-1, 1)
        g_preds = self.predict((input_event, durations_event),
                               batch_size,
                               True,
                               eval_,
                               num_workers=num_workers).flatten()
        pll = (pll.assign(_g_preds=g_preds).reset_index().merge(
            times, on=self.duration_col).set_index(idx_name).assign(
                pll=lambda x: x['_g_preds'] - np.log(x['_expg_sum']))['pll'])

        pll.index.name = idx_name_old
        return pll
Beispiel #27
0
def array_or_tensor(tensor, numpy, input):
    """Returs a tensor if numpy is False or input is tensor.
    Else it returns numpy array, even if input is a DataLoader.
    """
    is_tensor = None
    if numpy is False:
        is_tensor = True
    elif (numpy is True) or is_dl(input):
        is_tensor = False
    elif not (is_data(input) or is_dl(input)):
        raise ValueError(f"Do not understand type of `input`: {type(input)}")
    elif tuplefy(input).type() is torch.Tensor:
        is_tensor = True
    elif tuplefy(input).type() is np.ndarray:
        is_tensor = False
    else:
        raise ValueError("Something wrong")

    if is_tensor:
        tensor = tuplefy(tensor).to_tensor().val_if_single()
    else:
        tensor = tuplefy(tensor).to_numpy().val_if_single()
    return tensor
Beispiel #28
0
def make_dataset(numpy):
    n_events = 2
    n_frac = 4
    m = 10
    n = m * n_frac * n_events
    p = 5
    input = torch.randn((n, p))
    durations = torch.arange(m).repeat(int(n / m))
    events = torch.arange(n_events).repeat(int(n / n_events)).float()
    target = (durations, events)
    data = tt.tuplefy(input, target)
    if numpy:
        data = data.to_numpy()
    return data
Beispiel #29
0
 def __init__(self, in_features, num_nodes, out_features, batch_norm=True, dropout=None, activation=nn.ReLU,
              output_activation=None, output_bias=True,
              w_init_=lambda w: nn.init.kaiming_normal_(w, nonlinearity='relu')):
     super().__init__()
     num_nodes = tuplefy(in_features, num_nodes).flatten()
     if not hasattr(dropout, '__iter__'):
         dropout = [dropout for _ in range(len(num_nodes)-1)]
     net = []
     for n_in, n_out, p in zip(num_nodes[:-1], num_nodes[1:], dropout):
         net.append(DenseVanillaBlock(n_in, n_out, True, batch_norm, p, activation, w_init_))
     net.append(nn.Linear(num_nodes[-1], out_features, output_bias))
     if output_activation:
         net.append(output_activation)
     self.net = nn.Sequential(*net)
     self.bn = torch.nn.BatchNorm1d(in_features)
Beispiel #30
0
def test_pmf_runs(numpy, num_durations):
    data = make_dataset(True)
    input, target = data
    labtrans = PMF.label_transform(num_durations)
    target = labtrans.fit_transform(*target)
    data = tt.tuplefy(input, target)
    if not numpy:
        data = data.to_tensor()
    net = tt.practical.MLPVanilla(input.shape[1], [4], labtrans.out_features)
    model = PMF(net)
    fit_model(data, model)
    assert_survs(input, model)
    model.duration_index = labtrans.cuts
    assert_survs(input, model)
    cdi = model.interpolate(3, 'const_pdf')
    assert_survs(input, cdi)