Exemple #1
0
def Coxnnet_pipeline(mod1, mod2, x1, x2, y1, y2, hyperparameters, save, path):
    dense_size = hyperparameters[
        'Dense size']  #number of nodes in dense layers
    dropout_p = hyperparameters['Dropout']

    in_features_one = x1.shape[1]
    if mod2 != 'None':
        pass
    else:
        net = Coxnnet(in_features_one, dense_size, dropout_p).to(device)
    net.train()
    model = CoxPH(net, tt.optim.SGD)

    batch_size = hyperparameters['batch_size']
    epochs = hyperparameters['Epoch']
    verbose = True
    lr = hyperparameters['Learning rate']
    model.optimizer.set_lr(lr)
    #model.optimizer.set('momentum', 0.9)

    if mod2 != 'None':
        log = model.fit((x1, x2), (y1, y2),
                        batch_size,
                        epochs,
                        verbose=verbose)
    else:
        log = model.fit(x1, (y1, y2), batch_size, epochs, verbose=verbose)
    net.eval()
    if save:
        PATH = SAVE_FOLDER + "Coxnnet_" + mod1 + "+" + str(mod2) + "_" + str(
            epochs) + "_" + str(batch_size) + "_" + str(lr) + "_" + str(
                dense_size) + "_" + str(dropout_p)
        torch.save(net.state_dict(), SAVE_FOLDER + path + ".pt")
    return model, log
Exemple #2
0
    def fit(self, X, y, column_names):
        # format data
        self.column_names = column_names
        full_df = self._format_to_pycox(X, y, self.column_names)
        val_df = full_df.sample(frac=0.2)
        train_df = full_df.drop(val_df.index)
        train_x, train_y = self._standardize_df(train_df, "train")
        val_x, val_y = self._standardize_df(val_df, "val")
        # configure model
        self.in_features = train_x.shape[1]
        net = tt.practical.MLPVanilla(in_features=self.in_features,
                                      num_nodes=self.num_nodes,
                                      out_features=self.out_features,
                                      batch_norm=self.batch_norm,
                                      dropout=self.dropout,
                                      activation=self.activation,
                                      output_bias=self.output_bias)
        self.model = CoxPH(
            net, tt.optim.Adam(lr=self.lr, weight_decay=self.weight_decay))
        # self.model.optimizer.set_lr(self.lr)

        n_train = train_x.shape[0]
        while n_train % self.batch_size == 1:  # this will cause issues in batch norm
            self.batch_size += 1

        self.model.fit(train_x,
                       train_y,
                       self.batch_size,
                       self.epochs,
                       self.callbacks,
                       verbose=True,
                       val_data=(val_x, val_y),
                       val_batch_size=self.batch_size,
                       num_workers=self.num_workers)
        self.model.compute_baseline_hazards()
Exemple #3
0
def test_cox_cc_runs(numpy):
    data = make_dataset(False).apply(lambda x: x.float()).to_numpy()
    if not numpy:
        data = data.to_tensor()
    net = tt.practical.MLPVanilla(data[0].shape[1], [4], 1, False, output_bias=False)
    model = CoxPH(net)
    fit_model(data, model)
    model.compute_baseline_hazards()
    assert_survs(data[0], model)
Exemple #4
0
def VAESurv_pipeline(mod1, mod2, x1, x2, y1, y2, hyperparameters, save, path):
    d_dims = hyperparameters['D dims']
    dense_size = hyperparameters['Dense size']#number of nodes in dense layers
    latent_size = hyperparameters['Latent size'] # number of nodes (dimensionality) of encoded data
    neuron_size = hyperparameters['Neuron size'] # Dimensions for survival network
    dropout_p = hyperparameters['Dropout']

    in_features_one = x1.shape[1]
    if mod2 != 'None':
        in_features_two = x2.shape[1]
        net = VAESurv(in_features_one, in_features_two, d_dims, dense_size, latent_size, dropout_p, neuron_size, device).to(device)
    else:
        pass
    #Load pre-trained VAE
    if mod2 != 'None':
        PATH=hyperparameters['State file path']+mod1+'+'+mod2+'.pt'
    else:
        PATH=hyperparameters['State file path']+mod1+'.pt'
    net.load_state_dict(torch.load(PATH), strict=False)
    net.eval()
    
    for name, param in net.named_parameters():
        if not('surv_net' in name):
            param.requires_grad = False
    """    
    print('Trainable parameters:')
    for name, param in net.named_parameters():
        if (param.requires_grad):
            print(name)
    """
    net.train()
    model = CoxPH(net, tt.optim.Adam) #loss = 
    
    batch_size = hyperparameters['batch_size'] 
    epochs = hyperparameters['Epoch']
    verbose = True
    model.optimizer.set_lr(hyperparameters['Learning rate'])
    model.optimizer.set('weight_decay', hyperparameters['L2 reg'])
    if mod2 != 'None':
        log = model.fit((x1,x2),(y1,y2), batch_size, epochs, verbose=verbose)
    else:
        log = model.fit(x1,(y1,y2), batch_size, epochs, verbose=verbose)
    net.eval()
    if save:
        PATH = SAVE_FOLDER + "VAESurv_"+ path 
        torch.save(net.state_dict(), PATH)
    return model, log
Exemple #5
0
def load_model(model_file, data, clinical, surv_time, edge_index):
    """The function for loading a pytorch model
    """
    #############
    m = MyNet(edge_index).to(device)
    model = CoxPH(m, tt.optim.Adam(0.0001))
    #_, features = m(data)
    #print(features)
    model.load_net(model_file)
    prediction = model.predict_surv_df(data)
    #print(prediction)
    fs = features(model.net, torch.from_numpy(data).to(device))
    #print(fs)
    #ev = EvalSurv(prediction, clinical, surv_time)
    #prediction = ev.concordance_td()

    return prediction, fs
Exemple #6
0
    def _model_factory(self,
                       n_trees=None,
                       n_input_features=None,
                       n_neurons=None):
        if self.algorithm == 'CPH':
            return CoxPHFitter()
        elif self.algorithm == 'RSF':
            return RandomSurvivalForestModel(num_trees=n_trees)
        elif self.algorithm in self._pycox_methods:
            net_args = {
                'in_features': n_input_features,
                'num_nodes': n_neurons,
                'batch_norm': True,
                'dropout': 0.1,
            }

            if self.algorithm == 'DeepSurv':
                net = tt.practical.MLPVanilla(out_features=1,
                                              output_bias=False,
                                              **net_args)
                model = CoxPH(net, tt.optim.Adam)

                return model
            if self.algorithm == 'CoxTime':
                net = MLPVanillaCoxTime(**net_args)
                model = CoxTime(net, tt.optim.Adam)

                return model
            if self.algorithm in self._discrete_time_methods:
                num_durations = 30
                print(f'   {num_durations} equidistant intervals')
            if self.algorithm == 'DeepHit':
                labtrans = DeepHitSingle.label_transform(num_durations)
                net = self._get_discrete_time_net(labtrans, net_args)
                model = DeepHitSingle(net,
                                      tt.optim.Adam,
                                      alpha=0.2,
                                      sigma=0.1,
                                      duration_index=labtrans.cuts)

                return model
            if self.algorithm == 'MTLR':
                labtrans = MTLR.label_transform(num_durations)
                net = self._get_discrete_time_net(labtrans, net_args)
                model = MTLR(net, tt.optim.Adam, duration_index=labtrans.cuts)

                return model
            if self.algorithm == 'Nnet-survival':
                labtrans = LogisticHazard.label_transform(num_durations)
                net = self._get_discrete_time_net(labtrans, net_args)
                model = LogisticHazard(net,
                                       tt.optim.Adam(0.01),
                                       duration_index=labtrans.cuts)

                return model
        else:
            raise Exception('Unrecognized model.')
Exemple #7
0
def train_LSTMCox(data_df, r_splits):
  epochs = 100
  verbose = True

  in_features = 768
  out_features = 1
  batch_norm = True
  dropout = 0.6
  output_bias = False

  c_index_at = []
  c_index_30 = []

  time_auc_30 = []
  time_auc_60 = []
  time_auc_365 = []

  for i in range(len(r_splits)):
    print("\nIteration %s"%(i))
    
    #DATA PREP
    df_train, df_val, df_test, df_test_30 = prepare_datasets(data_df, r_splits[i][2], r_splits[i][1], r_splits[i][0])

    x_train = np.array(df_train["x0"].tolist()).astype("float32")
    x_val = np.array(df_val["x0"].tolist()).astype("float32")
    x_test = np.array(df_test["x0"].tolist()).astype("float32")
    x_test_30 = np.array(df_test_30["x0"].tolist()).astype("float32")

    labtrans = CoxTime.label_transform()
    get_target = lambda df: (df['duration'].values, df['event'].values)
    y_train = labtrans.fit_transform(*get_target(df_train))
    y_val = labtrans.transform(*get_target(df_val))

    durations_test, events_test = get_target(df_test)
    durations_test_30, events_test_30 = get_target(df_test_30)
    val = tt.tuplefy(x_val, y_val)
    
    (train_x, train_y), (val_x, val_y), (test_x, test_y), _ = df2array(data_df, df_train, df_val, df_test, df_test_30)

    #MODEL
    callbacks = [tt.callbacks.EarlyStopping()]

    net = LSTMCox(768, 32, 1, 1)

    model = CoxPH(net, tt.optim.Adam)
    model.optimizer.set_lr(0.0001)

    if x_train.shape[0] % 2:
      batch_size = 255
    else:
      batch_size = 256
      
    log = model.fit(x_train, y_train, batch_size, epochs, callbacks, val_data=val, val_batch_size=batch_size)

    model.compute_baseline_hazards()

    surv = model.predict_surv_df(x_test)
    ev = EvalSurv(surv, durations_test, events_test, censor_surv='km')
    c_index_at.append(ev.concordance_td())

    surv_30 = model.predict_surv_df(x_test_30)
    ev_30 = EvalSurv(surv_30, durations_test_30, events_test_30, censor_surv='km')
    c_index_30.append(ev_30.concordance_td())

    for time_x in [30, 60, 365]:
      va_auc, va_mean_auc = cumulative_dynamic_auc(train_y, test_y, model.predict(x_test).flatten(), time_x)

      eval("time_auc_" + str(time_x)).append(va_auc[0])

    print("C-index_30:", c_index_30[i])
    print("C-index_AT:", c_index_at[i])

    print("time_auc_30", time_auc_30[i])
    print("time_auc_60", time_auc_60[i])
    print("time_auc_365", time_auc_365[i])

  return c_index_at, c_index_30, time_auc_30, time_auc_60, time_auc_365
Exemple #8
0
np.random.seed(method_random_seed)

batch_norm = True
dropout = 0.0
output_bias = False

net = tt.practical.MLPVanilla(X_train_std.shape[1],
                              [n_nodes for layer_idx in range(n_layers)],
                              1,
                              batch_norm,
                              dropout,
                              output_bias=output_bias)

optimizer = tt.optim.Adam(lr=lr)

surv_model = CoxPH(net, optimizer)

model_filename = \
    os.path.join(output_dir, 'models',
                 '%s_%s_exp%d_%s_bs%d_nep%d_nla%d_nno%d_lr%f_test.pt'
                 % (survival_estimator_name, dataset, experiment_idx,
                    val_string, batch_size, n_epochs, n_layers, n_nodes, lr))
assert os.path.isfile(model_filename)
print('*** Loading ***', flush=True)
surv_model.load_net(model_filename)

surv_df = surv_model.predict_surv_df(X_test_std)
surv = surv_df.to_numpy().T

print()
print('[Test data statistics]')
def _train_dcph(x, t, e, folds):

  """Helper Function to train a deep-cox model (DeepSurv, Faraggi-Simon).

  Args:
    x:
      a numpy array of input features (Training Data).
    t:
      a numpy vector of event times (Training Data).
    e:
      a numpy vector of event indicators (1 if event occured, 0 otherwise)
      (Training Data).
    folds:
       vector of the training cv folds.

  Returns:
    Trained pycox.CoxPH model.

  """

  in_features = x.shape[1]
  num_nodes = [100, 100]
  out_features = 1
  batch_norm = False
  dropout = 0.0
  output_bias = False

  fold_model = {}

  for f in set(folds):

    xf = x[folds != f]
    tf = t[folds != f]
    ef = e[folds != f]

    validx = sorted(
        np.random.choice(len(xf), size=(int(0.15 * len(xf))), replace=False))

    vidx = np.array([False] * len(xf))
    vidx[validx] = True

    net = ttup.practical.MLPVanilla(
        in_features,
        num_nodes,
        out_features,
        batch_norm,
        dropout,
        output_bias=output_bias).double()

    model = CoxPH(net, torch.optim.Adam)

    y_train = (tf[~vidx], ef[~vidx])
    y_val = (tf[vidx], ef[vidx])
    val = xf[vidx], y_val

    batch_size = 256
    model.optimizer.set_lr(0.001)
    epochs = 20
    callbacks = [ttup.callbacks.EarlyStopping()]

    model.fit(
        xf[~vidx],
        y_train,
        batch_size,
        epochs,
        callbacks,
        True,
        val_data=val,
        val_batch_size=batch_size)
    model.compute_baseline_hazards()

    fold_model[f] = model

  return fold_model
Exemple #10
0
                    np.random.seed(method_random_seed)

                    batch_norm = True
                    dropout = 0.
                    output_bias = False

                    optimizer = tt.optim.Adam(lr=lr)
                    net = tt.practical.MLPVanilla(
                        fold_X_train_std.shape[1],
                        [n_nodes for layer_idx in range(n_layers)],
                        1,
                        batch_norm,
                        dropout,
                        output_bias=output_bias)

                    surv_model = CoxPH(net, optimizer)

                    model_filename = \
                        os.path.join(output_dir, 'models',
                                     '%s_%s_exp%d_bs%d_nep%d_nla%d_nno%d_'
                                     % (survival_estimator_name, dataset,
                                        experiment_idx, batch_size, n_epochs,
                                        n_layers, n_nodes)
                                     +
                                     'lr%f_cv%d.pt'
                                     % (lr, cross_val_idx))
                    time_elapsed_filename = model_filename[:-3] + '_time.txt'
                    if not os.path.isfile(model_filename):
                        # print('*** Fitting with hyperparam:', hyperparam,
                        #       '-- cross val index:', cross_val_idx, flush=True)
                        surv_model.fit(
x_test = x_mapper.transform(df_test).astype('float32')

get_target = lambda df: (df['duration'].values, df['event'].values)
y_train = get_target(df_train)
y_val = get_target(df_val)
durations_test, events_test = get_target(df_test)
val = x_val, y_val

in_features = x_train.shape[1]
num_nodes = [32, 32]
out_features = 1
batch_norm = True
dropout = 0.1
output_bias = False

net = tt.practical.MLPVanilla(in_features,
                              num_nodes,
                              out_features,
                              batch_norm,
                              dropout,
                              output_bias=output_bias)

model = CoxPH(net, tt.optim.Adam)

batch_size = 256
lrfinder = model.lr_finder(x_train, y_train, batch_size, tolerance=10)
_ = lrfinder.plot()

plt.show()

print()
Exemple #12
0
def pycox_deep(filename, Y_train, Y_test, opt, choice):
    # choice = {'lr_rate': l, 'batch': b, 'decay': 0, 'weighted_decay': wd, 'net': net, 'index': index}
    X_train, X_test = enc_using_trained_ae(filename, TARGET=opt, ALPHA=0.01, N_ITER=100, L1R=-9999)
    path = './models/analysis/'
    check = 0
    savename = 'model_check_autoen_m5_test_batch+dropout+wd.csv'
    # r=root, d=directories, f = files
    for r, d, f in os.walk(path):
        for file in f:
            if savename in file:
                check = 1

    # X_train = X_train.drop('UR_SG3', axis=1)
    # X_test = X_test.drop('UR_SG3', axis=1)

    x_train = X_train
    x_test = X_test
    x_train['SVDTEPC_G'] = Y_train['SVDTEPC_G']
    x_train['PC_YN'] = Y_train['PC_YN']
    x_test['SVDTEPC_G'] = Y_test['SVDTEPC_G']
    x_test['PC_YN'] = Y_test['PC_YN']

    ## DataFrameMapper ##
    cols_standardize = list(X_train.columns)
    cols_standardize.remove('SVDTEPC_G')
    cols_standardize.remove('PC_YN')

    standardize = [(col, None) for col in cols_standardize]

    x_mapper = DataFrameMapper(standardize)

    _ = x_mapper.fit_transform(X_train).astype('float32')
    X_train = x_mapper.transform(X_train).astype('float32')
    X_test = x_mapper.transform(X_test).astype('float32')

    get_target = lambda df: (df['SVDTEPC_G'].values, df['PC_YN'].values)
    y_train = get_target(x_train)
    durations_test, events_test = get_target(x_test)
    in_features = X_train.shape[1]
    print(in_features)
    num_nodes = choice['nodes']
    out_features = 1
    batch_norm = True  # False for batch_normalization
    dropout = 0.01
    output_bias = False
    # net = choice['net']
    net = tt.practical.MLPVanilla(in_features, num_nodes, out_features, batch_norm, dropout, output_bias=output_bias)


    print("training")
    model = CoxPH(net, tt.optim.Adam)
    # lrfinder = model.lr_finder(X_train, y_train, batch_size)

    # lr_best = lrfinder.get_best_lr()
    lr_best = 0.0001
    model.optimizer.set_lr(choice['lr_rate'])

    weighted_decay = choice['weighted_decay']
    verbose = True

    batch_size = choice['batch']
    epochs = 100

    if weighted_decay == 0:
        callbacks = [tt.callbacks.EarlyStopping(patience=epochs)]
        # model.fit(X_train, y_train, batch_size, epochs, callbacks, verbose=verbose)
    else:
        callbacks = [tt.callbacks.DecoupledWeightDecay(weight_decay=choice['decay'])]
        # model.fit(X_train, y_train, batch_size, epochs, callbacks, verbose)

    ''''''
    # dataloader = model.make_dataloader(tt.tuplefy(X_train, y_train),batch_size,True)
    datas = tt.tuplefy(X_train, y_train).to_tensor()
    print(datas)
    make_dataset = tt.data.DatasetTuple;
    DataLoader = tt.data.DataLoaderBatch
    dataset = make_dataset(*datas)
    dataloader = DataLoader(dataset, batch_size, False, sampler=StratifiedSampler(datas, batch_size))
    # dataloader = DataLoader(dataset,batch_size, True)
    model.fit_dataloader(dataloader, epochs, callbacks, verbose)
    # model.fit(X_train, y_train, batch_size, epochs, callbacks, verbose)
    # model.partial_log_likelihood(*val).mean()

    print("predicting")
    baseline_hazards = model.compute_baseline_hazards(datas[0], datas[1])
    baseline_hazards = df(baseline_hazards)

    surv = model.predict_surv_df(X_test)
    surv = 1 - surv
    ev = EvalSurv(surv, durations_test, events_test, censor_surv='km')

    print("scoring")
    c_index = ev.concordance_td()
    print("c-index(", opt, "): ", c_index)

    if int(c_index * 10) == 0:
        hazardname = 'pycox_model_hazard_m5_v2_' + opt + '_0'
        netname = 'pycox_model_net_m5_v2_' + opt + '_0'
        weightname = 'pycox_model_weight_m5_v2_' + opt + '_0'
    else:
        hazardname = 'pycox_model_hazard_m5_' + opt + '_'
        netname = 'pycox_model_net_m5_' + opt + '_'
        weightname = 'pycox_model_weight_m5_' + opt + '_'

    baseline_hazards.to_csv('./test/'+hazardname + str(int(c_index * 100)) + '_' + str(index) + '.csv', index=False)
    netname = netname + str(int(c_index * 100)) + '_' + str(index) + '.sav'
    weightname = weightname + str(int(c_index * 100)) + '_' + str(index) + '.sav'
    model.save_net('./test/' + netname)
    model.save_model_weights('./test/' + weightname)

    pred = df(surv)
    pred = pred.transpose()
    surv_final = []
    pred_final = []

    for i in range(len(pred)):
        pred_final.append(float(1-pred[Y_test['SVDTEPC_G'][i]][i]))
        surv_final.append(float(pred[Y_test['SVDTEPC_G'][i]][i]))

    Y_test_cox = CoxformY(Y_test)
    #print(surv_final)
    c_cox, concordant, discordant,_,_ = concordance_index_censored(Y_test_cox['PC_YN'], Y_test_cox['SVDTEPC_G'], surv_final)
    c_cox_pred = concordance_index_censored(Y_test_cox['PC_YN'], Y_test_cox['SVDTEPC_G'], pred_final)[0]
    print("c-index(", opt, ") - sksurv: ", round(c_cox, 4))
    print("cox-concordant(", opt, ") - sksurv: ", concordant)
    print("cox-disconcordant(", opt, ") - sksurv: ", discordant)
    print("c-index_pred(", opt, ") - sksurv: ", round(c_cox_pred, 4))

    fpr, tpr, _ = metrics.roc_curve(Y_test['PC_YN'], pred_final)
    auc = metrics.auc(fpr, tpr)
    print("auc(", opt, "): ", round(auc, 4))

    if check == 1:
        model_check = pd.read_csv(path+savename)
    else:
        model_check = df(columns=['option', 'gender', 'c-td', 'c-index', 'auc'])
    line_append = {'option':str(choice), 'gender':opt, 'c-td':round(c_index,4), 'c-index':round(c_cox_pred,4), 'auc':round(auc,4)}
    model_check = model_check.append(line_append, ignore_index=True)
    model_check.to_csv(path+savename, index=False)

    del X_train
    del X_test

    return surv_final
Exemple #13
0
def main():
    parser = setup_parser()
    args = parser.parse_args()

    if args.which_gpu != 'none':
        os.environ["CUDA_VISIBLE_DEVICES"] = args.which_gpu

    # save setting
    if not os.path.exists(os.path.join(args.save_path, args.model_name)):
        os.mkdir(os.path.join(args.save_path, args.model_name))

    # data reading seeting
    singnal_data_path = args.signal_dataset_path
    table_path = args.table_path
    time_col = 'SurvivalDays'
    event_col = 'Mortality'

    # dataset
    data_pathes, times, events = read_dataset(singnal_data_path, table_path,
                                              time_col, event_col,
                                              args.sample_ratio)

    data_pathes_train, data_pathes_test, times_train, times_test, events_train, events_test = train_test_split(
        data_pathes, times, events, test_size=0.3, random_state=369)
    data_pathes_train, data_pathes_val, times_train, times_val, events_train, events_val = train_test_split(
        data_pathes_train,
        times_train,
        events_train,
        test_size=0.2,
        random_state=369)

    labels_train = label_transfer(times_train, events_train)
    dataset_train = VsDatasetBatch(data_pathes_train, *labels_train)
    dl_train = tt.data.DataLoaderBatch(dataset_train,
                                       args.train_batch_size,
                                       shuffle=True)

    labels_val = label_transfer(times_val, events_val)
    dataset_val = VsDatasetBatch(data_pathes_val, *labels_val)
    dl_val = tt.data.DataLoaderBatch(dataset_val,
                                     args.train_batch_size,
                                     shuffle=True)

    labels_test = label_transfer(times_test, events_test)
    dataset_test_x = VsTestInput(data_pathes_test)
    dl_test_x = DataLoader(dataset_test_x, args.test_batch_size, shuffle=False)

    net = resnet18(args)
    model = CoxPH(
        net,
        tt.optim.Adam(lr=args.lr,
                      betas=(0.9, 0.999),
                      eps=1e-08,
                      weight_decay=5e-4,
                      amsgrad=False))
    # callbacks = [tt.cb.EarlyStopping(patience=15)]
    callbacks = [
        tt.cb.BestWeights(file_path=os.path.join(
            args.save_path, args.model_name, args.model_name + '_bestWeight'),
                          rm_file=False)
    ]
    verbose = True
    model_log = model.fit_dataloader(dl_train,
                                     args.epochs,
                                     callbacks,
                                     verbose,
                                     val_dataloader=dl_val)

    save_args(os.path.join(args.save_path, args.model_name), args)
    model_log.to_pandas().to_csv(os.path.join(args.save_path, args.model_name,
                                              'loss.csv'),
                                 index=False)

    _ = model.compute_baseline_hazards(
        get_vs_data(dataset_train), (dataset_train.time, dataset_train.event))
    model.save_net(
        path=os.path.join(args.save_path, args.model_name, args.model_name +
                          '_final'))
    surv = model.predict_surv_df(dl_test_x)
    surv.to_csv(os.path.join(args.save_path, args.model_name,
                             'test_sur_df.csv'),
                index=False)
    ev = EvalSurv(surv, np.array(labels_test[0]), np.array(labels_test[1]),
                  'km')
    print(ev.concordance_td())
    save_cindex(os.path.join(args.save_path, args.model_name),
                ev.concordance_td())
    print('done')
Exemple #14
0
x_mapper = DataFrameMapper(standardize + leave)


X_train = pd.DataFrame(data=x_mapper.fit_transform(X_train),
                       columns=numerical_columns+categorical_columns,
                       index=X_train.index)
# x_test = x_mapper.transform(X_test)




in_features = X_train.shape[1]
num_nodes = [32, 16, 8]
out_features = 1
batch_norm = True
dropout = 0.1
output_bias = False

net = tt.practical.MLPVanilla(in_features, num_nodes, out_features, batch_norm,
                              dropout, output_bias=output_bias)

from pycox.models import CoxPH
model = CoxPH(net, tt.optim.Adam)
batch_size = 256
lrfinder = model.lr_finder(X_train.values.astype('float32'), y_train, batch_size, tolerance=10)
_ = lrfinder.plot()
tt.callbacks.EarlyStopping(patience=20)
plt.show()

print()
Exemple #15
0
class DeepSurv_pycox():
    def __init__(self,
                 layers,
                 nodes_per_layer,
                 dropout,
                 weight_decay,
                 batch_size,
                 lr=0.01,
                 seed=47):
        # set seed
        np.random.seed(seed)
        _ = torch.manual_seed(seed)
        self.standardalizer = None
        self.standardize_data = True

        self._duration_col = "duration"
        self._event_col = "event"

        self.in_features = None
        self.out_features = 1
        self.batch_norm = True
        self.output_bias = False
        self.activation = torch.nn.ReLU
        self.epochs = 512
        self.num_workers = 2
        self.callbacks = [tt.callbacks.EarlyStopping()]

        # parameters tuned
        self.num_nodes = [int(nodes_per_layer) for _ in range(int(layers))]
        self.dropout = dropout
        self.weight_decay = weight_decay
        self.lr = lr
        self.batch_size = int(batch_size)

    def set_standardize(self, standardize_bool):
        self.standardize_data = standardize_bool

    def _format_to_pycox(self, X, Y, F):
        # from numpy to pandas df
        df = pd.DataFrame(data=X, columns=F)
        if Y is not None:
            df[self._duration_col] = Y[:, 0]
            df[self._event_col] = Y[:, 1]
        return df

    def _standardize_df(self, df, flag):
        # if flag = test, the df passed in does not contain Y labels
        if self.standardize_data:
            df_x = df if flag == 'test' else df.drop(
                columns=[self._duration_col, self._event_col])
            if flag == "train":
                cols_leave = []
                cols_standardize = []
                for column in df_x.columns:
                    if set(pd.unique(df[column])) == set([0, 1]):
                        cols_leave.append(column)
                    else:
                        cols_standardize.append(column)
                standardize = [([col], StandardScaler())
                               for col in cols_standardize]
                leave = [(col, None) for col in cols_leave]
                self.standardalizer = DataFrameMapper(standardize + leave)

                x = self.standardalizer.fit_transform(df_x).astype('float32')
                y = (df[self._duration_col].values.astype('float32'),
                     df[self._event_col].values.astype('float32'))

            elif flag == "val":
                x = self.standardalizer.transform(df_x).astype('float32')
                y = (df[self._duration_col].values.astype('float32'),
                     df[self._event_col].values.astype('float32'))

            elif flag == "test":
                x = self.standardalizer.transform(df_x).astype('float32')
                y = None

            else:
                raise NotImplementedError

            return x, y
        else:
            raise NotImplementedError

    def fit(self, X, y, column_names):
        # format data
        self.column_names = column_names
        full_df = self._format_to_pycox(X, y, self.column_names)
        val_df = full_df.sample(frac=0.2)
        train_df = full_df.drop(val_df.index)
        train_x, train_y = self._standardize_df(train_df, "train")
        val_x, val_y = self._standardize_df(val_df, "val")
        # configure model
        self.in_features = train_x.shape[1]
        net = tt.practical.MLPVanilla(in_features=self.in_features,
                                      num_nodes=self.num_nodes,
                                      out_features=self.out_features,
                                      batch_norm=self.batch_norm,
                                      dropout=self.dropout,
                                      activation=self.activation,
                                      output_bias=self.output_bias)
        self.model = CoxPH(
            net, tt.optim.Adam(lr=self.lr, weight_decay=self.weight_decay))
        # self.model.optimizer.set_lr(self.lr)

        n_train = train_x.shape[0]
        while n_train % self.batch_size == 1:  # this will cause issues in batch norm
            self.batch_size += 1

        self.model.fit(train_x,
                       train_y,
                       self.batch_size,
                       self.epochs,
                       self.callbacks,
                       verbose=True,
                       val_data=(val_x, val_y),
                       val_batch_size=self.batch_size,
                       num_workers=self.num_workers)
        self.model.compute_baseline_hazards()

    def predict(self, test_x, time_list):
        # format data
        test_df = self._format_to_pycox(test_x, None, self.column_names)
        test_x, _ = self._standardize_df(test_df, "test")

        proba_matrix_ = self.model.predict_surv_df(test_x)
        proba_matrix = np.transpose(proba_matrix_.values)
        pred_medians = []
        median_time = max(time_list)
        # if the predicted proba never goes below 0.5, predict the largest seen value
        for test_idx, survival_proba in enumerate(proba_matrix):
            # the survival_proba is in descending order
            for col, proba in enumerate(survival_proba):
                if proba > 0.5:
                    continue
                if proba == 0.5 or col == 0:
                    median_time = time_list[col]
                else:
                    median_time = (time_list[col - 1] + time_list[col]) / 2
                break
            pred_medians.append(median_time)

        return np.array(pred_medians), proba_matrix_
Exemple #16
0
def train_deepsurv(data_df, r_splits):
  epochs = 100
  verbose = True

  num_nodes = [32]
  out_features = 1
  batch_norm = True
  dropout = 0.6
  output_bias = False

  c_index_at = []
  c_index_30 = []

  time_auc_30 = []
  time_auc_60 = []
  time_auc_365 = []

  for i in range(len(r_splits)):
    print("\nIteration %s"%(i))
    
    #DATA PREP
    df_train, df_val, df_test, df_test_30 = prepare_datasets(data_df, r_splits[i][2], r_splits[i][1], r_splits[i][0])
    
    xcols = list(df_train.columns)

    for col_name in ["subject_id", "event", "duration"]:
      if col_name in xcols:
        xcols.remove(col_name)

    cols_standardize = xcols

    standardize = [([col], StandardScaler()) for col in cols_standardize]

    x_mapper = DataFrameMapper(standardize)

    x_train = x_mapper.fit_transform(df_train).astype('float32')
    x_val = x_mapper.transform(df_val).astype('float32')
    x_test = x_mapper.transform(df_test).astype('float32')
    x_test_30 =  x_mapper.transform(df_test_30).astype('float32')

    labtrans = CoxTime.label_transform()
    get_target = lambda df: (df['duration'].values, df['event'].values)
    y_train = labtrans.fit_transform(*get_target(df_train))
    y_val = labtrans.transform(*get_target(df_val))

    durations_test, events_test = get_target(df_test)
    durations_test_30, events_test_30 = get_target(df_test_30)
    val = tt.tuplefy(x_val, y_val)

    (train_x, train_y), (val_x, val_y), (test_x, test_y), _ = df2array(data_df, df_train, df_val, df_test, df_test_30)

    #MODEL
    in_features = x_train.shape[1]

    callbacks = [tt.callbacks.EarlyStopping()]

    net = tt.practical.MLPVanilla(in_features, num_nodes, out_features, batch_norm, dropout, output_bias=output_bias)

    model = CoxPH(net, tt.optim.Adam)
    model.optimizer.set_lr(0.0001)

    if x_train.shape[0] % 2:
      batch_size = 255
    else:
      batch_size = 256

    log = model.fit(x_train, y_train, batch_size, epochs, callbacks, val_data=val, val_batch_size=batch_size)

    model.compute_baseline_hazards()

    surv = model.predict_surv_df(x_test)
    ev = EvalSurv(surv, durations_test, events_test, censor_surv='km')
    c_index_at.append(ev.concordance_td())

    surv_30 = model.predict_surv_df(x_test_30)
    ev_30 = EvalSurv(surv_30, durations_test_30, events_test_30, censor_surv='km')
    c_index_30.append(ev_30.concordance_td())

    for time_x in [30, 60, 365]:
      va_auc, va_mean_auc = cumulative_dynamic_auc(train_y, test_y, model.predict(x_test).flatten(), time_x)

      eval("time_auc_" + str(time_x)).append(va_auc[0])

    print("C-index_30:", c_index_30[i])
    print("C-index_AT:", c_index_at[i])

    print("time_auc_30", time_auc_30[i])
    print("time_auc_60", time_auc_60[i])
    print("time_auc_365", time_auc_365[i])

  return c_index_at, c_index_30, time_auc_30, time_auc_60, time_auc_365
            out_features = 1
            batch_norm = args.use_BN
            dropout = args.dropout
            output_bias = args.use_output_bias
            device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

            if len(cols_categorical)>0:
                net = MixedInputMLP(in_features, num_embeddings, embedding_dims, num_nodes, out_features, batch_norm, dropout, output_bias=output_bias)
                # net = Transformer(in_features, num_embeddings, num_nodes, out_features, batch_norm, dropout, output_bias=output_bias)
            else:
                net = MLPVanilla(in_features, num_nodes, out_features, batch_norm,
                                            dropout, output_bias=output_bias)
            net = net.to(device)

            if args.optimizer == 'AdamWR':
                model = CoxPH(net, optimizer=tt.optim.AdamWR(lr=args.lr, decoupled_weight_decay=args.weight_decay),device=device)
            elif args.optimizer=='AdamW':
                model = CoxPH(net, optimizer=tt.optim.AdamW(lr=args.lr, decoupled_weight_decay=args.weight_decay),device=device)
            elif args.optimizer =='Adam':
                model = CoxPH(net, optimizer=tt.optim.Adam(lr=args.lr, weight_decay=args.weight_decay),device=device)
    

            wandb.init(project='icml_new_'+args.dataset, 
                    group=f'fold{fold}_'+args.loss+args.optimizer,
                    name=f'L{args.num_layers}N{args.num_nodes}D{args.dropout}W{args.weight_decay}B{args.batch_size}',
                    config=args)

            wandb.watch(net)

            # Loss configuration ============================================================
Exemple #18
0
    in_features = x_train.shape[1]
    num_nodes = [args.num_nodes] * args.num_layers
    out_features = 1
    batch_norm = args.use_BN
    dropout = args.dropout
    output_bias = args.use_output_bias
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    net = tt.practical.MLPVanilla(in_features,
                                  num_nodes,
                                  out_features,
                                  batch_norm,
                                  dropout,
                                  output_bias=output_bias)
    net = net.to(device)
    model = CoxPH(net, tt.optim.Adam(weight_decay=args.weight_decay))

    wandb.init(
        project=args.dataset,
        group=args.loss,
        name=
        f'L{args.num_layers}N{args.num_nodes}D{args.dropout}W{args.weight_decay}B{args.batch_size}',
        config=args)

    wandb.watch(net)

    # Loss configuration ============================================================
    if args.loss == 'rank':
        model.loss = DSAFTRankLoss()
    elif args.loss == 'mae':
        model.loss = DSAFTMAELoss()
Exemple #19
0
    # net = MixedInputMLPCoxTime(in_features, num_embeddings, embedding_dims, num_nodes, batch_norm, dropout)

    net = MixedInputMLP(in_features,
                        num_embeddings,
                        embedding_dims,
                        num_nodes,
                        out_features,
                        batch_norm=batch_norm,
                        dropout=dropout,
                        output_bias=output_bias)
    net = net.to(device)

    if args.optimizer == 'AdamWR':
        model = CoxPH(net,
                      optimizer=tt.optim.AdamWR(
                          lr=args.lr,
                          decoupled_weight_decay=args.weight_decay,
                          cycle_eta_multiplier=0.8))
    lrfinder = model.lr_finder(x_train, y_train, batch_size, tolerance=10)
    lr = lrfinder.get_best_lr()
    model.optimizer.set_lr(lr)

    wandb.init(
        project=args.dataset + '_baseline',
        group='deepsurv' + '_' + args.optimizer,
        name=
        f'L{args.num_layers}N{args.num_nodes}D{args.dropout}W{args.weight_decay}B{args.batch_size}',
        config=args)

    wandb.watch(net)
def main(data_root, cancer_type, anatomical_location, fold):

    # Import the RDF graph for PPI network
    f = open('seen.pkl', 'rb')
    seen = pickle.load(f)
    f.close()
    #####################

    f = open('ei.pkl', 'rb')
    ei = pickle.load(f)
    f.close()

    global device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    # device = torch.device('cpu')

    cancer_type_vector = np.zeros((33, ), dtype=np.float32)
    cancer_type_vector[cancer_type] = 1

    cancer_subtype_vector = np.zeros((25, ), dtype=np.float32)
    for i in CANCER_SUBTYPES[cancer_type]:
        cancer_subtype_vector[i] = 1

    anatomical_location_vector = np.zeros((52, ), dtype=np.float32)
    anatomical_location_vector[anatomical_location] = 1
    cell_type_vector = np.zeros((10, ), dtype=np.float32)
    cell_type_vector[CELL_TYPES[cancer_type]] = 1

    pt_tensor_cancer_type = torch.FloatTensor(cancer_type_vector).to(device)
    pt_tensor_cancer_subtype = torch.FloatTensor(cancer_subtype_vector).to(
        device)
    pt_tensor_anatomical_location = torch.FloatTensor(
        anatomical_location_vector).to(device)
    pt_tensor_cell_type = torch.FloatTensor(cell_type_vector).to(device)
    edge_index = torch.LongTensor(ei).to(device)

    # Import a dictionary that maps protiens to their coresponding genes by Ensembl database
    f = open('ens_dic.pkl', 'rb')
    dicty = pickle.load(f)
    f.close()
    dic = {}
    for d in dicty:
        key = dicty[d]
        if key not in dic:
            dic[key] = {}
        dic[key][d] = 1

    # Build a dictionary from ENSG -- ENST
    d = {}
    with open('data1/prot_names1.txt') as f:
        for line in f:
            tok = line.split()
            d[tok[1]] = tok[0]

    clin = [
    ]  # for clinical data (i.e. number of days to survive, days to death for dead patients and days to last followup for alive patients)
    feat_vecs = [
    ]  # list of lists ([[patient1],[patient2],.....[patientN]]) -- [patientX] = [gene_expression_value, diff_gene_expression_value, methylation_value, diff_methylation_value, VCF_value, CNV_value]
    suv_time = [
    ]  # list that include wheather a patient is alive or dead (i.e. 0 for dead and 1 for alive)
    can_types = ["BRCA_v2"]
    data_root = '/ibex/scratch/projects/c2014/sara/'
    for i in range(len(can_types)):
        # file that contain patients ID with their coressponding 6 differnt files names (i.e. files names for gene_expression, diff_gene_expression, methylation, diff_methylation, VCF and CNV)
        f = open(data_root + can_types[i] + '.txt')
        lines = f.read().splitlines()
        f.close()
        lines = lines[1:]
        count = 0
        feat_vecs = np.zeros((len(lines), 17186 * 6), dtype=np.float32)
        i = 0
        for l in tqdm(lines):
            l = l.split('\t')
            clinical_file = l[6]
            surv_file = l[2]
            myth_file = 'myth/' + l[3]
            diff_myth_file = 'diff_myth/' + l[1]
            exp_norm_file = 'exp_count/' + l[-1]
            diff_exp_norm_file = 'diff_exp/' + l[0]
            cnv_file = 'cnv/' + l[4] + '.txt'
            vcf_file = 'vcf/' + 'OutputAnnoFile_' + l[
                5] + '.hg38_multianno.txt.dat'
            # Check if all 6 files are exist for a patient (that's because for some patients, their survival time not reported)
            all_files = [
                myth_file, diff_exp_norm_file, diff_myth_file, exp_norm_file,
                cnv_file, vcf_file
            ]
            for fname in all_files:
                if not os.path.exists(fname):
                    print('File ' + fname + ' does not exist!')
                    sys.exit(1)
    #         f = open(clinical_file)
    #         content = f.read().strip()
    #         f.close()
            clin.append(clinical_file)
            #         f = open(surv_file)
            #         content = f.read().strip()
            #         f.close()
            suv_time.append(surv_file)
            temp_myth = myth_data(myth_file, seen, d, dic)
            vec = np.array(get_data(exp_norm_file, diff_exp_norm_file,
                                    diff_myth_file, cnv_file, vcf_file,
                                    temp_myth, seen, dic),
                           dtype=np.float32)
            vec = vec.flatten()
            #         vec = np.concatenate([
            #             vec, cancer_type_vector, cancer_subtype_vector,
            #             anatomical_location_vector, cell_type_vector])
            feat_vecs[i, :] = vec
            i += 1

    min_max_scaler = MinMaxScaler(clip=True)
    labels_days = []
    labels_surv = []
    for days, surv in zip(clin, suv_time):
        #     if days.replace("-", "") != "":
        #         days = float(days)
        #     else:
        #         days = 0.0
        labels_days.append(float(days))
        labels_surv.append(float(surv))

    # Train by batch
    dataset = feat_vecs
    #print(dataset.shape)
    labels_days = np.array(labels_days)
    labels_surv = np.array(labels_surv)

    censored_index = []
    uncensored_index = []
    for i in range(len(dataset)):
        if labels_surv[i] == 1:
            censored_index.append(i)
        else:
            uncensored_index.append(i)
    model = CoxPH(MyNet(edge_index).to(device), tt.optim.Adam(0.0001))

    censored_index = np.array(censored_index)
    uncensored_index = np.array(uncensored_index)

    # Each time test on a specific cancer type
    # total_cancers = ["TCGA-BRCA"]
    # for i in range(len(total_cancers)):
    # test_set = [d for t, d in zip(total_cancers, dataset) if t == total_cancers[i]]
    # train_set = [d for t, d in zip(total_cancers, dataset) if t != total_cancers[i]]

    # Censored split
    n = len(censored_index)
    index = np.arange(n)
    i = n // 5
    np.random.seed(seed=0)
    np.random.shuffle(index)
    if fold < 4:
        ctest_idx = index[fold * i:fold * i + i]
        ctrain_idx = index[:fold * i] + index[fold * i + i:]
    else:
        ctest_idx = index[fold * i:]
        ctrain_idx = index[:fold * i]
    ctrain_n = len(ctrain_idx)
    cvalid_n = ctrain_n // 10
    cvalid_idx = ctrain_idx[:cvalid_n]
    ctrain_idx = ctrain_idx[cvalid_n:]

    # Uncensored split
    n = len(uncensored_index)
    index = np.arange(n)
    i = n // 5
    np.random.seed(seed=0)
    np.random.shuffle(index)
    if fold < 4:
        utest_idx = index[fold * i:fold * i + i]
        utrain_idx = index[:fold * i] + index[fold * i + i:]
    else:
        utest_idx = index[fold * i:]
        utrain_idx = index[:fold * i]
    utrain_n = len(utrain_idx)
    uvalid_n = utrain_n // 10
    uvalid_idx = utrain_idx[:uvalid_n]
    utrain_idx = utrain_idx[uvalid_n:]

    train_idx = np.concatenate(censored_index[ctrain_idx],
                               uncensored_index[utrain_idx])
    np.random.seed(seed=0)
    np.random.shuffle(train_idx)
    valid_idx = np.concatenate(censored_index[cvalid_idx],
                               uncensored_index[uvalid_idx])
    np.random.seed(seed=0)
    np.random.shuffle(valid_idx)
    test_idx = np.concatenate(censored_index[ctest_idx],
                              uncensored_index[utest_idx])
    np.random.seed(seed=0)
    np.random.shuffle(test_idx)

    train_data = dataset[train_idx]
    train_data = min_max_scaler.fit_transform(train_data)
    train_labels_days = labels_days[train_idx]
    train_labels_surv = labels_surv[train_idx]
    train_labels = (train_labels_days, train_labels_surv)

    val_data = dataset[valid_idx]
    val_data = min_max_scaler.transform(val_data)
    val_labels_days = labels_days[valid_idx]
    val_labels_surv = labels_surv[valid_idx]
    test_data = dataset[test_idx]
    test_data = min_max_scaler.transform(test_data)
    test_labels_days = labels_days[test_idx]
    test_labels_surv = labels_surv[test_idx]
    val_labels = (val_labels_days, val_labels_surv)
    print(val_labels)

    callbacks = [tt.callbacks.EarlyStopping()]
    batch_size = 16
    epochs = 100
    val = (val_data, val_labels)
    log = model.fit(train_data,
                    train_labels,
                    batch_size,
                    epochs,
                    callbacks,
                    True,
                    val_data=val,
                    val_batch_size=batch_size)
    log.plot()
    plt.show()
    # print(model.partial_log_likelihood(*val).mean())
    train = train_data, train_labels
    # Compute the evaluation measurements
    model.compute_baseline_hazards(*train)
    surv = model.predict_surv_df(test_data)
    print(surv)
    ev = EvalSurv(surv, test_labels_days, test_labels_surv)
    print(ev.concordance_td())