def test_concordance_index_returns_same_after_shifting(): T = np.array([1, 2, 3, 4, 5, 6]) T_ = np.array([2, 1, 4, 6, 5, 3]) assert ( utils.concordance_index(T, T_) == utils.concordance_index(T - 5, T_ - 5) == utils.concordance_index(T, T_ - 5) == utils.concordance_index(T - 5, T_) )
def test_data_normalization(self, data_pred2): # During fit, CoxPH copies the training data and normalizes it. # Future calls should be normalized in the same way and # internal training set should not be saved in a normalized state. cf = CoxPHFitter(normalize=True) cf.fit(data_pred2, duration_col='t', event_col='E') # Internal training set ci_trn = concordance_index(cf.durations, -cf.predict_partial_hazard(cf.data).values, cf.event_observed) # New data should normalize in the exact same way ci_org = concordance_index(data_pred2['t'], -cf.predict_partial_hazard(data_pred2[['x1', 'x2']]).values, data_pred2['E']) assert ci_org == ci_trn
def test_cox_ph_prediction_monotonicity(self, data_pred2): # Concordance wise, all prediction methods should be monotonic versions # of one-another, unless numerical factors screw it up. t = data_pred2['t'] e = data_pred2['E'] X = data_pred2[['x1', 'x2']] for normalize in [True, False]: msg = ("Predict methods should get the same concordance" + " when {}normalizing".format('' if normalize else 'not ')) cf = CoxPHFitter(normalize=normalize) cf.fit(data_pred2, duration_col='t', event_col='E') # Base comparison is partial_hazards ci_ph = concordance_index(t, -cf.predict_partial_hazard(X).values, e) ci_med = concordance_index(t, cf.predict_median(X).ravel(), e) assert ci_ph == ci_med, msg ci_exp = concordance_index(t, cf.predict_expectation(X).ravel(), e) assert ci_ph == ci_exp, msg
def score_(self): """ The concordance score (also known as the c-index) of the fit. The c-index is a generalization of the ROC AUC to survival data, including censorships. For this purpose, the ``score_`` is a measure of the predictive accuracy of the fitted model onto the training dataset. It's analogous to the R^2 in linear models. """ # pylint: disable=access-member-before-definition if hasattr(self, "_predicted_hazards_"): self._concordance_score_ = concordance_index(self.durations, -self._predicted_hazards_, self.event_observed) del self._predicted_hazards_ return self._concordance_score_ return self._concordance_score_
def get_concordance_index(self, x, t, e, **kwargs): """ Taken from the lifelines.utils package. Docstring is provided below. Parameters: x: (n, d) numpy array of observations. t: (n) numpy array representing observed time events. e: (n) numpy array representing time indicators. Returns: concordance_index: calcualted using lifelines.utils.concordance_index lifelines.utils.concordance index docstring: Calculates the concordance index (C-index) between two series of event times. The first is the real survival times from the experimental data, and the other is the predicted survival times from a model of some kind. The concordance index is a value between 0 and 1 where, 0.5 is the expected result from random predictions, 1.0 is perfect concordance and, 0.0 is perfect anti-concordance (multiply predictions with -1 to get 1.0) Score is usually 0.6-0.7 for survival models. See: Harrell FE, Lee KL, Mark DB. Multivariable prognostic models: issues in developing models, evaluating assumptions and adequacy, and measuring and reducing errors. Statistics in Medicine 1996;15(4):361-87. """ compute_hazards = theano.function( inputs = [self.X], outputs = -self.partial_hazard ) partial_hazards = compute_hazards(x) return concordance_index(t, partial_hazards, e)
def print_summary(self): """ Print summary statistics describing the fit. """ df = self.summary # Significance codes last df[''] = [significance_code(p) for p in df['p']] # Print information about data first print('n={}, number of events={}'.format(self.data.shape[0], np.where(self.event_observed)[0].shape[0]), end='\n\n') print(df.to_string(float_format=lambda f: '{:.3e}'.format(f))) # Significance code explanation print('---') print("Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ", end='\n\n') print("Concordance = {:.3f}" .format(concordance_index(self.durations, -self.predict_partial_hazard(self.data).values.ravel(), self.event_observed))) return
def test_concordance_index(): size = 1000 T = np.random.normal(size=size) P = np.random.normal(size=size) C = np.random.choice([0, 1], size=size) Z = np.zeros_like(T) # Zeros is exactly random assert utils.concordance_index(T, Z) == 0.5 assert utils.concordance_index(T, Z, C) == 0.5 # Itself is 1 assert utils.concordance_index(T, T) == 1.0 assert utils.concordance_index(T, T, C) == 1.0 # Random is close to 0.5 assert abs(utils.concordance_index(T, P) - 0.5) < 0.05 assert abs(utils.concordance_index(T, P, C) - 0.5) < 0.05
def training_epoch_end(self, outputs): """Compute performance metrics on the training dataset. This method is called automatically by pytorch-lightning. """ pred_surv = torch.cat([x["pred_surv"] for x in outputs]).cpu() true_binary = torch.cat([x["true_binary"] for x in outputs]).cpu().numpy() true_time = torch.cat([x["true_time"] for x in outputs]).cpu().numpy() true_event = torch.cat([x["true_event"] for x in outputs ]).cpu().numpy().astype(np.bool) two_year_bin = np.digitize(24, self.time_bins) survival_fn = mtlr_survival_at_times(pred_surv, np.pad(self.time_bins, (1, 0)), self.eval_times) pred_binary = 1 - mtlr_survival(pred_surv)[:, two_year_bin] roc_auc = roc_auc_score(true_binary, pred_binary) pred_risk = mtlr_risk(pred_surv).numpy() ci = concordance_index(true_time, -pred_risk, event_observed=true_event) log = { "training/surv/roc_auc_at_2yrs": roc_auc, "training/surv/ci": ci, } # log loss and metrics to Tensorboard loss_keys = [k for k in outputs[0]["log"].keys() if "/loss" in k] log.update({ k: torch.stack([x["log"][k] for x in outputs]).mean() for k in loss_keys }) return {"loss": log["training/total/loss"], "log": log}
def score(self, df: pd.DataFrame, scoring_method: str = "log_likelihood") -> float: """ Score the data in df on the fitted model. With default scoring method, returns the *average partial log-likelihood*. Parameters ---------- df: DataFrame the dataframe with duration col, event col, etc. scoring_method: str one of {'log_likelihood', 'concordance_index'} log_likelihood: returns the average unpenalized partial log-likelihood. concordance_index: returns the concordance-index """ if scoring_method == "log_likelihood": raise NotImplementedError("Only concordance_index is available") T = df.pop(self.duration_col).astype(float) E = df.pop(self.event_col).astype(bool) predictions = self.predict_median(df) return concordance_index(T, predictions, event_observed=E)
def get_concordance_index(self, x, t, e, **kwargs): """ Taken from the lifelines.utils package. Docstring is provided below. Parameters: x: (n, d) numpy array of observations. t: (n) numpy array representing observed time events. e: (n) numpy array representing time indicators. Returns: concordance_index: calcualted using lifelines.utils.concordance_index lifelines.utils.concordance index docstring: Calculates the concordance index (C-index) between two series of event times. The first is the real survival times from the experimental data, and the other is the predicted survival times from a model of some kind. The concordance index is a value between 0 and 1 where, 0.5 is the expected result from random predictions, 1.0 is perfect concordance and, 0.0 is perfect anti-concordance (multiply predictions with -1 to get 1.0) Score is usually 0.6-0.7 for survival models. See: Harrell FE, Lee KL, Mark DB. Multivariable prognostic models: issues in developing models, evaluating assumptions and adequacy, and measuring and reducing errors. Statistics in Medicine 1996;15(4):361-87. """ compute_hazards = theano.function(inputs=[self.X], outputs=-self.partial_hazard) partial_hazards = compute_hazards(x) return concordance_index(t, partial_hazards, e)
def train_neural_network(self): train_print = "Training Deep Regularized AFT Model:" params_print = "Parameters: l2_reg:{}, learning_rate:{}," \ " momentum: beta1={} beta2={}, batch_size:{}, batch_norm:{}," \ " hidden_dim:{}, latent_dim:{}, num_of_batches:{}, keep_prob:{}" \ .format(self.l2_reg, self.learning_rate, self.beta1, self.beta2, self.batch_size, self.batch_norm, self.hidden_dim, self.latent_dim, self.num_batches, self.keep_prob) print(train_print) print(params_print) logging.debug(train_print) logging.debug(params_print) self.session.run(tf.global_variables_initializer()) best_ci = 0 best_validation_epoch = 0 last_improvement = 0 start_time = time.time() epochs = 0 show_all_variables() j = 0 for i in range(self.num_iterations): # Batch Training run_options = tf.RunOptions(timeout_in_ms=4000) x_batch, t_batch, e_batch = self.session.run( [self.x_batch, self.t_batch, self.e_batch], options=run_options) risk_batch = risk_set(data_t=t_batch) batch_impute_mask = get_missing_mask(x_batch, self.imputation_values) batch_size = len(t_batch) idx_observed = e_batch == 1 # TODO simplify batch processing feed_dict_train = { self.x: x_batch, self.x_lab: x_batch[idx_observed], self.x_unlab: x_batch[np.logical_not(idx_observed)], self.impute_mask: batch_impute_mask, self.t: t_batch, self.t_lab: t_batch[idx_observed], self.t_unlab: t_batch[np.logical_not(idx_observed)], self.e: e_batch, self.risk_set: risk_batch, self.batch_size_tensor: batch_size, self.is_training: True } summary, train_time, train_cost, train_ranking, train_rae, train_reg, train_lik, train_recon, \ train_obs_lik, train_censo_lik, _ = self.session.run( [self.merged, self.predicted_time, self.cost, self.ranking_partial_lik, self.total_rae, self.reg_loss, self.neg_log_lik, self.total_t_recon_loss, self.observed_neg_lik, self.censored_neg_lik, self.optimizer], feed_dict=feed_dict_train) train_ci = concordance_index( event_times=t_batch, predicted_event_times=train_time.reshape(t_batch.shape), event_observed=e_batch) tf.verify_tensor_all_finite(train_cost, "Training Cost has Nan or Infinite") if j >= self.num_examples: epochs += 1 is_epoch = True # idx = 0 j = 0 else: # idx = j j += self.batch_size is_epoch = False if i % 100 == 0: train_print = "it:{}, trainCI:{}, train_ranking:{}, train_RAE:{}, train_lik:{}, train_obs_lik:{}, " \ "train_cens_lik:{}, train_reg:{}".format(i, train_ci, train_ranking, train_rae, train_lik, train_obs_lik, train_censo_lik, train_reg) print(train_print) logging.debug(train_print) if is_epoch or (i == (self.num_iterations - 1)): improved_str = '' # Calculate Vaid CI the CI self.train_ci.append(train_ci) self.train_cost.append(train_cost) self.train_t_rae.append(train_rae) self.train_log_lik.append(train_lik) self.train_ranking.append(train_ranking) self.train_recon.append(train_recon) self.train_writer.add_summary(summary, i) valid_ci, valid_cost, valid_rae, valid_ranking, valid_lik, valid_reg, valid_log_var, valid_recon = self.predict_concordance_index( x=self.valid_x, e=self.valid_e, t=self.valid_t) self.valid_cost.append(valid_cost) self.valid_ci.append(valid_ci) self.valid_t_rae.append(valid_rae) self.valid_log_lik.append(valid_lik) self.valid_ranking.append(valid_ranking) self.valid_recon.append(valid_recon) tf.verify_tensor_all_finite( valid_cost, "Validation Cost has Nan or Infinite") if valid_ci > best_ci: self.saver.save(sess=self.session, save_path=self.save_path) best_validation_epoch = epochs best_ci = valid_ci print("valid_ci:{}".format(valid_ci)) last_improvement = i improved_str = '*' # Save Best Perfoming all variables of the TensorFlow graph to file. # update best validation accuracy optimization_print = "Iteration: {} epochs:{}, Training: RAE:{}, Loss: {}," \ " Ranking:{}, Reg:{}, Lik:{}, T_Recon:{}, CI:{}" \ " Validation RAE:{} Loss:{}, Ranking:{}, Reg:{}, Lik:{}, T_Recon:{}, CI:{}, {}" \ .format(i + 1, epochs, train_rae, train_cost, train_ranking, train_reg, train_lik, train_recon, train_ci, valid_rae, valid_cost, valid_ranking, valid_reg, valid_lik, valid_recon, valid_ci, improved_str) print(optimization_print) logging.debug(optimization_print) if i - last_improvement > self.require_improvement or math.isnan( valid_cost) or epochs >= self.max_epochs: print( "No improvement found in a while, stopping optimization." ) # Break out from the for-loop. break # Ending time. end_time = time.time() time_dif = end_time - start_time time_dif_print = "Time usage: " + str( timedelta(seconds=int(round(time_dif)))) print(time_dif_print) logging.debug(time_dif_print) # shutdown everything to avoid zombies self.session.run(self.queue.close(cancel_pending_enqueues=True)) self.coord.request_stop() self.coord.join(self.threads) return best_validation_epoch, epochs
def run_comb(i): clinical_train, _, rnaseq_train, _, mirna_train, _, target_train, target_test = train_test_split( clinical, rnaseq, mirna, target, test_size=0.15, stratify=patient_cancer_type) Mo = Model(clinical_input=clinical_input, gene_expression_input=gene_expression_input, mirna_input=mirna_input) device = "cpu" target_train.index = [i for i in range(len(target_train))] days_to_death = target_train["days_to_death"].values data = { "gene_expression": torch.tensor(rnaseq_train, device=device), "mirna": torch.tensor(mirna_train, device=device), "clinical": torch.tensor(clinical_train, device=device), } if i != "": del data[i] f = open("table2_{0}.txt".format(data.keys()), "a") f.write("\nFiles used: {0}".format(ld.files)) f.write("{0}".format(data.keys())) now = time.time() Mo.train(data, target_train, n_batches=10) took = time.time() - now print("Train time:", took) f.write("Took {0}".format(took)) for cancer_type in set(patient_cancer_type): indexes = [ index for index, value in enumerate(patient_cancer_type) if value == cancer_type ] type_rnaseq = np.array(rnaseq)[indexes] type_rnaseq = [list(i) for i in type_rnaseq] type_mirna = np.array(mirna)[indexes] type_mirna = [list(i) for i in type_mirna] type_clinical = np.array(clinical)[indexes] type_clinical = [list(i) for i in type_clinical] print("\nTesting Data---for", cancer_type) f.write("\nTesting Data---for{0}--{1}".format(cancer_type, i)) days_to_death = target_test["days_to_death"].values vital_status = target_test["vital_status"].values data = { "gene_expression": torch.tensor(type_rnaseq), "mirna": torch.tensor(type_mirna), "clinical": torch.tensor(type_clinical), } if i != "": del data[i] hazard = Mo(data)["hazard"].detach() try: c_index_1 = concordance_index(days_to_death, -hazard) except: c_index_1 = "None" try: c_index_2 = concordance_index(days_to_death, hazard) except: c_index_2 = "None" try: c_index_3 = concordance_index(days_to_death, -hazard, np.logical_not(vital_status)) except: c_index_3 = "None" write = "\nC_index:{0} {1} {2}".format(c_index_1, c_index_2, c_index_3) f.write(write) f.close()
def CIndex_lifeline(hazards, labels, survtime_all): return (concordance_index(survtime_all, -hazards, labels))
def _fit_static(self, dataframe, duration_col, event_col=None, timeline=None, show_progress=True): """ Perform inference on the coefficients of the Aalen additive model. Parameters: dataframe: a pandas dataframe, with covariates and a duration_col and a event_col. one row per individual. duration_col refers to how long the individual was observed for. event_col is a boolean: 1 if individual 'died', 0 else. id_col should be left as None. duration_col: specify what the duration column is called in the dataframe event_col: specify what the event occurred column is called in the dataframe timeline: reformat the estimates index to a new timeline. progress_bar: include a fancy progress bar! Returns: self, with new methods like plot, smoothed_hazards_ and properties like cumulative_hazards_ """ from_tuples = pd.MultiIndex.from_tuples df = dataframe.copy() # set unique ids for individuals id_col = 'id' ids = np.arange(df.shape[0]) df[id_col] = ids # if the regression should fit an intercept if self.fit_intercept: df['baseline'] = 1. # if no event_col is specified, assume all non-censorships if event_col: c = df[event_col].values del df[event_col] else: c = np.ones_like(ids) # each individual should have an ID of time of leaving study C = pd.Series(c, dtype=bool, index=ids) T = pd.Series(df[duration_col].values, index=ids) df = df.set_index(id_col) pass_for_numeric_dtypes_or_raise(df) ix = T.argsort() T, C = T.iloc[ix], C.iloc[ix] del df[duration_col] n, d = df.shape columns = df.columns # initialize dataframe to store estimates non_censorsed_times = list(T[C].iteritems()) n_deaths = len(non_censorsed_times) hazards_ = pd.DataFrame(np.zeros((n_deaths, d)), columns=columns, index=from_tuples(non_censorsed_times)).swaplevel(1, 0) variance_ = pd.DataFrame(np.zeros((n_deaths, d)), columns=columns, index=from_tuples(non_censorsed_times)).swaplevel(1, 0) # initialize loop variables. previous_hazard = np.zeros((d,)) progress = progress_bar(n_deaths) to_remove = [] t = T.iloc[0] i = 0 for id, time in T.iteritems(): # should be sorted. if t != time: assert t < time # remove the individuals from the previous loop. df.iloc[to_remove] = 0. to_remove = [] t = time to_remove.append(id) if C[id] == 0: continue relevant_individuals = (ids == id) assert relevant_individuals.sum() == 1. # perform linear regression step. try: v, V = lr(df.values, relevant_individuals, c1=self.coef_penalizer, c2=self.smoothing_penalizer, offset=previous_hazard) except LinAlgError: print("Linear regression error. Try increasing the penalizer term.") hazards_.loc[time, id] = v.T variance_.loc[time, id] = V[:, relevant_individuals][:, 0] ** 2 previous_hazard = v.T # update progress bar if show_progress: i += 1 progress.update(i) # print a new line so the console displays well if show_progress: print() # not sure this is the correct thing to do. self.hazards_ = hazards_.groupby(level=0).sum() self.cumulative_hazards_ = self.hazards_.cumsum() self.variance_ = variance_.groupby(level=0).sum() if timeline is not None: self.hazards_ = self.hazards_.reindex(timeline, method='ffill') self.cumulative_hazards_ = self.cumulative_hazards_.reindex(timeline, method='ffill') self.variance_ = self.variance_.reindex(timeline, method='ffill') self.timeline = timeline else: self.timeline = self.hazards_.index.values.astype(float) self.durations = T self.event_observed = C self._compute_confidence_intervals() self.score_ = concordance_index(self.durations, self.predict_median(dataframe).values.ravel(), self.event_observed) return
def trainCox(dataroot='./data/TCGA_GBMLGG/', ckpt_name='./checkpoints/surv_15_cox/', model='cox_omic', penalizer=1e-4): ### Creates Checkpoint Directory if not os.path.exists(ckpt_name): os.makedirs(ckpt_name) if not os.path.exists(os.path.join(ckpt_name, model)): os.makedirs(os.path.join(ckpt_name, model)) ### Load PNAS Splits pnas_splits = pd.read_csv(dataroot + 'pnas_splits.csv') pnas_splits.columns = ['TCGA ID'] + [str(k) for k in range(1, 16)] pnas_splits.index = pnas_splits['TCGA ID'] pnas_splits = pnas_splits.drop(['TCGA ID'], axis=1) ### Loads Data ignore_missing_moltype = True if model in [ 'cox_omic', 'cox_moltype', 'cox_grade+moltype', 'all' ] else False ignore_missing_histype = True if model in [ 'cox_histype', 'cox_grade', 'cox_grade+moltype', 'all' ] else False all_dataset = getCleanAllDataset( dataroot=dataroot, ignore_missing_moltype=ignore_missing_moltype, ignore_missing_histype=ignore_missing_histype)[1] model_feats = { 'cox_omic': [ 'TCGA ID', 'Histology', 'Grade', 'Molecular subtype', 'Histomolecular subtype' ], 'cox_moltype': ['Survival months', 'censored', 'codeletion', 'idh mutation'], 'cox_histype': ['Survival months', 'censored', 'Histology'], 'cox_grade': ['Survival months', 'censored', 'Grade'], 'cox_grade+moltype': ['Survival months', 'censored', 'codeletion', 'idh mutation', 'Grade'], 'cox_all': ['TCGA ID', 'Histomolecular subtype'] } cv_results = [] for k in pnas_splits.columns: pat_train = list( set(pnas_splits.index[pnas_splits[k] == 'Train']).intersection( all_dataset.index)) pat_test = list( set(pnas_splits.index[pnas_splits[k] == 'Test']).intersection( all_dataset.index)) feats = all_dataset.columns.drop( model_feats[model] ) if model == 'cox_omic' or model == 'cox_all' else model_feats[model] train = all_dataset.loc[pat_train] test = all_dataset.loc[pat_test] cph = CoxPHFitter(penalizer=penalizer) cph.fit(train[feats], duration_col='Survival months', event_col='censored', show_progress=False) cin = concordance_index(test['Survival months'], -cph.predict_partial_hazard(test[feats]), test['censored']) cv_results.append(cin) train.insert(loc=0, column='Hazard', value=-cph.predict_partial_hazard(train)) test.insert(loc=0, column='Hazard', value=-cph.predict_partial_hazard(test)) pickle.dump( train, open( os.path.join(ckpt_name, model, '%s_%s_pred_train.pkl' % (model, k)), 'wb')) pickle.dump( test, open( os.path.join(ckpt_name, model, '%s_%s_pred_test.pkl' % (model, k)), 'wb')) pickle.dump( cv_results, open(os.path.join(ckpt_name, model, '%s_results.pkl' % model), 'wb')) print("C-Indices across Splits", cv_results) print("Average C-Index: %f" % CI_pm(cv_results))
def fit(self, df, duration_col, event_col=None, show_progress=False, initial_beta=None, strata=None, step_size=None, weights_col=None): """ Fit the Cox Propertional Hazard model to a dataset. Tied survival times are handled using Efron's tie-method. Parameters: df: a Pandas dataframe with necessary columns `duration_col` and `event_col`, plus other covariates. `duration_col` refers to the lifetimes of the subjects. `event_col` refers to whether the 'death' events was observed: 1 if observed, 0 else (censored). duration_col: the column in dataframe that contains the subjects' lifetimes. event_col: the column in dataframe that contains the subjects' death observation. If left as None, assume all individuals are non-censored. weights_col: an optional column in the dataframe that denotes the weight per subject. This column is expelled and not used as a covariate, but as a weight in the final regression. Default weight is 1. show_progress: since the fitter is iterative, show convergence diagnostics. initial_beta: initialize the starting point of the iterative algorithm. Default is the zero vector. strata: specify a list of columns to use in stratification. This is useful if a catagorical covariate does not obey the proportional hazard assumption. This is used similar to the `strata` expression in R. See http://courses.washington.edu/b515/l17.pdf. Returns: self, with additional properties: hazards_ """ df = df.copy() # Sort on time df = df.sort_values(by=duration_col) self._n_examples = df.shape[0] self.strata = coalesce(strata, self.strata) if self.strata is not None: original_index = df.index.copy() df = df.set_index(self.strata) # Extract time and event T = df[duration_col] del df[duration_col] if event_col is None: E = pd.Series(np.ones(df.shape[0]), index=df.index) else: E = df[event_col] del df[event_col] if weights_col: weights = df.pop(weights_col) if (weights.astype(int) != weights).any(): warnings.warn( """It looks like your weights are not integers, possibly propensity scores then? It's important to know that the naive variance estimates of the coefficients are biased. Instead use Monte Carlo to estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis" """, RuntimeWarning) else: weights = pd.DataFrame(np.ones((self._n_examples, 1)), index=df.index) self._check_values(df, T, E) df = df.astype(float) # save fitting data for later self.durations = T.copy() self.event_observed = E.copy() if self.strata is not None: self.durations.index = original_index self.event_observed.index = original_index self.event_observed = self.event_observed.astype(bool) self._norm_mean = df.mean(0) self._norm_std = df.std(0) E = E.astype(bool) hazards_ = self._newton_rhaphson(normalize(df, self._norm_mean, self._norm_std), T, E, weights=weights, initial_beta=initial_beta, show_progress=show_progress, step_size=step_size) self.hazards_ = pd.DataFrame( hazards_.T, columns=df.columns, index=['coef']) / self._norm_std self.confidence_intervals_ = self._compute_confidence_intervals() self.baseline_hazard_ = self._compute_baseline_hazards(df, T, E) self.baseline_cumulative_hazard_ = self._compute_baseline_cumulative_hazard( ) self.baseline_survival_ = self._compute_baseline_survival() self.score_ = concordance_index( self.durations, -self.predict_partial_hazard(df).values.ravel(), self.event_observed) self._train_log_partial_hazard = self.predict_log_partial_hazard( self._norm_mean.to_frame().T) return self
def eval(model, x, y, e): hr_pred = model.predict(x) hr_pred = np.exp(hr_pred) ci = concordance_index(y, -hr_pred, e) return ci
def surv_coxph(data_train, x_cols, duration_col, event_col, data_test=None, pt=None, show_extra=True): """Integrate functions that include modeling using Cox Regression and evaluating Parameters ---------- data_train : pandas.DataFame Full survival data for train. x_cols : list of str Name of column indicating variables. duration_col : str Name of column indicating time. event_col : str Name of column indicating event. data_test : pandas.DataFame Full survival data for test, default None. pt : float Predicted time for AUC. Returns ------- object Object of cox model in `lifelines.CoxPHFitter`. Examples -------- >>> surv_coxph(train_data, ['x1', 'x2'], 'T', 'E', test_data, pt=5*12) """ y_cols = [event_col, duration_col] cph = CoxPHFitter() cph.fit(data_train[x_cols + y_cols], duration_col=duration_col, event_col=event_col, show_progress=True) # CI of train pred_X_train = cph.predict_partial_hazard(data_train[x_cols]) pred_X_train.rename(columns={0: 'X'}, inplace=True) ci_train = concordance_index(data_train[duration_col], -pred_X_train, data_train[event_col]) # AUC of train at pt df = pd.concat([data_train[y_cols], pred_X_train], axis=1) roc_train = surv_roc(df, 'X', duration_col, event_col, pt=pt) if data_test is not None: # CI of test pred_X_test = cph.predict_partial_hazard(data_test[x_cols]) pred_X_test.rename(columns={0: 'X'}, inplace=True) ci_test = concordance_index(data_test[duration_col], -pred_X_test, data_test[event_col]) # AUC of test at pt df = pd.concat([data_test[y_cols], pred_X_test], axis=1) roc_test = surv_roc(df, 'X', duration_col, event_col, pt=pt) # Print Summary of CPH cph.print_summary() print "__________Metrics CI__________" print "CI of train: %.4f" % ci_train if data_test is not None: print "CI of test : %.4f" % ci_test print "__________Metrics AUC__________" print "AUC of train: %.4f" % roc_train['AUC'] if data_test is not None: print "AUC of test : %.4f" % roc_test['AUC'] if not show_extra: return cph # Print Coefficients print "__________Summary of Coefficients in CPH__________" cols = ['coef', 'p', 'lower 0.95', 'upper 0.95'] print cols[0], ":" for i in cph.summary.index: print "%.4f" % (cph.summary.loc[i, cols[0]]) print "__________" print cols[1], ":" for i in cph.summary.index: print "%.4f" % (cph.summary.loc[i, cols[1]]) print "__________" print "95% CI :" for i in cph.summary.index: print "[%.4f, %.4f]" % (cph.summary.loc[i, cols[2]], cph.summary.loc[i, cols[3]]) return cph
def test_concordance_index_returns_same_after_shifting(): T = np.array([1, 2, 3, 4, 5, 6]) T_ = np.array([2, 1, 4, 6, 5, 3]) assert utils.concordance_index(T, T_) == utils.concordance_index( T - 5, T_ - 5) == utils.concordance_index( T, T_ - 5) == utils.concordance_index(T - 5, T_)
def main(): args = parse_args() if args.conf_path is None: conf_path = DEFAULT_CONF_PATH else: conf_path = Path(args.conf_path) exp_config = CoxExperimentConfig.from_conf(conf_path) exp_config.output_dir.mkdir(parents=True, exist_ok=True) hypersearch_config = HypersearchConfig.from_conf(conf_path) shutil.copy(str(conf_path), str(exp_config.output_dir.joinpath("cox.conf"))) # import input data: i_full=list of patient IDs, y_full=censoring status and survival times for patients, # x_full=input data for patients (i.e. motion descriptors [11,514-element vector]) with open(str(exp_config.data_path), 'rb') as f: c3 = pickle.load(f) x_full = c3[0] y_full = c3[1] print(x_full.shape, y_full.shape) del c3 # Initialize lists to store predictions c_vals = [] c_trains = [] kf = KFold(n_splits=exp_config.n_folds) i = 0 for train_indices, test_indices in kf.split(x_full): print(train_indices.shape, test_indices.shape) x_train, y_train = x_full[train_indices], y_full[train_indices] x_val, y_val = x_full[test_indices], y_full[test_indices] # STEP 1: find optimal hyperparameters using CV print("Step 1a") opars, osummary = hypersearch_cox( x_data=x_train, y_data=y_train, method=exp_config.search_method, nfolds=exp_config.n_folds, nevals=exp_config.n_evals, penalty_range=hypersearch_config.penalty_exp) print("Step b") # (1b) using optimal hyperparameters, train a model and test its performance on the holdout validation set. olog = train_cox_reg( xtr=x_train, ytr=y_train, penalty=10**opars['penalty'], ) # (1c) Compute Harrell's Concordance index pred_val = olog.predict_partial_hazard(x_val) c_val = concordance_index(y_val[:, 1], -pred_val, y_val[:, 0]) pred_train = olog.predict_partial_hazard(x_train) c_train = concordance_index(y_train[:, 1], -pred_train, y_train[:, 0]) c_vals.append(c_val) c_trains.append(c_train) save_params(opars, osummary, "cv_{}".format(i), exp_config.output_dir, c_val=c_val, c_train=c_train, c_val_mean=np.mean(c_vals), c_val_var=np.var(c_vals), c_train_mean=np.mean(c_trains), c_train_var=np.var(c_trains)) print('Validation concordance index = {0:.4f}'.format(c_val)) i += 1 plot_cs(c_trains, c_vals, exp_config.output_dir) print('Mean Validation concordance index = {0:.4f}'.format( np.mean(c_vals))) print('Variance = {0:.4f}'.format(np.var(c_vals)))
def concordance_index(self): return concordance_index(self.df['TIME'], -self.df['LPH'], self.df['EVENT'])
print(w) get_target = lambda df: (df['time'].values, df['dead'].values) time_valid, dead_valid = get_target(dataValid) ypred_train_NN = model_cv.predict_proba(x_train_NN) ypred_test_NN = model_cv.predict_proba(x_valid_NN) ypred_surv_train_NN = ypred_train_NN.reshape([dataTrain.shape[0], -1]) ypred_surv_valid_NN = ypred_test_NN.reshape([dataValid.shape[0], -1]) y_pred_valid_surv = np.cumprod((1 - ypred_surv_valid_NN), axis=1) y_pred_train_surv = np.cumprod((1 - ypred_surv_train_NN), axis=1) oneyr_surv_train = y_pred_train_surv[:, 50] oneyr_surv_valid = y_pred_valid_surv[:, 50] surv_valid = pd.DataFrame(np.transpose(y_pred_valid_surv)) surv_valid.index = interval_l surv_train = pd.DataFrame(np.transpose(y_pred_train_surv)) surv_train.index = interval_l dict_cv_cindex_train[key] = concordance_index(dataTrain.time, oneyr_surv_train) dict_cv_cindex_valid[key] = concordance_index(dataValid.time, oneyr_surv_valid) #scores_train += concordance_index(dataTrain.time,oneyr_surv_train)#,data_train.dead) #scores_test += concordance_index(dataValid.time,oneyr_surv_valid) #cta.append(concordance_index(dataTrain.time,oneyr_surv_train)) #cte.append(concordance_index(dataValid.time,oneyr_surv_valid)) ev_valid = EvalSurv(surv_valid, time_valid, dead_valid, censor_surv='km') scores_test += ev_valid.concordance_td() ev_train = EvalSurv(surv_train, dataTrain['time'].values, dataTrain['dead'].values, censor_surv='km') scores_train += ev_train.concordance_td() cta.append(concordance_index(dataTrain.time, oneyr_surv_train)) cte.append(concordance_index(dataValid.time, oneyr_surv_valid))
def test_(self, data_generator, model, repurposing_mode = False, test = False): y_pred = [] y_label = [] model.eval() for i, (v_d, v_p, label) in enumerate(data_generator): if self.drug_encoding == "MPNN" or self.drug_encoding == 'Transformer': v_d = v_d else: v_d = v_d.float().to(self.device) if self.target_encoding == 'Transformer': v_p = v_p else: v_p = v_p.float().to(self.device) score = self.model(v_d, v_p) if self.binary: m = torch.nn.Sigmoid() logits = torch.squeeze(m(score)).detach().cpu().numpy() else: logits = torch.squeeze(score).detach().cpu().numpy() label_ids = label.to('cpu').numpy() y_label = y_label + label_ids.flatten().tolist() y_pred = y_pred + logits.flatten().tolist() outputs = np.asarray([1 if i else 0 for i in (np.asarray(y_pred) >= 0.5)]) model.train() if self.binary: if repurposing_mode: return y_pred ## ROC-AUC curve if test: roc_auc_file = os.path.join(self.result_folder, "roc-auc.jpg") plt.figure(0) roc_curve(y_pred, y_label, roc_auc_file, self.drug_encoding + '_' + self.target_encoding) plt.figure(1) pr_auc_file = os.path.join(self.result_folder, "pr-auc.jpg") prauc_curve(y_pred, y_label, pr_auc_file, self.drug_encoding + '_' + self.target_encoding) return roc_auc_score(y_label, y_pred), average_precision_score(y_label, y_pred), f1_score(y_label, outputs), log_loss(y_label, outputs), y_pred else: if repurposing_mode: return y_pred return mean_squared_error(y_label, y_pred), pearsonr(y_label, y_pred)[0], pearsonr(y_label, y_pred)[1], concordance_index(y_label, y_pred), y_pred
lrfinder = model.lr_finder(x_train, y_train, batch_size, tolerance=10) best = lrfinder.get_best_lr() model.optimizer.set_lr(best) epochs = args.epochs callbacks = [tt.callbacks.EarlyStopping(patience=patience)] verbose = True log = model.fit(x_train, y_train_transformed, batch_size, epochs, callbacks, verbose, val_data = val_transformed, val_batch_size = batch_size) # Evaluation =================================================================== val_loss = min(log.monitors['val_'].scores['loss']['score']) # get Ctd ctd = concordance_index(event_times = durations_test_transformed, predicted_scores = model.predict(x_test).reshape(-1), event_observed = events_test) # set time grid for numerical integration to get IBS and IBLL if durations_test.min()>0: time_grid = np.linspace(durations_test.min(), durations_test.max(), 100) else: durations_test_copy = durations_test.copy() durations_test_copy.sort() time_grid = np.linspace(durations_test_copy[1], durations_test.max(), 100) # time_grid = np.linspace(durations_test.min(), durations_test.max(), 100) # transform time grid into DSAFT scale for fair comparison # pdb.set_trace() time_grid = np.exp(scaler_train.transform(np.log(time_grid.reshape(-1, 1)))).reshape(-1) # grid interval for numerical integration ds = np.array(time_grid - np.array([0.0] + time_grid[:-1].tolist()))
presorted_times=True, kernel_function=kernel_func) # --------------------------------------------------------- # compute c-index # if cindex_method == 'cum_haz': cum_haz = \ surv_model.predict_cum_haz( fold_X_val_standardized, sorted_fold_y_val, presorted_times=True, kernel_function=kernel_func) cum_hazard_scores = cum_haz.sum(axis=1) cindex = concordance_index(fold_y_val[:, 0], -cum_hazard_scores, fold_y_val[:, 1]) elif cindex_method == 'cum_haz_from_surv': surv_thresholded = np.maximum(surv, np.finfo(float).eps) cum_haz = -np.log(surv_thresholded) cum_hazard_scores = cum_haz.sum(axis=1) cindex = concordance_index(fold_y_val[:, 0], -cum_hazard_scores, fold_y_val[:, 1]) elif cindex_method == 'median': predicted_medians = \ np.array([compute_median_survival_time(mesh_points, surv_row) for surv_row in surv]) cindex = concordance_index(fold_y_val[:, 0],
else: count+=meta_parameters_dictionary['test_labels_{}'.format(f)].shape[1] clinical_features_size=count model = get_model(cube_size, clinical_features_size,kernel_size = (3,3,3)) model= load_model(model_filename,custom_objects = {'cox_regression_loss':cox_regression_loss}) # model.load_weights(checkpoint_dir+'cyclic_{}_{}.h5'.format(test_cohort,i)) test_generator = data(meta_parameters_dictionary,batch_size,False,False) preds = model.predict_generator(test_generator,verbose=1) preds = np.squeeze(preds) df = pd.DataFrame() df = df.assign(hazard = preds[0]) df = df.assign(pred_class = (preds[1])) df = df.assign(actual_months = meta_parameters_dictionary['test_labels_months']) df = df.assign(is_dead = meta_parameters_dictionary['test_labels_is_dead']) df.to_csv(test_cohort+'_preds.csv') print(concordance_index(df.actual_months,-df.hazard,df.is_dead)) fpr,tpr,thresholds = roc_curve(meta_parameters_dictionary['test_labels'],np.array(df.pred_class)) opt_threshold = thresholds[np.argmax(tpr - fpr)] class_pred = np.zeros_like(np.array(df.pred_class)) class_pred[np.where(np.array(df.pred_class)>opt_threshold)]=1 df = df.assign(class_preds=class_pred) T = df['actual_months'] E = df['is_dead'] # ix = (df.class_preds==1) thres = np.median(df.hazard) ix = df.hazard < thres kmf = KaplanMeierFitter() kmf.fit(T[~ix],E[~ix],label='high-risk') ax = kmf.plot() kmf.fit(T[ix],E[ix],label='low-risk') kmf.plot(ax=ax)
def CIndex_lifeline(hazards, labels, survtime_all): labels = labels.data.cpu().numpy() hazards = hazards.cpu().numpy().reshape(-1) return(concordance_index(survtime_all, -hazards, labels))
def train(self, trainingData, validationData=None, validation_freq=10): #tdata required to sort data only ## sort data xdata, edata, tdata = trainingData['x'], trainingData[ 'e'], trainingData['t'] sort_idx = numpy.argsort(tdata)[::-1] xdata = xdata[sort_idx] edata = edata[sort_idx].astype(numpy.float32) tdata = tdata[sort_idx] if validationData: xdata_valid, edata_valid, tdata_valid = validationData[ 'x'], validationData['e'], validationData['t'] sort_idx = numpy.argsort(tdata_valid)[::-1] xdata_valid = xdata_valid[sort_idx] edata_valid = edata_valid[sort_idx].astype(numpy.float32) tdata_valid = tdata_valid[sort_idx] ##TODO : cache if self.params.standardize: mean, var = xdata.mean(axis=0), xdata.std(axis=0) xdata = (xdata - mean) / var ##same mean and var as train xdata_valid = (xdata_valid - mean) / var assert self.params.modelPath assert xdata.shape[ 1] == self.params.n_in, "invalid number of covariates" assert (edata.ndim == 1) and (tdata.ndim == 1) ##sanity check train_losses, train_ci, train_index = [], [], [] validation_losses, validation_ci, validation_index = [], [], [] best_validation_loss = numpy.inf best_params_idx = -1 with tf.Session() as sess: sess.run(tf.global_variables_initializer() ) ##init graph with given initializers ##start training for epoch in range(self.params.n_epochs): loss, risk, _ = sess.run( [self.loss, self.risk, self.grad_step], feed_dict={ self.x: xdata, self.e: edata }) train_losses.append(loss) train_ci.append( concordance_index(tdata, -numpy.exp(risk.ravel()), edata)) train_index.append(epoch) ##frequently check metrics on validation data if validationData and (epoch % validation_freq == 0): vloss, vrisk = sess.run([self.loss, self.risk], feed_dict={ self.x: xdata_valid, self.e: edata_valid }) validation_losses.append(vloss) validation_ci.append( concordance_index(tdata_valid, -numpy.exp(vrisk.ravel()), edata_valid)) validation_index.append(epoch) # improve patience if loss improves enough if vloss < best_validation_loss * self.params.improvement_threshold: self.params.patience = max( self.params.patience, epoch * self.params.patience_increase) best_params_idx = epoch best_validation_loss = vloss if self.params.patience <= epoch: break print("Training done") print("Best epoch", best_params_idx) print("Best loss", best_validation_loss) ##save model saver = tf.train.Saver() saver.save(sess, self.params.modelPath) self.trainingStats["training"] = { "loss": train_losses, "ci": train_ci, "epochs": train_index, "type": "training" } if validationData: self.trainingStats["validation"] = { "loss": validation_losses, "ci": validation_ci, "epochs": validation_index, "type": "validation" } return self.trainingStats
def fit(self, df, duration_col, event_col=None, show_progress=False, initial_beta=None, strata=None, step_size=None, weights_col=None): """ Fit the Cox Propertional Hazard model to a dataset. Tied survival times are handled using Efron's tie-method. Parameters: df: a Pandas dataframe with necessary columns `duration_col` and `event_col`, plus other covariates. `duration_col` refers to the lifetimes of the subjects. `event_col` refers to whether the 'death' events was observed: 1 if observed, 0 else (censored). duration_col: the column in dataframe that contains the subjects' lifetimes. event_col: the column in dataframe that contains the subjects' death observation. If left as None, assume all individuals are non-censored. weights_col: an optional column in the dataframe that denotes the weight per subject. This column is expelled and not used as a covariate, but as a weight in the final regression. Default weight is 1. show_progress: since the fitter is iterative, show convergence diagnostics. initial_beta: initialize the starting point of the iterative algorithm. Default is the zero vector. strata: specify a list of columns to use in stratification. This is useful if a catagorical covariate does not obey the proportional hazard assumption. This is used similar to the `strata` expression in R. See http://courses.washington.edu/b515/l17.pdf. Returns: self, with additional properties: hazards_ """ df = df.copy() # Sort on time df = df.sort_values(by=duration_col) self._n_examples = df.shape[0] self.strata = coalesce(strata, self.strata) if self.strata is not None: original_index = df.index.copy() df = df.set_index(self.strata) # Extract time and event T = df[duration_col] del df[duration_col] if event_col is None: E = pd.Series(np.ones(df.shape[0]), index=df.index) else: E = df[event_col] del df[event_col] if weights_col: weights = df.pop(weights_col).values else: weights = np.ones(self._n_examples) self._check_values(df, E) df = df.astype(float) # save fitting data for later self.durations = T.copy() self.event_observed = E.copy() if self.strata is not None: self.durations.index = original_index self.event_observed.index = original_index self.event_observed = self.event_observed.astype(bool) self._norm_mean = df.mean(0) self._norm_std = df.std(0) E = E.astype(bool) hazards_ = self._newton_rhaphson(normalize(df, self._norm_mean, self._norm_std), T, E, weights=weights, initial_beta=initial_beta, show_progress=show_progress, step_size=step_size) self.hazards_ = pd.DataFrame(hazards_.T, columns=df.columns, index=['coef']) / self._norm_std self.confidence_intervals_ = self._compute_confidence_intervals() self.baseline_hazard_ = self._compute_baseline_hazards(df, T, E) self.baseline_cumulative_hazard_ = self._compute_baseline_cumulative_hazard() self.baseline_survival_ = self._compute_baseline_survival() self.score_ = concordance_index(self.durations, -self.predict_partial_hazard(df).values.ravel(), self.event_observed) self._train_log_partial_hazard = self.predict_log_partial_hazard(self._norm_mean.to_frame().T) return self
def train(self, x, c, s, names, fold, n_feature=50): #learning_ratio = 1e-3 n = x.shape[0] dev_index = n * 3 // 4 x = self.preprocess(x, c, s, names, fold, n_feature, dev_index) x_trn, x_dev = x[:dev_index], x[dev_index:] c_trn, c_dev = 1 - c[:dev_index], 1 - c[dev_index:] s_trn, s_dev = s[:dev_index], s[dev_index:] sort_idx = np.argsort(s_trn)[::-1] x_trn = x_trn[sort_idx] s_trn = s_trn[sort_idx] c_trn = c_trn[sort_idx] def nll(E, NUM_E): def loss(y_true, y_pred): hazard_ratio = K.exp(y_pred) log_risk = K.log(K.cumsum(hazard_ratio)) uncensored_likelihood = K.transpose(y_pred) - log_risk censored_likelihood = uncensored_likelihood * E neg_likelihood = -K.sum(censored_likelihood) / NUM_E return neg_likelihood return loss input_size = len(x[0]) cindex_dev = {} # for dropout in [0.0, 0.5]: for dropout in [0.0]: self.model = self.get_model(input_size, dropout) for lr in [0.1, 0.01, 0.001, 0.0001]: print('############## Run at ', fold, dropout, lr) adam = optimizers.Adam(lr=lr) self.model.compile(loss=[nll(c_trn, np.sum(c_trn))], optimizer=adam) data = (x_trn, c_trn, s_trn, x_dev, c_dev, s_dev) modelpath = self.out_folder + '/%s/%s_(%d)_%0.1f_%0.5f.hdf5' % ( self.model_name, self.cancer, fold, dropout, lr) checkpoint = MyCallback(modelpath, data) self.model.fit(x_trn, s_trn, epochs=self.epochs, batch_size=len(x_trn), verbose=0, shuffle=False, callbacks=[checkpoint]) self.model.load_weights(modelpath) pred_raw = self.model.predict(x_dev, batch_size=1, verbose=1) pred_dev = -np.exp(pred_raw) cindex_dev_max = concordance_index(s_dev, pred_dev, c_dev) cindex_dev[modelpath] = cindex_dev_max self.reset_weights() self.bestmodelpath, self.cindex_dev_max = max( cindex_dev.items(), key=operator.itemgetter(1)) return self.cindex_dev_max
def get_concordance_index(self, xdata, edata, tdata): risk = self.predict(xdata) partial_hazards = -numpy.exp(risk) return concordance_index(tdata, partial_hazards, edata)
#Initialize lists to store predictions preds_bootfull = [] inds_inbag = [] Cb_opts = [] #STEP 1 #(1a) find optimal hyperparameters opars, osummary = hypersearch_DL(x_data=x_full, y_data=y_full, method='particle swarm', nfolds=6, nevals=50, lrexp_range=[-6.,-4.5], l1rexp_range=[-7,-4], dro_range=[.1,.9], units1_range=[75,250], units2_range=[5,20], alpha_range=[0.3, 0.7], batch_size=16, num_epochs=100) #(1b) using optimal hyperparameters, train a model on full sample olog = DL_single_run(xtr=x_full, ytr=y_full, units1=opars['units1'], units2=opars['units2'], dro=opars['dro'], lr=10**opars['lrexp'], l1r=10**opars['l1rexp'], alpha=opars['alpha'], batchsize=16, numepochs=100) #(1c) Compute Harrell's Concordance index predfull = olog.model.predict(x_full, batch_size=1)[1] C_app = concordance_index(y_full[:,1], -predfull, y_full[:,0]) print('Apparent concordance index = {0:.4f}'.format(C_app)) #BOOTSTRAP SAMPLING #define useful variables nsmp = len(x_full) rowids = [_ for _ in range(nsmp)] B = 100 for b in range(B): print('Current bootstrap sample:', b, 'of', B-1) print('-------------------------------------')
def get_rf_comparison(basedir): norm = ['rare', 'rel_abun', 'log', 'clr'] levels = ['phyla', 'classes', 'orders', 'families', 'genera', 'species'] order_cols, order_rows = [], [] fig = plt.figure(figsize=(13, 10)) for l in range(len(levels)): level = levels[l] dfs, dfs_tax = [], [] for n in norm: df = pd.read_csv(basedir+'random_forest/'+n+'/'+level+'_overall.csv', header=0, index_col=0) try: df_tax = df.drop(['Score', 'OOB_score'], axis=0) except: df_tax = df.drop(['Score'], axis=0) df_tax['Mean'] = df_tax.mean(axis=1) df_tax = pd.DataFrame(df_tax.loc[:, 'Mean']) df = pd.DataFrame(df.loc['Score', :]) df = df.transpose() df.rename(index={'Score':n}, inplace=True) dfs.append(df) df_tax = df_tax.transpose() df_tax.rename(index={'Mean':n}, inplace=True) dfs_tax.append(df_tax) dfs = pd.concat(dfs)*100 dfs_tax = pd.concat(dfs_tax).fillna(value=0) for a in range(2): dfs['Mean'] = dfs.mean(axis=1) dfs = dfs.sort_values(by='Mean', axis=0, ascending=True) if a == 1: dfs = dfs.sort_values(by='Mean', axis=0, ascending=False) else: df_mean = pd.DataFrame(dfs.loc[:, 'Mean']) dfs.drop('Mean', axis=1, inplace=True) dfs = dfs.transpose() dfs.rename(index=norm_names, columns=rename_plots, inplace=True) df_mean.rename(index=norm_names, inplace=True) order_rows = ['Rarefied', 'Relative\nabundance', 'Log', 'CLR'] if order_cols == []: order_cols = list(dfs.columns) else: dfs = dfs.loc[:, order_cols] dfs = dfs.loc[order_rows, :] df_mean = df_mean.loc[order_rows, :] ax = plt.subplot2grid((6,120), (l, 5), colspan=76) ax_mean = plt.subplot2grid((6, 120), (l, 0), colspan=4) ax_con = plt.subplot2grid((6,6), (l, 4)) ax_con_tax = plt.subplot2grid((6,6), (l, 5)) xtcks = False if l == 5: xtcks = True annotate_heatmap(ax, dfs, xticks=xtcks, yticks=False) annotate_heatmap(ax_mean, df_mean, xticks=xtcks) ax_mean.set_ylabel(level.capitalize(), fontsize=fs_main, fontweight='bold') concs = [] for a in range(4): conc = [] for b in range(4): l1 = dfs.iloc[a, :].values l2 = dfs.iloc[b, :].values conc.append(concordance_index(l1, l2)) concs.append(conc) concs = pd.DataFrame(concs, index=dfs.index.values, columns=dfs.index.values) annotate_heatmap(ax_con, concs, cmap='Blues', rnd=2, yticks=False, xticks=xtcks, vmin=0.75) concs_tax = [] for a in range(4): conc_tax = [] for b in range(4): l1 = dfs_tax.iloc[a, :].values l2 = dfs_tax.iloc[b, :].values conc_tax.append(concordance_index(l1, l2)) concs_tax.append(conc_tax) concs_tax = pd.DataFrame(concs_tax, index=dfs_tax.index.values, columns=dfs_tax.index.values) concs_tax.rename(columns=norm_names, inplace=True) annotate_heatmap(ax_con_tax, concs_tax, cmap='Purples', rnd=2, yticks=False, xticks=xtcks, vmin=0.75) if l == 0: ax.set_title('Classification accuracy (%)', fontsize=fs_title, fontweight='bold') ax_con.set_title('Concordance in\nclassification accuracy', fontsize=fs_main, fontweight='bold') ax_con_tax.set_title('Concordance in\nfeature importance', fontsize=fs_main, fontweight='bold') plt.savefig(basedir+'/figures/RF_compare'+ext, dpi=600, bbox_inches='tight') plt.close() return
optimizer=tf.keras.optimizers.Adam( learning_rate=0.001, )) callbacks = [ tf.keras.callbacks.EarlyStopping(monitor='val_coxph', min_delta=0.0001, patience=20, mode='min', restore_best_weights=True) ] history = mil.model.fit(ds_train, steps_per_epoch=4, validation_data=ds_valid, epochs=10000, callbacks=callbacks) y_pred_all = mil.model.predict(ds_all) if concordance_index(samples['times'], np.exp( -1 * y_pred_all[:, 0]), samples['event']) > .52: X = True evaluation = mil.model.evaluate(ds_test) histories.append(history.history) evaluations.append(evaluation) weights.append(mil.model.get_weights()) except: pass ##get ranks per cancer for index, cancer in enumerate(['NA']): mask = np.where(cancer_strat == index)[0] cancer_test_indexes[cancer] = cancer_test_indexes.get( cancer, []) + [mask[np.isin(mask, idx_test, assume_unique=True)]] temp = np.exp(-y_pred_all[mask, 0]).argsort() ranks = np.empty_like(temp) ranks[temp] = np.arange(len(mask))
def c_index3(month, risk, status): c_index = concordance_index(np.reshape(month, -1), -np.reshape(risk, -1), np.reshape(status, -1)) return c_index #def get_bi_lstm_model():
def predict_concordance_index(self, x, t, e, outcomes=None): input_size = x.shape[0] i = 0 num_batches = input_size / self.batch_size predicted_time = np.zeros(shape=input_size, dtype=np.int) total_ranking = 0.0 total_rae = 0.0 total_cost = 0.0 total_gen_loss = 0.0 total_disc_loss = 0.0 total_layer_one_recon = 0.0 total_t_reg_loss = 0.0 total_reg = 0.0 total_mse = 0.0 while i < input_size: # The ending index for the next batch is denoted j. j = min(i + self.batch_size, input_size) feed_dict = self.batch_feed_dict(e=e, i=i, j=j, t=t, x=x, outcomes=outcomes) cost, ranking, gen_loss, rae, reg, disc_loss, layer_one_recon, t_reg_loss, t_mse = self.session.run( [ self.cost, self.ranking_partial_lik, self.gen_one_loss, self.total_rae, self.reg_loss, self.disc_one_loss, self.layer_one_recon, self.t_regularization_loss, self.t_mse ], feed_dict=feed_dict) temp_pred_time = [] for p in range(self.sample_size): gen_time = self.session.run(self.predicted_time, feed_dict=feed_dict) temp_pred_time.append(gen_time) temp_pred_time = np.array(temp_pred_time) # print("temp_pred_time:{}".format(temp_pred_time.shape)) predicted_time[i:j] = np.median(temp_pred_time, axis=0) total_ranking += ranking total_cost += cost total_rae += rae total_gen_loss += gen_loss total_reg += reg total_layer_one_recon += layer_one_recon total_disc_loss += disc_loss total_t_reg_loss += t_reg_loss total_mse += t_mse i = j predicted_event_times = predicted_time.reshape(input_size) ci_index = concordance_index( event_times=t, predicted_scores=np.nan_to_num(predicted_event_times).tolist(), event_observed=e) def batch_average(total): return total / num_batches return ci_index, batch_average(total_cost), batch_average( total_rae), batch_average(total_ranking), batch_average( total_gen_loss), batch_average(total_reg), batch_average( total_disc_loss), batch_average( total_layer_one_recon), batch_average( total_t_reg_loss), batch_average(total_mse)
def train_neural_network(self): train_print = "Training {0} Model:".format(self.model) params_print = "Parameters:, l2_reg:{}, learning_rate:{}," \ " momentum: beta1={} beta2={}, batch_size:{}, batch_norm:{}," \ " hidden_dim:{}, latent_dim:{}, num_of_batches:{}, keep_prob:{}, disc_update:{}" \ .format(self.l2_reg, self.learning_rate, self.beta1, self.beta2, self.batch_size, self.batch_norm, self.hidden_dim, self.latent_dim, self.num_batches, self.keep_prob, self.disc_updates) # print(train_print) # print(params_print) # logging.debug(train_print) # logging.debug(params_print) self.session.run(tf.global_variables_initializer()) best_ci = 0 best_t_reg = np.inf best_validation_epoch = 0 last_improvement = 0 start_time = time.time() epochs = 0 # show_all_variables() j = 0 for i in range(self.num_iterations): # Batch Training run_options = tf.RunOptions(timeout_in_ms=4000) x_batch, t_batch, e_batch = self.session.run( [self.x_batch, self.t_batch, self.e_batch], options=run_options) risk_batch = risk_set(data_t=t_batch) batch_impute_mask = get_missing_mask(x_batch, self.imputation_values) batch_size = len(t_batch) idx_observed = e_batch == 1 # TODO simplify batch processing feed_dict_train = { self.x: x_batch, self.impute_mask: batch_impute_mask, self.t: t_batch, self.t_lab: t_batch[idx_observed], self.e: e_batch, self.risk_set: risk_batch, self.batch_size_tensor: batch_size, self.is_training: True, self.noise_alpha: np.ones(shape=self.noise_dim) } for k in range(self.disc_updates): _ = self.session.run([self.disc_solver], feed_dict=feed_dict_train) for m in range(self.gen_updates): _ = self.session.run([self.gen_solver], feed_dict=feed_dict_train) summary, train_time, train_cost, train_ranking, train_rae, train_reg, train_gen, train_layer_one_recon, \ train_t_reg, train_t_mse, train_disc = self.session.run( [self.merged, self.predicted_time, self.cost, self.ranking_partial_lik, self.total_rae, self.reg_loss, self.gen_one_loss, self.layer_one_recon, self.t_regularization_loss, self.t_mse, self.disc_one_loss], feed_dict=feed_dict_train) try: train_ci = concordance_index( event_times=t_batch, predicted_scores=np.nan_to_num(train_time).reshape( t_batch.shape), event_observed=e_batch) except IndexError: train_ci = 0.0 print("C-Index IndexError") ##### ibs / ibll ##### train_time_grid = np.linspace(t_batch.min(), t_batch.max(), 100) train_ds = np.array(train_time_grid - np.array([0.0] + train_time_grid[:-1].tolist())) # print(t_batch) # print(e_batch) train_bs, train_bll = get_scores( y_train=t_batch, delta_train=e_batch, y_test=t_batch, delta_test=e_batch, pred_train=train_time.reshape(t_batch.shape), pred_test=train_time.reshape(t_batch.shape), time_grid=train_time_grid, surv_residual=False, cens_residual=False) train_ibs = sum(train_bs * train_ds) / (train_time_grid.max() - train_time_grid.min()) train_ibll = sum(train_bll * train_ds) / (train_time_grid.max() - train_time_grid.min()) ###################### tf.verify_tensor_all_finite(train_cost, "Training Cost has Nan or Infinite") if j >= self.num_examples: epochs += 1 is_epoch = True # idx = 0 j = 0 else: # idx = j j += self.batch_size is_epoch = False if i % 100 == 0: train_print = "it:{}, trainCI:{}, train_ranking:{}, train_RAE:{}, train_Gen:{}, train_Disc:{}, " \ "train_reg:{}, train_t_reg:{}, train_t_mse:{}, train_layer_one_recon:{}".format( i, train_ci, train_ranking, train_rae, train_gen, train_disc, train_reg, train_t_reg, train_t_mse, train_layer_one_recon) # print(train_print) # logging.debug(train_print) if is_epoch or (i == (self.num_iterations - 1)): improved_str = '' # Calculate Vaid CI the CI self.train_ci.append(train_ci) self.train_cost.append(train_cost) self.train_t_rae.append(train_rae) self.train_gen.append(train_gen) self.train_disc.append(train_disc) self.train_ranking.append(train_ranking) self.train_layer_one_recon.append(train_layer_one_recon) self.train_writer.add_summary(summary, i) valid_ci, valid_cost, valid_rae, valid_ranking, valid_gen, valid_reg, valid_disc, valid_layer_one_recon, valid_t_reg, valid_t_mse = self.predict_concordance_index( x=self.valid_x, e=self.valid_e, t=self.valid_t) self.valid_cost.append(valid_cost) self.valid_ci.append(valid_ci) self.valid_t_rae.append(valid_rae) self.valid_gen.append(valid_gen) self.valid_disc.append(valid_disc) self.valid_ranking.append(valid_ranking) self.valid_layer_one_recon.append(valid_layer_one_recon) tf.verify_tensor_all_finite( valid_cost, "Validation Cost has Nan or Infinite") if valid_t_reg < best_t_reg: self.saver.save(sess=self.session, save_path=self.save_path) best_validation_epoch = epochs best_t_reg = valid_t_reg last_improvement = epochs improved_str = '*' # Save Best Perfoming all variables of the TensorFlow graph to file. # update best validation accuracy # optimization_print = "Iteration: {} epochs:{}, Training: RAE:{}, Loss: {}," \ # " Ranking:{}, Reg:{}, Gen:{}, Disc:{}, Recon_One:{}, T_Reg:{},T_MSE:{}, CI:{}" \ # " Validation RAE:{} Loss:{}, Ranking:{}, Reg:{}, Gen:{}, Disc:{}, " \ # "Recon_One:{}, T_Reg:{}, T_MSE:{}, CI:{}, {}" \ # .format(i + 1, epochs, train_rae, train_cost, train_ranking, train_reg, train_gen, # train_disc, train_layer_one_recon, train_t_reg, train_t_mse, # train_ci, valid_rae, valid_cost, valid_ranking, valid_reg, valid_gen, valid_disc, # valid_layer_one_recon, valid_t_reg, valid_t_mse, valid_ci, improved_str) optimization_print = "Iteration: {} || TRAIN loss: {}, CI: {}, IBS: {}, IBLL: {} || VAL loss: {}, CI:{}, improved: {}".format( i + 1, np.round(train_cost, 4), np.round(train_ci, 4), np.round(train_ibs, 4), np.round(train_ibll, 4), np.round(valid_cost, 4), np.round(valid_ci, 4), improved_str) if (i + 1) % 50 == 0: print(optimization_print) # logging.debug(optimization_print) if epochs - last_improvement > self.require_improvement or math.isnan( train_cost) or epochs >= self.max_epochs: # if i - last_improvement > self.require_improvement: print( "No improvement found in a while, stopping optimization." ) # Break out from the for-loop. break # Ending time. end_time = time.time() time_dif = end_time - start_time time_dif_print = "Time usage: " + str( timedelta(seconds=int(round(time_dif)))) # print(time_dif_print) # logging.debug(time_dif_print) # shutdown everything to avoid zombies self.session.run(self.queue.close(cancel_pending_enqueues=True)) self.coord.request_stop() self.coord.join(self.threads) return best_validation_epoch, epochs
def main(): with tf.device('/gpu:0'): x = tf.placeholder(tf.float32, [None, clinic_num], name = 'input') s_time = tf.placeholder(tf.float32, [None,num_event], name = 'surv_time') s_event = tf.placeholder(tf.float32, [None,num_event], name = 'surv_event') Pat_ind = tf.placeholder(tf.int32, [None,num_event], name = 'Pat_ind') keep_prob = tf.placeholder(tf.float32, name = 'keep_rate') treatment = tf.placeholder(tf.float32, [None, dim_interact_feature], name = 'treatment') global_step = tf.placeholder(tf.int32, []) # model fc = _create_fc_layer(x, 3*clinic_num, 'relu', 'shared_layer', keep_prob, w_reg = reg_W) # fc = tf.concat([x,fc0], axis=1) # fc1_1 = _create_fc_layer(fc, 5*clinic_num, 'relu', 'specific_layer1_1', keep_prob, w_reg = reg_W) fc1_2 = _create_fc_layer(fc, 1*clinic_num, 'relu', 'specific_layer1_2', keep_prob, w_reg = reg_W) output1 = _create_fc_layer(fc1_2, num_event-1, 'tanh', 'output_1', use_bias= False, w_reg = reg_W_out) fc2_1 = tf.multiply(treatment, fc) # fc2_1 = _create_fc_layer(fc2_1, 5*clinic_num, 'relu', 'specific_layer2_1', keep_prob, w_reg = reg_W) fc2_2 = _create_fc_layer(fc2_1, 1*clinic_num, 'relu', 'specific_layer2_2', keep_prob, w_reg = reg_W) output2 = _create_fc_layer(fc2_2, 1, 'tanh', 'output_2', use_bias= False, w_reg = reg_W_out) # loss loss_cox_prog, loss_rank_prog = Get_loss(output1, s_time, s_event, Pat_ind) pred_DFS = tf.reduce_max(output1,axis=1) loss_cox_pred = DeepSurv_loss(s_time[:,3], s_event[:,3], Pat_ind[:,3], output2) loss_reg = tf.losses.get_regularization_loss() # + intra_loss_weight[1]*loss_cox_pred loss_total = intra_loss_weight[0]*loss_cox_prog + reg_factor*loss_reg # import pdb; pdb.set_trace() # x1 = tf.Variable([0.2,0.3,0.5],tf.float32) # x2 = tf.reduce_max(x1) learning_rate = exponential_decay_with_warmup(warmup_step,learning_rate_base,global_step,learning_rate_step,learning_rate_decay,staircase=True) optimizer = tf.train.MomentumOptimizer(learning_rate = learning_rate, momentum = momentum, use_nesterov = True) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_step = optimizer.minimize(loss_total) restore_var = [v for v in tf.trainable_variables()] print(restore_var) # import pdb;pdb.set_trace() saver = tf.train.Saver(max_to_keep = 10) # Start Tensorflow session gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction = 0.9)#设置每个GPU使用率0.7代表70% config = tf.ConfigProto(gpu_options = gpu_options, allow_soft_placement = True) with tf.Session(config = config) as sess: # Initialize all variables sess.run(tf.global_variables_initializer()) # loader = tf.train.Saver(var_list=restore_var) # loader.restore(sess, snapshot_dir) gsp = 0 # Loop over number of epochs for epoch in range(num_epochs): # print("{} Start epoch number: {}".format(datetime.now(), epoch)) np.random.shuffle(ind_0) np.random.shuffle(ind_1) # Initialize iterator with the training dataset train_risk = 0.0 prog_risk = 0.0 pred_risk = 0.0 reg_risk = 0.0 # import pdb;pdb.set_trace() for i in range(num_batchs): gsp += 1 ind0 = ind_0[i*r0:(i+1)*r0] ind1 = ind_1[i*r1:(i+1)*r1] treat, input_x1, input_time, input_event, input_idx = GetData(ind0,ind1) # import pdb;pdb.set_trace() # pdfs, opt = sess.run([pred_DFS,output1], feed_dict = {global_step:gsp, treatment: treat, x: input_x1, s_time: input_time, s_event: input_event, Pat_ind: input_idx, keep_prob: 1.0}) # pdfs1 = sess.run(pred_DFS, feed_dict = {global_step:gsp, treatment: treat, x: input_x1, s_time: input_time, s_event: input_event, Pat_ind: input_idx, keep_prob: 1.0}) # print(pdfs1) # print(pdfs) # print(opt) _, opt2, _, opt, reg_ls, prog_ls, pred_ls, total_ls, now_lr = sess.run([train_step, fc, output1, pred_DFS, loss_reg, loss_cox_prog, loss_cox_pred, loss_total, learning_rate], feed_dict = {global_step:gsp, treatment: treat, x: input_x1, s_time: input_time, s_event: input_event, Pat_ind: input_idx, keep_prob: keep_prob_rate}) reg_risk += reg_ls train_risk += total_ls prog_risk += prog_ls pred_risk += pred_ls # import pdb;pdb.set_trace() reg_risk /= num_batchs train_risk /= num_batchs prog_risk /= num_batchs pred_risk /= num_batchs line = 'epoch: %d, learning rate: %.5f, tatol_loss: %.4f, reg_loss: %.4f, prognosis-cox loss: %.4f, predict-cox loss: %.4f' % (epoch + 1, now_lr, train_risk, reg_risk, prog_risk, pred_risk) print(line) with open(log_path, 'a') as f: f.write(line + '\n') if (epoch+1)%2 == 0: tra_pred = [] for i in range(len(tra_treat)): xd = tra_treat[i] treat = np.array([xd]*dim_interact_feature) opt1, Pat_pred = sess.run([output1,pred_DFS], feed_dict = {x: clinic_factors[i,:].reshape(1,clinic_num), keep_prob: 1.0, treatment: treat.reshape(1,dim_interact_feature)}) if np.max(Pat_pred)>1.0: import pdb;pdb.set_trace() tra_pred.append(-Pat_pred) # import pdb;pdb.set_trace() tra_pred = np.array(tra_pred, np.float32) tra_ci_value = concordance_index(tra_FFS_time, tra_pred, tra_FFS_event) line = 'train cohort, CI: %.4f, epoch: %d' % (tra_ci_value, epoch) print(line) val_pred = [] for i in range(len(val_treat)): xd = val_treat[i] treat = np.array([xd]*dim_interact_feature) opt = sess.run(pred_DFS, feed_dict = {x: clinic_factors_val[i,:].reshape(1,clinic_num), keep_prob: 1.0, treatment: treat.reshape(1,dim_interact_feature)}) print(opt) opt = sess.run(output1, feed_dict = {x: clinic_factors_val[i,:].reshape(1,clinic_num), keep_prob: 1.0, treatment: treat.reshape(1,dim_interact_feature)}) print(opt) opt1, Pat_pred = sess.run([output1,pred_DFS], feed_dict = {x: clinic_factors_val[i,:].reshape(1,clinic_num), keep_prob: 1.0, treatment: treat.reshape(1,dim_interact_feature)}) print(opt1) print(Pat_pred) opt1, Pat_pred = sess.run([output1,pred_DFS], feed_dict = {x: clinic_factors_val[i,:].reshape(1,clinic_num), keep_prob: 1.0, treatment: treat.reshape(1,dim_interact_feature)}) print(opt1) print(Pat_pred) val_pred.append(-Pat_pred) import pdb;pdb.set_trace() val_pred = np.array(val_pred, np.float32) val_ci_value = concordance_index(val_FFS_time, val_pred, val_FFS_event) line = 'validation cohort, CI: %.4f, epoch: %d' % (val_ci_value, epoch) print(line) test_pred = [] for i in range(len(test_treat)): xd = test_treat[i] treat = np.array([xd]*dim_interact_feature) # import pdb;pdb.set_trace() opt, Pat_pred = sess.run([output1,pred_DFS], feed_dict = {x: clinic_factors_test[i,:].reshape(1,clinic_num), keep_prob: 1.0, treatment: treat.reshape(1,dim_interact_feature)}) test_pred.append(-Pat_pred[0]) # import pdb;pdb.set_trace() test_pred = np.array(test_pred, np.float32) test_ci_value = concordance_index(test_FFS_time, test_pred, test_FFS_event) line = 'test cohort, CI: %.4f, epoch: %d' % (test_ci_value, epoch) print(line)
def cph_ci(x, t, e, **kwargs): return concordance_index( event_times= t, predicted_event_times= -model.predict_partial_hazard(x), event_observed= e, )