def prepare_data(clinical_data, display_cmat=True, VIF=10, skip_variable_reduction=False): #VIF of 10 means that all the features with R^2 above 0.9 will removed time, event, fdata = extract_time_event_and_features(clinical_data) if not skip_variable_reduction: fdata2 = calculate_vif_(fdata, VIF) else: fdata2 = fdata fdata2 = remove_nan_correlations(fdata2, time) if display_cmat: from pysurvival.utils.display import correlation_matrix correlation_matrix(fdata2, figure_size=(30, 15), text_fontsize=10) return fdata2, time, event
cph = CoxPHFitter() data = train[albumin_change] cph.fit(data, 'PFS', event_col='disease_progress') cph.print_summary() cph = CoxPHFitter() data = train[haemoglobin_change] cph.fit(data, 'PFS', event_col='disease_progress') cph.print_summary() # In[ ]: # In[34]: correlation_matrix(df[features], figure_size=(30, 15), text_fontsize=10) # In[ ]: # In[35]: def compute_scores(model, table, timepoints, variables): c_indexes = [] for i in timepoints: table.loc[:, 'disease_progress_temp'] = table['disease_progress'] table.loc[:, 'PFS_temp'] = table['PFS'] table.loc[table.PFS > i, 'disease_progress_temp'] = 0 table.loc[table.PFS > i, 'PFS_temp'] = i c_indexes.append( concordance_index(model, table[variables], table['PFS_temp'],
# Removing duplicates if there exist N_dupli = sum(df.duplicated(keep='first')) df = df.drop_duplicates(keep='first').reset_index(drop=True) print("The raw_dataset contains {} duplicates".format(N_dupli)) # Number of samples in the dataset N = df.shape[0] df.columns from pysurvival.utils.display import correlation_matrix correlation_matrix(df[features], figure_size=(10,10), text_fontsize=8) to_remove = ['totalJogos', 'idaEstadio'] features = np.setdiff1d(features, to_remove).tolist() # Building training and testing sets from sklearn.model_selection import train_test_split index_train, index_test = train_test_split( range(N), test_size = 0.4) data_train = df.loc[index_train].reset_index( drop = True ) data_test = df.loc[index_test].reset_index( drop = True ) # Creating the X, T and E inputs X_train, X_test = df[features], data_test[features] T_train, T_test = df[time_column], data_test[time_column]