Beispiel #1
0
def prepare_data(clinical_data,
                 display_cmat=True,
                 VIF=10,
                 skip_variable_reduction=False):
    #VIF of 10 means that all the features with R^2 above 0.9 will removed

    time, event, fdata = extract_time_event_and_features(clinical_data)
    if not skip_variable_reduction:
        fdata2 = calculate_vif_(fdata, VIF)
    else:
        fdata2 = fdata
    fdata2 = remove_nan_correlations(fdata2, time)
    if display_cmat:
        from pysurvival.utils.display import correlation_matrix
        correlation_matrix(fdata2, figure_size=(30, 15), text_fontsize=10)
    return fdata2, time, event
Beispiel #2
0
cph = CoxPHFitter()
data = train[albumin_change]
cph.fit(data, 'PFS', event_col='disease_progress')
cph.print_summary()

cph = CoxPHFitter()
data = train[haemoglobin_change]
cph.fit(data, 'PFS', event_col='disease_progress')
cph.print_summary()

# In[ ]:

# In[34]:

correlation_matrix(df[features], figure_size=(30, 15), text_fontsize=10)

# In[ ]:

# In[35]:


def compute_scores(model, table, timepoints, variables):
    c_indexes = []
    for i in timepoints:
        table.loc[:, 'disease_progress_temp'] = table['disease_progress']
        table.loc[:, 'PFS_temp'] = table['PFS']
        table.loc[table.PFS > i, 'disease_progress_temp'] = 0
        table.loc[table.PFS > i, 'PFS_temp'] = i
        c_indexes.append(
            concordance_index(model, table[variables], table['PFS_temp'],
Beispiel #3
0

# Removing duplicates if there exist
N_dupli = sum(df.duplicated(keep='first'))
df = df.drop_duplicates(keep='first').reset_index(drop=True)
print("The raw_dataset contains {} duplicates".format(N_dupli))

# Number of samples in the dataset
N = df.shape[0]


df.columns


from pysurvival.utils.display import correlation_matrix
correlation_matrix(df[features], figure_size=(10,10), text_fontsize=8)


to_remove = ['totalJogos', 'idaEstadio']
features = np.setdiff1d(features, to_remove).tolist()


# Building training and testing sets
from sklearn.model_selection import train_test_split
index_train, index_test = train_test_split( range(N), test_size = 0.4)
data_train = df.loc[index_train].reset_index( drop = True )
data_test  = df.loc[index_test].reset_index( drop = True )

# Creating the X, T and E inputs
X_train, X_test = df[features], data_test[features]
T_train, T_test = df[time_column], data_test[time_column]