/
graphSurvival.py
118 lines (95 loc) · 5.79 KB
/
graphSurvival.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
from bravado.client import SwaggerClient
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
from collections import Counter
from lifelines.plotting import plot_lifetimes
from lifelines import KaplanMeierFitter
from sklearn.naive_bayes import BernoulliNB
cbioportal = SwaggerClient.from_url('https://www.cbioportal.org/api/api-docs',
config={"validate_requests":False,"validate_responses":False})
def statisticalSignificance(survival_status, has_mutation):
i=0
aliveNo=0 #if patient is alive and does not have mutation
aliveYes=0 #if patient is alive and has mutation
deceasedNo=0 #if patient died and does not have mutation
deceasedYes=0 #if patient died and has mutation
print(len(has_mutation))
while i<len(survival_status):
if has_mutation[i] == 1:
if survival_status[i] == 0:
aliveYes = aliveYes + 1
else:
deceasedYes = deceasedYes + 1
else:
if survival_status[i] == 0:
aliveNo = aliveNo + 1
else:
deceasedNo = deceasedNo + 1
i = i + 1
#increments i
print("aliveNo {} ".format(aliveNo))
print("aliveYes {} ".format(aliveYes))
print("deceasedNo {} ".format(deceasedNo))
print("deceasedYes {} ".format(deceasedYes))
oddsratio, pvalue = stats.fisher_exact([[deceasedYes, aliveYes], [deceasedNo, aliveNo]])
print("pvalue {} ".format(pvalue))
def graph(months, survival_status, has_mutation, name):
survival_data=pd.DataFrame({'OS_MONTHS': months,
'OS_STATUS': survival_status # 0 if living, 1 if dead
})
#0 if don't have mutation, 1 if do have mutation in has_mutation
## create an kmf object
kmf=KaplanMeierFitter()
## fit the data into a model for each group
kmf.fit(survival_data.OS_MONTHS[has_mutation], survival_data.OS_STATUS[has_mutation], label="have mutation")
layer1=kmf.plot(ci_show=True)
kmf.fit(survival_data.OS_MONTHS[~has_mutation], survival_data.OS_STATUS[~has_mutation], label="no mutation")
layer2=kmf.plot(ax=layer1, ci_show=True)
plt.title('{} survival plot'.format(name))
## view plot
plt.show()
def getSurvivalData(patientIds, mutatedIds):
overall_mutations=np.isin(patientIds, mutatedIds)
months=[cbioportal.Clinical_Data.getAllClinicalDataOfPatientInStudyUsingGET(attributeId='OS_MONTHS', patientId=j, studyId='brca_tcga_pan_can_atlas_2018').result()[0] for j in patientIds]
months=[float(x.value) for x in months]
living=[cbioportal.Clinical_Data.getAllClinicalDataOfPatientInStudyUsingGET(attributeId='OS_STATUS', patientId=i, studyId='brca_tcga_pan_can_atlas_2018').result()[0]['value'] for i in patientIds]
survival_status=np.array(living)=='1:DECEASED'
return months, survival_status, overall_mutations
def anomolies(patientIds, mutatedIds):
#TCGA-OL-A66H does not have the attribute OS_MONTHS
#TCGA-BH-A0B2 does not have OS_MONTHS, only has AGE, AJCC_PATHOLOGIC_TUMOR_STAGE, AJCC_STAGING_EDITION, CANCER_TYPE_ACRONYM, CENTER, DAYS_LAST_FOLLOWUP, DAYS_TO_BIRTH, DAYS_TO_INITIAL_PATHOLOGIC_DIAGNOSIS, ETHNICITY, FORM_COMPLETION_DATE, HISTORY_NEOADJUVANT_TRTYN, ICD_10, ICD_O_3_HISTOLOGY, ICD_O_3_SITE, INFORMED_CONSENT_VERIFIED, "IN_PANCANPATHWAYS_FREEZE, OTHER_PATIENT_ID, PATH_M_STAGE, PATH_N_STAGE, PATH_T_STAGE, PERSON_NEOPLASM_CANCER_STATUS, PRIMARY_LYMPH_NODE_PRESENTATION_ASSESSMENT, PRIOR_DX, RACE, SAMPLE_COUNT, SEX
anomolies=(['TCGA-BH-A0B2', 'TCGA-OL-A66H'])
mask=np.isin(patientIds, anomolies)
patientIds=patientIds[~mask]
mutate=np.isin(mutatedIds, anomolies)
mutatedIds=mutatedIds[~mutate]
mutatedIds=np.unique(mutatedIds) #often times, the same patient ID has multiple mutations of the gene, so unique prevents the same patient ID from being in the list of mutated Ids
return(patientIds, mutatedIds)
def genes(name):
# select genes in the cohort of interest
#TP53 = 7157, EP300 = 2033, PIK3CA=5290, CDH1=999, GATA3=2625, MAP3K1=4214
genes = cbioportal.Genes.getGeneUsingGET(geneId=name).result()
print("The Entrez Gene ID for gene {} is {} ".format(name, genes.entrezGeneId))
return genes.entrezGeneId
def main():
name='GATA3'
geneId=genes(name)
# extended documentation available here https://www.cbioportal.org/api/swagger-ui.html
# select patients in the cohort of interest (TCGA pan cancer project)
patients = cbioportal.Patients.getAllPatientsInStudyUsingGET(studyId='brca_tcga_pan_can_atlas_2018').result()
patientIds = np.array([x.patientId for x in patients])
print("The brca_tcga_pan_can_atlas_2018 study spans {} patients".format(len(patients)))
# what kind of mutations do the patients in this cohort have?
mutation = cbioportal.Mutations.getMutationsInMolecularProfileBySampleListIdUsingGET(entrezGeneId=geneId, molecularProfileId='brca_tcga_pan_can_atlas_2018_mutations', sampleListId='brca_tcga_pan_can_atlas_2018_all').result()
mutatedIds=np.array([x.patientId for x in mutation])
print("The number of mutations of the {} gene is {} ".format(name, len(mutation))) #this outputs the total number of mutations of a particular gene which does not need to be the total number of people with the mutation, as the same person could have multiple mutations for the same gene
patient, mutated = anomolies(patientIds, mutatedIds)
print("Patients used to graph {} ".format(len(patient)))
print("total mutated {} ".format(len(mutated)))
months, survival_status, overall_mutations = getSurvivalData(patient, mutated)
graph(months, survival_status, overall_mutations, name)
statisticalSignificance(survival_status, overall_mutations)
if __name__ == '__main__':
main()