def main(): parser = argparse.ArgumentParser() parser.add_argument('--cancerType', dest='type',\ help='Cancer type to be collected') parser.add_argument('--getData',dest='get', action='store_true',\ default=False,help='Set flag to get all data') opts = parser.parse_args() if opts.get: for ds in ['brca', 'ccrcc', 'colon', 'ovarian', 'endometrial', 'luad']: cptac.download(dataset=ds) if opts.type.lower() == 'brca': dat = cptac.Brca() elif opts.type.lower() == 'ccrcc': dat = cptac.Ccrcc() elif opts.type.lower() == 'coad': dat = cptac.Colon() elif opts.type.lower() == 'ovca': dat = cptac.Ovarian() elif opts.type.lower() == 'luad': dat = cptac.Luad() elif opts.type.lower() == 'endometrial': dat = cptac.Endometrial() else: exit() df = dat.get_phosphoproteomics() pdf = dat.get_proteomics() # df.columns = [' '.join(col).strip() for col in df.columns.values] df.to_csv(path_or_buf="phos_file.tsv", sep='\t') pdf.to_csv(path_or_buf='prot_file.tsv', sep='\t')
def compute_regression(input_cancer_type): if input_cancer_type == "CCRCC": cancer = cptac.Ccrcc() elif input_cancer_type == "Endometrial": cancer = cptac.Endometrial() elif input_cancer_type == "LUAD": cancer = cptac.Luad() elif input_cancer_type == "HNSCC": cancer = cptac.Hnscc() elif input_cancer_type == "LSCC": cancer = cptac.Lscc() elif input_cancer_type == "PDAC": cancer = cptac.Pdac() df = dc.get_prot_trans_df(cancer) results = df.groupby('Gene').apply(regression) reg_df = pd.DataFrame(list(results)) reg_df.index = results.index reg_df.reset_index(inplace=True) reg_df = reg_df.dropna() reg_df['interaction_FDR'] = ssm.fdrcorrection( reg_df['interaction_pval'])[1] reg_df['condition_FDR'] = ssm.fdrcorrection(reg_df['condition_pval'])[1] reg_df['intercept_FDR'] = ssm.fdrcorrection(reg_df['intercept_pval'])[1] reg_df['Cancer'] = [input_cancer_type] * len(reg_df) file_name = input_cancer_type + '_regressions.csv' reg_df.to_csv(file_name, index=False)
def load_cancers(include_pdac=False): ccrcc = cptac.Ccrcc() en = cptac.Endometrial() luad = cptac.Luad() hnscc = cptac.Hnscc() lscc = cptac.Lscc() cancers = [ccrcc, en, luad, hnscc, lscc] cancer_names = ['CCRCC', 'Endometrial', 'LUAD', 'HNSCC', 'LSCC'] if include_pdac: pdac = cptac.Pdac() cancers.append(pdac) cancer_names.append('PDAC') return cancers, cancer_names
def getDataForCancer(ctype): if ctype.lower() == 'brca': dat = cptac.Brca() elif ctype.lower() == 'ccrcc': dat = cptac.Ccrcc() elif ctype.lower() == 'coad': dat = cptac.Colon() elif ctype.lower() == 'ovca': dat = cptac.Ovarian() elif ctype.lower() == 'luad': dat = cptac.Luad() elif ctype.lower() == 'endometrial': dat = cptac.Endometrial() else: exit() return dat
def __init__(self): cptac.download(dataset="endometrial", version='latest') # cptac.download(dataset="brca", version='latest') # cptac.download(dataset="gbm", version='latest') # cptac.download(dataset="hsncc", version='latest') # cptac.download(dataset="luad", version='latest') cptac.download(dataset="ovarian", version='latest') cptac.download(dataset="ccrcc", version='latest') cptac.download(dataset="colon", version='latest') self.en = cptac.Endometrial() # self.brca = cptac.Brca() # self.gbm = cptac.Gbm() # self.hsncc = cptac.Hnscc() # self.luad= cptac.Luad() self.ovarian = cptac.Ovarian() self.ccrcc = cptac.Ccrcc() self.colon = cptac.Colon() # self.datasets = list(self.en,self.brca,self.gbm,self.hsncc,self.luad,self.ovarian,self.ccrcc) self.datasets = list([self.en, self.ovarian, self.ccrcc, self.colon])
def cptacData(): ''' We need to collect and load CPTAC data ''' print("Loading cptac datasets") #we need to make sure all datasets are downloaded ##here are the cancers that are available without login information allcans = ['brca', 'ccrcc', 'colon', 'ovarian', 'luad',\ #'hnscc','gbm','lscc',\ 'endometrial'] print("Downloading cptac data") for ct in allcans: cptac.download(dataset=ct) #then we load them into a dictionary fdict = {'brca':cptac.Brca(), 'ccrcc':cptac.Ccrcc(),\ 'colon':cptac.Colon(), 'ovarian':cptac.Ovarian(),\ #'hnscc':cptac.Hnscc(),'gbm':cptac.Gbm(), 'lscc':cptac.Lscc(),\ 'endometrial':cptac.Endometrial(), 'luad':cptac.Luad()} return fdict
def test_get_frequently_mutated_en_default_cutoff(): en = cptac.Endometrial() print('Running get_frequently_mutated...') df = ut.get_frequently_mutated(en) name = "frequently_mutated" dimensions = (232, 4) headers = ['Gene', 'Unique_Samples_Mut', 'Missense_Mut', 'Truncation_Mut'] # test gene names test_coord_names = ((53, 0), (32, 0), (227, 0)) test_vals_names = ('CTCF', 'CCDC168', 'ZNF536') total_tumors = 95 # test missense and trucation don't equal the unique_sample_mutated #(miss and trunc in same sample) test_coord_CTCF = ((53, 1), (53, 2), (53, 3)) test_vals_CTCF = (27 / total_tumors, 9 / total_tumors, 23 / total_tumors) # testmissense and trucation values are equal test_coord_CCDC168 = ((32, 1), (32, 2), (32, 3)) test_vals_CCDC168 = (16 / total_tumors, 11 / total_tumors, 11 / total_tumors) # test no truncation type mutatations test_coord_ZNF536 = ((227, 1), (227, 2), (227, 3)) test_vals_ZNF536 = (12 / total_tumors, 12 / total_tumors, 0 / total_tumors) # test close to cutoff test_coord_DICER1 = ((61, 1), (61, 2), (61, 3)) test_vals_DICER1 = (10 / total_tumors, 10 / total_tumors, 1 / total_tumors) # common test test_coord_TP53 = ((205, 1), (205, 2), (205, 3)) test_vals_TP53 = (21 / total_tumors, 15 / total_tumors, 7 / total_tumors) test_coord_vals = [(test_coord_names, test_vals_names), (test_coord_CTCF, test_vals_CTCF), (test_coord_CCDC168, test_vals_CCDC168), (test_coord_ZNF536, test_vals_ZNF536), (test_coord_DICER1, test_vals_DICER1), (test_coord_TP53, test_vals_TP53)] for coord, vals in test_coord_vals: PASS = check_getter(df, dimensions, headers, coord, vals) print_test_result(PASS)
def test_get_frequently_mutated_en_cutoff_20_cutoff(): en = cptac.Endometrial() print('Running get_frequently_mutated...') df = ut.get_frequently_mutated(en, cutoff=0.2) dimensions = (10, 4) headers = ['Gene', 'Unique_Samples_Mut', 'Missense_Mut', 'Truncation_Mut'] # test gene names test_coord_names = ((0, 0), (2, 0), (8, 0)) test_vals_names = ('ARID1A', 'CTNNB1', 'TP53') total_tumors = 95 # test missense and trucation don't equal the unique_samples_mutated #(miss and trunc in same sample and counted in each category) test_coord_ARID1A = ((0, 1), (0, 2), (0, 3)) test_vals_ARID1A = (43 / total_tumors, 13 / total_tumors, 38 / total_tumors) # test no truncation type mutatations test_coord_CTNNB1 = ((2, 1), (2, 2), (2, 3)) test_vals_CTNNB1 = (29 / total_tumors, 29 / total_tumors, 0 / total_tumors) # test close to the cutoff test_coord_ZFHX3 = ((9, 1), (9, 2), (9, 3)) test_vals_ZFHX3 = (21 / total_tumors, 8 / total_tumors, 16 / total_tumors) # test miss and trunc almost equal test_coord_KMT2B = ((3, 1), (3, 2), (3, 3)) test_vals_KMT2B = (23 / total_tumors, 11 / total_tumors, 12 / total_tumors) # common test test_coord_TP53 = ((8, 1), (8, 2), (8, 3)) test_vals_TP53 = (21 / total_tumors, 15 / total_tumors, 7 / total_tumors) test_coord_vals = [(test_coord_names, test_vals_names), (test_coord_ARID1A, test_vals_ARID1A), (test_coord_CTNNB1, test_vals_CTNNB1), (test_coord_ZFHX3, test_vals_ZFHX3), (test_coord_KMT2B, test_vals_KMT2B), (test_coord_TP53, test_vals_TP53)] for coord, vals in test_coord_vals: PASS = check_getter(df, dimensions, headers, coord, vals) print_test_result(PASS)
os.path.realpath('Make_Cancer_Delta_Corr_and_P_Value_Dataframe')) parentdir = os.path.dirname(currentdir) parentdir = os.path.dirname(parentdir) sys.path.append(parentdir) import Delta_Correlation as dc input_cancer_type = sys.argv[1] mutated_gene = sys.argv[2] input_permutation_number = int(sys.argv[3]) cutoff = 15 if input_cancer_type == "CCRCC": cancer = cptac.Ccrcc() elif input_cancer_type == "Endometrial": cancer = cptac.Endometrial() cutoff = 10 elif input_cancer_type == "LUAD": cancer = cptac.Luad() elif input_cancer_type == "HNSCC": cancer = cptac.Hnscc() elif input_cancer_type == "LSCC": cancer = cptac.Lscc() elif input_cancer_type == "PDAC": cancer = cptac.Pdac() mutation_df = cancer.get_somatic_mutation() mutation_df = mutation_df[mutation_df.Gene == mutated_gene] mutation_df = mutation_df[mutation_df.Mutation != 'Silent'] mutation_df = mutation_df[mutation_df.Mutation != 'RNA'] mutation_df = mutation_df[mutation_df.Mutation != 'synonymous SNV']
headers = [ 'A1BG_p.E298K', 'A1BG_p.S181N', 'A1CF_p.F487L', 'A1CF_p.S236Y', 'A2ML1_p.A8V', 'A2ML1_p.G1306D', 'A2ML1_p.L1347F', 'A2ML1_p.L82I', 'A2ML1_p.P712S', 'A2ML1_p.R443Q', 'ZYG11A_p.Q442H', 'ZYG11B_p.H315R', 'ZYG11B_p.R495M', 'ZYG11B_p.R728C', 'ZYX_p.C447Y', 'ZZEF1_p.A2723V', 'ZZEF1_p.D845Y', 'ZZEF1_p.K1251E', 'ZZEF1_p.K2387Sfs*40', 'ZZZ3_p.Y891C' ] test_coord = ((94, 51558), (0, 0), (45, 25436)) test_vals = (0, 0, 0) PASS = check_getter(df, dimensions, headers, test_coord, test_vals) print_test_result(PASS) en = cptac.Endometrial(version="latest") print("\nRunning tests:\n") print("Testing getters...") test_get_clinical() test_get_derived_molecular() test_get_experimental_design() test_get_acetylproteomics() test_get_proteomics() test_get_transcriptomics() test_get_circular_RNA() test_get_miRNA() test_get_CNV() test_get_phosphoproteomics() test_get_phosphoproteomics_gene()
# -*- coding: utf-8 -*- “””bioinfoProteomicsHW4.ipynb # Loading CPTAC Data in google colab using a Python package developed in the Payne lab at BYU. """ #install and import cptac and other python packages (just pandas in this case) !pip install -q cptac import cptac import pandas as pd #download endometrial data and creates a new endometrial class cptac.download(dataset="Endometrial") en = cptac.Endometrial() #obtain proteomic data (enProt) and patient information (enInfo) enProt = en.get_proteomics() enInfo = en.get_clinical() #i used google colab to do this, so need to mount my drive to be able to export the data from google.colab import drive drive.mount('drive') #save proteome and clinical information tables to export and work on locally/in another language enProt.to_csv("enProt.csv") !cp enProt.csv "drive/My Drive" enInfo.to_csv("enInfo.csv") !cp enInfo.csv "drive/My Drive"
from statannot import add_stat_annotation from scipy.stats import pearsonr def downloadCptac(): # To view available datasets, enter 'cptac.list_data()'. cptac.list_datasets() cptac.download(dataset = "endometrial") cptac.download(dataset = 'colon') cptac.download(dataset = 'ovarian') cptac.download(dataset = 'RenalCcrcc') #cptac.download(dataset ='luad') #cptac.download(dataset ='brca') downloadCptac() endometrialData = cptac.Endometrial() colorectalData = cptac.Colon() ovarianData = cptac.Ovarian() renalData = cptac.RenalCcrcc() lungData = cptac.Luad() breastData = cptac.Brca() def listDataForEachCancer(): print("endometrial") endometrialData.list_data() print("\n\ncolorectal") colorectalData.list_data() print("\n\novarian") ovarianData.list_data() print("\n\nrenal") renalData.list_data()