Esempio n. 1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--cancerType', dest='type',\
                        help='Cancer type to be collected')
    parser.add_argument('--getData',dest='get', action='store_true',\
                        default=False,help='Set flag to get all data')
    opts = parser.parse_args()

    if opts.get:
        for ds in ['brca', 'ccrcc', 'colon', 'ovarian', 'endometrial', 'luad']:
            cptac.download(dataset=ds)

    if opts.type.lower() == 'brca':
        dat = cptac.Brca()
    elif opts.type.lower() == 'ccrcc':
        dat = cptac.Ccrcc()
    elif opts.type.lower() == 'coad':
        dat = cptac.Colon()
    elif opts.type.lower() == 'ovca':
        dat = cptac.Ovarian()
    elif opts.type.lower() == 'luad':
        dat = cptac.Luad()
    elif opts.type.lower() == 'endometrial':
        dat = cptac.Endometrial()
    else:
        exit()

    df = dat.get_phosphoproteomics()
    pdf = dat.get_proteomics()
    # df.columns = [' '.join(col).strip() for col in df.columns.values]

    df.to_csv(path_or_buf="phos_file.tsv", sep='\t')
    pdf.to_csv(path_or_buf='prot_file.tsv', sep='\t')
def compute_regression(input_cancer_type):

    if input_cancer_type == "CCRCC":
        cancer = cptac.Ccrcc()
    elif input_cancer_type == "Endometrial":
        cancer = cptac.Endometrial()
    elif input_cancer_type == "LUAD":
        cancer = cptac.Luad()
    elif input_cancer_type == "HNSCC":
        cancer = cptac.Hnscc()
    elif input_cancer_type == "LSCC":
        cancer = cptac.Lscc()
    elif input_cancer_type == "PDAC":
        cancer = cptac.Pdac()

    df = dc.get_prot_trans_df(cancer)
    results = df.groupby('Gene').apply(regression)
    reg_df = pd.DataFrame(list(results))
    reg_df.index = results.index
    reg_df.reset_index(inplace=True)
    reg_df = reg_df.dropna()
    reg_df['interaction_FDR'] = ssm.fdrcorrection(
        reg_df['interaction_pval'])[1]
    reg_df['condition_FDR'] = ssm.fdrcorrection(reg_df['condition_pval'])[1]
    reg_df['intercept_FDR'] = ssm.fdrcorrection(reg_df['intercept_pval'])[1]
    reg_df['Cancer'] = [input_cancer_type] * len(reg_df)

    file_name = input_cancer_type + '_regressions.csv'
    reg_df.to_csv(file_name, index=False)
Esempio n. 3
0
def load_cancers(include_pdac=False):
    ccrcc = cptac.Ccrcc()
    en = cptac.Endometrial()
    luad = cptac.Luad()
    hnscc = cptac.Hnscc()
    lscc = cptac.Lscc()
    cancers = [ccrcc, en, luad, hnscc, lscc]
    cancer_names = ['CCRCC', 'Endometrial', 'LUAD', 'HNSCC', 'LSCC']
    if include_pdac:
        pdac = cptac.Pdac()
        cancers.append(pdac)
        cancer_names.append('PDAC')
    return cancers, cancer_names
Esempio n. 4
0
def getDataForCancer(ctype):
    if ctype.lower() == 'brca':
        dat = cptac.Brca()
    elif ctype.lower() == 'ccrcc':
        dat = cptac.Ccrcc()
    elif ctype.lower() == 'coad':
        dat = cptac.Colon()
    elif ctype.lower() == 'ovca':
        dat = cptac.Ovarian()
    elif ctype.lower() == 'luad':
        dat = cptac.Luad()
    elif ctype.lower() == 'endometrial':
        dat = cptac.Endometrial()
    else:
        exit()
    return dat
Esempio n. 5
0
 def __init__(self):
     cptac.download(dataset="endometrial", version='latest')
     # cptac.download(dataset="brca", version='latest')
     # cptac.download(dataset="gbm", version='latest')
     # cptac.download(dataset="hsncc", version='latest')
     # cptac.download(dataset="luad", version='latest')
     cptac.download(dataset="ovarian", version='latest')
     cptac.download(dataset="ccrcc", version='latest')
     cptac.download(dataset="colon", version='latest')
     self.en = cptac.Endometrial()
     # self.brca = cptac.Brca()
     # self.gbm = cptac.Gbm()
     # self.hsncc = cptac.Hnscc()
     # self.luad= cptac.Luad()
     self.ovarian = cptac.Ovarian()
     self.ccrcc = cptac.Ccrcc()
     self.colon = cptac.Colon()
     # self.datasets = list(self.en,self.brca,self.gbm,self.hsncc,self.luad,self.ovarian,self.ccrcc)
     self.datasets = list([self.en, self.ovarian, self.ccrcc, self.colon])
Esempio n. 6
0
def cptacData():
    '''
    We need to collect and load CPTAC data
    '''
    print("Loading cptac datasets")
    #we need to make sure all datasets are downloaded
    ##here are the cancers that are available without login information
    allcans = ['brca', 'ccrcc', 'colon', 'ovarian', 'luad',\
             #'hnscc','gbm','lscc',\
             'endometrial']
    print("Downloading cptac data")
    for ct in allcans:
        cptac.download(dataset=ct)
    #then we load them into a dictionary
    fdict = {'brca':cptac.Brca(), 'ccrcc':cptac.Ccrcc(),\
           'colon':cptac.Colon(), 'ovarian':cptac.Ovarian(),\
             #'hnscc':cptac.Hnscc(),'gbm':cptac.Gbm(), 'lscc':cptac.Lscc(),\
           'endometrial':cptac.Endometrial(), 'luad':cptac.Luad()}
    return fdict
Esempio n. 7
0
def test_get_frequently_mutated_en_default_cutoff():
    en = cptac.Endometrial()
    print('Running get_frequently_mutated...')
    df = ut.get_frequently_mutated(en)

    name = "frequently_mutated"
    dimensions = (232, 4)
    headers = ['Gene', 'Unique_Samples_Mut', 'Missense_Mut', 'Truncation_Mut']
    # test gene names
    test_coord_names = ((53, 0), (32, 0), (227, 0))
    test_vals_names = ('CTCF', 'CCDC168', 'ZNF536')

    total_tumors = 95
    # test missense and trucation don't equal the unique_sample_mutated
    #(miss and trunc in same sample)
    test_coord_CTCF = ((53, 1), (53, 2), (53, 3))
    test_vals_CTCF = (27 / total_tumors, 9 / total_tumors, 23 / total_tumors)
    # testmissense and trucation values are equal
    test_coord_CCDC168 = ((32, 1), (32, 2), (32, 3))
    test_vals_CCDC168 = (16 / total_tumors, 11 / total_tumors,
                         11 / total_tumors)
    # test no truncation type mutatations
    test_coord_ZNF536 = ((227, 1), (227, 2), (227, 3))
    test_vals_ZNF536 = (12 / total_tumors, 12 / total_tumors, 0 / total_tumors)
    # test close to cutoff
    test_coord_DICER1 = ((61, 1), (61, 2), (61, 3))
    test_vals_DICER1 = (10 / total_tumors, 10 / total_tumors, 1 / total_tumors)
    # common test
    test_coord_TP53 = ((205, 1), (205, 2), (205, 3))
    test_vals_TP53 = (21 / total_tumors, 15 / total_tumors, 7 / total_tumors)

    test_coord_vals = [(test_coord_names, test_vals_names),
                       (test_coord_CTCF, test_vals_CTCF),
                       (test_coord_CCDC168, test_vals_CCDC168),
                       (test_coord_ZNF536, test_vals_ZNF536),
                       (test_coord_DICER1, test_vals_DICER1),
                       (test_coord_TP53, test_vals_TP53)]

    for coord, vals in test_coord_vals:
        PASS = check_getter(df, dimensions, headers, coord, vals)

    print_test_result(PASS)
Esempio n. 8
0
def test_get_frequently_mutated_en_cutoff_20_cutoff():
    en = cptac.Endometrial()
    print('Running get_frequently_mutated...')
    df = ut.get_frequently_mutated(en, cutoff=0.2)

    dimensions = (10, 4)
    headers = ['Gene', 'Unique_Samples_Mut', 'Missense_Mut', 'Truncation_Mut']
    # test gene names
    test_coord_names = ((0, 0), (2, 0), (8, 0))
    test_vals_names = ('ARID1A', 'CTNNB1', 'TP53')

    total_tumors = 95
    # test missense and trucation don't equal the unique_samples_mutated
    #(miss and trunc in same sample and counted in each category)
    test_coord_ARID1A = ((0, 1), (0, 2), (0, 3))
    test_vals_ARID1A = (43 / total_tumors, 13 / total_tumors,
                        38 / total_tumors)
    # test no truncation type mutatations
    test_coord_CTNNB1 = ((2, 1), (2, 2), (2, 3))
    test_vals_CTNNB1 = (29 / total_tumors, 29 / total_tumors, 0 / total_tumors)
    # test close to the cutoff
    test_coord_ZFHX3 = ((9, 1), (9, 2), (9, 3))
    test_vals_ZFHX3 = (21 / total_tumors, 8 / total_tumors, 16 / total_tumors)
    # test miss and trunc almost equal
    test_coord_KMT2B = ((3, 1), (3, 2), (3, 3))
    test_vals_KMT2B = (23 / total_tumors, 11 / total_tumors, 12 / total_tumors)
    # common test
    test_coord_TP53 = ((8, 1), (8, 2), (8, 3))
    test_vals_TP53 = (21 / total_tumors, 15 / total_tumors, 7 / total_tumors)

    test_coord_vals = [(test_coord_names, test_vals_names),
                       (test_coord_ARID1A, test_vals_ARID1A),
                       (test_coord_CTNNB1, test_vals_CTNNB1),
                       (test_coord_ZFHX3, test_vals_ZFHX3),
                       (test_coord_KMT2B, test_vals_KMT2B),
                       (test_coord_TP53, test_vals_TP53)]

    for coord, vals in test_coord_vals:
        PASS = check_getter(df, dimensions, headers, coord, vals)

    print_test_result(PASS)
    os.path.realpath('Make_Cancer_Delta_Corr_and_P_Value_Dataframe'))
parentdir = os.path.dirname(currentdir)
parentdir = os.path.dirname(parentdir)
sys.path.append(parentdir)
import Delta_Correlation as dc

input_cancer_type = sys.argv[1]
mutated_gene = sys.argv[2]
input_permutation_number = int(sys.argv[3])

cutoff = 15

if input_cancer_type == "CCRCC":
    cancer = cptac.Ccrcc()
elif input_cancer_type == "Endometrial":
    cancer = cptac.Endometrial()
    cutoff = 10
elif input_cancer_type == "LUAD":
    cancer = cptac.Luad()
elif input_cancer_type == "HNSCC":
    cancer = cptac.Hnscc()
elif input_cancer_type == "LSCC":
    cancer = cptac.Lscc()
elif input_cancer_type == "PDAC":
    cancer = cptac.Pdac()

mutation_df = cancer.get_somatic_mutation()
mutation_df = mutation_df[mutation_df.Gene == mutated_gene]
mutation_df = mutation_df[mutation_df.Mutation != 'Silent']
mutation_df = mutation_df[mutation_df.Mutation != 'RNA']
mutation_df = mutation_df[mutation_df.Mutation != 'synonymous SNV']
Esempio n. 10
0
    headers = [
        'A1BG_p.E298K', 'A1BG_p.S181N', 'A1CF_p.F487L', 'A1CF_p.S236Y',
        'A2ML1_p.A8V', 'A2ML1_p.G1306D', 'A2ML1_p.L1347F', 'A2ML1_p.L82I',
        'A2ML1_p.P712S', 'A2ML1_p.R443Q', 'ZYG11A_p.Q442H', 'ZYG11B_p.H315R',
        'ZYG11B_p.R495M', 'ZYG11B_p.R728C', 'ZYX_p.C447Y', 'ZZEF1_p.A2723V',
        'ZZEF1_p.D845Y', 'ZZEF1_p.K1251E', 'ZZEF1_p.K2387Sfs*40',
        'ZZZ3_p.Y891C'
    ]
    test_coord = ((94, 51558), (0, 0), (45, 25436))
    test_vals = (0, 0, 0)

    PASS = check_getter(df, dimensions, headers, test_coord, test_vals)
    print_test_result(PASS)


en = cptac.Endometrial(version="latest")

print("\nRunning tests:\n")

print("Testing getters...")
test_get_clinical()
test_get_derived_molecular()
test_get_experimental_design()
test_get_acetylproteomics()
test_get_proteomics()
test_get_transcriptomics()
test_get_circular_RNA()
test_get_miRNA()
test_get_CNV()
test_get_phosphoproteomics()
test_get_phosphoproteomics_gene()
# -*- coding: utf-8 -*-
“””bioinfoProteomicsHW4.ipynb
# Loading CPTAC Data in google colab using a Python package developed in the Payne lab at BYU.
"""
#install and import cptac and other python packages (just pandas in this case)
!pip install -q cptac  
import cptac 
import pandas as pd

#download endometrial data and creates a new endometrial class
cptac.download(dataset="Endometrial")
en = cptac.Endometrial()

#obtain proteomic data (enProt) and patient information (enInfo)
enProt = en.get_proteomics()
enInfo = en.get_clinical()

#i used google colab to do this, so need to mount my drive to be able to export the data
from google.colab import drive
drive.mount('drive') 

#save proteome and clinical information tables to export and work on locally/in another language
enProt.to_csv("enProt.csv")
!cp enProt.csv "drive/My Drive"
enInfo.to_csv("enInfo.csv")
!cp enInfo.csv "drive/My Drive"
Esempio n. 12
0
from statannot import add_stat_annotation
from scipy.stats import pearsonr


def downloadCptac():
    # To view available datasets, enter 'cptac.list_data()'.
    cptac.list_datasets()
    cptac.download(dataset = "endometrial")
    cptac.download(dataset = 'colon')
    cptac.download(dataset = 'ovarian')
    cptac.download(dataset = 'RenalCcrcc')
    #cptac.download(dataset ='luad')
    #cptac.download(dataset ='brca')
downloadCptac()

endometrialData = cptac.Endometrial()
colorectalData = cptac.Colon()
ovarianData = cptac.Ovarian()
renalData = cptac.RenalCcrcc()
lungData = cptac.Luad()
breastData = cptac.Brca()

def listDataForEachCancer():
    print("endometrial")
    endometrialData.list_data()
    print("\n\ncolorectal")
    colorectalData.list_data()
    print("\n\novarian")
    ovarianData.list_data()
    print("\n\nrenal")
    renalData.list_data()