Beispiel #1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--cancerType', dest='type',\
                        help='Cancer type to be collected')
    parser.add_argument('--getData',dest='get', action='store_true',\
                        default=False,help='Set flag to get all data')
    opts = parser.parse_args()

    if opts.get:
        for ds in ['brca', 'ccrcc', 'colon', 'ovarian', 'endometrial', 'luad']:
            cptac.download(dataset=ds)

    if opts.type.lower() == 'brca':
        dat = cptac.Brca()
    elif opts.type.lower() == 'ccrcc':
        dat = cptac.Ccrcc()
    elif opts.type.lower() == 'coad':
        dat = cptac.Colon()
    elif opts.type.lower() == 'ovca':
        dat = cptac.Ovarian()
    elif opts.type.lower() == 'luad':
        dat = cptac.Luad()
    elif opts.type.lower() == 'endometrial':
        dat = cptac.Endometrial()
    else:
        exit()

    df = dat.get_phosphoproteomics()
    pdf = dat.get_proteomics()
    # df.columns = [' '.join(col).strip() for col in df.columns.values]

    df.to_csv(path_or_buf="phos_file.tsv", sep='\t')
    pdf.to_csv(path_or_buf='prot_file.tsv', sep='\t')
Beispiel #2
0
def getDataForCancer(ctype):
    if ctype.lower() == 'brca':
        dat = cptac.Brca()
    elif ctype.lower() == 'ccrcc':
        dat = cptac.Ccrcc()
    elif ctype.lower() == 'coad':
        dat = cptac.Colon()
    elif ctype.lower() == 'ovca':
        dat = cptac.Ovarian()
    elif ctype.lower() == 'luad':
        dat = cptac.Luad()
    elif ctype.lower() == 'endometrial':
        dat = cptac.Endometrial()
    else:
        exit()
    return dat
Beispiel #3
0
def cptacData():
    '''
    We need to collect and load CPTAC data
    '''
    print("Loading cptac datasets")
    #we need to make sure all datasets are downloaded
    ##here are the cancers that are available without login information
    allcans = ['brca', 'ccrcc', 'colon', 'ovarian', 'luad',\
             #'hnscc','gbm','lscc',\
             'endometrial']
    print("Downloading cptac data")
    for ct in allcans:
        cptac.download(dataset=ct)
    #then we load them into a dictionary
    fdict = {'brca':cptac.Brca(), 'ccrcc':cptac.Ccrcc(),\
           'colon':cptac.Colon(), 'ovarian':cptac.Ovarian(),\
             #'hnscc':cptac.Hnscc(),'gbm':cptac.Gbm(), 'lscc':cptac.Lscc(),\
           'endometrial':cptac.Endometrial(), 'luad':cptac.Luad()}
    return fdict
#!/usr/bin/env python
# coding: utf-8

### Creation of BoxPlots for ESR1 Protien Expression ###

import cptac  # import cptac to download cptac protein and clinical data
import pandas as pd  # import pandas for
import matplotlib.pyplot as plt  # import matplotlib for boxplot
import seaborn as sns  # import seaborn for boxplot
from scipy import stats
from statannot import add_stat_annotation

cptac.download(dataset="Brca")  # download breast cancer dataset
br = cptac.Brca()  # save data in br variable

protein_data = br.get_proteomics()  # save proteomic data
protein_data = protein_data.droplevel(1, axis=1)  # remove multi index
clinical_data = br.get_clinical()  # save clinical data

esr1 = protein_data["ESR1"]  # save ESR1 protein expression column
clinical_data["ER.IHC.Score"] = clinical_data["ER.IHC.Score"].fillna(
    "Not reported")  # fill in null values

er_mask = clinical_data[
    "ER.IHC.Score"] == "3+"  # 3+ is ER-positive, create mask of ER-positive patients
patients = esr1[er_mask]  # apply mask to protein expression data
ages = clinical_data["Age.in.Month"][er_mask] / 12  # calculate ages in years
er_positive_patients = pd.DataFrame(patients,
                                    columns=["ESR1"])  # create new dataframe
er_positive_patients["Age"] = ages  # apply ages to new column in dataframe
category = []  # set categories list
Beispiel #5
0
import cptac
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

#Alyssa's code
cptac.download(dataset="Brca")
br = cptac.Brca()

protein_data = br.get_proteomics()

#Remove the "multi" part of the dataframe (the dataframes are MultIndex pandas dataframes)
protein_data = protein_data.droplevel(1, axis=1)

rna_data = br.get_transcriptomics()
clinical_data = br.get_clinical()

clinical_data["Age_in_years"] = clinical_data["Age.in.Month"]/12


# clinical_data.to_csv("~/Desktop/qbioresearch/qbio_data_analysis_alyssa/data/clinical_data.csv")
# protein_data.to_csv("~/Desktop/qbioresearch/qbio_data_analysis_alyssa/data/protein_data.csv")
# rna_data.to_csv("~/Desktop/qbioresearch/qbio_data_analysis_alyssa/data/rna_data.csv")

# clinical_data_readin = pd.read_csv("~/Desktop/qbioresearch/qbio_data_analysis_alyssa/data/clinical_data.csv", index_col=1) #The index_col=0 creates the index (rownames) as the first column. 


#Check that the patients are in the same order in rna_data and protein_data.
#We are looking at the gene expression and protein information of EACH patient, so the data needs to be in pairs.
Beispiel #6
0
    # To view available datasets, enter 'cptac.list_data()'.
    cptac.list_datasets()
    cptac.download(dataset = "endometrial")
    cptac.download(dataset = 'colon')
    cptac.download(dataset = 'ovarian')
    cptac.download(dataset = 'RenalCcrcc')
    #cptac.download(dataset ='luad')
    #cptac.download(dataset ='brca')
downloadCptac()

endometrialData = cptac.Endometrial()
colorectalData = cptac.Colon()
ovarianData = cptac.Ovarian()
renalData = cptac.RenalCcrcc()
lungData = cptac.Luad()
breastData = cptac.Brca()

def listDataForEachCancer():
    print("endometrial")
    endometrialData.list_data()
    print("\n\ncolorectal")
    colorectalData.list_data()
    print("\n\novarian")
    ovarianData.list_data()
    print("\n\nrenal")
    renalData.list_data()

listDataForEachCancer()

#################################################################
# Correlation: Proteomics vs Transcriptom in Endometrial Cancer #