def __init__(self, no_internet, version): """Load all of the bcmbrca dataframes as values in the self._data dict variable, with names as keys, and format them properly. Parameters: version (str, optional): The version number to load, or the string "latest" to just load the latest building. Default is "latest". no_internet (bool, optional): Whether to skip the index update step because it requires an internet connection. This will be skipped automatically if there is no internet at all, but you may want to manually skip it if you have a spotty internet connection. Default is False. """ #ignore logging messages logger = logging.getLogger() logger.setLevel(logging.CRITICAL) # Set some needed variables, and pass them to the parent Dataset class __init__ function # This keeps a record of all versions that the code is equipped to handle. That way, if there's a new data release but they didn't update their package, it won't try to parse the new data version it isn't equipped to handle. valid_versions = ["1.0"] data_files = { "1.0": [ "HNSCC_discovery.dnp.annotated.exonic.maf.gz", "HNSCC_NAT_RNA-Seq_Expr_WashU_FPKM.tsv.gz", "HNSCC_tumor_RNA-Seq_Expr_WashU_FPKM.tsv.gz", "HNSCC_mature_miRNA_combined.tsv", "HNSCC_precursor_miRNA_combined.tsv", "HNSCC_total_miRNA_combined.tsv", "HNSCC_xCell.txt", "CIBERSORT.Output_Abs_HNSCC.txt", "HNSCC.gene_level.from_seg.filtered.tsv", "gencode.v22.annotation.gtf.gz", "CPTAC_pancan_RNA_tumor_purity_ESTIMATE_WashU.tsv.gz", "README_miRNA", "README_CIBERSORT", "README_xCell", "README_somatic_mutation_WXS", "README_gene_expression", "README.boxnote", "README_ESTIMATE_WashU" ] } # Call the parent class __init__ function super().__init__(cancer_type="washuhnscc", version=version, valid_versions=valid_versions, data_files=data_files, no_internet=no_internet) # get clinical df (used to slice out cancer specific patient_IDs in tumor_purity file) mssmclin = MssmClinical( no_internet=no_internet, version=version, filter_type='pancanhnscc') #_get_version - pancandataset clinical_df = mssmclin.get_clinical() # Load the data into dataframes in the self._data dict loading_msg = f"Loading {self.get_cancer_type()} v{self.version()}" for file_path in self._data_files_paths: # Loops through files variable # Print a loading message. We add a dot every time, so the user knows it's not frozen. loading_msg = loading_msg + "." print(loading_msg, end='\r') path_elements = file_path.split( os.sep) # Get a list of the levels of the path file_name = path_elements[ -1] # The last element will be the name of the file. We'll use this to identify files for parsing in the if/elif statements below if file_name == "HNSCC_discovery.dnp.annotated.exonic.maf.gz": # Note that we use the "file_name" variable to identify files. That way we don't have to use the whole path. df = pd.read_csv(file_path, sep='\t') # Rename the columns we want to keep to the appropriate names df = pd.read_csv(file_path, sep='\t') df['Patient_ID'] = df.loc[:, 'Tumor_Sample_Barcode'] df = df.rename( columns={ "Hugo_Symbol": "Gene", "Gene": "Gene_Database_ID", "Variant_Classification": "Mutation", "HGVSp_Short": "Location" }) df = df.set_index("Patient_ID") df = df[['Gene'] + ["Mutation"] + ["Location"] + [ col for col in df.columns if col not in ["Gene", "Mutation", "Location"] ]] df.index = df.index.str.replace(r"_T", "", regex=True) self._data["somatic_mutation"] = df if file_name == "HNSCC_NAT_RNA-Seq_Expr_WashU_FPKM.tsv.gz": df_norm = pd.read_csv(file_path, sep='\t') #change names to universal package names df_norm = df_norm.rename(columns={ "gene_name": "Name", "gene_id": "Database_ID" }) df_norm = df_norm.set_index(["Name", "Database_ID"]) df_norm = df_norm.sort_index() df_norm = df_norm.T #transpose df_norm.index.name = "Patient_ID" df_norm.index = df_norm.index.str.replace( r"-A", ".N", regex=True) #remove label for tumor samples self._helper_tables["transcriptomics_norm"] = df_norm if file_name == "HNSCC_tumor_RNA-Seq_Expr_WashU_FPKM.tsv.gz": df = pd.read_csv(file_path, sep='\t') #change names to universal package names df = df.rename(columns={ "gene_name": "Name", "gene_id": "Database_ID" }) df = df.set_index(["Name", "Database_ID"]) df = df.sort_index() df = df.T #transpose df.index.name = "Patient_ID" df.index = df.index.str.replace( r"-T", "", regex=True) #remove label for tumor samples self._helper_tables["transcriptomics_tumor"] = df elif 'miRNA_combined' in file_name: miRNA_type = file_name.split('_')[ 1] # get type of miRNA data (precursor, mature, or total) if miRNA_type == 'mature': df = pd.read_csv( file_path, delimiter='\t', index_col=['Name', 'ID', 'Alias', 'Derives_from']) else: df = pd.read_csv(file_path, delimiter='\t', index_col=['Name', 'ID', 'Alias']) df = df.transpose() df = average_replicates( df, common='\.\d$' ) # average duplicates for C3L-02617 and C3N-02727 df.index = df.index.str.replace('\.T$', '', regex=True) df.index = df.index.str.replace('\.A$', '.N', regex=True) df.index.name = 'Patient_ID' # Sort normal = df.loc[df.index.str.contains('\.N$', regex=True)] normal = normal.sort_values(by=["Patient_ID"]) tumor = df.loc[~df.index.str.contains('\.N$', regex=True)] tumor = tumor.sort_values(by=["Patient_ID"]) all_df = tumor.append(normal) self._data[miRNA_type + '_miRNA'] = all_df elif file_name == "HNSCC_xCell.txt": df = pd.read_csv(file_path, sep='\t', index_col=0) df = df.transpose() df.columns.name = 'Name' df.index.name = 'Patient_ID' df.index = df.index.str.replace( r'-T$', '', regex=True) # remove label for tumor samples df.index = df.index.str.replace( r'-A$', '.N', regex=True) # change label for normal samples self._data["xcell"] = df elif file_name == "CIBERSORT.Output_Abs_HNSCC.txt": df = pd.read_csv(file_path, sep='\t', index_col=0) df.index.name = 'Patient_ID' df.columns.name = 'Name' df.index = df.index.str.replace(r'-T$', '', regex=True) df.index = df.index.str.replace(r'-A$', '.N', regex=True) self._data["cibersort"] = df elif file_name == "HNSCC.gene_level.from_seg.filtered.tsv": df = pd.read_csv(file_path, sep="\t") df = df.rename(columns={"Gene": "Name"}) df = df.set_index("Name") self._data["CNV"] = df elif file_name == "gencode.v22.annotation.gtf.gz": df = read_gtf(file_path) df = df[["gene_name", "gene_id"]] df = df.drop_duplicates() df = df.rename(columns={ "gene_name": "Name", "gene_id": "Database_ID" }) df = df.set_index("Name") self._helper_tables["CNV_gene_ids"] = df elif file_name == "CPTAC_pancan_RNA_tumor_purity_ESTIMATE_WashU.tsv.gz": df = pd.read_csv(file_path, sep="\t", na_values='NA') df.Sample_ID = df.Sample_ID.str.replace( r'-T', '', regex=True) # only tumor samples in file df = df.set_index('Sample_ID') df.index.name = 'Patient_ID' # Use list of patient_ids to slice out cancers patient_ids = clinical_df.index.to_list() df = df.loc[df.index.isin(patient_ids)] self._data["tumor_purity"] = df elif file_name == "README_miRNA": with open(file_path, 'r') as reader: self._readme_files["readme_miRNA"] = reader.read() elif file_name == "README_CIBERSORT": with open(file_path, 'r') as reader: self._readme_files["readme_cibersort"] = reader.read() elif file_name == "README_xCell": with open(file_path, 'r') as reader: self._readme_files["readme_xcell"] = reader.read() elif file_name == "README_somatic_mutation_WXS": with open(file_path, 'r') as reader: self._readme_files[ "readme_somatic_mutation"] = reader.read() elif file_name == "README_gene_expression": with open(file_path, 'r') as reader: self._readme_files["readme_transcriptomics"] = reader.read( ) elif file_name == "README.boxnote": self._readme_files["readme_cnv"] = get_boxnote_text(file_path) elif file_name == "README_ESTIMATE_WashU": with open(file_path, 'r') as reader: self._readme_files["readme_tumor_purity"] = reader.read() print(' ' * len(loading_msg), end='\r') # Erase the loading message formatting_msg = f"Formatting {self.get_cancer_type()} dataframes..." print(formatting_msg, end='\r') # combine and create transcriptomic dataframe rna_tumor = self._helper_tables.get("transcriptomics_tumor") rna_normal = self._helper_tables.get( "transcriptomics_norm" ) # Normal entries are already marked with 'N' on the end of the ID rna_combined = rna_tumor.append(rna_normal) self._data["transcriptomics"] = rna_combined # CNV cnv = self._data["CNV"] gene_ids = self._helper_tables["CNV_gene_ids"] df = cnv.join(gene_ids, how="left") #merge in gene_ids df = df.reset_index() df = df.set_index(["Name", "Database_ID"]) #create multi-index df = df.T df.index.name = 'Patient_ID' self._data["CNV"] = df self._data = sort_all_rows_pancan( self._data) # Sort IDs (tumor first then normal) # Use the master index to reindex the clinical dataframe, so the clinical dataframe has a record of every sample in the dataset. Rows that didn't exist before (such as the rows for normal samples) are filled with NaN. # new_clinical = self._data["clinical"] # new_clinical = new_clinical.reindex(master_index) # Add a column called Sample_Tumor_Normal to the clinical dataframe indicating whether each sample was a tumor or normal sample. Use a function from dataframe_tools to generate it. ###FILL: Your dataset should have some way that it marks the Patient IDs ### of normal samples. The example code below is for a dataset that ### marks them by putting an 'N' at the beginning of each one. You will ### need to write a lambda function that takes a given Patient_ID string ### and returns a bool indicating whether it corresponds to a normal ### sample. Pass that lambda function to the 'normal_test' parameter of ### the generate_sample_status_col function when you call it. See ### cptac/dataframe_tools.py for further function documentation. ###START EXAMPLE CODE################################################### # sample_status_col = generate_sample_status_col(new_clinical, normal_test=lambda sample: sample[0] == 'N') ###END EXAMPLE CODE##################################################### # new_clinical.insert(0, "Sample_Tumor_Normal", sample_status_col) # Replace the clinical dataframe in the data dictionary with our new and improved version! # self._data['clinical'] = new_clinical # Edit the format of the Patient_IDs to have normal samples marked the same way as in other datasets. ###FILL: You will need to pass the proper parameters to correctly ### reformat the patient IDs in your dataset. The standard format is to ### have the string '.N' appended to the end of the normal patient IDs, ### e.g. the normal patient ID corresponding to C3L-00378 would be ### C3L-00378.N (this way we can easily match two samples from the same ### patient). The example code below is for a dataset where all the ### normal samples have an "N" prepended to the patient IDs. The ### reformat_normal_patient_ids function erases that and puts a ".N" at ### the end. See cptac/dataframe_tools.py for further function ### documentation. ###START EXAMPLE CODE################################################### # self._data = reformat_normal_patient_ids(self._data, existing_identifier="N", existing_identifier_location="start") ###END EXAMPLE CODE##################################################### # Call function from dataframe_tools.py to sort all tables first by sample status, and then by the index # self._data = sort_all_rows(self._data) # Call function from dataframe_tools.py to standardize the names of the index and column axes # self._data = standardize_axes_names(self._data) print(" " * len(formatting_msg), end='\r') # Erase the formatting message
def __init__(self, no_internet, version): """Load all of the umichucec dataframes as values in the self._data dict variable, with names as keys, and format them properly. Parameters: version (str, optional): The version number to load, or the string "latest" to just load the latest building. Default is "latest". no_internet (bool, optional): Whether to skip the index update step because it requires an internet connection. This will be skipped automatically if there is no internet at all, but you may want to manually skip it if you have a spotty internet connection. Default is False. """ # Set some needed variables, and pass them to the parent Dataset class __init__ function # This keeps a record of all versions that the code is equipped to handle. That way, if there's a new data release but they didn't update their package, it won't try to parse the new data version it isn't equipped to handle. valid_versions = ["1.0"] data_files = { "1.0": ["Report_abundance_groupby=protein_protNorm=MD_gu=2.tsv", "Report_abundance_groupby=multi-site_protNorm=MD_gu=2.tsv", "aliquot_to_patient_ID.tsv", "README_v3.boxnote", # proteomics "README.boxnote" # phosphoproteomics ] } # Call the parent class __init__ function super().__init__(cancer_type="umichucec", version=version, valid_versions=valid_versions, data_files=data_files, no_internet=no_internet) # Load the data into dataframes in the self._data dict loading_msg = f"Loading {self.get_cancer_type()} v{self.version()}" for file_path in self._data_files_paths: # Loops through files variable # Print a loading message. We add a dot every time, so the user knows it's not frozen. loading_msg = loading_msg + "." print(loading_msg, end='\r') path_elements = file_path.split(os.sep) # Get a list of the levels of the path file_name = path_elements[-1] # The last element will be the name of the file. We'll use this to identify files for parsing in the if/elif statements below if file_name == "Report_abundance_groupby=protein_protNorm=MD_gu=2.tsv": df = pd.read_csv(file_path, sep = "\t") df['Database_ID'] = df.Index.apply(lambda x: x.split('|')[0]) # get protein identifier df['Name'] = df.Index.apply(lambda x: x.split('|')[6]) # get protein name df = df.set_index(['Name', 'Database_ID']) # set multiindex df = df.drop(columns = ['Index', 'MaxPepProb', 'NumberPSM', 'Gene']) # drop unnecessary columns df = df.transpose() ref_intensities = df.loc["ReferenceIntensity"] # get reference intensities to use to calculate ratios df = df.subtract(ref_intensities, axis="columns") # subtract reference intensities from all the values df = df.iloc[1:,:] # drop ReferenceIntensity row df.index.name = 'Patient_ID' self._data["proteomics"] = df elif file_name == "Report_abundance_groupby=multi-site_protNorm=MD_gu=2.tsv": df = pd.read_csv(file_path, sep = "\t") # Parse a few columns out of the "Index" column that we'll need for our multiindex df[['Database_ID','Transcript_ID',"Gene_ID","Havana_gene","Havana_transcript","Transcript","Name","Site"]] = df.Index.str.split("\\|",expand=True) df[['num1','start',"end","detected_phos","localized_phos","Site"]] = df.Site.str.split("_",expand=True) # Some rows have at least one localized phosphorylation site, but also have other # phosphorylations that aren't localized. We'll drop those rows, if their localized sites # are duplicated in another row, to avoid creating duplicates, because we only preserve information # about the localized sites in a given row. However, if the localized sites aren't duplicated in # another row, we'll keep the row. unlocalized_to_drop = df.index[~df["detected_phos"].eq(df["localized_phos"]) & df.duplicated(["Name", "Site", "Peptide", "Database_ID"], keep=False)]# dectected_phos of the split "Index" column is number of phosphorylations detected, and localized_phos is number of phosphorylations localized, so if the two values aren't equal, the row has at least one unlocalized site df = df.drop(index=unlocalized_to_drop) df = df[df['Site'].notna()] # only keep columns with phospho site df = df.set_index(['Name', 'Site', 'Peptide', 'Database_ID']) # This will create a multiindex from these columns #drop columns not needed in df df.drop(["Gene", "Index", "num1", "start", "end", "detected_phos", "localized_phos", "Havana_gene", "Havana_transcript", "MaxPepProb", "Gene_ID", "Transcript_ID", "Transcript"], axis=1, inplace=True) df = df.T # transpose ref_intensities = df.loc["ReferenceIntensity"]# Get reference intensities to use to calculate ratios df = df.subtract(ref_intensities, axis="columns") # Subtract ref intensities from all the values, to get ratios df = df.iloc[1:,:] # drop ReferenceIntensity row self._data["phosphoproteomics"] = df # aliquot_to_patient_ID.tsv contains only unique aliquots (no duplicates), # so there is no need to slice out cancer specific aliquots # This file can be found on Box under CPTAC/cptac/pancan/helper_files elif file_name == "aliquot_to_patient_ID.tsv": df = pd.read_csv(file_path, sep = "\t", index_col = 'aliquot_ID', usecols = ['aliquot_ID', 'patient_ID']) map_dict = df.to_dict()['patient_ID'] # create dictionary with aliquot_ID as keys and patient_ID as values self._helper_tables["map_ids"] = map_dict elif file_name == "README_v3.boxnote": self._readme_files["readme_proteomics"] = get_boxnote_text(file_path) elif file_name == "README.boxnote": self._readme_files["readme_phosphoproteomics"] = get_boxnote_text(file_path) print(' ' * len(loading_msg), end='\r') # Erase the loading message formatting_msg = f"Formatting {self.get_cancer_type()} dataframes..." print(formatting_msg, end='\r') # There was 1 duplicate ID (C3N-01825) in the proteomic and phosphoproteomic data. # I used the Payne lab mapping file "aliquot_to_patient_ID.tsv" to determine the tissue type # for these duplicates, and they were both tumor samples. Next, I ran a pearson correlation # to check how well the values from each duplicate correlated to its tumor flagship sample. # The first occurrence in the file had a higher correlation with the flagship sample # than the second occurrence. I also created scatterplots comparing each duplicate to its flagship sample. # We dropped the second occurrence of the duplicate because it didn't correlate very well to its flagship sample. # A file containing the correlations can be downloaded at: # https://byu.box.com/shared/static/jzsq69bd079oq0zbicw4w616hyicd5ev.xlsx # Drop quality control and ref intensity cols drop_cols = ['RefInt_pool01', 'RefInt_pool02', 'RefInt_pool03', 'RefInt_pool04', 'RefInt_pool05', 'RefInt_pool06', 'RefInt_pool07', 'RefInt_pool08', 'RefInt_pool09', 'RefInt_pool10', 'RefInt_pool11', 'RefInt_pool12', 'RefInt_pool13', 'RefInt_pool14', 'RefInt_pool15', 'RefInt_pool16', 'RefInt_pool17'] # Get dictionary with aliquots as keys and patient IDs as values mapping_dict = self._helper_tables["map_ids"] # Proteomics prot = self._data["proteomics"] prot = prot.drop(drop_cols, axis = 'index') # drop quality control and ref intensity cols prot = prot.reset_index() prot['Patient_ID'] = prot['Patient_ID'].replace(mapping_dict) # replace aliquots with patient IDs prot['Patient_ID'] = prot['Patient_ID'].apply(lambda x: x+'.N' if 'NX' in x else x) # 'NX' are enriched normals prot = prot.set_index('Patient_ID') prot = rename_duplicate_labels(prot, 'index') # add ".1" to the second ocurrence of the ID with a duplicate prot = prot.drop('C3N-01825.1', axis = 'index') # drop the duplicate that didn't correlate well with flagship self._data["proteomics"] = prot # Phosphoproteomics phos = self._data["phosphoproteomics"] phos = phos.drop(drop_cols, axis = 'index') # drop quality control and ref intensity cols phos = phos.rename(index = mapping_dict) # replace aliquots with patient IDs (normal samples have .N appended) # Add '.N' to enriched normal samples ('NX') phos.index.name = 'Patient_ID' phos = phos.reset_index() phos['Patient_ID'] = phos['Patient_ID'].apply(lambda x: x+'.N' if 'NX' in x else x) # 'NX' are enriched normals phos = phos.set_index('Patient_ID') phos = rename_duplicate_labels(phos, 'index') # add ".1" to the second ocurrence of the ID with a duplicate phos = phos.drop('C3N-01825.1', axis = 'index') # drop the duplicate that didn't correlate well with flagship self._data["phosphoproteomics"] = phos # Sort rows (tumor first then normal) and columns by first level (protein/gene name) self._data = sort_all_rows_pancan(self._data) print(" " * len(formatting_msg), end='\r') # Erase the formatting message
def __init__(self, no_internet, version): """Load all of the bcmbrca dataframes as values in the self._data dict variable, with names as keys, and format them properly. Parameters: version (str, optional): The version number to load, or the string "latest" to just load the latest building. Default is "latest". no_internet (bool, optional): Whether to skip the index update step because it requires an internet connection. This will be skipped automatically if there is no internet at all, but you may want to manually skip it if you have a spotty internet connection. Default is False. """ #ignore logging messages logger = logging.getLogger() logger.setLevel(logging.CRITICAL) # Set some needed variables, and pass them to the parent Dataset class __init__ function # This keeps a record of all versions that the code is equipped to handle. That way, if there's a new data release but they didn't update their package, it won't try to parse the new data version it isn't equipped to handle. valid_versions = ["1.0"] data_files = { "1.0": [ "OV_prospective.dnp.annotated.exonic.addrecovercases.maf.gz", "OV_tumor_RNA-Seq_Expr_WashU_FPKM.tsv.gz", #"OV_precursor_miRNA_combined.tsv", # waiting for data #"OV_mature_miRNA_combined.tsv", #"OV_total_miRNA_combined.tsv", "CIBERSORT.Output_Abs_OV.txt", "OV_xCell.txt", "gencode.v22.annotation.gtf.gz", "OV.gene_level.from_seg.filtered.tsv", "CPTAC_pancan_RNA_tumor_purity_ESTIMATE_WashU.tsv.gz", "README_miRNA", "README_CIBERSORT", "README_xCell", "README_somatic_mutation_WXS", "README_gene_expression", "README.boxnote", "README_ESTIMATE_WashU" ] } # Call the parent class __init__ function super().__init__(cancer_type="washuov", version=version, valid_versions=valid_versions, data_files=data_files, no_internet=no_internet) # get clinical df (used to slice out cancer specific patient_IDs in tumor_purity file) mssmclin = MssmClinical( no_internet=no_internet, version=version, filter_type='pancanov') #_get_version - pancandataset clinical_df = mssmclin.get_clinical() # Load the data into dataframes in the self._data dict loading_msg = f"Loading {self.get_cancer_type()} v{self.version()}" for file_path in self._data_files_paths: # Loops through files variable # Print a loading message. We add a dot every time, so the user knows it's not frozen. loading_msg = loading_msg + "." print(loading_msg, end='\r') path_elements = file_path.split( os.sep) # Get a list of the levels of the path file_name = path_elements[ -1] # The last element will be the name of the file. We'll use this to identify files for parsing in the if/elif statements below if file_name == "OV_prospective.dnp.annotated.exonic.addrecovercases.maf.gz": # Note that we use the "file_name" variable to identify files. That way we don't have to use the whole path. df = pd.read_csv(file_path, sep='\t') # Rename the columns we want to keep to the appropriate names df = pd.read_csv(file_path, sep='\t') df['Patient_ID'] = df.loc[:, 'Tumor_Sample_Barcode'] df = df.rename( columns={ "Hugo_Symbol": "Gene", "Gene": "Gene_Database_ID", "Variant_Classification": "Mutation", "HGVSp_Short": "Location" }) df = df.set_index("Patient_ID") df = df[['Gene'] + ["Mutation"] + ["Location"] + [ col for col in df.columns if col not in ["Gene", "Mutation", "Location"] ]] df.index = df.index.str.replace(r"_T", "", regex=True) self._data["somatic_mutation"] = df if file_name == "OV_tumor_RNA-Seq_Expr_WashU_FPKM.tsv.gz": df = pd.read_csv(file_path, sep="\t") df = df.rename(columns={ "gene_name": "Name", "gene_id": "Database_ID" }) df = df.set_index(["Name", "Database_ID"]) df = df.sort_index() df = df.T df.index.name = "Patient_ID" #remove label for tumor samples. All samples are tumors df.index = df.index.str.replace(r"-T", "", regex=True) self._data["transcriptomics"] = df elif file_name == "OV_xCell.txt": df = pd.read_csv(file_path, sep='\t', index_col=0) df = df.transpose() df.columns.name = 'Name' df.index.name = 'Patient_ID' df.index = df.index.str.replace( r'-T$', '', regex=True) # remove label for tumor samples df.index = df.index.str.replace( r'-A$', '.N', regex=True) # change label for normal samples self._data["xcell"] = df elif file_name == "CIBERSORT.Output_Abs_OV.txt": df = pd.read_csv(file_path, sep='\t', index_col=0) df.index.name = 'Patient_ID' df.columns.name = 'Name' df.index = df.index.str.replace(r'-T$', '', regex=True) df.index = df.index.str.replace(r'-A$', '.N', regex=True) self._data["cibersort"] = df elif file_name == "OV.gene_level.from_seg.filtered.tsv": df = pd.read_csv(file_path, sep="\t") df = df.rename(columns={"Gene": "Name"}) df = df.set_index("Name") self._data["CNV"] = df elif file_name == "gencode.v22.annotation.gtf.gz": df = read_gtf(file_path) df = df[["gene_name", "gene_id"]] df = df.drop_duplicates() df = df.rename(columns={ "gene_name": "Name", "gene_id": "Database_ID" }) df = df.set_index("Name") self._helper_tables["CNV_gene_ids"] = df elif file_name == "CPTAC_pancan_RNA_tumor_purity_ESTIMATE_WashU.tsv.gz": df = pd.read_csv(file_path, sep="\t", na_values='NA') df.Sample_ID = df.Sample_ID.str.replace( r'-T', '', regex=True) # only tumor samples in file df = df.set_index('Sample_ID') df.index.name = 'Patient_ID' # Use list of patient_ids to slice out cancers patient_ids = clinical_df.index.to_list() df = df.loc[df.index.isin(patient_ids)] self._data["tumor_purity"] = df elif file_name == "README_miRNA": with open(file_path, 'r') as reader: self._readme_files["readme_miRNA"] = reader.read() elif file_name == "README_CIBERSORT": with open(file_path, 'r') as reader: self._readme_files["readme_cibersort"] = reader.read() elif file_name == "README_xCell": with open(file_path, 'r') as reader: self._readme_files["readme_xcell"] = reader.read() elif file_name == "README_somatic_mutation_WXS": with open(file_path, 'r') as reader: self._readme_files[ "readme_somatic_mutation"] = reader.read() elif file_name == "README_gene_expression": with open(file_path, 'r') as reader: self._readme_files["readme_transcriptomics"] = reader.read( ) elif file_name == "README.boxnote": self._readme_files["readme_cnv"] = get_boxnote_text(file_path) elif file_name == "README_ESTIMATE_WashU": with open(file_path, 'r') as reader: self._readme_files["readme_tumor_purity"] = reader.read() print(' ' * len(loading_msg), end='\r') # Erase the loading message formatting_msg = f"Formatting {self.get_cancer_type()} dataframes..." print(formatting_msg, end='\r') # CNV cnv = self._data["CNV"] gene_ids = self._helper_tables["CNV_gene_ids"] df = cnv.join(gene_ids, how="left") #merge in gene_ids df = df.reset_index() df = df.set_index(["Name", "Database_ID"]) #create multi-index df = df.T df.index.name = 'Patient_ID' self._data["CNV"] = df self._data = sort_all_rows_pancan( self._data) # Sort IDs (tumor first then normal) print(" " * len(formatting_msg), end='\r') # Erase the formatting message
def __init__(self, no_internet, version): """Load all of the umichcoad dataframes as values in the self._data dict variable, with names as keys, and format them properly. Parameters: version (str, optional): The version number to load, or the string "latest" to just load the latest building. Default is "latest". no_internet (bool, optional): Whether to skip the index update step because it requires an internet connection. This will be skipped automatically if there is no internet at all, but you may want to manually skip it if you have a spotty internet connection. Default is False. """ # Set some needed variables, and pass them to the parent Dataset class __init__ function # This keeps a record of all versions that the code is equipped to handle. That way, if there's a new data release but they didn't update their package, it won't try to parse the new data version it isn't equipped to handle. valid_versions = ["1.0", "1.1"] data_files = { "1.0": [ "Report_abundance_groupby=protein_protNorm=MD_gu=2.tsv", "Report_abundance_groupby=multi-site_protNorm=MD_gu=2.tsv", "CRC_Prospective sample info.xlsx", "README_v3.boxnote", # proteomics "README.boxnote" # phosphoproteomics #"S039_BCprospective_observed_0920.tsv.gz", #"S039_BCprospective_imputed_0920.tsv.gz" ], "1.1": [ "Report_abundance_groupby=protein_protNorm=MD_gu=2.tsv", "Report_abundance_groupby=multi-site_protNorm=MD_gu=2.tsv", "CRC_Prospective sample info.xlsx", "README_v3.boxnote", # proteomics "README.boxnote" # phosphoproteomics ] } # Call the parent class __init__ function super().__init__(cancer_type="umichcoad", version=version, valid_versions=valid_versions, data_files=data_files, no_internet=no_internet) # Load the data into dataframes in the self._data dict loading_msg = f"Loading {self.get_cancer_type()} v{self.version()}" for file_path in self._data_files_paths: # Loops through files variable # Print a loading message. We add a dot every time, so the user knows it's not frozen. loading_msg = loading_msg + "." print(loading_msg, end='\r') path_elements = file_path.split( os.sep) # Get a list of the levels of the path file_name = path_elements[ -1] # The last element will be the name of the file. We'll use this to identify files for parsing in the if/elif statements below # Proteomics if file_name == "Report_abundance_groupby=protein_protNorm=MD_gu=2.tsv": df = pd.read_csv(file_path, sep="\t") df['Database_ID'] = df.Index.apply( lambda x: x.split('|')[0]) # get protein identifier df['Name'] = df.Index.apply( lambda x: x.split('|')[6]) # get protein name df = df.set_index(['Name', 'Database_ID']) # set multiindex df = df.drop( columns=['Index', 'MaxPepProb', 'NumberPSM', 'Gene']) # drop unnecessary columns df = df.transpose() ref_intensities = df.loc[ "ReferenceIntensity"] # get reference intensities to use to calculate ratios df = df.subtract( ref_intensities, axis="columns" ) # subtract reference intensities from all the values df = df.iloc[1:, :] # drop ReferenceIntensity row df.index.name = 'Patient_ID' self._data["proteomics"] = df # Phosphoproteomics elif file_name == "Report_abundance_groupby=multi-site_protNorm=MD_gu=2.tsv": df = pd.read_csv(file_path, sep="\t") # Parse a few columns out of the "Index" column that we'll need for our multiindex df[[ 'Database_ID', 'Transcript_ID', "Gene_ID", "Havana_gene", "Havana_transcript", "Transcript", "Name", "Site" ]] = df.Index.str.split("\\|", expand=True) df[[ 'num1', 'start', "end", "detected_phos", "localized_phos", "Site" ]] = df.Site.str.split("_", expand=True) # Some rows have at least one localized phosphorylation site, but also have other # phosphorylations that aren't localized. We'll drop those rows, if their localized sites # are duplicated in another row, to avoid creating duplicates, because we only preserve information # about the localized sites in a given row. However, if the localized sites aren't duplicated in # another row, we'll keep the row. unlocalized_to_drop = df.index[~df["detected_phos"].eq(df["localized_phos"]) & \ df.duplicated(["Name", "Site", "Peptide", "Database_ID"], keep=False)] # dectected_phos of the split "Index" column is number of phosphorylations detected, and # localized_phos is number of phosphorylations localized, so if the two values aren't equal, #the row has at least one unlocalized site df = df.drop(index=unlocalized_to_drop) df = df[ df['Site'].notna()] # only keep columns with phospho site df = df.set_index(['Name', 'Site', 'Peptide', 'Database_ID' ]) # create a multiindex in this order. #drop columns not needed in df df.drop([ 'Gene', "Index", "num1", "start", "end", "detected_phos", "localized_phos", "Havana_gene", "Havana_transcript", "MaxPepProb", "Gene_ID", "Transcript_ID", "Transcript" ], axis=1, inplace=True) df = df.transpose() ref_intensities = df.loc[ "ReferenceIntensity"] # Get reference intensities to use to calculate ratios df = df.subtract( ref_intensities, axis="columns" ) # Subtract ref intensities from all the values, to get ratios df = df.iloc[1:, :] # drop ReferenceIntensity row df.index.name = 'Patient_ID' self._data["phosphoproteomics"] = df # Mapping file to convert aliquots to patient_IDs for Colon # This file can be found on Box under CPTAC/cptac/pancan/helper_files elif file_name == "CRC_Prospective sample info.xlsx": df = pd.read_excel(file_path, index_col='Label', usecols=['Label', 'Sample Code']) map_dict = df.to_dict( )['Sample Code'] # create dictionary with aliquots as keys and patient IDs as values self._helper_tables["map_ids"] = map_dict elif file_name == "README_v3.boxnote": self._readme_files["readme_proteomics"] = get_boxnote_text( file_path) elif file_name == "README.boxnote": self._readme_files[ "readme_phosphoproteomics"] = get_boxnote_text(file_path) ''' if file_name == "S039_BCprospective_observed_0920.tsv.gz": df = pd.read_csv(file_path, sep="\t") df = df.transpose() df.index.name = 'Patient_ID' df.columns.name = 'Name' df = average_replicates(df) df = df.sort_values(by=["Patient_ID"]) self._data["proteomics"] = df if file_name == "S039_BCprospective_imputed_0920.tsv.gz": df = pd.read_csv(file_path, sep="\t") df = df.transpose() df.index.name = 'Patient_ID' df.columns.name = 'Name' df = average_replicates(df) df = df.sort_values(by=["Patient_ID"]) self._data["proteomics_imputed"] = df''' print(' ' * len(loading_msg), end='\r') # Erase the loading message formatting_msg = f"Formatting {self.get_cancer_type()} dataframes..." print(formatting_msg, end='\r') drop_cols = [ 'colonRef22-2', 'RefInt_ColonRef01', 'RefInt_ColonRef02', 'RefInt_ColonRef03', 'RefInt_ColonRef04', 'RefInt_ColonRef05', 'RefInt_ColonRef06', 'RefInt_ColonRef07', 'RefInt_ColonRef08', 'RefInt_ColonRef09', 'RefInt_ColonRef10', 'RefInt_ColonRef11', 'RefInt_ColonRef12', 'RefInt_ColonRef13', 'RefInt_ColonRef14', 'RefInt_ColonRef15', 'RefInt_ColonRef16', 'RefInt_ColonRef17', 'RefInt_ColonRef18', 'RefInt_ColonRef19', 'RefInt_ColonRef20', 'RefInt_ColonRef21', 'RefInt_ColonRef22-1' ] # Proteomics prot = self._data['proteomics'] prot = prot.drop( drop_cols, axis='index') # drop quality control and ref intensity cols # Phosphoproteomics phos = self._data["phosphoproteomics"] phos = phos.drop( drop_cols, axis='index') # drop quality control and ref intensity cols if self._version == "1.1": # Get dictionary to map aliquot to patient IDs mapping_dict = self._helper_tables["map_ids"] # Proteomics prot = prot.reset_index() prot['Patient_ID'] = prot['Patient_ID'].replace( mapping_dict) # replace aliquots with Patient_IDs prot.Patient_ID = prot.Patient_ID.apply( lambda x: x[1:] + '.N' if x[0] == 'N' else x[1:]) # change normals to have .N prot = prot.set_index('Patient_ID') # Phosphoproteomics phos = phos.reset_index() phos['Patient_ID'] = phos['Patient_ID'].replace( mapping_dict) # replace aliquots with Patient_IDs phos.Patient_ID = phos.Patient_ID.apply( lambda x: x[1:] + '.N' if x[0] == 'N' else x[1:]) # change normals to have .N phos = phos.set_index('Patient_ID') self._data['proteomics'] = prot self._data["phosphoproteomics"] = phos # Sort rows (tumor first then normal) and columns by first level (protein/gene name) self._data = sort_all_rows_pancan(self._data) print(" " * len(formatting_msg), end='\r') # Erase the formatting message
def __init__(self, no_internet, version): """Load all of the umichucec dataframes as values in the self._data dict variable, with names as keys, and format them properly. Parameters: version (str, optional): The version number to load, or the string "latest" to just load the latest building. Default is "latest". no_internet (bool, optional): Whether to skip the index update step because it requires an internet connection. This will be skipped automatically if there is no internet at all, but you may want to manually skip it if you have a spotty internet connection. Default is False. """ # Set some needed variables, and pass them to the parent Dataset class __init__ function # This keeps a record of all versions that the code is equipped to handle. That way, if there's a new data release but they didn't update their package, it won't try to parse the new data version it isn't equipped to handle. valid_versions = ["1.0"] data_files = { "1.0": ["Report_abundance_groupby=protein_protNorm=MD_gu=2.tsv", "Report_abundance_groupby=multi-site_protNorm=MD_gu=2.tsv", "aliquot_to_patient_ID.tsv", "README_v3.boxnote", # proteomics "README.boxnote" # phosphoproteomics ] } # Call the parent class __init__ function super().__init__(cancer_type="umichpdac", version=version, valid_versions=valid_versions, data_files=data_files, no_internet=no_internet) # Load the data into dataframes in the self._data dict loading_msg = f"Loading {self.get_cancer_type()} v{self.version()}" for file_path in self._data_files_paths: # Loops through files variable # Print a loading message. We add a dot every time, so the user knows it's not frozen. loading_msg = loading_msg + "." print(loading_msg, end='\r') path_elements = file_path.split(os.sep) # Get a list of the levels of the path file_name = path_elements[-1] # The last element will be the name of the file. We'll use this to identify files for parsing in the if/elif statements below if file_name == 'Report_abundance_groupby=protein_protNorm=MD_gu=2.tsv': df = pd.read_csv(file_path, sep = "\t") df['Database_ID'] = df.Index.apply(lambda x: x.split('|')[0]) # get protein identifier df['Name'] = df.Index.apply(lambda x: x.split('|')[6]) # get protein name df = df.set_index(['Name', 'Database_ID']) # set multiindex df = df.drop(columns = ['Index', 'MaxPepProb', 'NumberPSM', 'Gene']) # drop unnecessary columns df = df.transpose() ref_intensities = df.loc["ReferenceIntensity"] # get reference intensities to use to calculate ratios df = df.subtract(ref_intensities, axis="columns") # subtract reference intensities from all the values df.index.name = 'Patient_ID' # Drop quality control and ref intensity drop_cols = ['ReferenceIntensity', 'QC1', 'QC2', 'QC3', 'QC4', 'QC5', 'QC6', 'KoreanReference1', 'KoreanReference2', 'KoreanReference3', 'Pool-24-2', 'WU-PDA1', 'WU-Pool-25'] df = df.drop(drop_cols, axis = 'index') self._data["proteomics"] = df elif file_name == "Report_abundance_groupby=multi-site_protNorm=MD_gu=2.tsv": df = pd.read_csv(file_path, sep = "\t") # Parse a few columns out of the "Index" column that we'll need for our multiindex df[['Database_ID','Transcript_ID',"Gene_ID","Havana_gene","Havana_transcript","Transcript","Name","Site"]] = df.Index.str.split("\\|",expand=True) df[['num1','start',"end","detected_phos","localized_phos","Site"]] = df.Site.str.split("_",expand=True) # Some rows have at least one localized phosphorylation site, but also have other phosphorylations # that aren't localized. We'll drop those rows, if their localized sites are duplicated in another row, # to avoid creating duplicates, because we only preserve information about the localized sites in a # given row. However, if the localized sites aren't duplicated in another row, we'll keep the row. unlocalized_to_drop = df.index[~df["detected_phos"].eq(df["localized_phos"]) & \ df.duplicated(["Name", "Site", "Peptide", "Database_ID"], keep=False)] # dectected_phos of the split "Index" column is number of phosphorylations detected, and # localized_phos is number of phosphorylations localized, so if the two values aren't equal, # the row has at least one unlocalized site df = df.drop(index=unlocalized_to_drop) df = df[df['Site'].notna()] # only keep columns with phospho site df = df.set_index(['Name', 'Site', 'Peptide', 'Database_ID']) # create a multiindex in this order #drop columns not needed in df df.drop(['Gene', "Index", "num1", "start", "end", "detected_phos", "localized_phos", "Havana_gene", "Havana_transcript", "MaxPepProb", "Gene_ID", "Transcript_ID", "Transcript"], axis=1, inplace=True) df = df.T #transpose df ref_intensities = df.loc["ReferenceIntensity"]# Get reference intensities to use to calculate ratios df = df.subtract(ref_intensities, axis="columns") # Subtract ref intensities from all the values, to get ratios # Drop qauality control and ref intensity drop_cols = ['ReferenceIntensity', 'QC1', 'QC2', 'QC3', 'QC4', 'QC5', 'QC6', 'KoreanReference1', 'KoreanReference2', 'KoreanReference3', 'Pool-24-2', 'WU-PDA1', 'WU-Pool-25','RefInt_pool-01', 'RefInt_pool-02','RefInt_pool-03','RefInt_pool-04','RefInt_pool-05','RefInt_pool-06','RefInt_pool-07', 'RefInt_pool-08','RefInt_pool-09', 'RefInt_pool-10','RefInt_pool-11','RefInt_pool-12','RefInt_pool-13', 'RefInt_pool-14','RefInt_pool-15','RefInt_pool-16','RefInt_pool-17','RefInt_pool-18','RefInt_pool-19', 'RefInt_pool-20','RefInt_pool-21','RefInt_pool-22','RefInt_pool-23','RefInt_pool-24','RefInt_pool-25'] df = df.drop(drop_cols, axis = 'index') self._data["phosphoproteomics"] = df # aliquot_to_patient_ID.tsv contains only unique aliquots (no duplicates), # so there is no need to slice out cancer specific aliquots # This file can be found on Box under CPTAC/cptac/pancan/helper_files elif file_name == "aliquot_to_patient_ID.tsv": df = pd.read_csv(file_path, sep = "\t", index_col = 'aliquot_ID', usecols = ['aliquot_ID', 'patient_ID']) map_dict = df.to_dict()['patient_ID'] # create dictionary with aliquots as keys and patient IDs as values self._helper_tables["map_ids"] = map_dict elif file_name == "README_v3.boxnote": self._readme_files["readme_proteomics"] = get_boxnote_text(file_path) elif file_name == "README.boxnote": self._readme_files["readme_phosphoproteomics"] = get_boxnote_text(file_path) print(' ' * len(loading_msg), end='\r') # Erase the loading message formatting_msg = f"Formatting {self.get_cancer_type()} dataframes..." print(formatting_msg, end='\r') # These 8 aliquots were not in the mapping file. Yize said they are all normal samples. manually_mapped = {'CPT0347760002': 'C3L-07032.N', 'CPT0347790002': 'C3L-07033.N', 'CPT0347820002': 'C3L-07034.N', 'CPT0347850002': 'C3L-07035.N', 'CPT0347880002': 'C3L-07036.N', 'CPT0355180003': 'C3L-03513.N', 'CPT0355190003': 'C3L-03515.N', 'CPT0355200003': 'C3L-03514.N'} # Get dictionary to map aliquots to patient IDs mapping_dict = self._helper_tables["map_ids"] # Proteomics prot = self._data["proteomics"] prot = prot.rename(index = mapping_dict) # replace aliquots with patient IDs (normals have .N) prot = prot.rename(index = manually_mapped) # map 8 aliquots that were not in the mapping file self._data["proteomics"] = prot # Phosphoproteomics phos = self._data["phosphoproteomics"] phos = phos.rename(index = mapping_dict) # replace aliquots with patient IDs (normals have .N) phos = phos.rename(index = manually_mapped) # map 8 aliquots that were not in the mapping file self._data["phosphoproteomics"] = phos # Sort rows (tumor first then normal) and columns by first level (protein/gene name) self._data = sort_all_rows_pancan(self._data) print(" " * len(formatting_msg), end='\r') # Erase the formatting message
def __init__(self, no_internet, version): """Load all of the umichhnscc dataframes as values in the self._data dict variable, with names as keys, and format them properly. Parameters: version (str, optional): The version number to load, or the string "latest" to just load the latest building. Default is "latest". no_internet (bool, optional): Whether to skip the index update step because it requires an internet connection. This will be skipped automatically if there is no internet at all, but you may want to manually skip it if you have a spotty internet connection. Default is False. """ # Set some needed variables, and pass them to the parent Dataset class __init__ function # This keeps a record of all versions that the code is equipped to handle. That way, if there's a new data release but they didn't update their package, it won't try to parse the new data version it isn't equipped to handle. valid_versions = ["1.0"] data_files = { "1.0": ["Report_abundance_groupby=protein_protNorm=MD_gu=2.tsv", "Report_abundance_groupby=multi-site_protNorm=MD_gu=2.tsv", "README_v3.boxnote", # proteomics "README.boxnote" # phosphoproteomics #"S039_BCprospective_observed_0920.tsv.gz", #"S039_BCprospective_imputed_0920.tsv.gz" ] } # Call the parent class __init__ function super().__init__(cancer_type="umichhnscc", version=version, valid_versions=valid_versions, data_files=data_files, no_internet=no_internet) # Load the data into dataframes in the self._data dict loading_msg = f"Loading {self.get_cancer_type()} v{self.version()}" for file_path in self._data_files_paths: # Loops through files variable # Print a loading message. We add a dot every time, so the user knows it's not frozen. loading_msg = loading_msg + "." print(loading_msg, end='\r') path_elements = file_path.split(os.sep) # Get a list of the levels of the path file_name = path_elements[-1] # The last element will be the name of the file. We'll use this to identify files for parsing in the if/elif statements below if file_name == "Report_abundance_groupby=protein_protNorm=MD_gu=2.tsv": df = pd.read_csv(file_path, sep = "\t") df['Database_ID'] = df.Index.apply(lambda x: x.split('|')[0]) # get protein identifier df['Name'] = df.Index.apply(lambda x: x.split('|')[6]) # get protein name df = df.set_index(['Name', 'Database_ID']) # set multiindex df = df.drop(columns = ['Index', 'MaxPepProb', 'NumberPSM', 'Gene']) # drop unnecessary columns df = df.transpose() ref_intensities = df.loc["ReferenceIntensity"] # get reference intensities to use to calculate ratios df = df.subtract(ref_intensities, axis="columns") # subtract reference intensities from all the values df = df.iloc[1:,:] # drop ReferenceIntensity row df.index.name = 'Patient_ID' self._data["proteomics"] = df elif file_name == "Report_abundance_groupby=multi-site_protNorm=MD_gu=2.tsv": df = pd.read_csv(file_path, sep = "\t") # Parse a few columns out of the "Index" column that we'll need for our multiindex df[['Database_ID','Transcript_ID',"Gene_ID","Havana_gene","Havana_transcript","Transcript","Name", "Site"]] = df.Index.str.split("\\|",expand=True) df[['num1','start',"end","detected_phos","localized_phos","Site"]] = df.Site.str.split("_",expand=True) # Some rows have at least one localized phosphorylation site, but also have other # phosphorylations that aren't localized. We'll drop those rows, if their localized # sites are duplicated in another row, to avoid creating duplicates, because we only # preserve information about the localized sites in a given row. However, if the localized # sites aren't duplicated in another row, we'll keep the row. unlocalized_to_drop = df.index[~df["detected_phos"].eq(df["localized_phos"]) & \ df.duplicated(["Name", "Site", "Peptide", "Database_ID"], keep=False)] # dectected_phos of the split "Index" column is number of phosphorylations detected, and # localized_phos is number of phosphorylations localized, so if the two values aren't equal, the # row has at least one unlocalized site df = df.drop(index=unlocalized_to_drop) df = df[df['Site'].notna()] # only keep columns with phospho site df = df.set_index(['Name', 'Site', 'Peptide', 'Database_ID']) # create a multiindex, in this order. #drop columns not needed in df df.drop(['Gene', "Index", "num1", "start", "end", "detected_phos", "localized_phos", "Havana_gene", "Havana_transcript", "MaxPepProb", "Gene_ID", "Transcript_ID", "Transcript"], axis=1, inplace=True) df = df.T #transpose df ref_intensities = df.loc["ReferenceIntensity"]# Get reference intensities to use to calculate ratios df = df.subtract(ref_intensities, axis="columns") # Subtract ref intensities from all the values, to get ratios df = df.iloc[1:,:] # drop ReferenceIntensity row self._data["phosphoproteomics"] = df elif file_name == "README_v3.boxnote": self._readme_files["readme_proteomics"] = get_boxnote_text(file_path) elif file_name == "README.boxnote": self._readme_files["readme_phosphoproteomics"] = get_boxnote_text(file_path) ''' if file_name == "S039_BCprospective_observed_0920.tsv.gz": df = pd.read_csv(file_path, sep="\t") df = df.transpose() df.index.name = 'Patient_ID' df.columns.name = 'Name' df = average_replicates(df) df = df.sort_values(by=["Patient_ID"]) self._data["proteomics"] = df if file_name == "S039_BCprospective_imputed_0920.tsv.gz": df = pd.read_csv(file_path, sep="\t") df = df.transpose() df.index.name = 'Patient_ID' df.columns.name = 'Name' df = average_replicates(df) df = df.sort_values(by=["Patient_ID"]) self._data["proteomics_imputed"] = df''' print(' ' * len(loading_msg), end='\r') # Erase the loading message formatting_msg = f"Formatting {self.get_cancer_type()} dataframes..." print(formatting_msg, end='\r') # There were 4 labels with "-duplicate" appended in proteomics and phosphoproteomics files. # I ran a pearson correlation to check how well the values from each duplicate correlated to # the other duplicates for the same case ID. Three of the duplicates correlated well with their # respective case IDs. C3L-02617-N-duplicate2 did not correlate well with the other C3L-02617 duplicates, # so we dropped it and averaged the other two. I also created a scatterplot to compare each duplicate to # the first occurence of its case ID. The linear scatterplots indicated similarity between the aliquots. # We averaged the duplicates that correlated well together and were the same tissue type. # A file containing the correlations can be downloaded at: # https://byu.box.com/shared/static/jzsq69bd079oq0zbicw4w616hyicd5ev.xlsx drop_cols = ['128C', 'QC2', 'QC3', 'QC4', '129N', 'LungTumor1', 'Pooled-sample14', 'LungTumor2', 'QC6', 'LungTumor3', 'Pooled-sample17', 'QC7', 'Pooled-sample19', 'QC9', 'RefInt_pool01', 'RefInt_pool02', 'RefInt_pool03', 'RefInt_pool04', 'RefInt_pool05', 'RefInt_pool06', 'RefInt_pool07', 'RefInt_pool08', 'RefInt_pool09', 'RefInt_pool10', 'RefInt_pool11', 'RefInt_pool12', 'RefInt_pool13', 'RefInt_pool14', 'RefInt_pool15', 'RefInt_pool16', 'RefInt_pool17', 'RefInt_pool18', 'RefInt_pool19', 'RefInt_pool20'] # Proteomics prot = self._data["proteomics"] prot = prot.drop(drop_cols, axis = 'index') # drop quality control and ref intensity cols prot = prot.drop(['C3L-02617-N-duplicate2'], axis = 'index') # drop duplicate that did not correlate well # These IDs had a high correlation with their respective duplicates, so we average them # duplicates: 'C3L-02617-T-duplicate', 'C3L-00994-N-duplicate', 'C3L-02617-N-duplicate' prot = average_replicates(prot, ['C3L-02617-T','C3L-02617-N','C3L-00994-N'], normal_identifier = '-N') prot.index = prot.index.str.replace('-T$','', regex = True) prot.index = prot.index.str.replace('-N$','.N', regex = True) prot.index = prot.index.str.replace('-C$','.C', regex = True) # 6 cored normal samples in Hnscc self._data["proteomics"] = prot # Phosphoproteomics phos = self._data["phosphoproteomics"] phos = phos.drop(drop_cols, axis = 'index') # drop quality control and ref intensity cols phos = phos.drop(['C3L-02617-N-duplicate2'], axis = 'index') # drop duplicate that did not correlate well # average IDs that correlated well to their respective duplicates phos = average_replicates(phos, ['C3L-02617-T','C3L-02617-N','C3L-00994-N'], normal_identifier = '-N') phos.index = phos.index.str.replace('-T$','', regex = True) phos.index = phos.index.str.replace('-N$','.N', regex = True) phos.index = phos.index.str.replace('-C$','.C', regex = True) # 6 cored normal samples in Hnscc self._data["phosphoproteomics"] = phos # Sort rows (tumor first then normal) and columns by first level (protein/gene name) self._data = sort_all_rows_pancan(self._data) print(" " * len(formatting_msg), end='\r') # Erase the formatting message