def test_remote_select_1(self): querytext = """ #SELECT 1 RES = SELECT(region: (chr == chr2 OR chr == chr3) AND NOT(strand == + OR strand == -) AND start >= 130 AND stop <= 250) Example_Dataset_2; MATERIALIZE RES INTO select_1; """ logging.info("Query: {}".format(querytext)) logging.info("Executing REMOTE TEXTUAL query") respaths = self.rm.query(querytext, self.remote_output_path) dataset_name = respaths.iloc[0].dataset respaths = os.path.join(self.remote_output_path, dataset_name) logging.info("Deleting remote dataset {}".format(dataset_name)) self.rm.delete_dataset(dataset_name) res_query = gl.load_from_path(respaths).materialize() logging.info("Executing LOCAL PYTHON query") d = gl.get_example_dataset("Example_Dataset_2") d = d.select(region_predicate=(d.chr.isin(['chr2', 'chr3'])) & (~d.strand.isin(['+', '-'])) & (d.start >= 130) & (d.stop <= 250)) res_local = d.materialize() self.gdataframe_equality(res_query, res_local) logging.info("Executing REMOTE PYTHON query") gl.set_mode("remote") res_remote = d.materialize() self.gdataframe_equality(res_local, res_remote)
def extract_tfs(cell_lines, gencode_version): """ The EXTRACT_TFS operation extracts, from ChIP_Seq ENCODE expriments and for assembly GRCh38, the Transcription Factors that bind to promoter regions of genes belonging to specific cell lines, filtering by 'conservative idr thresholded peaks' in order to extract higher quality region data, and removing negative audits in order to keep only high quality data samples. Intermediate results files are exported locally during the execution of the function, while the final set of trasncription factors is returned as a Python dictionary (dict_GeneTF.p), where each target gene (set as key) is associated to the list of TFs binding to its promoters (set as value). :param cell_lines: a list of strings containing the names of the cell lines to analyze (it's possible to consider data from 1 up to 3 cell lines at the same time) :param gencode_version: number representing the GENCODE genomic annotations to use (currently, for assembly GRCh38, versions 22, 24 and 27 can be used) :return: a Python dictionary Example:: import genereg as gr tfs_dict = gr.TranscriptionFactors.extract_tfs(cell_lines=['K562','MCF7'], gencode_version=22) """ # Check input parameters if not ((len(cell_lines) > 0) and (len(cell_lines) < 4)): raise ValueError( 'You have to specify from 1 up to 3 cell lines to investigate') if gencode_version not in [22, 24, 27]: raise ValueError('GRCh38 GENCODE versions available are 22, 24 and 27') # Execute the query for the extraction of TFs on the remote server, using the PyGMQL Python library gl.set_remote_address('http://gmql.eu/gmql-rest/') gl.login() gl.set_mode('remote') # Load the ENCODE datasets to be used in the query narrow_dataset = gl.load_from_remote( remote_name='GRCh38_ENCODE_NARROW_NOV_2017', owner='public') ann_dataset = gl.load_from_remote(remote_name='GRCh38_ANNOTATION_GENCODE', owner='public') # Extract NARROW data of interest if len(cell_lines) == 1: cell = cell_lines[0] narrow = narrow_dataset.meta_select( (narrow_dataset['assay'] == 'ChIP-seq') & (narrow_dataset['output_type'] == 'conservative idr thresholded peaks') & (narrow_dataset['biosample_term_name'] == cell) & (narrow_dataset['assembly'] == 'GRCh38') & (narrow_dataset['project'] == 'ENCODE') & (narrow_dataset['file_status'] == 'released') & (~((narrow_dataset['audit_error'] == 'extremely low read depth') | (narrow_dataset['audit_error'] == 'extremely low read length') | (narrow_dataset['audit_warning'] == 'insufficient read depth') | (narrow_dataset['audit_not_compliant'] == 'insufficient read depth') | (narrow_dataset['audit_not_compliant'] == 'insufficient replicate concordance') | (narrow_dataset['audit_not_compliant'] == 'missing input control') | (narrow_dataset['audit_not_compliant'] == 'severe bottlenecking' ) | (narrow_dataset['audit_not_compliant'] == 'unreplicated experiment')))) elif len(cell_lines) == 2: cell_1 = cell_lines[0] cell_2 = cell_lines[1] narrow = narrow_dataset.meta_select( (narrow_dataset['assay'] == 'ChIP-seq') & (narrow_dataset['output_type'] == 'conservative idr thresholded peaks') & ((narrow_dataset['biosample_term_name'] == cell_1) | (narrow_dataset['biosample_term_name'] == cell_2)) & (narrow_dataset['assembly'] == 'GRCh38') & (narrow_dataset['project'] == 'ENCODE') & (narrow_dataset['file_status'] == 'released') & (~((narrow_dataset['audit_error'] == 'extremely low read depth') | (narrow_dataset['audit_error'] == 'extremely low read length') | (narrow_dataset['audit_warning'] == 'insufficient read depth') | (narrow_dataset['audit_not_compliant'] == 'insufficient read depth') | (narrow_dataset['audit_not_compliant'] == 'insufficient replicate concordance') | (narrow_dataset['audit_not_compliant'] == 'missing input control') | (narrow_dataset['audit_not_compliant'] == 'severe bottlenecking' ) | (narrow_dataset['audit_not_compliant'] == 'unreplicated experiment')))) elif len(cell_lines) == 3: cell_1 = cell_lines[0] cell_2 = cell_lines[1] cell_3 = cell_lines[2] narrow = narrow_dataset.meta_select( (narrow_dataset['assay'] == 'ChIP-seq') & (narrow_dataset['output_type'] == 'conservative idr thresholded peaks') & ((narrow_dataset['biosample_term_name'] == cell_1) | (narrow_dataset['biosample_term_name'] == cell_2) | (narrow_dataset['biosample_term_name'] == cell_3)) & (narrow_dataset['assembly'] == 'GRCh38') & (narrow_dataset['project'] == 'ENCODE') & (narrow_dataset['file_status'] == 'released') & (~((narrow_dataset['audit_error'] == 'extremely low read depth') | (narrow_dataset['audit_error'] == 'extremely low read length') | (narrow_dataset['audit_warning'] == 'insufficient read depth') | (narrow_dataset['audit_not_compliant'] == 'insufficient read depth') | (narrow_dataset['audit_not_compliant'] == 'insufficient replicate concordance') | (narrow_dataset['audit_not_compliant'] == 'missing input control') | (narrow_dataset['audit_not_compliant'] == 'severe bottlenecking' ) | (narrow_dataset['audit_not_compliant'] == 'unreplicated experiment')))) # Create the dataset of promoters annotations_version = str(gencode_version) coding_transcripts_0 = ann_dataset.meta_select( (ann_dataset['release_version'] == annotations_version) & (ann_dataset['annotation_type'] == 'transcript')) coding_transcripts = coding_transcripts_0.reg_select( (coding_transcripts_0.transcript_type == 'protein_coding') & ((coding_transcripts_0.tag == 'basic') | (coding_transcripts_0.tag == 'CCDS'))) prom_reg = coding_transcripts.reg_project( ['gene_id', 'gene_name', 'entrez_gene_id'], new_field_dict={ 'start': coding_transcripts.start - 2000, 'stop': coding_transcripts.start + 1000 }) prom = prom_reg.group(regs=['gene_name'], regs_aggregates={ 'ensembl_id': gl.BAGD('gene_id'), 'entrez_id': gl.BAGD('entrez_gene_id') }) # Merge all the possible replicas of the same TF, combining them in a single sample full_encode = narrow.normal_cover(1, 'ANY', ['experiment_target']) # Extract the transcription factors that overlap with at least one promoter region res_0 = prom.map(full_encode, refName='prom', expName='TF') res_1 = res_0.reg_select(res_0.count_prom_TF > 0) # Encode, for each region, the corresponding TF that binds to it as a region attribute set_tf = res_1.reg_project( new_field_dict={'TF': res_1['TF.experiment_target', 'string']}) # Merge all the samples into a dataset with a single sample containing all the regions with their binding TFs # and remove regions with unknown names for their belonging gene merged = set_tf.merge() known_genes = merged.reg_select(merged.entrez_id != '') # Group the regions by name, setting in the region attribute 'TF' the list of transcription factors that bind to that gene's promoters res = known_genes.group(regs=['gene_name'], regs_aggregates={ 'ensembl_gene_id': gl.BAGD('ensembl_id'), 'entrez_gene_id': gl.BAGD('entrez_id'), 'TFs': gl.BAGD('TF') }) # Materialize the results into a GDataframe res_Gdf = res.materialize('./(MaterializeResults)') # Extract the regions dataframe, where each row corresponds to a region and each column to an attribute GeneTF_df = res_Gdf.regs # Check the length of the dataframe, that is the number of rows of the dataframe length_df = len(GeneTF_df) # Set progressive integer numbers as new indexes of the dataframe GeneTF_df.index = range(length_df) # Convert all columns names into uppercase letters and rename them GeneTF_df.columns = map(str.upper, GeneTF_df.columns) GeneTF_df.rename(columns={ 'CHR': 'CHROM', 'START': 'LEFT', 'STOP': 'RIGHT', 'GENE_NAME': 'GENE_SYMBOL', 'TFS': 'TFs' }, inplace=True) for index, row in GeneTF_df.iterrows(): tfs_str = row['TFs'] tfs_list = tfs_str.split(',') GeneTF_df.set_value(index, 'TFs', tfs_list) # Load the list of genes of interest EntrezConversion_df = pd.read_excel('./Genes_of_Interest.xlsx', sheetname='Sheet1', header=0, converters={ 'GENE_SYMBOL': str, 'ENTREZ_GENE_ID': str, 'GENE_SET': str }) # Create a list with the Gene Symbols of the genes of interest Symbols = [] for index, row in EntrezConversion_df.iterrows(): i = row['GENE_SYMBOL'] Symbols.append(i) N_genes = len(Symbols) # Create an empty dictionary with lists as values for each key from collections import defaultdict dict_GeneTF = defaultdict(list) # Set the keys and initialize their values as empty lists for v in Symbols: dict_GeneTF[v] = [] dict_length = len(dict_GeneTF) # Select from the GeneTF_df only the rows with Gene Symbols of target genes of interest for index, row in GeneTF_df.iterrows( ): # iterate along the whole dataframe # get the current row GENE_SYMBOL i = row['GENE_SYMBOL'] for value in Symbols: # check if the current gene is contained in the list of genes of interest if i == value: # if there's correspondence TrFa_list = row.TFs # extract the list of TFs for t in TrFa_list: # since each gene can have more than one promoter bound by the same TF, # only distinct values for each transcription factor should be inserted in the dictionary if t not in dict_GeneTF[i]: # add the transcription factor to the list of values corresponding to that gene dict_GeneTF[i].append(t) # Order alphabetically the list of TFs for each gene of interest for k in dict_GeneTF.keys(): old = dict_GeneTF[k] # get the list of TFs sorted_TFs = sorted(old, key=lambda s: s.lower( )) # sort the list alphabetically (case-insensitive sorting) dict_GeneTF[k] = sorted_TFs # For each gene, add to the dictionary its ENTREZ_GENE_ID and the GENE_SET it belongs to. # These are added as two lists at the end of the value list of each key (i.e. gene): clearly the list containing # the ENTREZ_GENE_ID will always have one string element for each key, while the list containing the gene sets # can have one or more elements, depending on the number of sets the corresponding gene belongs to. # Get distinct Gene Symbols of genes of interest (considering only once the genes that belongs to multiple sets) Symbols_distinct = [] for value in Symbols: if value not in Symbols_distinct: Symbols_distinct.append(value) for value in Symbols_distinct: row = EntrezConversion_df.loc[EntrezConversion_df['GENE_SYMBOL'] == value] # get the ENTREZ_GENE_ID (in case of gene belonging to multiple sets this list will contain # its ENTREZ_GENE_ID as many times as the number of sets it belongs to) row_entrez_id = list(row.ENTREZ_GENE_ID) N_eid = len(row_entrez_id) if N_eid > 1: entrez_id = list(list( row_entrez_id[:N_eid - 1])) # consider the ENTREZ_GENE_ID only once else: entrez_id = list(row_entrez_id) sets = list(list(row.GENE_SET)) # add the ENTREZ_GENE_ID and the GENE_SET as elements in the list of values corresponding to the proper gene dict_GeneTF[value].append(entrez_id) dict_GeneTF[value].append(sets) # So, the general form of this dictionary containing the information we need is the following: # dict_GeneTF = {key: value, ...} = {GENE_SYMBOL: [TF1, TF2, TF3, ..., [ENTREZ_GENE_ID], [GENE_SETs]]} # Store the number of TFs for each gene of interest in a new dictionary from collections import defaultdict dict_TFs_gene = defaultdict(int) # Initialize the dictionary setting the keys and their initial values for k in dict_GeneTF.keys(): dict_TFs_gene[k] = 0 # Set the number of TFs that bind to a gene's promotes as value of the corresping key in the dictionary for k in dict_GeneTF.keys(): transcription_factors = dict_GeneTF[k][:-2] number_TFs = len(transcription_factors) dict_TFs_gene[k] = number_TFs # Generate an histogram showing the previous distribution, that is the number of TFs that bind # to each gene's promoters (sorting this number from the highest to the smallest) # Convert the dictionary into a pandas dataframe TFs_gene_unsorted_df = pd.DataFrame(list(dict_TFs_gene.items()), columns=['GENE_SYMBOL', '#TFs']) # Sort the dataframe according to the number of TFs for each gene TFs_gene_df = TFs_gene_unsorted_df.sort_values(by='#TFs', ascending=0) # Add to the dataframe a column for storing also the Entrez Gene ID of each gene besides the already present Gene Symbols TFs_gene_df['ENTREZ_GENE_ID'] = '' # Add the correct Entrez Gene ID for each gene for index, row in TFs_gene_df.iterrows(): sym = row['GENE_SYMBOL'] eid = EntrezConversion_df.loc[EntrezConversion_df['GENE_SYMBOL'] == sym, 'ENTREZ_GENE_ID'].iloc[0] TFs_gene_df.set_value(index, 'ENTREZ_GENE_ID', eid) # Export the dataframe into an Excel file writer = ExcelWriter( './1_Transcription_Factors/Number of TFs for each gene of interest.xlsx' ) TFs_gene_df.to_excel(writer, 'Sheet1', index=False) writer.save() # Export the dictionary of genes of interest and their TFs, ENTREZ GENE ID and GENE_SETs # Save the dictionary into a pickle file pickle.dump(dict_GeneTF, open('./1_Transcription_Factors/dict_GeneTF.p', 'wb')) # Save the dictionary as a .xlsx file workbook = xlsxwriter.Workbook( './1_Transcription_Factors/dict_GeneTF.xlsx') worksheet = workbook.add_worksheet() # Set the headers of the columns worksheet.write(0, 0, 'GENE_SYMBOL') worksheet.write(0, 1, 'Transcription Factors') row = 1 col = 0 # Print the dictionary for key in dict_GeneTF.keys(): row += 1 worksheet.write(row, col, key) for item in dict_GeneTF[key]: worksheet.write(row, col + 1, ''.join(item)) if item == dict_GeneTF[key][ -2]: # the second to last element is the Entrez Gene ID worksheet.write(row, col + 2, 'Entrez Gene ID') if item == dict_GeneTF[key][ -1]: # the last element is the list of gene sets worksheet.write(row, col + 2, 'Gene Set') row += 1 workbook.close() # Save the dictionary as a .txt file with open('./1_Transcription_Factors/dict_GeneTF.txt', 'w') as fp: for p in dict_GeneTF.items(): fp.write('%s : %s\n\n' % p) return dict_GeneTF
def extract_methylation(tumor, platform, gencode_version, methyl_upstream, methyl_downstream): """ The EXTRACT_METHYLATION operation extracts methylation values from TCGA for all the genes of interest. For each gene of interest, the mean value of all the beta_values associated to methylation sites that are localized within areas -methyl_upstream/+methyl_downstream bases from its TSSs are retrieved. Intermediate results files are exported locally during the execution of the function, while the final dataframe is returned as a Pandas dataframe and exported locally in the Excel file 'Methylation Values.xlsx'. :param tumor: full name of the tumor of interest, encoded as a string (e.g. 'Ovarian Serous Cystadenocarcinoma', 'Breast Invasive Carcinoma', ...) :param platform: number identifying the sequencing platform (either 27 for the 27k probes sequencing platform or 450 for the 450k probes sequencing platform) :param gencode_version: number representing the GENCODE genomic annotations to use (currently, for assembly GRCh38, versions 22, 24 and 27 can be used) :param methyl_upstream: number of bases upstream the gene TSS to consider for the extraction of methylation sites of interest :param methyl_downstream: number of bases downstream the gene TSS to consider for the extraction of methylation sites of interest :return: a Pandas dataframe Example:: import genereg as gr methyl_df = gr.Methylation.extract_methylation(tumor='Ovarian Serous Cystadenocarcinoma', platform=27, gencode_version=22, methyl_upstream=4000, methyl_downstream=1000) """ # Check input parameters tcga_tumors = [ "Acute Myeloid Leukemia", "Adrenocortical Carcinoma", "Bladder Urothelial Carcinoma", "Brain Lower Grade Glioma", "Breast Invasive Carcinoma", "Cervical Squamous Cell Carcinoma and Endocervical Adenocarcinoma", "Cholangiocarcinoma", "Colon Adenocarcinoma", "Esophageal Carcinoma", "Glioblastoma Multiforme", "Head and Neck Squamous Cell Carcinoma", "Kidney Chromophobe", "Kidney Renal Clear Cell Carcinoma", "Kidney Renal Papillary Cell Carcinoma", "Liver Hepatocellular Carcinoma", "Lung Adenocarcinoma", "Lung Squamous Cell Carcinoma", "Lymphoid Neoplasm Diffuse Large B-cell Lymphoma", "Mesothelioma", "Ovarian Serous Cystadenocarcinoma", "Pancreatic Adenocarcinoma", "Pheochromocytoma and Paraganglioma", "Prostate Adenocarcinoma", "Rectum Adenocarcinoma", "Sarcoma", "Skin Cutaneous Melanoma", "Stomach Adenocarcinoma", "Testicular Germ Cell Tumors", "Thymoma", "Thyroid Carcinoma", "Uterine Carcinosarcoma", "Uterine Corpus Endometrial Carcinoma", "Uveal Melanoma" ] if tumor not in tcga_tumors: raise ValueError( 'PATHOLOGY NOT SUPPORTED! You can analyze one of these 33 types of TCGA tumors: ' + (', '.join(tcga_tumors))) if platform not in [27, 450]: raise ValueError( 'PLATFORM NOT RECOGNIZED! Sequencing platforms available: 27 and 450' ) if gencode_version not in [22, 24, 27]: raise ValueError('GRCh38 GENCODE versions available are 22, 24 and 27') # Execute the query for the extraction of methylation values on the remote server, using the PyGMQL Python library gl.set_remote_address('http://gmql.eu/gmql-rest/') gl.login() gl.set_mode('remote') # Load the TCGA datasets to be used in the query methylation_dataset = gl.load_from_remote( remote_name='GRCh38_TCGA_methylation', owner='public') expression_dataset = gl.load_from_remote( remote_name='GRCh38_TCGA_gene_expression', owner='public') # Identify the sequencing platform to be used if platform == 27: seq_platform = 'Illumina Human Methylation 27' elif platform == 450: seq_platform = 'Illumina Human Methylation 450' # Extract all the samples for the current tumor and platform all_methyl = methylation_dataset.meta_select( (methylation_dataset['manually_curated__cases__disease_type'] == tumor) & (methylation_dataset['manually_curated__platform'] == seq_platform) & ((methylation_dataset['biospecimen__bio__sample_type'] == 'Primary Tumor') | (methylation_dataset['biospecimen__bio__sample_type'] == 'Recurrent Tumor')) & (methylation_dataset[ 'clinical__shared__history_of_neoadjuvant_treatment'] == 'No')) all_expr = expression_dataset.meta_select( (expression_dataset['manually_curated__cases__disease_type'] == tumor) & ((expression_dataset['biospecimen__bio__sample_type'] == 'Primary Tumor') | (expression_dataset['biospecimen__bio__sample_type'] == 'Recurrent Tumor')) & (expression_dataset[ 'clinical__shared__history_of_neoadjuvant_treatment'] == 'No')) # Methylation: methyl_0 = all_methyl.reg_project(field_list=['beta_value']) methyl = methyl_0.meta_select( semiJoinDataset=all_expr, semiJoinMeta=['biospecimen__bio__bcr_sample_barcode']) # Materialize the results into a GDataframe methyl_Gdf = methyl.materialize('./(MaterializeResults)') # The result dataset is loaded as a GDataframe, an object containing two pandas dataframes, one for the region data and one for the metadata. # Get the two pandas dataframes: methyl_df_regs = methyl_Gdf.regs methyl_df_meta = methyl_Gdf.meta n_regs = len(methyl_df_regs) n_samples = len(methyl_df_meta) # Change index into progressive integer numbers and store the name of the sample in another column methyl_df_regs['sample_id'] = methyl_df_regs.index methyl_df_regs.index = range(n_regs) # Convert all the metadata values into strings, since they're encode as lists in Python col_names = [] for name, values in methyl_df_meta.iteritems(): col_names.append(name) for index, row in methyl_df_meta.iterrows(): for c in col_names: list_val = row[c] # it's encoded as a list str_val = ''.join( list_val) # convert the value stored as a list in a string methyl_df_meta.set_value(index, c, str_val) # Export the metadata dataframe setting the TCGA aliquots as indexes. Metadata_df = methyl_df_meta.copy() Metadata_df['id_sample'] = Metadata_df.index Metadata_df.set_index('biospecimen__bio__bcr_sample_barcode', inplace=True) writer = ExcelWriter('./3_TCGA_Data/Methylation/METHYL (Metadata).xlsx') Metadata_df.to_excel(writer, 'Sheet1') writer.save() # Extract the sample barcodes (TCGA Aliquots) methyl_sample_barcodes = [] for index, row in methyl_df_meta.iterrows(): barcode = row['biospecimen__bio__bcr_sample_barcode'] if barcode not in methyl_sample_barcodes: # get distinct values methyl_sample_barcodes.append(barcode) # Load the list of genes of interest EntrezConversion_df = pd.read_excel('./Genes_of_Interest.xlsx', sheetname='Sheet1', header=0, converters={ 'GENE_SYMBOL': str, 'ENTREZ_GENE_ID': str, 'GENE_SET': str }) # Create a list containing the Gene Symbols of the genes of interest genesSYM_of_interest = [] for i, r in EntrezConversion_df.iterrows(): name = r['GENE_SYMBOL'] if name not in genesSYM_of_interest: genesSYM_of_interest.append(name) # Create a dictionary for storing all the methylation values for each gene of interest and for each aliquot TCGA dict_methyl_list = defaultdict(dict) for key, value in dict_methyl_list.items(): value = defaultdict(list) # The main dictionary has the Gene Symbols of the genes of interest as keys and each gene has another dictionary as value, which, in turn, has the different aliquots as keys and lists as values. # The idea is having a list, containing all the beta_values, for each gene in each TCGA aliquot. # Set the Gene Symbols as keys of the main dictionary for name in genesSYM_of_interest: dict_methyl_list[name] = {} # Set the samples barcodes as keys for each dictionary set as value of a specific key (genes) for sample in methyl_sample_barcodes: for k, v in dict_methyl_list.items(): v[sample] = [] # Extract the methyl_areas dataset gl.set_remote_address('http://gmql.eu/gmql-rest/') gl.login() gl.set_mode('remote') ann_dataset = gl.load_from_remote(remote_name='GRCh38_ANNOTATION_GENCODE', owner='public') annotations_version = str(gencode_version) coding_transcripts_0 = ann_dataset.meta_select( (ann_dataset['release_version'] == annotations_version) & (ann_dataset['annotation_type'] == 'transcript')) coding_transcripts = coding_transcripts_0.reg_select( (coding_transcripts_0.transcript_type == 'protein_coding') & ((coding_transcripts_0.tag == 'basic') | (coding_transcripts_0.tag == 'CCDS'))) methyl_areas_reg = coding_transcripts.reg_project( ['gene_id', 'gene_name', 'entrez_gene_id'], new_field_dict={ 'start': coding_transcripts.start - methyl_upstream, 'stop': coding_transcripts.start + methyl_downstream }) gencode_grch38_methyl_areas = methyl_areas_reg.group( regs=['gene_name'], regs_aggregates={ 'ensembl_gene_id': gl.BAG('gene_id'), 'gene_symbol': gl.BAG('gene_name'), 'entrez_gene_id': gl.BAG('entrez_gene_id') }) # Materialize the results into a GDataframe Gencode_df_TSS_Gdf = gencode_grch38_methyl_areas.materialize( './(MaterializeResults)') # Get the regions dataframe Gencode_df_TSS = Gencode_df_TSS_Gdf.regs Gencode_df_TSS.rename(columns={ 'chr': 'chrom', 'start': 'methyl_left', 'stop': 'methyl_right' }, inplace=True) # Remove the transcripts that don't belong to genes of interest Gencode_df_TSS_interest = Gencode_df_TSS.loc[ Gencode_df_TSS['gene_symbol'].isin(genesSYM_of_interest)].copy() # Extract from the TCGA data only useful columns for the following procedure methyl_df_regs.rename(columns={ 'chr': 'chrom', 'start': 'left', 'stop': 'right' }, inplace=True) methyl_df_regs_useful = methyl_df_regs[[ 'chrom', 'left', 'right', 'strand', 'beta_value', 'sample_id' ]].copy() # Create a dictionary for storing all the methylation regions associated to each gene of interest dict_methyl_df = {} # Set the Gene Symbols of genes of interest as keys of the main dictionary and an empty dataframe as values (with the same columns of 'methyl_df_regs_useful') columns = ['left', 'right', 'strand', 'beta_value', 'sample_id'] for i in genesSYM_of_interest: dict_methyl_df[i] = pd.DataFrame(columns=columns) # The dictionary has the Gene Symbols of the genes of interest as keys and a dataframe containing all the methylation regions with genomic coordinates that are within +methyl_upstream/-methyl_downstream bases from the TSS, for each gene of interest. # Fill the empty dataframes set as values in the dictionary. # Iterate along the GENCODE dataframe containing transcripts belonging to genes of interest for index, row in Gencode_df_TSS_interest.iterrows(): # extract values of attributes we are interested in gencode_chrom = row['chrom'] gencode_left = row['methyl_left'] gencode_right = row['methyl_right'] gene = row['gene_symbol'] # create a list with 'int' elements in the range [gencode_left, gencode_right) methyl_area = list(range(gencode_left, gencode_right)) # select the methylation regions that are inside the region selected (i.e. 'methyl_area') selected_methyl_regs = methyl_df_regs_useful.loc[ (methyl_df_regs_useful['chrom'] == gencode_chrom) & (methyl_df_regs_useful['left'].isin(methyl_area)) & (methyl_df_regs_useful['right'].isin(methyl_area))].copy() # set the extracted dataframe as value of the corresponding key (gene) in the dictionary value_df = dict_methyl_df[gene] # get the old dataframe # concatenate the old dataframe and the new one as value in the dictionary frames = [value_df, selected_methyl_regs] dict_methyl_df[gene] = pd.concat(frames) # For each dataframe set as value in the dictionary, remove duplicated rows, if present for key, value in dict_methyl_df.items(): value.drop_duplicates(keep='first', inplace=True) # Store in a list the Entrez Gene IDs of the genes of interest for which no regions has been found gencode_missing_values_genes = [] for key, value in dict_methyl_df.items(): if len(value) == 0: gencode_missing_values_genes.append(key) # Extract the methylation values for each gene of interest and for each TCGA aliquot. # Set the values by appending the methylation values for each gene of interest: these methylation values (beta_values) can be found in the dataframes set as values in dictionary "dict_ov_methyl_df". for gene, value_df in dict_methyl_df.items(): for index, row in value_df.iterrows(): beta = row['beta_value'] # get the methylation value sample = row['sample_id'] # get the name of the sample # get the aliquot corresponding to current sample aliq = methyl_df_meta.get_value( sample, 'biospecimen__bio__bcr_sample_barcode') # add the value according to the correct Gene Symbol and TCGA aliquot dict_methyl_list[gene][aliq].append(round(float(beta), 8)) #------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Extract in a list the names of the distinct sample barcodes (aliquots) methyl_sample_barcodes = list((list(dict_methyl_list.values()))[0].keys()) # Export the list of common aliquots in a .txt file with open('./3_TCGA_Data/Common_Aliquots.txt', 'w') as fp: for i in methyl_sample_barcodes: fp.write("%s\n" % i) # Shuffle and randomly splits into five different sets the TCGA aliquots to be analyzed. # This five sets of aliquots will be used in the feature selection procedure during the data analysis phase as five different test sets (with the remaining aliquots forming the corresponding training set), allowing the data analysis method to be trained and tested. # Thus, in order to reduce the bias, a cross-validation procedure is adopted and the feature selection is executed on each data matrix five times: the final set of features selected for that matrix # is the intersection of the five different subsets returned by the five different feature selection sub-processes on that same data matrix. # Import the list of common TCGA aliquots to analyze aliquot_file = open('./3_TCGA_Data/Common_Aliquots.txt', 'r') aliquots = aliquot_file.read().split('\n') aliquots.remove('') aliquot_file.close() # Create a dataframe having the TCGA aliquots as indexes of its rows model_gene_df = pd.DataFrame(index=aliquots, columns=['C1', 'C2']) # Shuffle the rows of the model gene dataframe in a random way, in order to reduce the bias model_gene_df = shuffle(model_gene_df) # Split the dataframe into five dataframes that will be used as test sets model_gene_df_split = np.array_split(model_gene_df, 5) model_gene_df_test1 = model_gene_df_split[0] model_gene_df_test2 = model_gene_df_split[1] model_gene_df_test3 = model_gene_df_split[2] model_gene_df_test4 = model_gene_df_split[3] model_gene_df_test5 = model_gene_df_split[4] # Save the aliquots selected for each of the five test dataframes in a dictionary dict_test_split = defaultdict(list) dict_test_split['Test_1'] = list(model_gene_df_test1.index.values) dict_test_split['Test_2'] = list(model_gene_df_test2.index.values) dict_test_split['Test_3'] = list(model_gene_df_test3.index.values) dict_test_split['Test_4'] = list(model_gene_df_test4.index.values) dict_test_split['Test_5'] = list(model_gene_df_test5.index.values) # Export the dictionary pickle.dump(dict_test_split, open('./5_Data_Analysis/dict_test_split.p', 'wb')) #------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Convert the nested dictionary into a dataframe: # Create a dataframe whose row indexes are the different TCGA samples and the columns are the distinct genes of interest methyl_df1 = pd.DataFrame(index=methyl_sample_barcodes, columns=[genesSYM_of_interest]) # Add three additional columns for the name of the sample, the ID of the patient and the tumor tag corresponding to each aliquot methyl_df2 = pd.DataFrame(index=methyl_sample_barcodes, columns=['Sample_ID', 'Tumor', 'Patient_ID']) # Create the final dataframe methyl_list_df = methyl_df1.join(methyl_df2) # Add to the dataframe the name of each sample, the patient ID and the tumor tag in correspondence of each TCGA aliquot for index, row in Metadata_df.iterrows(): sample = row['id_sample'] tumor_tag = row['clinical__admin__disease_code'] patient_id = row['clinical__shared__patient_id'] methyl_list_df.set_value(index, 'Sample_ID', sample) methyl_list_df.set_value(index, 'Tumor', tumor_tag) methyl_list_df.set_value(index, 'Patient_ID', patient_id) # Add a row at the beginning of the dataframe to insert also the Entrez Gene ID of each gene of interest additional_index = ['ENTREZ_GENE_ID'] methyl_df0_1 = pd.DataFrame(index=additional_index, columns=[genesSYM_of_interest]) methyl_df0_2 = pd.DataFrame(index=additional_index, columns=['Sample_ID', 'Tumor', 'Patient_ID']) methyl_df0 = methyl_df0_1.join(methyl_df0_2) frames = [methyl_df0, methyl_list_df] methyl_list_df = pd.concat(frames) # Add for each Gene Symbol of our genes of interest the corresponding Entrez ID in the first row of the dataframe for i, r in EntrezConversion_df.iterrows(): entrez_id = r['ENTREZ_GENE_ID'] gene_name = r['GENE_SYMBOL'] methyl_list_df.set_value('ENTREZ_GENE_ID', gene_name, entrez_id) # Set empty strings for NaN values in the 'GENE_SYMBOL' row methyl_list_df.set_value('ENTREZ_GENE_ID', 'Sample_ID', "") methyl_list_df.set_value('ENTREZ_GENE_ID', 'Tumor', "") methyl_list_df.set_value('ENTREZ_GENE_ID', 'Patient_ID', "") # Add to the dataframe the list of methylation values for each gene of interest in each aliquot TCGA for gene, dict_value in dict_methyl_list.items(): for tcga_aliq, beta_list in dict_value.items(): # get the list of beta_values for gene 'gene' and aliquot 'tcga_aliq' and add it in the proper cell of the dataframe methyl_list_df.set_value(tcga_aliq, gene, beta_list) # Compute the MEAN for the beta_values: # In case the same gene has more than one beta_value for a single sample, compute their median value and set it as the new beta_value. # In this way, we will have a single methylation value for each gene in each sample. dict_methyl = dict_methyl_list.copy() methyl_df = methyl_list_df.copy() sum_values = 0 count_values = 0 for gene_name, dict_value in dict_methyl.items(): for tcga_aliq, beta_list in dict_value.items(): # get the list of beta_values for gene 'entrez_id' and aliquot 'tcga_aliq' for v in beta_list: if (len(beta_list) != 0): # if the list of beta_values is not empty if not (math.isnan(v) ): # if the values considered is not 'nan' # consider the current value sum_values += v count_values += 1 # if there's more than one beta_value for the same gene in the same sample if (count_values != 0): # compute the median value single_beta_value = float(sum_values / count_values) # set the new single beta_value as the new methylation value for that gene # in correspondence of that specific aliquot, rounding it to a float with 8 decimal numbers dict_value[tcga_aliq] = round(single_beta_value, 8) # add this methylation value also in the proper cell of the dataframe methyl_df.set_value(tcga_aliq, gene_name, round(single_beta_value, 8)) # reset the variables for the next iteration sum_values = 0 count_values = 0 single_beta_value = 0 for i in genesSYM_of_interest: methyl_df[i] = methyl_df[i].apply( lambda y: np.nan if (isinstance(y, list) and len(y) == 0) else y) # Export the dataframe with the single methylation values for each gene of interest and in each TCGA aliquot writer = ExcelWriter('./3_TCGA_Data/Methylation/Methylation_Values.xlsx') methyl_df.to_excel(writer, 'Sheet1') writer.save() return methyl_df
def extract_expression(tumor, platform, gencode_version): """ The EXTRACT_EXPRESSION operation extracts expression values from TCGA for all the genes of interest and their candidate regulatory genes. Intermediate results files are exported locally during the execution of the function, while the final dataframes are returned as Pandas dataframes and exported locally in the Excel files 'Gene Expression - InterestGenes.xlsx' and 'Gene Expression - RegulatoryGenes.xlsx'. :param tumor: full name of the tumor of interest, encoded as a string (e.g. 'Ovarian Serous Cystadenocarcinoma', 'Breast Invasive Carcinoma', ...) :param platform: number identifying the sequencing platform (either 27 for the 27k probes sequencing platform or 450 for the 450k probes sequencing platform) :param gencode_version: number representing the GENCODE genomic annotations to use (currently, for assembly GRCh38, versions 22, 24 and 27 can be used) :return: two Pandas dataframes Example:: import genereg as gr expr_interest_df, expr_regul_df = gr.GeneExpression.extract_expression(tumor='Ovarian Serous Cystadenocarcinoma', platform=27, gencode_version=22) """ # Check input parameters tcga_tumors = [ "Acute Myeloid Leukemia", "Adrenocortical Carcinoma", "Bladder Urothelial Carcinoma", "Brain Lower Grade Glioma", "Breast Invasive Carcinoma", "Cervical Squamous Cell Carcinoma and Endocervical Adenocarcinoma", "Cholangiocarcinoma", "Colon Adenocarcinoma", "Esophageal Carcinoma", "Glioblastoma Multiforme", "Head and Neck Squamous Cell Carcinoma", "Kidney Chromophobe", "Kidney Renal Clear Cell Carcinoma", "Kidney Renal Papillary Cell Carcinoma", "Liver Hepatocellular Carcinoma", "Lung Adenocarcinoma", "Lung Squamous Cell Carcinoma", "Lymphoid Neoplasm Diffuse Large B-cell Lymphoma", "Mesothelioma", "Ovarian Serous Cystadenocarcinoma", "Pancreatic Adenocarcinoma", "Pheochromocytoma and Paraganglioma", "Prostate Adenocarcinoma", "Rectum Adenocarcinoma", "Sarcoma", "Skin Cutaneous Melanoma", "Stomach Adenocarcinoma", "Testicular Germ Cell Tumors", "Thymoma", "Thyroid Carcinoma", "Uterine Carcinosarcoma", "Uterine Corpus Endometrial Carcinoma", "Uveal Melanoma" ] if tumor not in tcga_tumors: raise ValueError( 'PATHOLOGY NOT SUPPORTED! You can analyze one of these 33 types of TCGA tumors: ' + (', '.join(tcga_tumors))) if platform not in [27, 450]: raise ValueError( 'PLATFORM NOT RECOGNIZED! Sequencing platforms available: 27 and 450' ) if gencode_version not in [22, 24, 27]: raise ValueError('GRCh38 GENCODE versions available are 22, 24 and 27') # Load the list of genes of interest EntrezConversion_df = pd.read_excel('./Genes_of_Interest.xlsx', sheetname='Sheet1', header=0, converters={ 'GENE_SYMBOL': str, 'ENTREZ_GENE_ID': str, 'GENE_SET': str }) # Create a list containing the Gene Symbols of the genes of interest genesSYM_of_interest = [] for i, r in EntrezConversion_df.iterrows(): sym = r['GENE_SYMBOL'] if sym not in genesSYM_of_interest: genesSYM_of_interest.append(sym) # Import the dictionary of genes of interest with their candidate regulatory genes dict_RegulGenes = pickle.load( open('./2_Regulatory_Genes/dict_RegulGenes.p', 'rb')) # Import the gene-TFs mapping dataframe Mapping_df = pd.read_excel('./0_Genes_Mapping/Genes_Mapping.xlsx', sheetname='Sheet1', header=0, converters={ 'ENTREZ_GENE_ID': str, 'HGNC_ID': str }) # Create a list containing the Gene Symbols of the regulatory genes of genes of interest regulatory_genesSYM = [] for key, value in dict_RegulGenes.items(): for gene in value: if gene not in regulatory_genesSYM: regulatory_genesSYM.append(gene) # Extract the list of distinct Gene Symbols mapped in the mapping table mapped_gene_SYMs = [] for index, row in Mapping_df.iterrows(): sym = row['GENE_SYMBOL'] if sym not in mapped_gene_SYMs: mapped_gene_SYMs.append(sym) # Execute the query for the extraction of gene expression values on the remote server, using the PyGMQL Python library gl.set_remote_address('http://gmql.eu/gmql-rest/') gl.login() gl.set_mode('remote') # Load the TCGA datasets to be used in the query methylation_dataset = gl.load_from_remote( remote_name='GRCh38_TCGA_methylation', owner='public') expression_dataset = gl.load_from_remote( remote_name='GRCh38_TCGA_gene_expression', owner='public') # Identify the sequencing platform to be used if platform == 27: seq_platform = 'Illumina Human Methylation 27' elif platform == 450: seq_platform = 'Illumina Human Methylation 450' # Extract all the samples for the current tumor and platform all_methyl = methylation_dataset.meta_select( (methylation_dataset['manually_curated__cases__disease_type'] == tumor) & (methylation_dataset['manually_curated__platform'] == seq_platform) & ((methylation_dataset['biospecimen__bio__sample_type'] == 'Primary Tumor') | (methylation_dataset['biospecimen__bio__sample_type'] == 'Recurrent Tumor')) & (methylation_dataset[ 'clinical__shared__history_of_neoadjuvant_treatment'] == 'No')) all_expr = expression_dataset.meta_select( (expression_dataset['manually_curated__cases__disease_type'] == tumor) & ((expression_dataset['biospecimen__bio__sample_type'] == 'Primary Tumor') | (expression_dataset['biospecimen__bio__sample_type'] == 'Recurrent Tumor')) & (expression_dataset[ 'clinical__shared__history_of_neoadjuvant_treatment'] == 'No')) # Gene Expression: expr_0 = all_expr.reg_project(field_list=[ 'ensembl_gene_id', 'entrez_gene_id', 'gene_symbol', 'fpkm' ]) expr = expr_0.meta_select( semiJoinDataset=all_methyl, semiJoinMeta=['biospecimen__bio__bcr_sample_barcode']) # Materialize the results into a GDataframe expr_Gdf = expr.materialize('./(MaterializeResults)') # The result dataset is loaded as a GDataframe, an object containing two pandas dataframes, one for the region data and one for the metadata. # Get the two pandas dataframes: expr_df_regs = expr_Gdf.regs expr_df_meta = expr_Gdf.meta n_regs = len(expr_df_regs) n_samples = len(expr_df_meta) # Rename 'chr', 'start', and 'stop' columns header expr_df_regs.rename(columns={ 'chr': 'chrom', 'start': 'left', 'stop': 'right' }, inplace=True) # Change index into progressive integer numbers and store the name of the sample in another column expr_df_regs['sample_id'] = expr_df_regs.index expr_df_regs.index = range(n_regs) # Convert unknown values (NaN) to empty strings expr_df_regs = expr_df_regs.fillna('') # Convert all the metadata values into strings, since they're encode as lists in Python col_names = [] for name, values in expr_df_meta.iteritems(): col_names.append(name) for index, row in expr_df_meta.iterrows(): for c in col_names: list_val = row[c] # it's encoded as a list str_val = ''.join( list_val) # convert the value stored as a list in a string expr_df_meta.set_value(index, c, str_val) # Since we have to extract the expression values for each distinct sample barcode (aliquot), we create a list containing these distinct identifiers expr_sample_barcodes_all = [] for index, row in expr_df_meta.iterrows(): barcode = row['biospecimen__bio__bcr_sample_barcode'] if barcode not in expr_sample_barcodes_all: # get distinct values expr_sample_barcodes_all.append(barcode) # Check which are repeated aliquots, if present all_aliqouts = [] for index, row in expr_df_meta.iterrows(): barcode = row['biospecimen__bio__bcr_sample_barcode'] all_aliqouts.append(barcode) multiple_aliquots = [ item for item, count in collections.Counter(all_aliqouts).items() if count > 1 ] samples_to_remove = [] expr_sample_barcodes = [] if len(multiple_aliquots) != 0: # Among the repeated aliquots, keep only the most recent ones (of 2013) for index, row in expr_df_meta.iterrows(): year = row['biospecimen__bio__year_of_shipment'] barcode = row['biospecimen__bio__bcr_sample_barcode'] if (barcode in multiple_aliquots) and year == '2011': expr_df_meta.drop(index, inplace=True) samples_to_remove.append(index) # Import the list of aliquots in the methylation dataset text_file = open('./3_TCGA_Data/Common_Aliquots.txt', 'r') aliquots = text_file.read().split('\n') aliquots.remove('') text_file.close() # Extract the new list of distinct TCGA Aliquots to extract for index, row in expr_df_meta.iterrows(): barcode = row['biospecimen__bio__bcr_sample_barcode'] if barcode in aliquots: if barcode not in expr_sample_barcodes: expr_sample_barcodes.append(barcode) else: expr_df_meta.drop(index, inplace=True) samples_to_remove.append(index) # Remove regions that corresponded to eliminated repeated aliquots expr_df_regs = expr_df_regs.loc[~( expr_df_regs['sample_id'].isin(samples_to_remove))].copy() else: expr_sample_barcodes = expr_sample_barcodes_all # Export the metadata dataframe setting the TCGA aliquots as indexes. Metadata_df = expr_df_meta.copy() Metadata_df['id_sample'] = Metadata_df.index Metadata_df.set_index('biospecimen__bio__bcr_sample_barcode', inplace=True) writer = ExcelWriter('./3_TCGA_Data/Gene_Expression/EXPR_(Metadata).xlsx') Metadata_df.to_excel(writer, 'Sheet1') writer.save() # Extract from the expression dataset all the regions that belong to genes of interest expr_df_regs_interest = expr_df_regs.loc[expr_df_regs['gene_symbol'].isin( genesSYM_of_interest)].copy() # Extract from the expression dataset all the regions that belong to regulatory genes of genes of interest expr_df_regs_regulatory = expr_df_regs.loc[ expr_df_regs['gene_symbol'].isin(regulatory_genesSYM)].copy() # Gene expression values for each gene of interest: # Create a dictionary for storing all the gene expression values for each gene of interest and for each aliquot TCGA from collections import defaultdict dict_expr_interest = defaultdict(dict) for key, value in dict_expr_interest.items(): value = defaultdict(list) # The main dictionary has the Gene Symbols of the genes of interest as keys and each gene has another dictionary as value, which, in turn, has the different aliquots as keys and lists as values. # The idea is having a list, containing all the fpkm values, for each gene in each TCGA aliquot. # Set the Gene Symbol as keys of the main dictionary for name in genesSYM_of_interest: dict_expr_interest[name] = {} # Set the names of the samples barcodes as keys for each dictionary set as value of a specific key (genes) for sample in expr_sample_barcodes: for k, v in dict_expr_interest.items(): v[sample] = [] # Set the values by appending the expression values for each gene of interest: these expression values (fpkm) can be found in the 'expr_df_regs_interest' dataframe for index, row in expr_df_regs_interest.iterrows( ): # iterating along the whole dataframe sym = row['gene_symbol'] # get the Gene Symbol of the gene fpkm = row['fpkm'] # get the gene expression value sample = row['sample_id'] # get the name of the sample # get the aliquot corresponding to current sample aliq = expr_df_meta.get_value(sample, 'biospecimen__bio__bcr_sample_barcode') # add the value according to the correct gene ID and TCGA aliquot, rounding it to a float with maximum 6 decimal numbers, dict_expr_interest[sym][aliq].append(round(float(fpkm), 6)) # Convert the nested dictionary also into a dataframe # Create a dataframe whose row indexes are the different TCGA samples and the columns are the distinct genes of interest expr_interest_df1 = pd.DataFrame(index=expr_sample_barcodes, columns=[genesSYM_of_interest]) # Add three additional columns for the name of the sample and the ID and barcode of the patient corresponding to each aliquot, in order to have them available if we will need it expr_interest_df2 = pd.DataFrame( index=expr_sample_barcodes, columns=['Sample_ID', 'Tumor', 'Patient_ID']) # Create the final dataframe expr_interest_df = expr_interest_df1.join(expr_interest_df2) # Fill the previously created dataframe with the correct gene expression values, for each gene of interest and for each TCGA aliquot for gene_sym, dict_value in dict_expr_interest.items(): for tcga_aliq, exp_list in dict_value.items(): if (len(exp_list) != 0): fpkm = exp_list[0] # add the expression value in the proper cell of the dataframe, rounding it to a float with maximum 6 decimal numbers expr_interest_df.set_value(tcga_aliq, gene_sym, round(fpkm, 6)) # Add to the dataframe the name of each sample, the tumor code and the patient's ID in correspondence of each TCGA aliquot for index, row in expr_df_meta.iterrows(): aliquot = row['biospecimen__bio__bcr_sample_barcode'] tumor_tag = row['clinical__admin__disease_code'] patient_id = row['clinical__shared__patient_id'] expr_interest_df.set_value(aliquot, 'Sample_ID', index) expr_interest_df.set_value(aliquot, 'Tumor', tumor_tag) expr_interest_df.set_value(aliquot, 'Patient_ID', patient_id) # Add a row at the beginning of the dataframe to insert also the Entrez Gene ID of each gene of interest additional_index = ['ENTREZ_GENE_ID'] expr_interest_df0_1 = pd.DataFrame(index=additional_index, columns=[genesSYM_of_interest]) expr_interest_df0_2 = pd.DataFrame( index=additional_index, columns=['Sample_ID', 'Tumor', 'Patient_ID']) expr_interest_df0 = expr_interest_df0_1.join(expr_interest_df0_2) frames = [expr_interest_df0, expr_interest_df] expr_interest_df = pd.concat(frames) # Add for each Gene Symbol of our genes of interest the corresponding Entrez Gene ID in the first row of the dataframe for i, r in EntrezConversion_df.iterrows(): entrez_id = r['ENTREZ_GENE_ID'] gene_name = r['GENE_SYMBOL'] expr_interest_df.set_value('ENTREZ_GENE_ID', gene_name, entrez_id) # Set empty strings for NaN values in the 'GENE_SYMBOL' row expr_interest_df.set_value('ENTREZ_GENE_ID', 'Sample_ID', "") expr_interest_df.set_value('ENTREZ_GENE_ID', 'Tumor', "") expr_interest_df.set_value('ENTREZ_GENE_ID', 'Patient_ID', "") # Export the dataframe with the gene expression values for our genes of interest for each TCGA aliquot writer = ExcelWriter( './3_TCGA_Data/Gene_Expression/Gene_Expression-InterestGenes.xlsx') expr_interest_df.to_excel(writer, 'Sheet1') writer.save() # Gene expression values for each candidate regulatory gene of the genes of interest: # Create a dictionary for storing all the gene expression values for each gene of interest and for each aliquot TCGA from collections import defaultdict dict_expr_regulatory = defaultdict(dict) for key, value in dict_expr_regulatory.items(): value = defaultdict(list) # The main dictionary has the Gene Symbols of the candidate regulatory genes as keys and each gene has another dictionary as value, which, in turn, has the different aliquots as keys and lists as values. # The idea is having a list, containing all the fpkm values, for each gene in each TCGA aliquot. # Set the Gene Symbols as keys of the main dictionary for name in regulatory_genesSYM: dict_expr_regulatory[name] = {} # Set the names of the samples barcodes as keys for each dictionary set as value of a specific key (genes) for sample in expr_sample_barcodes: for k, v in dict_expr_regulatory.items(): v[sample] = [] # Set the values by appending the expression values for each candidate regulatory gene: these expression values (fpkm) can be found in the "expr_df_regs_regulatory" dataframe for index, row in expr_df_regs_regulatory.iterrows( ): # iterating along the whole dataframe sym = row['gene_symbol'] # get the Gene Symbol of the gene ens_id = row['ensembl_gene_id'] # get the Ensembl Gene ID fpkm = row['fpkm'] # get the gene expression value sample = row['sample_id'] # get the name of the sample # get the aliquot corresponding to current sample aliq = expr_df_meta.get_value(sample, 'biospecimen__bio__bcr_sample_barcode') # add the value according to the correct gene ID and TCGA aliquot, rounding it to a float with maximum 6 decimal numbers if (gencode_version == 22): if (ens_id not in [ 'ENSG00000277726.3', 'ENSG00000275895.3', 'ENSGR0000214717.8' ]): dict_expr_regulatory[sym][aliq].append(round(float(fpkm), 6)) else: dict_expr_regulatory[sym][aliq].append(round(float(fpkm), 6)) # Convert the nested dictionary also into a dataframe # Create a dataframe whose row indexes are the different TCGA samples and the columns are the distinct candidate regulatory genes expr_regulatory_df1 = pd.DataFrame(index=expr_sample_barcodes, columns=[regulatory_genesSYM]) # Add three additional columns for the name of the sample and the ID and barcode of the patient corresponding to each aliquot, in order to have them available if we will need it expr_regulatory_df2 = pd.DataFrame( index=expr_sample_barcodes, columns=['Sample_ID', 'Tumor', 'Patient_ID']) # Create the final dataframe expr_regulatory_df = expr_regulatory_df1.join(expr_regulatory_df2) # Fill the previously created dataframe with the correct gene expression values, for each candidate regulatory gene and for each TCGA aliquot for gene_sym, dict_value in dict_expr_regulatory.items(): for tcga_aliq, exp_list in dict_value.items(): if (len(exp_list) != 0): fpkm = exp_list[0] # add the expression value in the proper cell of the dataframe, rounding it to a float with maximum 6 decimal numbers expr_regulatory_df.set_value(tcga_aliq, gene_sym, round(fpkm, 6)) # Add to the dataframe the name of each sample, the tumor code and the patient's ID in correspondence of each TCGA aliquot for index, row in expr_df_meta.iterrows(): aliquot = row['biospecimen__bio__bcr_sample_barcode'] tumor_tag = row['clinical__admin__disease_code'] patient_id = row['clinical__shared__patient_id'] expr_regulatory_df.set_value(aliquot, 'Sample_ID', index) expr_regulatory_df.set_value(aliquot, 'Tumor', tumor_tag) expr_regulatory_df.set_value(aliquot, 'Patient_ID', patient_id) # Add a row at the beginning of the dataframe to insert also the Gene Symbols of each gene of interest additional_index = ['ENTREZ_GENE_ID'] expr_regulatory_df0_1 = pd.DataFrame(index=additional_index, columns=[regulatory_genesSYM]) expr_regulatory_df0_2 = pd.DataFrame( index=additional_index, columns=['Sample_ID', 'Tumor', 'Patient_ID']) expr_regulatory_df0 = expr_regulatory_df0_1.join(expr_regulatory_df0_2) frames = [expr_regulatory_df0, expr_regulatory_df] expr_regulatory_df = pd.concat(frames) # Add for each Gene Symbol of the regulatory genes the corresponding Entrez Gene ID in the first row of the dataframe for i in regulatory_genesSYM: if i == 'PTRF': entrez_id = Mapping_df.loc[Mapping_df['GENE_SYMBOL'] == 'CAVIN1', 'ENTREZ_GENE_ID'].iloc[0] else: entrez_id = Mapping_df.loc[Mapping_df['GENE_SYMBOL'] == i, 'ENTREZ_GENE_ID'].iloc[0] expr_regulatory_df.set_value('ENTREZ_GENE_ID', i, entrez_id) # Set empty strings for NaN values in the 'GENE_SYMBOL' row expr_regulatory_df.set_value('ENTREZ_GENE_ID', 'Sample_ID', "") expr_regulatory_df.set_value('ENTREZ_GENE_ID', 'Tumor', "") expr_regulatory_df.set_value('ENTREZ_GENE_ID', 'Patient_ID', "") # Export the dataframe with the gene expression values for the regulatory genes of our genes of interest for each TCGA aliquot writer = ExcelWriter( './3_TCGA_Data/Gene_Expression/Gene_Expression-RegulatoryGenes.xlsx') expr_regulatory_df.to_excel(writer, 'Sheet1') writer.save() return expr_interest_df, expr_regulatory_df