Ejemplo n.º 1
0
def extract_methylation(tumor, platform, gencode_version, methyl_upstream,
                        methyl_downstream):
    """
	The EXTRACT_METHYLATION operation extracts methylation values from TCGA for all the genes of interest. For each gene of interest, the mean value of all the beta_values associated to methylation sites that are localized within areas -methyl_upstream/+methyl_downstream bases from its TSSs are retrieved. Intermediate results files are exported locally during the execution of the function, while the final dataframe is returned as a Pandas dataframe and exported locally in the Excel file 'Methylation Values.xlsx'.

	:param tumor: full name of the tumor of interest, encoded as a string (e.g. 'Ovarian Serous Cystadenocarcinoma', 'Breast Invasive Carcinoma', ...)
	:param platform: number identifying the sequencing platform (either 27 for the 27k probes sequencing platform or 450 for the 450k probes sequencing platform)
	:param gencode_version: number representing the GENCODE genomic annotations to use (currently, for assembly GRCh38, versions 22, 24 and 27 can be used)
	:param methyl_upstream: number of bases upstream the gene TSS to consider for the extraction of methylation sites of interest
	:param methyl_downstream: number of bases downstream the gene TSS to consider for the extraction of methylation sites of interest
	:return: a Pandas dataframe
	
	Example::
	
		import genereg as gr
		methyl_df = gr.Methylation.extract_methylation(tumor='Ovarian Serous Cystadenocarcinoma', platform=27, gencode_version=22, methyl_upstream=4000, methyl_downstream=1000)
	"""

    # Check input parameters
    tcga_tumors = [
        "Acute Myeloid Leukemia", "Adrenocortical Carcinoma",
        "Bladder Urothelial Carcinoma", "Brain Lower Grade Glioma",
        "Breast Invasive Carcinoma",
        "Cervical Squamous Cell Carcinoma and Endocervical Adenocarcinoma",
        "Cholangiocarcinoma", "Colon Adenocarcinoma", "Esophageal Carcinoma",
        "Glioblastoma Multiforme", "Head and Neck Squamous Cell Carcinoma",
        "Kidney Chromophobe", "Kidney Renal Clear Cell Carcinoma",
        "Kidney Renal Papillary Cell Carcinoma",
        "Liver Hepatocellular Carcinoma", "Lung Adenocarcinoma",
        "Lung Squamous Cell Carcinoma",
        "Lymphoid Neoplasm Diffuse Large B-cell Lymphoma", "Mesothelioma",
        "Ovarian Serous Cystadenocarcinoma", "Pancreatic Adenocarcinoma",
        "Pheochromocytoma and Paraganglioma", "Prostate Adenocarcinoma",
        "Rectum Adenocarcinoma", "Sarcoma", "Skin Cutaneous Melanoma",
        "Stomach Adenocarcinoma", "Testicular Germ Cell Tumors", "Thymoma",
        "Thyroid Carcinoma", "Uterine Carcinosarcoma",
        "Uterine Corpus Endometrial Carcinoma", "Uveal Melanoma"
    ]
    if tumor not in tcga_tumors:
        raise ValueError(
            'PATHOLOGY NOT SUPPORTED! You can analyze one of these 33 types of TCGA tumors: '
            + (', '.join(tcga_tumors)))

    if platform not in [27, 450]:
        raise ValueError(
            'PLATFORM NOT RECOGNIZED! Sequencing platforms available: 27 and 450'
        )

    if gencode_version not in [22, 24, 27]:
        raise ValueError('GRCh38 GENCODE versions available are 22, 24 and 27')

    # Execute the query for the extraction of methylation values on the remote server, using the PyGMQL Python library
    gl.set_remote_address('http://gmql.eu/gmql-rest/')
    gl.login()
    gl.set_mode('remote')

    # Load the TCGA datasets to be used in the query
    methylation_dataset = gl.load_from_remote(
        remote_name='GRCh38_TCGA_methylation', owner='public')
    expression_dataset = gl.load_from_remote(
        remote_name='GRCh38_TCGA_gene_expression', owner='public')

    # Identify the sequencing platform to be used
    if platform == 27:
        seq_platform = 'Illumina Human Methylation 27'
    elif platform == 450:
        seq_platform = 'Illumina Human Methylation 450'

    # Extract all the samples for the current tumor and platform
    all_methyl = methylation_dataset.meta_select(
        (methylation_dataset['manually_curated__cases__disease_type'] == tumor)
        & (methylation_dataset['manually_curated__platform'] == seq_platform)
        & ((methylation_dataset['biospecimen__bio__sample_type'] ==
            'Primary Tumor')
           | (methylation_dataset['biospecimen__bio__sample_type'] ==
              'Recurrent Tumor'))
        & (methylation_dataset[
            'clinical__shared__history_of_neoadjuvant_treatment'] == 'No'))
    all_expr = expression_dataset.meta_select(
        (expression_dataset['manually_curated__cases__disease_type'] == tumor)
        & ((expression_dataset['biospecimen__bio__sample_type'] ==
            'Primary Tumor')
           | (expression_dataset['biospecimen__bio__sample_type'] ==
              'Recurrent Tumor'))
        & (expression_dataset[
            'clinical__shared__history_of_neoadjuvant_treatment'] == 'No'))

    # Methylation:
    methyl_0 = all_methyl.reg_project(field_list=['beta_value'])
    methyl = methyl_0.meta_select(
        semiJoinDataset=all_expr,
        semiJoinMeta=['biospecimen__bio__bcr_sample_barcode'])

    # Materialize the results into a GDataframe
    methyl_Gdf = methyl.materialize('./(MaterializeResults)')

    # The result dataset is loaded as a GDataframe, an object containing two pandas dataframes, one for the region data and one for the metadata.
    # Get the two pandas dataframes:
    methyl_df_regs = methyl_Gdf.regs
    methyl_df_meta = methyl_Gdf.meta
    n_regs = len(methyl_df_regs)
    n_samples = len(methyl_df_meta)

    # Change index into progressive integer numbers and store the name of the sample in another column
    methyl_df_regs['sample_id'] = methyl_df_regs.index
    methyl_df_regs.index = range(n_regs)

    # Convert all the metadata values into strings, since they're encode as lists in Python
    col_names = []
    for name, values in methyl_df_meta.iteritems():
        col_names.append(name)
    for index, row in methyl_df_meta.iterrows():
        for c in col_names:
            list_val = row[c]  # it's encoded as a list
            str_val = ''.join(
                list_val)  # convert the value stored as a list in a string
            methyl_df_meta.set_value(index, c, str_val)

    # Export the metadata dataframe setting the TCGA aliquots as indexes.
    Metadata_df = methyl_df_meta.copy()
    Metadata_df['id_sample'] = Metadata_df.index
    Metadata_df.set_index('biospecimen__bio__bcr_sample_barcode', inplace=True)
    writer = ExcelWriter('./3_TCGA_Data/Methylation/METHYL (Metadata).xlsx')
    Metadata_df.to_excel(writer, 'Sheet1')
    writer.save()

    # Extract the sample barcodes (TCGA Aliquots)
    methyl_sample_barcodes = []
    for index, row in methyl_df_meta.iterrows():
        barcode = row['biospecimen__bio__bcr_sample_barcode']
        if barcode not in methyl_sample_barcodes:  # get distinct values
            methyl_sample_barcodes.append(barcode)

    # Load the list of genes of interest
    EntrezConversion_df = pd.read_excel('./Genes_of_Interest.xlsx',
                                        sheetname='Sheet1',
                                        header=0,
                                        converters={
                                            'GENE_SYMBOL': str,
                                            'ENTREZ_GENE_ID': str,
                                            'GENE_SET': str
                                        })

    # Create a list containing the Gene Symbols of the genes of interest
    genesSYM_of_interest = []
    for i, r in EntrezConversion_df.iterrows():
        name = r['GENE_SYMBOL']
        if name not in genesSYM_of_interest:
            genesSYM_of_interest.append(name)

    # Create a dictionary for storing all the methylation values for each gene of interest and for each aliquot TCGA
    dict_methyl_list = defaultdict(dict)

    for key, value in dict_methyl_list.items():
        value = defaultdict(list)
    # The main dictionary has the Gene Symbols of the genes of interest as keys and each gene has another dictionary as value, which, in turn, has the different aliquots as keys and lists as values.
    # The idea is having a list, containing all the beta_values, for each gene in each TCGA aliquot.

    # Set the Gene Symbols as keys of the main dictionary
    for name in genesSYM_of_interest:
        dict_methyl_list[name] = {}

    # Set the samples barcodes as keys for each dictionary set as value of a specific key (genes)
    for sample in methyl_sample_barcodes:
        for k, v in dict_methyl_list.items():
            v[sample] = []

    # Extract the methyl_areas dataset
    gl.set_remote_address('http://gmql.eu/gmql-rest/')
    gl.login()
    gl.set_mode('remote')

    ann_dataset = gl.load_from_remote(remote_name='GRCh38_ANNOTATION_GENCODE',
                                      owner='public')
    annotations_version = str(gencode_version)
    coding_transcripts_0 = ann_dataset.meta_select(
        (ann_dataset['release_version'] == annotations_version)
        & (ann_dataset['annotation_type'] == 'transcript'))
    coding_transcripts = coding_transcripts_0.reg_select(
        (coding_transcripts_0.transcript_type == 'protein_coding')
        & ((coding_transcripts_0.tag == 'basic')
           | (coding_transcripts_0.tag == 'CCDS')))
    methyl_areas_reg = coding_transcripts.reg_project(
        ['gene_id', 'gene_name', 'entrez_gene_id'],
        new_field_dict={
            'start': coding_transcripts.start - methyl_upstream,
            'stop': coding_transcripts.start + methyl_downstream
        })
    gencode_grch38_methyl_areas = methyl_areas_reg.group(
        regs=['gene_name'],
        regs_aggregates={
            'ensembl_gene_id': gl.BAG('gene_id'),
            'gene_symbol': gl.BAG('gene_name'),
            'entrez_gene_id': gl.BAG('entrez_gene_id')
        })

    # Materialize the results into a GDataframe
    Gencode_df_TSS_Gdf = gencode_grch38_methyl_areas.materialize(
        './(MaterializeResults)')

    # Get the regions dataframe
    Gencode_df_TSS = Gencode_df_TSS_Gdf.regs
    Gencode_df_TSS.rename(columns={
        'chr': 'chrom',
        'start': 'methyl_left',
        'stop': 'methyl_right'
    },
                          inplace=True)

    # Remove the transcripts that don't belong to genes of interest
    Gencode_df_TSS_interest = Gencode_df_TSS.loc[
        Gencode_df_TSS['gene_symbol'].isin(genesSYM_of_interest)].copy()

    # Extract from the TCGA data only useful columns for the following procedure
    methyl_df_regs.rename(columns={
        'chr': 'chrom',
        'start': 'left',
        'stop': 'right'
    },
                          inplace=True)
    methyl_df_regs_useful = methyl_df_regs[[
        'chrom', 'left', 'right', 'strand', 'beta_value', 'sample_id'
    ]].copy()

    # Create a dictionary for storing all the methylation regions associated to each gene of interest
    dict_methyl_df = {}

    # Set the Gene Symbols of genes of interest as keys of the main dictionary and an empty dataframe as values (with the same columns of 'methyl_df_regs_useful')
    columns = ['left', 'right', 'strand', 'beta_value', 'sample_id']
    for i in genesSYM_of_interest:
        dict_methyl_df[i] = pd.DataFrame(columns=columns)

    # The dictionary has the Gene Symbols of the genes of interest as keys and a dataframe containing all the methylation regions with genomic coordinates that are within +methyl_upstream/-methyl_downstream bases from the TSS, for each gene of interest.
    # Fill the empty dataframes set as values in the dictionary.

    # Iterate along the GENCODE dataframe containing transcripts belonging to genes of interest
    for index, row in Gencode_df_TSS_interest.iterrows():
        # extract values of attributes we are interested in
        gencode_chrom = row['chrom']
        gencode_left = row['methyl_left']
        gencode_right = row['methyl_right']
        gene = row['gene_symbol']
        # create a list with 'int' elements in the range [gencode_left, gencode_right)
        methyl_area = list(range(gencode_left, gencode_right))
        # select the methylation regions that are inside the region selected (i.e. 'methyl_area')
        selected_methyl_regs = methyl_df_regs_useful.loc[
            (methyl_df_regs_useful['chrom'] == gencode_chrom)
            & (methyl_df_regs_useful['left'].isin(methyl_area)) &
            (methyl_df_regs_useful['right'].isin(methyl_area))].copy()
        # set the extracted dataframe as value of the corresponding key (gene) in the dictionary
        value_df = dict_methyl_df[gene]  # get the old dataframe
        # concatenate the old dataframe and the new one as value in the dictionary
        frames = [value_df, selected_methyl_regs]
        dict_methyl_df[gene] = pd.concat(frames)

    # For each dataframe set as value in the dictionary, remove duplicated rows, if present
    for key, value in dict_methyl_df.items():
        value.drop_duplicates(keep='first', inplace=True)

    # Store in a list the Entrez Gene IDs of the genes of interest for which no regions has been found
    gencode_missing_values_genes = []
    for key, value in dict_methyl_df.items():
        if len(value) == 0:
            gencode_missing_values_genes.append(key)

    # Extract the methylation values for each gene of interest and for each TCGA aliquot.
    # Set the values by appending the methylation values for each gene of interest: these methylation values (beta_values) can be found in the dataframes set as values in dictionary "dict_ov_methyl_df".
    for gene, value_df in dict_methyl_df.items():
        for index, row in value_df.iterrows():
            beta = row['beta_value']  # get the methylation value
            sample = row['sample_id']  # get the name of the sample
            # get the aliquot corresponding to current sample
            aliq = methyl_df_meta.get_value(
                sample, 'biospecimen__bio__bcr_sample_barcode')
            # add the value according to the correct Gene Symbol and TCGA aliquot
            dict_methyl_list[gene][aliq].append(round(float(beta), 8))

    #-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
    # Extract in a list the names of the distinct sample barcodes (aliquots)
    methyl_sample_barcodes = list((list(dict_methyl_list.values()))[0].keys())
    # Export the list of common aliquots in a .txt file
    with open('./3_TCGA_Data/Common_Aliquots.txt', 'w') as fp:
        for i in methyl_sample_barcodes:
            fp.write("%s\n" % i)

    # Shuffle and randomly splits into five different sets the TCGA aliquots to be analyzed.
    # This five sets of aliquots will be used in the feature selection procedure during the data analysis phase as five different test sets (with the remaining aliquots forming the corresponding training set), allowing the data analysis method to be trained and tested.
    # Thus, in order to reduce the bias, a cross-validation procedure is adopted and the feature selection is executed on each data matrix five times: the final set of features selected for that matrix
    # is the intersection of the five different subsets returned by the five different feature selection sub-processes on that same data matrix.

    # Import the list of common TCGA aliquots to analyze
    aliquot_file = open('./3_TCGA_Data/Common_Aliquots.txt', 'r')
    aliquots = aliquot_file.read().split('\n')
    aliquots.remove('')
    aliquot_file.close()

    # Create a dataframe having the TCGA aliquots as indexes of its rows
    model_gene_df = pd.DataFrame(index=aliquots, columns=['C1', 'C2'])

    # Shuffle the rows of the model gene dataframe in a random way, in order to reduce the bias
    model_gene_df = shuffle(model_gene_df)

    # Split the dataframe into five dataframes that will be used as test sets
    model_gene_df_split = np.array_split(model_gene_df, 5)
    model_gene_df_test1 = model_gene_df_split[0]
    model_gene_df_test2 = model_gene_df_split[1]
    model_gene_df_test3 = model_gene_df_split[2]
    model_gene_df_test4 = model_gene_df_split[3]
    model_gene_df_test5 = model_gene_df_split[4]

    # Save the aliquots selected for each of the five test dataframes in a dictionary
    dict_test_split = defaultdict(list)
    dict_test_split['Test_1'] = list(model_gene_df_test1.index.values)
    dict_test_split['Test_2'] = list(model_gene_df_test2.index.values)
    dict_test_split['Test_3'] = list(model_gene_df_test3.index.values)
    dict_test_split['Test_4'] = list(model_gene_df_test4.index.values)
    dict_test_split['Test_5'] = list(model_gene_df_test5.index.values)

    # Export the dictionary
    pickle.dump(dict_test_split,
                open('./5_Data_Analysis/dict_test_split.p', 'wb'))
    #-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

    # Convert the nested dictionary into a dataframe:

    # Create a dataframe whose row indexes are the different TCGA samples and the columns are the distinct genes of interest
    methyl_df1 = pd.DataFrame(index=methyl_sample_barcodes,
                              columns=[genesSYM_of_interest])

    # Add three additional columns for the name of the sample, the ID of the patient and the tumor tag corresponding to each aliquot
    methyl_df2 = pd.DataFrame(index=methyl_sample_barcodes,
                              columns=['Sample_ID', 'Tumor', 'Patient_ID'])

    # Create the final dataframe
    methyl_list_df = methyl_df1.join(methyl_df2)

    # Add to the dataframe the name of each sample, the patient ID and the tumor tag in correspondence of each TCGA aliquot
    for index, row in Metadata_df.iterrows():
        sample = row['id_sample']
        tumor_tag = row['clinical__admin__disease_code']
        patient_id = row['clinical__shared__patient_id']
        methyl_list_df.set_value(index, 'Sample_ID', sample)
        methyl_list_df.set_value(index, 'Tumor', tumor_tag)
        methyl_list_df.set_value(index, 'Patient_ID', patient_id)

    # Add a row at the beginning of the dataframe to insert also the Entrez Gene ID of each gene of interest
    additional_index = ['ENTREZ_GENE_ID']
    methyl_df0_1 = pd.DataFrame(index=additional_index,
                                columns=[genesSYM_of_interest])
    methyl_df0_2 = pd.DataFrame(index=additional_index,
                                columns=['Sample_ID', 'Tumor', 'Patient_ID'])
    methyl_df0 = methyl_df0_1.join(methyl_df0_2)

    frames = [methyl_df0, methyl_list_df]
    methyl_list_df = pd.concat(frames)

    # Add for each Gene Symbol of our genes of interest the corresponding Entrez ID in the first row of the dataframe
    for i, r in EntrezConversion_df.iterrows():
        entrez_id = r['ENTREZ_GENE_ID']
        gene_name = r['GENE_SYMBOL']
        methyl_list_df.set_value('ENTREZ_GENE_ID', gene_name, entrez_id)

    # Set empty strings for NaN values in the 'GENE_SYMBOL' row
    methyl_list_df.set_value('ENTREZ_GENE_ID', 'Sample_ID', "")
    methyl_list_df.set_value('ENTREZ_GENE_ID', 'Tumor', "")
    methyl_list_df.set_value('ENTREZ_GENE_ID', 'Patient_ID', "")

    # Add to the dataframe the list of methylation values for each gene of interest in each aliquot TCGA
    for gene, dict_value in dict_methyl_list.items():
        for tcga_aliq, beta_list in dict_value.items():
            # get the list of beta_values for gene 'gene' and aliquot 'tcga_aliq' and add it in the proper cell of the dataframe
            methyl_list_df.set_value(tcga_aliq, gene, beta_list)

    # Compute the MEAN for the beta_values:

    # In case the same gene has more than one beta_value for a single sample, compute their median value and set it as the new beta_value.
    # In this way, we will have a single methylation value for each gene in each sample.
    dict_methyl = dict_methyl_list.copy()
    methyl_df = methyl_list_df.copy()

    sum_values = 0
    count_values = 0

    for gene_name, dict_value in dict_methyl.items():
        for tcga_aliq, beta_list in dict_value.items():
            # get the list of beta_values for gene 'entrez_id' and aliquot 'tcga_aliq'
            for v in beta_list:
                if (len(beta_list) !=
                        0):  # if the list of beta_values is not empty
                    if not (math.isnan(v)
                            ):  # if the values considered is not 'nan'
                        # consider the current value
                        sum_values += v
                        count_values += 1
            # if there's more than one beta_value for the same gene in the same sample
            if (count_values != 0):
                # compute the median value
                single_beta_value = float(sum_values / count_values)
                # set the new single beta_value as the new methylation value for that gene
                # in correspondence of that specific aliquot, rounding it to a float with 8 decimal numbers
                dict_value[tcga_aliq] = round(single_beta_value, 8)
                # add this methylation value also in the proper cell of the dataframe
                methyl_df.set_value(tcga_aliq, gene_name,
                                    round(single_beta_value, 8))
                # reset the variables for the next iteration
                sum_values = 0
                count_values = 0
                single_beta_value = 0

    for i in genesSYM_of_interest:
        methyl_df[i] = methyl_df[i].apply(
            lambda y: np.nan if (isinstance(y, list) and len(y) == 0) else y)

    # Export the dataframe with the single methylation values for each gene of interest and in each TCGA aliquot
    writer = ExcelWriter('./3_TCGA_Data/Methylation/Methylation_Values.xlsx')
    methyl_df.to_excel(writer, 'Sheet1')
    writer.save()

    return methyl_df
Ejemplo n.º 2
0
def extract_tfs(cell_lines, gencode_version):
    """
	The EXTRACT_TFS operation extracts, from ChIP_Seq ENCODE expriments and for assembly GRCh38, the Transcription Factors that bind to promoter regions of genes belonging to specific cell lines, filtering by 'conservative idr thresholded peaks' in order to extract higher quality region data, and removing negative audits in order to keep only high quality data samples. Intermediate results files are exported locally during the execution of the function, while the final set of trasncription factors is returned as a Python dictionary (dict_GeneTF.p), where each target gene (set as key) is associated to the list of TFs binding to its promoters (set as value). 

	:param cell_lines: a list of strings containing the names of the cell lines to analyze (it's possible to consider data from 1 up to 3 cell lines at the same time)
	:param gencode_version: number representing the GENCODE genomic annotations to use (currently, for assembly GRCh38, versions 22, 24 and 27 can be used)
	:return: a Python dictionary
	
	Example::
	
		import genereg as gr
		tfs_dict = gr.TranscriptionFactors.extract_tfs(cell_lines=['K562','MCF7'], gencode_version=22)
	"""

    # Check input parameters
    if not ((len(cell_lines) > 0) and (len(cell_lines) < 4)):
        raise ValueError(
            'You have to specify from 1 up to 3 cell lines to investigate')

    if gencode_version not in [22, 24, 27]:
        raise ValueError('GRCh38 GENCODE versions available are 22, 24 and 27')

    # Execute the query for the extraction of TFs on the remote server, using the PyGMQL Python library
    gl.set_remote_address('http://gmql.eu/gmql-rest/')
    gl.login()
    gl.set_mode('remote')

    # Load the ENCODE datasets to be used in the query
    narrow_dataset = gl.load_from_remote(
        remote_name='GRCh38_ENCODE_NARROW_NOV_2017', owner='public')
    ann_dataset = gl.load_from_remote(remote_name='GRCh38_ANNOTATION_GENCODE',
                                      owner='public')

    # Extract NARROW data of interest
    if len(cell_lines) == 1:
        cell = cell_lines[0]
        narrow = narrow_dataset.meta_select(
            (narrow_dataset['assay'] == 'ChIP-seq')
            & (narrow_dataset['output_type'] ==
               'conservative idr thresholded peaks')
            & (narrow_dataset['biosample_term_name'] == cell)
            & (narrow_dataset['assembly'] == 'GRCh38')
            & (narrow_dataset['project'] == 'ENCODE')
            & (narrow_dataset['file_status'] == 'released') &
            (~((narrow_dataset['audit_error'] == 'extremely low read depth') |
               (narrow_dataset['audit_error'] == 'extremely low read length') |
               (narrow_dataset['audit_warning'] == 'insufficient read depth') |
               (narrow_dataset['audit_not_compliant'] ==
                'insufficient read depth') |
               (narrow_dataset['audit_not_compliant'] ==
                'insufficient replicate concordance') |
               (narrow_dataset['audit_not_compliant'] ==
                'missing input control') |
               (narrow_dataset['audit_not_compliant'] == 'severe bottlenecking'
                ) | (narrow_dataset['audit_not_compliant'] ==
                     'unreplicated experiment'))))
    elif len(cell_lines) == 2:
        cell_1 = cell_lines[0]
        cell_2 = cell_lines[1]
        narrow = narrow_dataset.meta_select(
            (narrow_dataset['assay'] == 'ChIP-seq')
            & (narrow_dataset['output_type'] ==
               'conservative idr thresholded peaks')
            & ((narrow_dataset['biosample_term_name'] == cell_1)
               | (narrow_dataset['biosample_term_name'] == cell_2))
            & (narrow_dataset['assembly'] == 'GRCh38')
            & (narrow_dataset['project'] == 'ENCODE')
            & (narrow_dataset['file_status'] == 'released') &
            (~((narrow_dataset['audit_error'] == 'extremely low read depth') |
               (narrow_dataset['audit_error'] == 'extremely low read length') |
               (narrow_dataset['audit_warning'] == 'insufficient read depth') |
               (narrow_dataset['audit_not_compliant'] ==
                'insufficient read depth') |
               (narrow_dataset['audit_not_compliant'] ==
                'insufficient replicate concordance') |
               (narrow_dataset['audit_not_compliant'] ==
                'missing input control') |
               (narrow_dataset['audit_not_compliant'] == 'severe bottlenecking'
                ) | (narrow_dataset['audit_not_compliant'] ==
                     'unreplicated experiment'))))
    elif len(cell_lines) == 3:
        cell_1 = cell_lines[0]
        cell_2 = cell_lines[1]
        cell_3 = cell_lines[2]
        narrow = narrow_dataset.meta_select(
            (narrow_dataset['assay'] == 'ChIP-seq')
            & (narrow_dataset['output_type'] ==
               'conservative idr thresholded peaks')
            & ((narrow_dataset['biosample_term_name'] == cell_1)
               | (narrow_dataset['biosample_term_name'] == cell_2)
               | (narrow_dataset['biosample_term_name'] == cell_3))
            & (narrow_dataset['assembly'] == 'GRCh38')
            & (narrow_dataset['project'] == 'ENCODE')
            & (narrow_dataset['file_status'] == 'released') &
            (~((narrow_dataset['audit_error'] == 'extremely low read depth') |
               (narrow_dataset['audit_error'] == 'extremely low read length') |
               (narrow_dataset['audit_warning'] == 'insufficient read depth') |
               (narrow_dataset['audit_not_compliant'] ==
                'insufficient read depth') |
               (narrow_dataset['audit_not_compliant'] ==
                'insufficient replicate concordance') |
               (narrow_dataset['audit_not_compliant'] ==
                'missing input control') |
               (narrow_dataset['audit_not_compliant'] == 'severe bottlenecking'
                ) | (narrow_dataset['audit_not_compliant'] ==
                     'unreplicated experiment'))))

    # Create the dataset of promoters
    annotations_version = str(gencode_version)
    coding_transcripts_0 = ann_dataset.meta_select(
        (ann_dataset['release_version'] == annotations_version)
        & (ann_dataset['annotation_type'] == 'transcript'))
    coding_transcripts = coding_transcripts_0.reg_select(
        (coding_transcripts_0.transcript_type == 'protein_coding')
        & ((coding_transcripts_0.tag == 'basic')
           | (coding_transcripts_0.tag == 'CCDS')))
    prom_reg = coding_transcripts.reg_project(
        ['gene_id', 'gene_name', 'entrez_gene_id'],
        new_field_dict={
            'start': coding_transcripts.start - 2000,
            'stop': coding_transcripts.start + 1000
        })
    prom = prom_reg.group(regs=['gene_name'],
                          regs_aggregates={
                              'ensembl_id': gl.BAGD('gene_id'),
                              'entrez_id': gl.BAGD('entrez_gene_id')
                          })

    # Merge all the possible replicas of the same TF, combining them in a single sample
    full_encode = narrow.normal_cover(1, 'ANY', ['experiment_target'])

    # Extract the transcription factors that overlap with at least one promoter region
    res_0 = prom.map(full_encode, refName='prom', expName='TF')
    res_1 = res_0.reg_select(res_0.count_prom_TF > 0)

    # Encode, for each region, the corresponding TF that binds to it as a region attribute
    set_tf = res_1.reg_project(
        new_field_dict={'TF': res_1['TF.experiment_target', 'string']})

    # Merge all the samples into a dataset with a single sample containing all the regions with their binding TFs
    # and remove regions with unknown names for their belonging gene
    merged = set_tf.merge()
    known_genes = merged.reg_select(merged.entrez_id != '')

    # Group the regions by name, setting in the region attribute 'TF' the list of transcription factors that bind to that gene's promoters
    res = known_genes.group(regs=['gene_name'],
                            regs_aggregates={
                                'ensembl_gene_id': gl.BAGD('ensembl_id'),
                                'entrez_gene_id': gl.BAGD('entrez_id'),
                                'TFs': gl.BAGD('TF')
                            })

    # Materialize the results into a GDataframe
    res_Gdf = res.materialize('./(MaterializeResults)')

    # Extract the regions dataframe, where each row corresponds to a region and each column to an attribute
    GeneTF_df = res_Gdf.regs
    # Check the length of the dataframe, that is the number of rows of the dataframe
    length_df = len(GeneTF_df)
    # Set progressive integer numbers as new indexes of the dataframe
    GeneTF_df.index = range(length_df)

    # Convert all columns names into uppercase letters and rename them
    GeneTF_df.columns = map(str.upper, GeneTF_df.columns)
    GeneTF_df.rename(columns={
        'CHR': 'CHROM',
        'START': 'LEFT',
        'STOP': 'RIGHT',
        'GENE_NAME': 'GENE_SYMBOL',
        'TFS': 'TFs'
    },
                     inplace=True)

    for index, row in GeneTF_df.iterrows():
        tfs_str = row['TFs']
        tfs_list = tfs_str.split(',')
        GeneTF_df.set_value(index, 'TFs', tfs_list)

    # Load the list of genes of interest
    EntrezConversion_df = pd.read_excel('./Genes_of_Interest.xlsx',
                                        sheetname='Sheet1',
                                        header=0,
                                        converters={
                                            'GENE_SYMBOL': str,
                                            'ENTREZ_GENE_ID': str,
                                            'GENE_SET': str
                                        })

    # Create a list with the Gene Symbols of the genes of interest
    Symbols = []
    for index, row in EntrezConversion_df.iterrows():
        i = row['GENE_SYMBOL']
        Symbols.append(i)
    N_genes = len(Symbols)

    # Create an empty dictionary with lists as values for each key
    from collections import defaultdict
    dict_GeneTF = defaultdict(list)

    # Set the keys and initialize their values as empty lists
    for v in Symbols:
        dict_GeneTF[v] = []
    dict_length = len(dict_GeneTF)

    # Select from the GeneTF_df only the rows with Gene Symbols of target genes of interest
    for index, row in GeneTF_df.iterrows(
    ):  # iterate along the whole dataframe
        # get the current row GENE_SYMBOL
        i = row['GENE_SYMBOL']
        for value in Symbols:  # check if the current gene is contained in the list of genes of interest
            if i == value:  # if there's correspondence
                TrFa_list = row.TFs  # extract the list of TFs
                for t in TrFa_list:
                    # since each gene can have more than one promoter bound by the same TF,
                    # only distinct values for each transcription factor should be inserted in the dictionary
                    if t not in dict_GeneTF[i]:
                        # add the transcription factor to the list of values corresponding to that gene
                        dict_GeneTF[i].append(t)

    # Order alphabetically the list of TFs for each gene of interest
    for k in dict_GeneTF.keys():
        old = dict_GeneTF[k]  # get the list of TFs
        sorted_TFs = sorted(old, key=lambda s: s.lower(
        ))  # sort the list alphabetically (case-insensitive sorting)
        dict_GeneTF[k] = sorted_TFs

    # For each gene, add to the dictionary its ENTREZ_GENE_ID and the GENE_SET it belongs to.
    # These are added as two lists at the end of the value list of each key (i.e. gene): clearly the list containing
    # the ENTREZ_GENE_ID will always have one string element for each key, while the list containing the gene sets
    # can have one or more elements, depending on the number of sets the corresponding gene belongs to.

    # Get distinct Gene Symbols of genes of interest (considering only once the genes that belongs to multiple sets)
    Symbols_distinct = []
    for value in Symbols:
        if value not in Symbols_distinct:
            Symbols_distinct.append(value)

    for value in Symbols_distinct:
        row = EntrezConversion_df.loc[EntrezConversion_df['GENE_SYMBOL'] ==
                                      value]
        # get the ENTREZ_GENE_ID (in case of gene belonging to multiple sets this list will contain
        # its ENTREZ_GENE_ID as many times as the number of sets it belongs to)
        row_entrez_id = list(row.ENTREZ_GENE_ID)
        N_eid = len(row_entrez_id)
        if N_eid > 1:
            entrez_id = list(list(
                row_entrez_id[:N_eid -
                              1]))  # consider the ENTREZ_GENE_ID only once
        else:
            entrez_id = list(row_entrez_id)
        sets = list(list(row.GENE_SET))
        # add the ENTREZ_GENE_ID and the GENE_SET as elements in the list of values corresponding to the proper gene
        dict_GeneTF[value].append(entrez_id)
        dict_GeneTF[value].append(sets)

    # So, the general form of this dictionary containing the information we need is the following:
    # dict_GeneTF = {key: value, ...} = {GENE_SYMBOL: [TF1, TF2, TF3, ..., [ENTREZ_GENE_ID], [GENE_SETs]]}

    # Store the number of TFs for each gene of interest in a new dictionary
    from collections import defaultdict
    dict_TFs_gene = defaultdict(int)

    # Initialize the dictionary setting the keys and their initial values
    for k in dict_GeneTF.keys():
        dict_TFs_gene[k] = 0

    # Set the number of TFs that bind to a gene's promotes as value of the corresping key in the dictionary
    for k in dict_GeneTF.keys():
        transcription_factors = dict_GeneTF[k][:-2]
        number_TFs = len(transcription_factors)
        dict_TFs_gene[k] = number_TFs

    # Generate an histogram showing the previous distribution, that is the number of TFs that bind
    # to each gene's promoters (sorting this number from the highest to the smallest)

    # Convert the dictionary into a pandas dataframe
    TFs_gene_unsorted_df = pd.DataFrame(list(dict_TFs_gene.items()),
                                        columns=['GENE_SYMBOL', '#TFs'])

    # Sort the dataframe according to the number of TFs for each gene
    TFs_gene_df = TFs_gene_unsorted_df.sort_values(by='#TFs', ascending=0)

    # Add to the dataframe a column for storing also the Entrez Gene ID of each gene besides the already present Gene Symbols
    TFs_gene_df['ENTREZ_GENE_ID'] = ''

    # Add the correct Entrez Gene ID for each gene
    for index, row in TFs_gene_df.iterrows():
        sym = row['GENE_SYMBOL']
        eid = EntrezConversion_df.loc[EntrezConversion_df['GENE_SYMBOL'] ==
                                      sym, 'ENTREZ_GENE_ID'].iloc[0]
        TFs_gene_df.set_value(index, 'ENTREZ_GENE_ID', eid)

    # Export the dataframe into an Excel file
    writer = ExcelWriter(
        './1_Transcription_Factors/Number of TFs for each gene of interest.xlsx'
    )
    TFs_gene_df.to_excel(writer, 'Sheet1', index=False)
    writer.save()

    # Export the dictionary of genes of interest and their TFs, ENTREZ GENE ID and GENE_SETs
    # Save the dictionary into a pickle file
    pickle.dump(dict_GeneTF,
                open('./1_Transcription_Factors/dict_GeneTF.p', 'wb'))

    # Save the dictionary as a .xlsx file
    workbook = xlsxwriter.Workbook(
        './1_Transcription_Factors/dict_GeneTF.xlsx')
    worksheet = workbook.add_worksheet()
    # Set the headers of the columns
    worksheet.write(0, 0, 'GENE_SYMBOL')
    worksheet.write(0, 1, 'Transcription Factors')

    row = 1
    col = 0
    # Print the dictionary
    for key in dict_GeneTF.keys():
        row += 1
        worksheet.write(row, col, key)
        for item in dict_GeneTF[key]:
            worksheet.write(row, col + 1, ''.join(item))
            if item == dict_GeneTF[key][
                    -2]:  # the second to last element is the Entrez Gene ID
                worksheet.write(row, col + 2, 'Entrez Gene ID')
            if item == dict_GeneTF[key][
                    -1]:  # the last element is the list of gene sets
                worksheet.write(row, col + 2, 'Gene Set')
            row += 1
    workbook.close()

    # Save the dictionary as a .txt file
    with open('./1_Transcription_Factors/dict_GeneTF.txt', 'w') as fp:
        for p in dict_GeneTF.items():
            fp.write('%s : %s\n\n' % p)

    return dict_GeneTF
Ejemplo n.º 3
0
def extract_expression(tumor, platform, gencode_version):
    """
	The EXTRACT_EXPRESSION operation extracts expression values from TCGA for all the genes of interest and their candidate regulatory genes. Intermediate results files are exported locally during the execution of the function, while the final dataframes are returned as Pandas dataframes and exported locally in the Excel files 'Gene Expression - InterestGenes.xlsx' and 'Gene Expression - RegulatoryGenes.xlsx'.

	:param tumor: full name of the tumor of interest, encoded as a string (e.g. 'Ovarian Serous Cystadenocarcinoma', 'Breast Invasive Carcinoma', ...)
	:param platform: number identifying the sequencing platform (either 27 for the 27k probes sequencing platform or 450 for the 450k probes sequencing platform)
	:param gencode_version: number representing the GENCODE genomic annotations to use (currently, for assembly GRCh38, versions 22, 24 and 27 can be used)
	:return: two Pandas dataframes

	Example::
	
		import genereg as gr
		expr_interest_df, expr_regul_df = gr.GeneExpression.extract_expression(tumor='Ovarian Serous Cystadenocarcinoma', platform=27, gencode_version=22)
	"""

    # Check input parameters
    tcga_tumors = [
        "Acute Myeloid Leukemia", "Adrenocortical Carcinoma",
        "Bladder Urothelial Carcinoma", "Brain Lower Grade Glioma",
        "Breast Invasive Carcinoma",
        "Cervical Squamous Cell Carcinoma and Endocervical Adenocarcinoma",
        "Cholangiocarcinoma", "Colon Adenocarcinoma", "Esophageal Carcinoma",
        "Glioblastoma Multiforme", "Head and Neck Squamous Cell Carcinoma",
        "Kidney Chromophobe", "Kidney Renal Clear Cell Carcinoma",
        "Kidney Renal Papillary Cell Carcinoma",
        "Liver Hepatocellular Carcinoma", "Lung Adenocarcinoma",
        "Lung Squamous Cell Carcinoma",
        "Lymphoid Neoplasm Diffuse Large B-cell Lymphoma", "Mesothelioma",
        "Ovarian Serous Cystadenocarcinoma", "Pancreatic Adenocarcinoma",
        "Pheochromocytoma and Paraganglioma", "Prostate Adenocarcinoma",
        "Rectum Adenocarcinoma", "Sarcoma", "Skin Cutaneous Melanoma",
        "Stomach Adenocarcinoma", "Testicular Germ Cell Tumors", "Thymoma",
        "Thyroid Carcinoma", "Uterine Carcinosarcoma",
        "Uterine Corpus Endometrial Carcinoma", "Uveal Melanoma"
    ]
    if tumor not in tcga_tumors:
        raise ValueError(
            'PATHOLOGY NOT SUPPORTED! You can analyze one of these 33 types of TCGA tumors: '
            + (', '.join(tcga_tumors)))

    if platform not in [27, 450]:
        raise ValueError(
            'PLATFORM NOT RECOGNIZED! Sequencing platforms available: 27 and 450'
        )

    if gencode_version not in [22, 24, 27]:
        raise ValueError('GRCh38 GENCODE versions available are 22, 24 and 27')

    # Load the list of genes of interest
    EntrezConversion_df = pd.read_excel('./Genes_of_Interest.xlsx',
                                        sheetname='Sheet1',
                                        header=0,
                                        converters={
                                            'GENE_SYMBOL': str,
                                            'ENTREZ_GENE_ID': str,
                                            'GENE_SET': str
                                        })

    # Create a list containing the Gene Symbols of the genes of interest
    genesSYM_of_interest = []
    for i, r in EntrezConversion_df.iterrows():
        sym = r['GENE_SYMBOL']
        if sym not in genesSYM_of_interest:
            genesSYM_of_interest.append(sym)

    # Import the dictionary of genes of interest with their candidate regulatory genes
    dict_RegulGenes = pickle.load(
        open('./2_Regulatory_Genes/dict_RegulGenes.p', 'rb'))

    # Import the gene-TFs mapping dataframe
    Mapping_df = pd.read_excel('./0_Genes_Mapping/Genes_Mapping.xlsx',
                               sheetname='Sheet1',
                               header=0,
                               converters={
                                   'ENTREZ_GENE_ID': str,
                                   'HGNC_ID': str
                               })

    # Create a list containing the Gene Symbols of the regulatory genes of genes of interest
    regulatory_genesSYM = []
    for key, value in dict_RegulGenes.items():
        for gene in value:
            if gene not in regulatory_genesSYM:
                regulatory_genesSYM.append(gene)

    # Extract the list of distinct Gene Symbols mapped in the mapping table
    mapped_gene_SYMs = []
    for index, row in Mapping_df.iterrows():
        sym = row['GENE_SYMBOL']
        if sym not in mapped_gene_SYMs:
            mapped_gene_SYMs.append(sym)

    # Execute the query for the extraction of gene expression values on the remote server, using the PyGMQL Python library
    gl.set_remote_address('http://gmql.eu/gmql-rest/')
    gl.login()
    gl.set_mode('remote')

    # Load the TCGA datasets to be used in the query
    methylation_dataset = gl.load_from_remote(
        remote_name='GRCh38_TCGA_methylation', owner='public')
    expression_dataset = gl.load_from_remote(
        remote_name='GRCh38_TCGA_gene_expression', owner='public')

    # Identify the sequencing platform to be used
    if platform == 27:
        seq_platform = 'Illumina Human Methylation 27'
    elif platform == 450:
        seq_platform = 'Illumina Human Methylation 450'

    # Extract all the samples for the current tumor and platform
    all_methyl = methylation_dataset.meta_select(
        (methylation_dataset['manually_curated__cases__disease_type'] == tumor)
        & (methylation_dataset['manually_curated__platform'] == seq_platform)
        & ((methylation_dataset['biospecimen__bio__sample_type'] ==
            'Primary Tumor')
           | (methylation_dataset['biospecimen__bio__sample_type'] ==
              'Recurrent Tumor'))
        & (methylation_dataset[
            'clinical__shared__history_of_neoadjuvant_treatment'] == 'No'))
    all_expr = expression_dataset.meta_select(
        (expression_dataset['manually_curated__cases__disease_type'] == tumor)
        & ((expression_dataset['biospecimen__bio__sample_type'] ==
            'Primary Tumor')
           | (expression_dataset['biospecimen__bio__sample_type'] ==
              'Recurrent Tumor'))
        & (expression_dataset[
            'clinical__shared__history_of_neoadjuvant_treatment'] == 'No'))

    # Gene Expression:
    expr_0 = all_expr.reg_project(field_list=[
        'ensembl_gene_id', 'entrez_gene_id', 'gene_symbol', 'fpkm'
    ])
    expr = expr_0.meta_select(
        semiJoinDataset=all_methyl,
        semiJoinMeta=['biospecimen__bio__bcr_sample_barcode'])

    # Materialize the results into a GDataframe
    expr_Gdf = expr.materialize('./(MaterializeResults)')

    # The result dataset is loaded as a GDataframe, an object containing two pandas dataframes, one for the region data and one for the metadata.
    # Get the two pandas dataframes:
    expr_df_regs = expr_Gdf.regs
    expr_df_meta = expr_Gdf.meta
    n_regs = len(expr_df_regs)
    n_samples = len(expr_df_meta)

    # Rename 'chr', 'start', and 'stop' columns header
    expr_df_regs.rename(columns={
        'chr': 'chrom',
        'start': 'left',
        'stop': 'right'
    },
                        inplace=True)
    # Change index into progressive integer numbers and store the name of the sample in another column
    expr_df_regs['sample_id'] = expr_df_regs.index
    expr_df_regs.index = range(n_regs)

    # Convert unknown values (NaN) to empty strings
    expr_df_regs = expr_df_regs.fillna('')

    # Convert all the metadata values into strings, since they're encode as lists in Python
    col_names = []
    for name, values in expr_df_meta.iteritems():
        col_names.append(name)
    for index, row in expr_df_meta.iterrows():
        for c in col_names:
            list_val = row[c]  # it's encoded as a list
            str_val = ''.join(
                list_val)  # convert the value stored as a list in a string
            expr_df_meta.set_value(index, c, str_val)

    # Since we have to extract the expression values for each distinct sample barcode (aliquot), we create a list containing these distinct identifiers
    expr_sample_barcodes_all = []
    for index, row in expr_df_meta.iterrows():
        barcode = row['biospecimen__bio__bcr_sample_barcode']
        if barcode not in expr_sample_barcodes_all:  # get distinct values
            expr_sample_barcodes_all.append(barcode)

    # Check which are repeated aliquots, if present
    all_aliqouts = []
    for index, row in expr_df_meta.iterrows():
        barcode = row['biospecimen__bio__bcr_sample_barcode']
        all_aliqouts.append(barcode)
    multiple_aliquots = [
        item for item, count in collections.Counter(all_aliqouts).items()
        if count > 1
    ]

    samples_to_remove = []
    expr_sample_barcodes = []
    if len(multiple_aliquots) != 0:
        # Among the repeated aliquots, keep only the most recent ones (of 2013)
        for index, row in expr_df_meta.iterrows():
            year = row['biospecimen__bio__year_of_shipment']
            barcode = row['biospecimen__bio__bcr_sample_barcode']
            if (barcode in multiple_aliquots) and year == '2011':
                expr_df_meta.drop(index, inplace=True)
                samples_to_remove.append(index)

        # Import the list of aliquots in the methylation dataset
        text_file = open('./3_TCGA_Data/Common_Aliquots.txt', 'r')
        aliquots = text_file.read().split('\n')
        aliquots.remove('')
        text_file.close()

        # Extract the new list of distinct TCGA Aliquots to extract
        for index, row in expr_df_meta.iterrows():
            barcode = row['biospecimen__bio__bcr_sample_barcode']
            if barcode in aliquots:
                if barcode not in expr_sample_barcodes:
                    expr_sample_barcodes.append(barcode)
            else:
                expr_df_meta.drop(index, inplace=True)
                samples_to_remove.append(index)

        # Remove regions that corresponded to eliminated repeated aliquots
        expr_df_regs = expr_df_regs.loc[~(
            expr_df_regs['sample_id'].isin(samples_to_remove))].copy()

    else:
        expr_sample_barcodes = expr_sample_barcodes_all

    # Export the metadata dataframe setting the TCGA aliquots as indexes.
    Metadata_df = expr_df_meta.copy()
    Metadata_df['id_sample'] = Metadata_df.index
    Metadata_df.set_index('biospecimen__bio__bcr_sample_barcode', inplace=True)
    writer = ExcelWriter('./3_TCGA_Data/Gene_Expression/EXPR_(Metadata).xlsx')
    Metadata_df.to_excel(writer, 'Sheet1')
    writer.save()

    # Extract from the expression dataset all the regions that belong to genes of interest
    expr_df_regs_interest = expr_df_regs.loc[expr_df_regs['gene_symbol'].isin(
        genesSYM_of_interest)].copy()
    # Extract from the expression dataset all the regions that belong to regulatory genes of genes of interest
    expr_df_regs_regulatory = expr_df_regs.loc[
        expr_df_regs['gene_symbol'].isin(regulatory_genesSYM)].copy()

    # Gene expression values for each gene of interest:

    # Create a dictionary for storing all the gene expression values for each gene of interest and for each aliquot TCGA
    from collections import defaultdict
    dict_expr_interest = defaultdict(dict)

    for key, value in dict_expr_interest.items():
        value = defaultdict(list)

    # The main dictionary has the Gene Symbols of the genes of interest as keys and each gene has another dictionary as value, which, in turn, has the different aliquots as keys and lists as values.
    # The idea is having a list, containing all the fpkm values, for each gene in each TCGA aliquot.

    # Set the Gene Symbol as keys of the main dictionary
    for name in genesSYM_of_interest:
        dict_expr_interest[name] = {}

    # Set the names of the samples barcodes as keys for each dictionary set as value of a specific key (genes)
    for sample in expr_sample_barcodes:
        for k, v in dict_expr_interest.items():
            v[sample] = []

    # Set the values by appending the expression values for each gene of interest: these expression values (fpkm) can be found in the 'expr_df_regs_interest' dataframe
    for index, row in expr_df_regs_interest.iterrows(
    ):  # iterating along the whole dataframe
        sym = row['gene_symbol']  # get the Gene Symbol of the gene
        fpkm = row['fpkm']  # get the gene expression value
        sample = row['sample_id']  # get the name of the sample
        # get the aliquot corresponding to current sample
        aliq = expr_df_meta.get_value(sample,
                                      'biospecimen__bio__bcr_sample_barcode')
        # add the value according to the correct gene ID and TCGA aliquot, rounding it to a float with maximum 6 decimal numbers,
        dict_expr_interest[sym][aliq].append(round(float(fpkm), 6))

    # Convert the nested dictionary also into a dataframe

    # Create a dataframe whose row indexes are the different TCGA samples and the columns are the distinct genes of interest
    expr_interest_df1 = pd.DataFrame(index=expr_sample_barcodes,
                                     columns=[genesSYM_of_interest])

    # Add three additional columns for the name of the sample and the ID and barcode of the patient corresponding to each aliquot, in order to have them available if we will need it
    expr_interest_df2 = pd.DataFrame(
        index=expr_sample_barcodes,
        columns=['Sample_ID', 'Tumor', 'Patient_ID'])

    # Create the final dataframe
    expr_interest_df = expr_interest_df1.join(expr_interest_df2)

    # Fill the previously created dataframe with the correct gene expression values, for each gene of interest and for each TCGA aliquot
    for gene_sym, dict_value in dict_expr_interest.items():
        for tcga_aliq, exp_list in dict_value.items():
            if (len(exp_list) != 0):
                fpkm = exp_list[0]
                # add the expression value in the proper cell of the dataframe, rounding it to a float with maximum 6 decimal numbers
                expr_interest_df.set_value(tcga_aliq, gene_sym, round(fpkm, 6))

    # Add to the dataframe the name of each sample, the tumor code and the patient's ID in correspondence of each TCGA aliquot
    for index, row in expr_df_meta.iterrows():
        aliquot = row['biospecimen__bio__bcr_sample_barcode']
        tumor_tag = row['clinical__admin__disease_code']
        patient_id = row['clinical__shared__patient_id']
        expr_interest_df.set_value(aliquot, 'Sample_ID', index)
        expr_interest_df.set_value(aliquot, 'Tumor', tumor_tag)
        expr_interest_df.set_value(aliquot, 'Patient_ID', patient_id)

    # Add a row at the beginning of the dataframe to insert also the Entrez Gene ID of each gene of interest
    additional_index = ['ENTREZ_GENE_ID']
    expr_interest_df0_1 = pd.DataFrame(index=additional_index,
                                       columns=[genesSYM_of_interest])
    expr_interest_df0_2 = pd.DataFrame(
        index=additional_index, columns=['Sample_ID', 'Tumor', 'Patient_ID'])
    expr_interest_df0 = expr_interest_df0_1.join(expr_interest_df0_2)

    frames = [expr_interest_df0, expr_interest_df]
    expr_interest_df = pd.concat(frames)

    # Add for each Gene Symbol of our genes of interest the corresponding Entrez Gene ID in the first row of the dataframe
    for i, r in EntrezConversion_df.iterrows():
        entrez_id = r['ENTREZ_GENE_ID']
        gene_name = r['GENE_SYMBOL']
        expr_interest_df.set_value('ENTREZ_GENE_ID', gene_name, entrez_id)

    # Set empty strings for NaN values in the 'GENE_SYMBOL' row
    expr_interest_df.set_value('ENTREZ_GENE_ID', 'Sample_ID', "")
    expr_interest_df.set_value('ENTREZ_GENE_ID', 'Tumor', "")
    expr_interest_df.set_value('ENTREZ_GENE_ID', 'Patient_ID', "")

    # Export the dataframe with the gene expression values for our genes of interest for each TCGA aliquot
    writer = ExcelWriter(
        './3_TCGA_Data/Gene_Expression/Gene_Expression-InterestGenes.xlsx')
    expr_interest_df.to_excel(writer, 'Sheet1')
    writer.save()

    # Gene expression values for each candidate regulatory gene of the genes of interest:

    # Create a dictionary for storing all the gene expression values for each gene of interest and for each aliquot TCGA
    from collections import defaultdict
    dict_expr_regulatory = defaultdict(dict)

    for key, value in dict_expr_regulatory.items():
        value = defaultdict(list)

    # The main dictionary has the Gene Symbols of the candidate regulatory genes as keys and each gene has another dictionary as value, which, in turn, has the different aliquots as keys and lists as values.
    # The idea is having a list, containing all the fpkm values, for each gene in each TCGA aliquot.

    # Set the Gene Symbols as keys of the main dictionary
    for name in regulatory_genesSYM:
        dict_expr_regulatory[name] = {}

    # Set the names of the samples barcodes as keys for each dictionary set as value of a specific key (genes)
    for sample in expr_sample_barcodes:
        for k, v in dict_expr_regulatory.items():
            v[sample] = []

    # Set the values by appending the expression values for each candidate regulatory gene: these expression values (fpkm) can be found in the "expr_df_regs_regulatory" dataframe
    for index, row in expr_df_regs_regulatory.iterrows(
    ):  # iterating along the whole dataframe
        sym = row['gene_symbol']  # get the Gene Symbol of the gene
        ens_id = row['ensembl_gene_id']  # get the Ensembl Gene ID
        fpkm = row['fpkm']  # get the gene expression value
        sample = row['sample_id']  # get the name of the sample
        # get the aliquot corresponding to current sample
        aliq = expr_df_meta.get_value(sample,
                                      'biospecimen__bio__bcr_sample_barcode')
        # add the value according to the correct gene ID and TCGA aliquot, rounding it to a float with maximum 6 decimal numbers
        if (gencode_version == 22):
            if (ens_id not in [
                    'ENSG00000277726.3', 'ENSG00000275895.3',
                    'ENSGR0000214717.8'
            ]):
                dict_expr_regulatory[sym][aliq].append(round(float(fpkm), 6))
        else:
            dict_expr_regulatory[sym][aliq].append(round(float(fpkm), 6))

    # Convert the nested dictionary also into a dataframe

    # Create a dataframe whose row indexes are the different TCGA samples and the columns are the distinct candidate regulatory genes
    expr_regulatory_df1 = pd.DataFrame(index=expr_sample_barcodes,
                                       columns=[regulatory_genesSYM])

    # Add three additional columns for the name of the sample and the ID and barcode of the patient corresponding to each aliquot, in order to have them available if we will need it
    expr_regulatory_df2 = pd.DataFrame(
        index=expr_sample_barcodes,
        columns=['Sample_ID', 'Tumor', 'Patient_ID'])

    # Create the final dataframe
    expr_regulatory_df = expr_regulatory_df1.join(expr_regulatory_df2)

    # Fill the previously created dataframe with the correct gene expression values, for each candidate regulatory gene and for each TCGA aliquot
    for gene_sym, dict_value in dict_expr_regulatory.items():
        for tcga_aliq, exp_list in dict_value.items():
            if (len(exp_list) != 0):
                fpkm = exp_list[0]
                # add the expression value in the proper cell of the dataframe, rounding it to a float with maximum 6 decimal numbers
                expr_regulatory_df.set_value(tcga_aliq, gene_sym,
                                             round(fpkm, 6))

    # Add to the dataframe the name of each sample, the tumor code and the patient's ID in correspondence of each TCGA aliquot
    for index, row in expr_df_meta.iterrows():
        aliquot = row['biospecimen__bio__bcr_sample_barcode']
        tumor_tag = row['clinical__admin__disease_code']
        patient_id = row['clinical__shared__patient_id']
        expr_regulatory_df.set_value(aliquot, 'Sample_ID', index)
        expr_regulatory_df.set_value(aliquot, 'Tumor', tumor_tag)
        expr_regulatory_df.set_value(aliquot, 'Patient_ID', patient_id)

    # Add a row at the beginning of the dataframe to insert also the Gene Symbols of each gene of interest
    additional_index = ['ENTREZ_GENE_ID']
    expr_regulatory_df0_1 = pd.DataFrame(index=additional_index,
                                         columns=[regulatory_genesSYM])
    expr_regulatory_df0_2 = pd.DataFrame(
        index=additional_index, columns=['Sample_ID', 'Tumor', 'Patient_ID'])
    expr_regulatory_df0 = expr_regulatory_df0_1.join(expr_regulatory_df0_2)

    frames = [expr_regulatory_df0, expr_regulatory_df]
    expr_regulatory_df = pd.concat(frames)

    # Add for each Gene Symbol of the regulatory genes the corresponding Entrez Gene ID in the first row of the dataframe
    for i in regulatory_genesSYM:
        if i == 'PTRF':
            entrez_id = Mapping_df.loc[Mapping_df['GENE_SYMBOL'] == 'CAVIN1',
                                       'ENTREZ_GENE_ID'].iloc[0]
        else:
            entrez_id = Mapping_df.loc[Mapping_df['GENE_SYMBOL'] == i,
                                       'ENTREZ_GENE_ID'].iloc[0]
        expr_regulatory_df.set_value('ENTREZ_GENE_ID', i, entrez_id)

    # Set empty strings for NaN values in the 'GENE_SYMBOL' row
    expr_regulatory_df.set_value('ENTREZ_GENE_ID', 'Sample_ID', "")
    expr_regulatory_df.set_value('ENTREZ_GENE_ID', 'Tumor', "")
    expr_regulatory_df.set_value('ENTREZ_GENE_ID', 'Patient_ID', "")

    # Export the dataframe with the gene expression values for the regulatory genes of our genes of interest for each TCGA aliquot
    writer = ExcelWriter(
        './3_TCGA_Data/Gene_Expression/Gene_Expression-RegulatoryGenes.xlsx')
    expr_regulatory_df.to_excel(writer, 'Sheet1')
    writer.save()

    return expr_interest_df, expr_regulatory_df