Beispiel #1
0
def main(transcript_expression_path, gtf_path, genome_path, HLAclass_path,
         HLAtypes_path, HLAtypes_pan_path, netMHC_path, netMHC_pan_path,
         threshold, mosea_path, mxfinder_path, output_path, tumor_specific,
         remove_temp_files, name_user, cluster):

    try:

        logger.info("Starting execution IR_ISOTOPE_part2")

        # transcript_expression_path = "/projects_rg/SCLC_cohorts/George/tables/iso_tpm_George_Peifer_Rudin_Yokota.tab"
        # gtf_path = "/projects_rg/SCLC_cohorts/annotation/Homo_sapiens.GRCh37.75.formatted.only_protein_coding.gtf"
        # codons_gtf_path = "/projects_rg/SCLC_cohorts/annotation/Homo_sapiens.GRCh37.75.codons.gtf"
        # mosea = "/genomics/users/juanluis/Software/MoSEA-master/mosea.py"
        # fasta_genome = "/genomics/users/juanluis/Software/MoSEA-master/test_files/genome/hg19.fa"
        # orfs_scripts = "/genomics/users/juanluis/comprna/MxFinder/extract_orfs.py"
        # interpro = "/soft/EB_repo/bio/sequence/programs/noarch/interproscan/5.33-72.0/interproscan.sh"
        # IUPred = "/projects_rg/SCLC_cohorts/soft/IUPred2A"
        # HLAclass_path = "/projects_rg/SCLC_cohorts/tables/PHLAT_summary_ClassI_all_samples.out"
        # HLAtypes_path = "/projects_rg/SCLC_cohorts/tables/NetMHC-4.0_HLA_types_accepted.tab"
        # HLAtypes_pan_path = "/projects_rg/SCLC_cohorts/tables/NetMHCpan-4.0_HLA_types_accepted.tab"
        # netMHC_path = "/projects_rg/SCLC_cohorts/soft/netMHC-4.0/netMHC"
        # netMHC_pan_path = "/projects_rg/SCLC_cohorts/soft/netMHCpan-4.0/netMHCpan"
        # remove_temp_files = True
        # tumor_specific = True
        # name_user = "******"
        # output_path = "/users/genomics/juanluis/SCLC_cohorts/SCLC/epydoor/IR"
        # # ONLY FOR MARVIN
        # #python2 = "Python/2.7.14-foss-2017b"
        # # ONLY FOR HYDRA
        # python2 = "Python/2.7.11"

        # 0.1. Create a gtf with only the exon information
        dir_path = os.path.dirname(os.path.realpath(__file__))
        gtf_path_exon = '{}.{}'.format(gtf_path, "exon")
        gtf = pd.read_table(gtf_path, delimiter="\t", header=None, comment="#")
        #Get only the information on the exons and on chromosomes from 1 to 22, X and Y
        gtf.columns = [
            'chr', 'type1', 'type2', 'start', 'end', 'dot', 'strand', 'dot2',
            'rest_information'
        ]
        gtf = gtf[gtf['type2'].isin(["exon"])]
        gtf = gtf[gtf['chr'].isin(list(range(1, 23)) + ["X", "Y"])]
        #Add the chr suffix
        gtf['chr'] = 'chr' + gtf['chr'].astype(str)
        #Save the gtf in external file
        gtf.to_csv(gtf_path_exon,
                   index=False,
                   header=False,
                   sep='\t',
                   quoting=csv.QUOTE_NONE)

        # 6. Create the folder, if it doesn't exists
        logger.info("Part6...")
        if not os.path.exists(output_path + "/coverageBed"):
            os.makedirs(output_path + "/coverageBed")
        # Move all the coverage.sorted files to the created directory
        command1 = "mv " + output_path + "/*coverage_sorted " + output_path + "/coverageBed/"
        os.system(command1)

        # 7.1. Get the coverage for each exonization
        logger.info("Part7.1...")
        if (tumor_specific):
            output_path_filtered2 = output_path + "/IR_expressed_genes_filtered2.tab"
        else:
            output_path_filtered2 = output_path + "/IR_expressed_genes.tab"

        get_coverageBed_adapter(output_path_filtered2,
                                output_path + "/random_introns.bed",
                                output_path + "/coverageBed", output_path,
                                name_user, cluster)

        # 7.2. Assemble all pieces into one single file
        logger.info("Part7.2...")
        command2 = "awk 'FNR==1 && NR!=1{next;}{print}' " + output_path + "/get_coverageBed_*.tab > " + output_path + "/IR_coverage.tab"
        os.system(command2)

        # 7.3. Get the introns with a significant p_value
        logger.info("Part7.3...")
        command3="head -n1 "+output_path+"/IR_coverage.tab > "+output_path+"/IR_significant_introns.tab; " \
                   "awk '{ if ($7 <= 0.05 && $6 > 0) print }' "+output_path+"/IR_coverage.tab >> "+output_path+"/IR_significant_introns.tab"
        os.system(command3)

        # 8. Get the peptide sequence associated
        logger.info("Part8...")
        get_peptide_sequence(output_path + "/IR_significant_introns.tab",
                             transcript_expression_path, gtf_path,
                             output_path + "/IR_peptide_sequence.fa",
                             output_path + "/IR_fasta_sequence.fa",
                             output_path + "/IR_ORF.tab",
                             output_path + "/IR_ORF_sequences.tab", mosea_path,
                             genome_path, mxfinder_path, remove_temp_files)

        # 9. Filter the significant results
        logger.info("Part9...")
        dir_path = os.path.dirname(os.path.realpath(__file__))
        command4="Rscript "+dir_path+"/lib/IR/filter_results.R "+output_path + "/IR_ORF.tab"+" "+ \
                 output_path + "/IR_ORF_filtered.tab " + str(threshold) + " "+ output_path + "/IR_ORF_filtered_peptide_change.tab"
        os.system(command4)

        # 10. Select the fasta candidates for being run to the epitope analysis
        logger.info("Part10...")
        #Create the folder, if it doesn't exists
        if not os.path.exists(output_path + "/IR_fasta_files"):
            os.makedirs(output_path + "/IR_fasta_files")
        select_fasta_candidates(
            output_path + "/IR_ORF_filtered_peptide_change.tab",
            output_path + "/IR_peptide_sequence.fa",
            output_path + "/IR_peptide_sequence_filtered.fa",
            output_path + "/IR_fasta_files")

        #11. Run netMHC-4.0_part1
        logger.info("Part11...")
        if not os.path.exists(output_path + "/IR_NetMHC-4.0_files"):
            os.makedirs(output_path + "/IR_NetMHC-4.0_files")
        run_netMHC_classI_slurm_part1(
            output_path + "/IR_ORF_filtered_peptide_change.tab", HLAclass_path,
            HLAtypes_path, output_path + "/IR_fasta_files",
            output_path + "/IR_NetMHC-4.0_files",
            output_path + "/IR_NetMHC-4.0_neoantigens_type_gained.tab",
            output_path + "/IR_NetMHC-4.0_neoantigens_type_gained_all.tab",
            output_path + "/IR_NetMHC-4.0_neoantigens_type_lost.tab",
            output_path + "/IR_NetMHC-4.0_neoantigens_type_lost_all.tab",
            output_path + "/IR_NetMHC-4.0_junctions_ORF_neoantigens.tab",
            netMHC_path, cluster)

        #12. Run netMHCpan-4.0_part1
        logger.info("Part12...")
        if not os.path.exists(output_path + "/IR_NetMHCpan-4.0_files"):
            os.makedirs(output_path + "/IR_NetMHCpan-4.0_files")
        run_netMHCpan_classI_slurm_part1(
            output_path + "/IR_ORF_filtered_peptide_change.tab", HLAclass_path,
            HLAtypes_pan_path, output_path + "/IR_fasta_files",
            output_path + "/IR_NetMHCpan-4.0_files",
            output_path + "/IR_NetMHCpan-4.0_neoantigens_type_gained.tab",
            output_path + "/IR_NetMHCpan-4.0_neoantigens_type_gained_all.tab",
            output_path + "/IR_NetMHCpan-4.0_neoantigens_type_lost.tab",
            output_path + "/IR_NetMHCpan-4.0_neoantigens_type_lost_all.tab",
            output_path + "/IR_NetMHCpan-4.0_junctions_ORF_neoantigens.tab",
            netMHC_pan_path, cluster)

        exit(0)

    except Exception as error:
        logger.error('ERROR: ' + repr(error))
        logger.error("Aborting execution")
        sys.exit(1)
Beispiel #2
0
def main():
    try:

        logger.info("Starting execution IR_epydoor_part2")

        transcript_expression_path = "/projects_rg/SCLC_cohorts/George/tables/iso_tpm_George_Peifer_Rudin_Yokota.tab"
        gtf_path = "/projects_rg/SCLC_cohorts/annotation/Homo_sapiens.GRCh37.75.formatted.only_protein_coding.gtf"
        codons_gtf_path = "/projects_rg/SCLC_cohorts/annotation/Homo_sapiens.GRCh37.75.codons.gtf"
        mosea = "/genomics/users/juanluis/Software/MoSEA-master/mosea.py"
        fasta_genome = "/genomics/users/juanluis/Software/MoSEA-master/test_files/genome/hg19.fa"
        orfs_scripts = "/genomics/users/juanluis/comprna/MxFinder/extract_orfs.py"
        interpro = "/soft/EB_repo/bio/sequence/programs/noarch/interproscan/5.33-72.0/interproscan.sh"
        IUPred = "/projects_rg/SCLC_cohorts/soft/IUPred2A"
        HLAclass_path = "/projects_rg/SCLC_cohorts/tables/PHLAT_summary_ClassI_all_samples.out"
        HLAtypes_path = "/projects_rg/SCLC_cohorts/tables/NetMHC-4.0_HLA_types_accepted.tab"
        HLAtypes_pan_path = "/projects_rg/SCLC_cohorts/tables/NetMHCpan-4.0_HLA_types_accepted.tab"
        netMHC_path = "/projects_rg/SCLC_cohorts/soft/netMHC-4.0/netMHC"
        netMHC_pan_path = "/projects_rg/SCLC_cohorts/soft/netMHCpan-4.0/netMHCpan"
        remove_temp_files = True
        tumor_specific = True
        name_user = "******"
        output_path = "/users/genomics/juanluis/SCLC_cohorts/SCLC/epydoor/IR"
        # ONLY FOR MARVIN
        #python2 = "Python/2.7.14-foss-2017b"
        # ONLY FOR HYDRA
        python2 = "Python/2.7.11"

        # 6. Create the folder, if it doesn't exists
        logger.info("Part6...")
        if not os.path.exists(output_path + "/coverageBed"):
            os.makedirs(output_path + "/coverageBed")
        # Move all the coverage.sorted files to the created directory
        command1 = "mv " + output_path + "/*coverage_sorted " + output_path + "/coverageBed/"
        os.system(command1)

        # 7.1. Get the coverage for each exonization
        logger.info("Part7.1...")
        if (tumor_specific):
            output_path_filtered2 = output_path + "/IR_expressed_genes_filtered2.tab"
        else:
            output_path_filtered2 = output_path + "/IR_expressed_genes.tab"

        get_coverageBed_adapter(output_path_filtered2,
                                output_path + "/random_introns.bed",
                                output_path + "/coverageBed", output_path,
                                name_user)

        # 7.2. Assemble all pieces into one single file
        logger.info("Part7.2...")
        command2 = "awk 'FNR==1 && NR!=1{next;}{print}' " + output_path + "/get_coverageBed_*.tab > " + output_path + "/IR_coverage.tab"
        os.system(command2)

        # 7.3. Get the introns with a significant p_value
        logger.info("Part7.3...")
        command3="head -n1 "+output_path+"/IR_coverage.tab > "+output_path+"/IR_significant_introns.tab; " \
                   "awk '{ if ($7 <= 0.05) print }' "+output_path+"/IR_coverage.tab >> "+output_path+"/IR_significant_introns.tab"
        os.system(command3)

        # 8. Get the peptide sequence associated
        logger.info("Part8...")
        get_peptide_sequence(
            output_path + "/IR_significant_introns.tab",
            transcript_expression_path, gtf_path, codons_gtf_path,
            output_path + "/IR_peptide_sequence.fa",
            output_path + "/IR_fasta_sequence.fa", output_path + "/IR_ORF.tab",
            output_path + "/IR_ORF_sequences.tab",
            output_path + "/IR_Interpro.tab", output_path + "/IR_IUPred.tab",
            mosea, fasta_genome, orfs_scripts, interpro, IUPred,
            remove_temp_files, python2)

        # 9. Filter the significant results
        logger.info("Part9...")
        dir_path = os.path.dirname(os.path.realpath(__file__))
        command4="module load R; Rscript "+dir_path+"/lib/IR/filter_results.R "+output_path + "/IR_ORF.tab"+" "+ \
                 output_path + "/IR_ORF_filtered.tab" +" "+output_path + "/IR_ORF_filtered_peptide_change.tab"
        os.system(command4)

        # 10. Select the fasta candidates for being run to the epitope analysis
        logger.info("Part10...")
        #Create the folder, if it doesn't exists
        if not os.path.exists(output_path + "/IR_fasta_files"):
            os.makedirs(output_path + "/IR_fasta_files")
        select_fasta_candidates(
            output_path + "/IR_ORF_filtered_peptide_change.tab",
            output_path + "/IR_peptide_sequence.fa",
            output_path + "/IR_peptide_sequence_filtered.fa",
            output_path + "/IR_fasta_files")

        #11. Run netMHC-4.0_part1
        logger.info("Part11...")
        if not os.path.exists(output_path + "/IR_NetMHC-4.0_files"):
            os.makedirs(output_path + "/IR_NetMHC-4.0_files")
        run_netMHC_classI_slurm_part1(
            output_path + "/IR_ORF_filtered_peptide_change.tab", HLAclass_path,
            HLAtypes_path, output_path + "/IR_fasta_files",
            output_path + "/IR_NetMHC-4.0_files",
            output_path + "/IR_NetMHC-4.0_neoantigens_type_3.tab",
            output_path + "/IR_NetMHC-4.0_neoantigens_type_3_all.tab",
            output_path + "/IR_NetMHC-4.0_neoantigens_type_2.tab",
            output_path + "/IR_NetMHC-4.0_neoantigens_type_2_all.tab",
            output_path + "/IR_NetMHC-4.0_junctions_ORF_neoantigens.tab",
            netMHC_path)

        #12. Run netMHCpan-4.0_part1
        logger.info("Part12...")
        if not os.path.exists(output_path + "/IR_NetMHCpan-4.0_files"):
            os.makedirs(output_path + "/IR_NetMHCpan-4.0_files")
        run_netMHCpan_classI_slurm_part1(
            output_path + "/IR_ORF_filtered_peptide_change.tab", HLAclass_path,
            HLAtypes_pan_path, output_path + "/IR_fasta_files",
            output_path + "/IR_NetMHCpan-4.0_files",
            output_path + "/IR_NetMHCpan-4.0_neoantigens_type_3.tab",
            output_path + "/IR_NetMHCpan-4.0_neoantigens_type_3_all.tab",
            output_path + "/IR_NetMHCpan-4.0_neoantigens_type_2.tab",
            output_path + "/IR_NetMHCpan-4.0_neoantigens_type_2_all.tab",
            output_path + "/IR_NetMHCpan-4.0_junctions_ORF_neoantigens.tab",
            netMHC_pan_path)
        logger.info(
            "Wait until all jobs have finished. Then, go on with part3")

        exit(0)

    except Exception as error:
        logger.error('ERROR: ' + repr(error))
        logger.error("Aborting execution")
        sys.exit(1)