Beispiel #1
0
def main(readcounts_path, gtf_path, genome_path, transcript_expression_path,
         HLAclass_path, HLAtypes_path, HLAtypes_pan_path, netMHC_path,
         netMHC_pan_path, mosea_path, orfs_scripts, output_path, repeats_path,
         threshold, max_length, tumor_specific, control_path, Intropolis_path,
         mutations_path, CHESS_SE_path, flag_Rudin, remove_temp_files,
         name_user):
    try:

        logger.info("Starting execution exonizations_ISOTOPE_part2")

        # 6. Create the folder, if it doesn't exists
        logger.info("Part6...")
        logger.info("Moving all coverageBed files...")
        if not os.path.exists(output_path + "/coverageBed"):
            os.makedirs(output_path + "/coverageBed")
        # Move all the coverage.sorted files to the created directory
        command1 = "mv " + output_path + "/*coverage_sorted " + output_path + "/coverageBed/"
        os.system(command1)

        # 7.1. Get the coverage for each exonization
        logger.info("Part7...")
        dir_path = os.path.dirname(os.path.realpath(__file__))
        get_coverageBed_adapter(output_path + "/exonizations_by_sample.tab",
                                output_path + "/random_exonizations.bed",
                                output_path + "/coverageBed", output_path,
                                name_user)

        # 7.2. Assemble all pieces into one single file
        command2 = "awk 'FNR==1 && NR!=1{next;}{print}' " + output_path + "/get_coverageBed_*.tab > " + output_path + "/exonizations_by_sample_coverage.tab"
        os.system(command2)

        # 8. Check if in the exonizations there are mutations nearby
        logger.info("Part8...")
        check_mutations_nearby(
            output_path + "/exonizations_by_sample_coverage.tab",
            mutations_path, 200,
            output_path + "/exonizations_by_sample_coverage_mut.tab")

        # 9. Separate between mutated and non-mutated cases
        logger.info("Part9...")
        command2="Rscript "+dir_path+"/lib/Exonization/separate_mutated_cases.R "+output_path + \
                 "/exonizations_by_sample_coverage_mut.tab"+" "+output_path + "/mutated_exonizations.tab"+" "+output_path + "/non_mutated_exonizations.tab"
        # print(command2)
        os.system(command2)

        # 10. Get the tumor specific events
        logger.info("Part10...")
        if (tumor_specific):
            logger.info("Get the tumor specific events...")

            # Get the significant exonizations from Intropolis (control)
            logger.info("Intropolis...")
            output_path_aux = output_path + "/new_exonized_junctions.tab"
            output_Intropolis_path_aux2 = output_path + "/new_exonized_junctions_Intropolis_reads.tab"
            get_reads_exonizations(output_path_aux, Intropolis_path,
                                   output_Intropolis_path_aux2, True)
            output_Intropolis_path_aux3 = output_path + "/new_exonized_junctions_Intropolis_reads_repeatitions.tab"
            overlap_with_repeats(output_Intropolis_path_aux2, repeats_path,
                                 output_Intropolis_path_aux3)
            output_Intropolis_path_aux4 = output_path + "/exonizations_by_sample_Intropolis.tab"
            get_significant_exonizations(output_Intropolis_path_aux3,
                                         threshold,
                                         output_Intropolis_path_aux4)

            if (control_path != "Missing"):
                # Get the significant exonizations from normal samples
                logger.info("Additional controls...")
                extract_exonized_junctions(
                    control_path, gtf_path, genome_path, max_length,
                    output_path + "/exonized_junctions_control.tab",
                    mosea_path)
                get_reads_exonizations(
                    output_path + "/exonized_junctions_control.tab",
                    control_path,
                    output_path + "/exonized_junctions_control_reads.tab",
                    False)
                get_significant_exonizations(
                    output_path + "/exonized_junctions_control_reads.tab",
                    threshold,
                    output_path + "/exonizations_by_sample_control.tab")

                #Filter exonizations
                logger.info("Filtering events...")
                filter_exonizations(
                    output_path + "/non_mutated_exonizations.tab",
                    output_path + "/exonizations_by_sample_control.tab",
                    output_path + "/exonizations_by_sample_Intropolis.tab",
                    output_path + "/non_mutated_exonizations_filtered.tab",
                    control_path)
                filter_exonizations_CHESS(
                    output_path + "/non_mutated_exonizations_filtered.tab",
                    CHESS_SE_path,
                    output_path + "/non_mutated_exonizations_filtered2.tab")

            else:
                #Filter exonizations
                logger.info("Filtering events...")
                filter_exonizations(
                    output_path + "/non_mutated_exonizations.tab", "Missing",
                    output_path + "/exonizations_by_sample_Intropolis.tab",
                    output_path + "/non_mutated_exonizations_filtered.tab",
                    control_path)
                filter_exonizations_CHESS(
                    output_path + "/non_mutated_exonizations_filtered.tab",
                    CHESS_SE_path,
                    output_path + "/non_mutated_exonizations_filtered2.tab")

            # 11. Join the mutated and non_mutated cases
            output_path_aux13 = output_path + "/all_exonizations.tab"
            command3 = "cat " + output_path + "/mutated_exonizations.tab" + " > " + output_path_aux13 + ";tail -n+2 " + output_path + "/non_mutated_exonizations_filtered2.tab" + " >> " + output_path_aux13
            os.system(command3)

        else:
            # 11. Join the mutated and non_mutated cases
            output_path_aux13 = output_path + "/all_exonizations.tab"
            command3 = "cat " + output_path + "/mutated_exonizations.tab" + " > " + output_path_aux13 + ";tail -n+2 " + output_path + "/non_mutated_exonizations.tab" + " >> " + output_path_aux13
            os.system(command3)

        # 12. Get the peptide sequence associated
        logger.info("Part11...")
        output_path_aux13 = output_path + "/all_exonizations.tab"
        output_path_peptide = output_path + "/exonizations_peptide_sequence.fa"
        output_path_dna = output_path + "/exonizations_fasta_sequence.fa"
        output_path_aux14 = output_path + "/all_exonizations_ORF.tab"
        output_path_aux15 = output_path + "/all_exonizations_ORF_sequences.tab"
        output_path_aux16 = output_path + "/all_exonizations_Interpro.tab"
        output_path_aux17 = output_path + "/all_exonizations_IUPred.tab"
        get_peptide_sequence(output_path_aux13, transcript_expression_path,
                             gtf_path, output_path_peptide, output_path_dna,
                             output_path_aux14, output_path_aux15,
                             output_path_aux16, output_path_aux17, mosea_path,
                             genome_path, orfs_scripts, remove_temp_files)

        # 13. Filter the significant results
        logger.info("Part12...")
        output_path_aux18 = output_path + "/all_exonizations_filtered.tab"
        output_path_aux19 = output_path + "/all_exonizations_filtered_peptide_change.tab"
        command4 = "Rscript " + dir_path + "/lib/Exonization/filter_results.R " + output_path_aux14 + " " + output_path_aux18 + " " + output_path_aux19
        os.system(command4)

        # 14. Select the fasta candidates for being run to the epitope analysis
        logger.info("Part13...")
        output_path_aux20 = output_path + "/exonizations_peptide_sequence.fa"
        output_path_aux21 = output_path + "/exonizations_peptide_sequence_filtered.fa"
        #Create the folder, if it doesn't exists
        if not os.path.exists(output_path + "/exonization_fasta_files"):
            os.makedirs(output_path + "/exonization_fasta_files")
        select_fasta_candidates(output_path_aux19, output_path_aux20,
                                output_path_aux21,
                                output_path + "/exonization_fasta_files")

        # 15. Run netMHC-4.0_part1
        logger.info("Part14...")
        if not os.path.exists(output_path + "/exonizations_NetMHC-4.0_files"):
            os.makedirs(output_path + "/exonizations_NetMHC-4.0_files")
        run_netMHC_classI_slurm_part1(
            output_path_aux19, HLAclass_path, HLAtypes_path,
            output_path + "/exonization_fasta_files",
            output_path + "/exonizations_NetMHC-4.0_files", output_path +
            "/exonizations_NetMHC-4.0_neoantigens_type_gained.tab",
            output_path +
            "/exonizations_NetMHC-4.0_neoantigens_type_gained_all.tab",
            output_path + "/exonizations_NetMHC-4.0_neoantigens_type_lost.tab",
            output_path +
            "/exonizations_NetMHC-4.0_neoantigens_type_lost_all.tab",
            output_path +
            "/exonizations_NetMHC-4.0_junctions_ORF_neoantigens.tab",
            netMHC_path)

        # 16. Run netMHCpan-4.0_part1
        logger.info("Part15...")
        if not os.path.exists(output_path +
                              "/exonizations_NetMHCpan-4.0_files"):
            os.makedirs(output_path + "/exonizations_NetMHCpan-4.0_files")
        run_netMHCpan_classI_slurm_part1(
            output_path_aux19, HLAclass_path, HLAtypes_pan_path,
            output_path + "/exonization_fasta_files",
            output_path + "/exonizations_NetMHCpan-4.0_files", output_path +
            "/exonizations_NetMHCpan-4.0_neoantigens_type_gained.tab",
            output_path +
            "/exonizations_NetMHCpan-4.0_neoantigens_type_gained_all.tab",
            output_path +
            "/exonizations_NetMHCpan-4.0_neoantigens_type_lost.tab",
            output_path +
            "/exonizations_NetMHCpan-4.0_neoantigens_type_lost_all.tab",
            output_path +
            "/exonizations_NetMHCpan-4.0_junctions_ORF_neoantigens.tab",
            netMHC_pan_path)
        logger.info(
            "Wait until all jobs have finished. Then, go on with part3")

        logger.info("Done. Exiting program.")

        exit(0)

    except Exception as error:
        logger.error('ERROR: ' + repr(error))
        logger.error("Aborting execution")
        sys.exit(1)
def main():
    try:

        logger.info("Starting execution exonizations_ePydoor_part2")

        tumor_specific = True
        readcounts_path = "/projects_rg/SCLC_cohorts/Smart/STAR/readCounts.tab"
        transcript_expression_path = "/projects_rg/SCLC_cohorts/Smart/Salmon/iso_tpm.txt"
        gtf_path = "/projects_rg/SCLC_cohorts/annotation/Homo_sapiens.GRCh37.75.formatted.only_protein_coding.gtf"
        codons_gtf_path = "/projects_rg/SCLC_cohorts/annotation/Homo_sapiens.GRCh37.75.codons.gtf"
        mutations_path = "/projects_rg/babita/TCGA/mutation/mut_pipeline/juanlu_sclc/src_files/SCLC_mutations_sorted.bed.mut.out"
        repeats_path = "/projects_rg/SCLC_cohorts/cis_analysis/tables/hg19_repeats.bed"
        CHESS_SE_path = "/projects_rg/SCLC_cohorts/annotation/chess2.0_assembly_hg19_CrossMap.events_SE_strict.ioe"
        mosea = "/genomics/users/juanluis/Software/MoSEA-master/mosea.py"
        fasta_genome = "/genomics/users/juanluis/Software/MoSEA-master/test_files/genome/hg19.fa"
        orfs_scripts = "/genomics/users/juanluis/comprna/MxFinder/extract_orfs.py"
        interpro = "/soft/EB_repo/bio/sequence/programs/noarch/interproscan/5.33-72.0/interproscan.sh"
        IUPred = "/projects_rg/SCLC_cohorts/soft/IUPred2A"
        HLAclass_path = "/projects_rg/SCLC_cohorts/Smart/PHLAT/PHLAT_summary_ClassI.out"
        HLAtypes_path = "/projects_rg/SCLC_cohorts/tables/NetMHC-4.0_HLA_types_accepted.tab"
        HLAtypes_pan_path = "/projects_rg/SCLC_cohorts/tables/NetMHCpan-4.0_HLA_types_accepted.tab"
        netMHC_path = "/projects_rg/SCLC_cohorts/soft/netMHC-4.0/netMHC"
        netMHC_pan_path = "/projects_rg/SCLC_cohorts/soft/netMHCpan-4.0/netMHCpan"
        remove_temp_files = True
        flag_Rudin = False
        threshold2 = 10
        name_user = "******"
        output_path = "/users/genomics/juanluis/SCLC_cohorts/Smart/epydoor/exonizations"
        # ONLY FOR MARVIN
        #python2 = "Python/2.7.14-foss-2017b"
        # ONLY FOR HYDRA
        python2 = "Python/2.7.11"

        # 6. Create the folder, if it doesn't exists
        logger.info("Part6...")
        if not os.path.exists(output_path + "/coverageBed"):
            os.makedirs(output_path + "/coverageBed")
        # Move all the coverage.sorted files to the created directory
        command1 = "mv " + output_path + "/*coverage_sorted " + output_path + "/coverageBed/"
        os.system(command1)

        # 7.1. Get the coverage for each exonization
        logger.info("Part7...")
        dir_path = os.path.dirname(os.path.realpath(__file__))
        get_coverageBed_adapter(output_path + "/exonizations_by_sample.tab",
                                output_path + "/random_exonizations.bed",
                                output_path + "/coverageBed", output_path,
                                name_user)

        # 7.2. Assemble all pieces into one single file
        command2 = "awk 'FNR==1 && NR!=1{next;}{print}' " + output_path + "/get_coverageBed_*.tab > " + output_path + "/exonizations_by_sample_coverage.tab"
        os.system(command2)

        # 8. Check if in the exonizations there are mutations nearby
        logger.info("Part8...")
        check_mutations_nearby(
            output_path + "/exonizations_by_sample_coverage.tab",
            mutations_path, 200,
            output_path + "/exonizations_by_sample_coverage_mut.tab")

        # 9. Separate between mutated and non-mutated cases
        logger.info("Part9...")
        command2="module load R; Rscript "+dir_path+"/lib/Exonization/separate_mutated_cases.R "+output_path + \
                 "/exonizations_by_sample_coverage_mut.tab"+" "+output_path + "/mutated_exonizations.tab"+" "+output_path + "/non_mutated_exonizations.tab"
        # print(command2)
        os.system(command2)

        # 10. Get the tumor specific events
        if (tumor_specific):

            # Get also the significant exonizations from Rudin and Intropolis
            output_Rudin_path_aux2 = output_path + "/new_exonized_junctions_Rudin_normal_reads.tab"
            readCounts_Rudin_path = "/projects_rg/SCLC_cohorts/Rudin/STAR/v1/normal_readCounts.tab"
            get_reads_exonizations(output_path + "/new_exonized_junctions.tab",
                                   readCounts_Rudin_path,
                                   output_Rudin_path_aux2)
            output_Rudin_path_aux3 = output_path + "/new_exonized_junctions_Rudin_normal_reads_repeatitions.tab"
            overlap_with_repeats(output_Rudin_path_aux2, repeats_path,
                                 output_Rudin_path_aux3)
            output_Rudin_path_aux4 = output_path + "/exonizations_by_sample_Rudin_normal.tab"
            get_significant_exonizations(output_Rudin_path_aux3, threshold2,
                                         output_Rudin_path_aux4)

            output_Intropolis_path_aux2 = output_path + "/new_exonized_junctions_Intropolis_reads.tab"
            get_reads_exonizations(output_path + "/new_exonized_junctions.tab",
                                   readcounts_path,
                                   output_Intropolis_path_aux2)
            output_Intropolis_path_aux3 = output_path + "/new_exonized_junctions_Intropolis_reads_repeatitions.tab"
            overlap_with_repeats(output_Intropolis_path_aux2, repeats_path,
                                 output_Intropolis_path_aux3)
            output_Intropolis_path_aux4 = output_path + "/exonizations_by_sample_Intropolis.tab"
            get_significant_exonizations(output_Intropolis_path_aux3,
                                         threshold2,
                                         output_Intropolis_path_aux4)

            output_Rudin_path_aux4 = output_path + "/exonizations_by_sample_Rudin_normal.tab"
            output_Intropolis_path_aux4 = output_path + "/exonizations_by_sample_Intropolis.tab"
            output_path_aux11 = output_path + "/non_mutated_exonizations_filtered.tab"
            filter_exonizations(output_path + "/non_mutated_exonizations.tab",
                                output_Rudin_path_aux4,
                                output_Intropolis_path_aux4, output_path_aux11,
                                flag_Rudin)
            output_path_aux12 = output_path + "/non_mutated_exonizations_filtered2.tab"
            filter_exonizations_CHESS(output_path_aux11, CHESS_SE_path,
                                      output_path_aux12)

            # 11. Join the mutated and non_mutated cases
            logger.info("Part10...")
            output_path_aux13 = output_path + "/all_exonizations.tab"
            command3 = "cat " + output_path + "/mutated_exonizations.tab" + " > " + output_path_aux13 + ";tail -n+2 " + output_path_aux12 + " >> " + output_path_aux13
            os.system(command3)

        else:

            # 11. Join the mutated and non_mutated cases
            logger.info("Part10...")
            output_path_aux13 = output_path + "/all_exonizations.tab"
            command3 = "cat " + output_path + "/mutated_exonizations.tab" + " > " + output_path_aux13 + ";tail -n+2 " + output_path + "/non_mutated_exonizations.tab" + " >> " + output_path_aux13
            os.system(command3)

        # 12. Get the peptide sequence associated
        logger.info("Part11...")
        output_path_aux13 = output_path + "/all_exonizations.tab"
        output_path_peptide = output_path + "/exonizations_peptide_sequence.fa"
        output_path_dna = output_path + "/exonizations_fasta_sequence.fa"
        output_path_aux14 = output_path + "/all_exonizations_ORF.tab"
        output_path_aux15 = output_path + "/all_exonizations_ORF_sequences.tab"
        output_path_aux16 = output_path + "/all_exonizations_Interpro.tab"
        output_path_aux17 = output_path + "/all_exonizations_IUPred.tab"
        get_peptide_sequence(output_path_aux13, transcript_expression_path,
                             gtf_path, codons_gtf_path, output_path_peptide,
                             output_path_dna, output_path_aux14,
                             output_path_aux15, output_path_aux16,
                             output_path_aux17, mosea, fasta_genome,
                             orfs_scripts, interpro, IUPred, remove_temp_files,
                             python2)

        # 13. Filter the significant results
        logger.info("Part12...")
        output_path_aux18 = output_path + "/all_exonizations_filtered.tab"
        output_path_aux19 = output_path + "/all_exonizations_filtered_peptide_change.tab"
        command4 = "module load R; Rscript " + dir_path + "/lib/Exonization/filter_results.R " + output_path_aux14 + " " + output_path_aux18 + " " + output_path_aux19
        os.system(command4)

        # 14. Select the fasta candidates for being run to the epitope analysis
        logger.info("Part13...")
        output_path_aux20 = output_path + "/exonizations_peptide_sequence.fa"
        output_path_aux21 = output_path + "/exonizations_peptide_sequence_filtered.fa"
        #Create the folder, if it doesn't exists
        if not os.path.exists(output_path + "/exonization_fasta_files"):
            os.makedirs(output_path + "/exonization_fasta_files")
        select_fasta_candidates(output_path_aux19, output_path_aux20,
                                output_path_aux21,
                                output_path + "/exonization_fasta_files")

        # 15. Run netMHC-4.0_part1
        logger.info("Part14...")
        if not os.path.exists(output_path + "/exonizations_NetMHC-4.0_files"):
            os.makedirs(output_path + "/exonizations_NetMHC-4.0_files")
        run_netMHC_classI_slurm_part1(
            output_path_aux19, HLAclass_path, HLAtypes_path,
            output_path + "/exonization_fasta_files",
            output_path + "/exonizations_NetMHC-4.0_files", output_path +
            "/exonizations_NetMHC-4.0_neoantigens_type_3.tab", output_path +
            "/exonizations_NetMHC-4.0_neoantigens_type_3_all.tab",
            output_path + "/exonizations_NetMHC-4.0_neoantigens_type_2.tab",
            output_path +
            "/exonizations_NetMHC-4.0_neoantigens_type_2_all.tab",
            output_path +
            "/exonizations_NetMHC-4.0_junctions_ORF_neoantigens.tab",
            netMHC_path)

        # 16. Run netMHCpan-4.0_part1
        logger.info("Part15...")
        if not os.path.exists(output_path +
                              "/exonizations_NetMHCpan-4.0_files"):
            os.makedirs(output_path + "/exonizations_NetMHCpan-4.0_files")
        run_netMHCpan_classI_slurm_part1(
            output_path_aux19, HLAclass_path, HLAtypes_pan_path,
            output_path + "/exonization_fasta_files",
            output_path + "/exonizations_NetMHCpan-4.0_files", output_path +
            "/exonizations_NetMHCpan-4.0_neoantigens_type_3.tab", output_path +
            "/exonizations_NetMHCpan-4.0_neoantigens_type_3_all.tab",
            output_path + "/exonizations_NetMHCpan-4.0_neoantigens_type_2.tab",
            output_path +
            "/exonizations_NetMHCpan-4.0_neoantigens_type_2_all.tab",
            output_path +
            "/exonizations_NetMHCpan-4.0_junctions_ORF_neoantigens.tab",
            netMHC_pan_path)
        logger.info(
            "Wait until all jobs have finished. Then, go on with part3")

        logger.info("Done. Exiting program.")

        exit(0)

    except Exception as error:
        logger.error('ERROR: ' + repr(error))
        logger.error("Aborting execution")
        sys.exit(1)