예제 #1
0
def get_non_mutation_indices(simulation_output_folder, sample_file,
                             coding_exon_bed, out_prefix, genome_fasta,
                             nt_indices_files):
    '''
    Get the indices of all the positions where no SNPs/other variants are reported.
    '''

    # to do:
    # 1. need to convert bed to not have chr name
    # for each vcf file:
    # 1. subtract bed to get all locations where there isnt a mutation using intersect_bed with subtract=True
    # bmo.intersect_bed(file1, file2, output_file, subtract=True)
    # 2. using that, extract all sequence pieces using fasta_from_intervals, need to decide if names=true or names=false
    # fasta_from_intervals(intersect_bed_file, fasta_file, genome_fasta, force_strand = True, names = False)
    # 3. from this fasta file, create a file that contains for each exon, the indices at which each nt resides
    # split this so each chr has its own file?
    # bo.extract_nt_indices(fasta_file, output_file)

    # set up the new file to contain the regions without a mutation
    coding_exon_bed_out = "{0}/format_{1}".format(
        simulation_output_folder,
        coding_exon_bed.split('/')[-1])
    # set up the new file to contain the regions without a mutation
    coding_exon_bed_out_subtract = "{0}/subtract_{1}".format(
        simulation_output_folder,
        coding_exon_bed.split('/')[-1])
    # set up the new file to contain the regions without a mutation
    coding_exon_bed_out_subtract_format = "{0}/subtract_format_{1}".format(
        simulation_output_folder,
        coding_exon_bed.split('/')[-1])
    # file to contain the fasta output of regions containing no mutation
    fasta_no_mutations = "{0}/exon_regions_no_mutations.fasta".format(
        simulation_output_folder)
    # change the names in the bed file to correspond to the mutation vcf
    bo.change_bed_names(coding_exon_bed,
                        coding_exon_bed_out,
                        full_names=True,
                        header=False)
    # intersect the bed, leaving only regions that contain no mutations
    bmo.intersect_bed(coding_exon_bed,
                      sample_file,
                      write_both=False,
                      output_file=coding_exon_bed_out_subtract,
                      no_dups=False,
                      subtract=True,
                      intersect=True)
    # file = gen.read_many_fields(coding_exon_bed_out_subtract, "\t")
    bo.change_bed_names(coding_exon_bed_out_subtract,
                        coding_exon_bed_out_subtract_format,
                        full_names=False,
                        header=False)
    # generate fasta of all the regions
    bo.fasta_from_intervals(coding_exon_bed_out_subtract,
                            fasta_no_mutations,
                            genome_fasta,
                            force_strand=True,
                            names=True)
    # # extract the indices of each location a mutation doesnt occur
    bo.extract_nt_indices(fasta_no_mutations, nt_indices_files)
예제 #2
0
def check_coding(exons_file, CDSs_file, outfile, remove_overlapping = False):
        '''
        Given a bed file of exon coordinates and a bed file of CDS coordinates,
        writes a new bed file that only contains those exon coordinates form the former file that
        1) are fully coding
        2) are internal
        NB! Assumes that all the coordinates are from non-overlapping transcripts.
        If this is not the case, set remove_overlaps to True and it'll remove overlapping
        intervals.
        '''
        if remove_overlapping:
                bmo.sort_bed(exons_file, exons_file)
                remove_overlaps(exons_file, exons_file)
        #filter out anything that isn't fully coding
        #you have to write_both because you want to make sure that they
        #haven't been kept because of an overlap to a transcript that doesn't appear in the exons file
        temp_file = "temp_data/temp{0}.txt".format(random.random())
        bmo.intersect_bed(exons_file, CDSs_file, overlap = 1, overlap_rec = True, output_file = temp_file, force_strand = True, write_both = True, no_dups = False, no_name_check = False)
        #filter out terminal exons
        #in theory, there shouldn't be any left after the previous step
        #in practice, there may be unannotated UTRs, so it looks like we have a fully coding terminal exon,
        #whereas in reality, the exon is only partially coding
        temp_file2 = "temp_data/temp{0}.txt".format(random.random())
        with open(temp_file2, "w") as o_file:
                #figure out the rank of the last exon for each transcript
                filt_exons = gen.read_many_fields(exons_file, "\t")
                filt_exons = [i for i in filt_exons if len(i) > 3]
                names = [i[3].split(".") for i in filt_exons]
                names = gen.list_to_dict(names, 0, 1, as_list = True)
                names = {i: max([int(j) for j in names[i]]) for i in names}
                coding_exons = gen.read_many_fields(temp_file, "\t")
                for exon in coding_exons:
                        overlap_name = exon[9].split(".")
                        if overlap_name[0] in names:
                                name = exon[3].split(".")
                                if name[-1] != "1":
                                        last_exon = names[name[0]]
                                        if int(name[-1]) != last_exon:
                                                exon = [str(i) for i in exon[:6]]
                                                o_file.write("\t".join(exon))
                                                o_file.write("\n")
        bmo.sort_bed(temp_file2, temp_file2)
        gen.run_process(["mergeBed", "-i", temp_file2, "-c", "4,5,6", "-o", "distinct,distinct,distinct"], file_for_output = outfile)
        gen.remove_file(temp_file)
        gen.remove_file(temp_file2)
예제 #3
0
def main():

    description = "Look at disease snps."
    arguments = ["disease_snps_file", "output_directory", "results_prefix", "simulations", "ese_file", "intersect_snps", "get_relative_positions", "get_snp_status", "get_info", "simulate_ptc_location", "get_possible_ptc_locations", "required_simulations", "get_overlaps", "intersect_ptcs", "compare_ptcs" ,"get_introns", "compare_distances", "clinvar_ptc_locations", "location_simulation", "exclude_cpg", "ese_hit_simulation", "only_disease", "only_kgenomes", "only_ese", "get_unique_ptcs", "get_unique_rel_pos", "excess_test", "disease_locations_chisquare"]
    args = gen.parse_arguments(description, arguments, flags = [5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20, 21, 22, 23,24,25,26,27], ints=[3])
    disease_snps_file, output_directory, results_prefix, simulations, ese_file, intersect_snps, get_relative_positions, get_snp_status, get_info, simulate_ptc_location, get_possible_ptc_locations, required_simulations, get_overlaps, intersect_ptcs, compare_ptcs, get_introns, compare_distances, clinvar_ptc_locations, location_simulation, exclude_cpg, ese_hit_simulation, only_disease, only_kgenomes, only_ese, get_unique_ptcs, get_unique_rel_pos, excess_test, disease_locations_chisquare = args.disease_snps_file, args.output_directory, args.results_prefix, args.simulations, args.ese_file, args.intersect_snps, args.get_relative_positions, args.get_snp_status, args.get_info, args.simulate_ptc_location, args.get_possible_ptc_locations, args.required_simulations, args.get_overlaps, args.intersect_ptcs, args.compare_ptcs, args.get_introns, args.compare_distances, args.clinvar_ptc_locations, args.location_simulation, args.exclude_cpg, args.ese_hit_simulation, args.only_disease, args.only_kgenomes, args.only_ese, args.get_unique_ptcs, args.get_unique_rel_pos, args.excess_test, args.disease_locations_chisquare

    if simulations and not isinstance(simulations, int):
        print("\nERROR: Please provide the correct number for simulations.\n")
        raise Exception

    # create the output directory if it doesnt already exist
    gen.create_output_directories(output_directory)

    # disease_snps_file = "./source_data/clinvar_20180429.vcf.gz"
    disease_snps_index_file = "{0}.tbi".format(disease_snps_file)

    if not os.path.isfile(disease_snps_file) or not os.path.isfile(disease_snps_index_file):
        print("\nERROR: Please provide the required disease SNPs file(s).\n")
        raise Exception

    # intersect the coding exons with the disease snps
    exon_bed = "{0}_coding_exons.bed".format(results_prefix)
    disease_snp_intersect_file_vcf = "{0}/disease_snp_intersect.vcf".format(output_directory)
    disease_snp_intersect_file_bed = "{0}/disease_snp_intersect.bed".format(output_directory)
    if intersect_snps:
        print("Intersecting snps with exons")
        so.intersect_snps_parallel(exon_bed, disease_snps_file, disease_snp_intersect_file_vcf)
        so.intersect_vcf_to_bed(exon_bed, disease_snp_intersect_file_vcf, disease_snp_intersect_file_bed, change_names = True)

    # get relative positions of the snps in cds and exons
    full_bed = "{0}_CDS.bed".format(results_prefix)
    disease_snps_relative_exon_positions = "{0}/disease_snp_relative_exon_positions.bed".format(output_directory)
    disease_snps_relative_cds_positions = "{0}/disease_snp_relative_cds_positions.bed".format(output_directory)
    if get_relative_positions:
        print("Getting snp relative positions...")
        so.get_snp_relative_exon_position(disease_snp_intersect_file_bed, disease_snps_relative_exon_positions)
        # output to var because this is how the function was made
        relative_positions = gen.read_many_fields(disease_snps_relative_exon_positions, "\t")
        so.get_snp_relative_cds_position(relative_positions, disease_snps_relative_cds_positions, full_bed)

    # get the change status of the snps to check them
    cds_fasta = "{0}_CDS.fasta".format(results_prefix)
    disease_ptcs_file = "{0}/disease_ptcs.txt".format(output_directory)
    disease_other_file = "{0}/disease_other_snps.txt".format(output_directory)
    if get_snp_status:
        print("Getting snp status...")
        so.get_snp_change_status(disease_snps_relative_cds_positions, cds_fasta, disease_ptcs_file, disease_other_file)

    # get intersect between the clinvar ptcs and 1000 genomes ptcs
    ptc_file = "{0}_ptc_file.txt".format(results_prefix)
    ptc_intersect_file = "{0}/ptc_intersect.bed".format(output_directory)
    if intersect_ptcs:
        temp_disease_ptc_file = "temp_data/{0}".format(random.random())
        dso.refactor_ptc_file(disease_ptcs_file, temp_disease_ptc_file)
        temp_k_genomes_ptc_file = "temp_data/{0}".format(random.random())
        dso.refactor_ptc_file(ptc_file, temp_k_genomes_ptc_file, header=True)
        bao.intersect_bed(temp_k_genomes_ptc_file, temp_disease_ptc_file, write_both = True, no_dups=False, output_file = ptc_intersect_file)
        gen.remove_file(temp_disease_ptc_file)
        gen.remove_file(temp_k_genomes_ptc_file)

    # get a list of ptcs unique to each dataset
    unique_ptcs = "{0}/disease_ptcs_no_intersect.bed".format(output_directory)
    unique_ptcs_kgenomes = "{0}/kgenomes_ptcs_no_intersect.bed".format(output_directory)
    if get_unique_ptcs:
        dso.get_unique_ptcs(disease_ptcs_file, ptc_file, ptc_intersect_file, unique_ptcs, unique_ptcs_kgenomes)

    # get the relative positions of the ptcs unique to each dataset
    unique_ptcs_rel_pos_file = "{0}/disease_ptcs_no_intersect_rel_pos.bed".format(output_directory)
    kgenomes_relative_positions = "{0}_PTC_relative_exon_positions.bed".format(results_prefix)
    kgenomes_unique_ptcs_rel_pos_file = "{0}/kgenomes_ptcs_no_intersect_rel_pos.bed".format(output_directory)
    if get_unique_rel_pos:
        dso.get_unique_rel_pos(unique_ptcs, disease_snps_relative_exon_positions, unique_ptcs_kgenomes, kgenomes_relative_positions, unique_ptcs_rel_pos_file, kgenomes_unique_ptcs_rel_pos_file)


    # get the ese file name
    ese_file_name = ese_file.split('/')[-1].split('.')[0]
    # get the coding exons fasta file path
    coding_exons_fasta = "{0}_coding_exons.fasta".format(results_prefix)

    # snp_relative_positions_file = "{0}_SNP_relative_exon_position.bed".format(results_prefix)

    # simulation picking random reference allele matched simulants
    clinvar_location_simulation_file = "{0}/clinvar_ptc_location_simulation.csv".format(output_directory)
    clinvar_location_simulation_ese_overlap_file = "{0}/clinvar_ptc_location_simulation_{1}_ese_overlaps.csv".format(output_directory, ese_file_name)
    kgenomes_location_simulation_file = "{0}/1000_genomes_simulations.csv".format(output_directory)
    kgenomes_location_simulation_ese_overlap_file = "{0}/1000_genomes_simulations_ese_overlaps.csv".format(output_directory)

    if location_simulation:
        if not only_kgenomes:
            print('Running ptc location simulation on disease PTCs...')
            dso.ptc_location_simulation(unique_ptcs_rel_pos_file, coding_exons_fasta, simulations, clinvar_location_simulation_file, clinvar_location_simulation_ese_overlap_file, ese_file, only_ese, exclude_cpg)
        if not only_disease:
            print('Running ptc location simulation on 1000 genomes PTCs...')
            dso.ptc_location_simulation(kgenomes_unique_ptcs_rel_pos_file, coding_exons_fasta, simulations, kgenomes_location_simulation_file, kgenomes_location_simulation_ese_overlap_file, ese_file, only_ese, exclude_cpg)


    window_start = 3
    window_end = 69
    clinvar_ese_hit_simulation_file = "{0}/clinvar_ese_hit_simulation_{1}_{2}_{3}.csv".format(output_directory, window_start, window_end, ese_file_name)
    kgenomes_ese_hit_simulation_file = "{0}/1000_genomes_ese_hit_simulation_{1}_{2}_{3}.csv".format(output_directory, window_start, window_end, ese_file_name)

    # do a simulation picking only sites from within the region
    if ese_hit_simulation:
        if not only_kgenomes:
            print("Simulating ESE hits on the {0}-{1} region for disease PTCs...".format(window_start, window_end))
            dso.ese_hit_simulation(unique_ptcs_rel_pos_file, coding_exons_fasta, simulations, clinvar_ese_hit_simulation_file, ese_file, window_start, window_end, exclude_cpg)
        if not only_disease:
            print("Simulating ESE hits on the {0}-{1} region for 1000 genomes PTCs...".format(window_start, window_end))
            dso.ese_hit_simulation(kgenomes_unique_ptcs_rel_pos_file, coding_exons_fasta, simulations, kgenomes_ese_hit_simulation_file, ese_file, window_start, window_end, exclude_cpg)


    excess_test_file = "{0}/clinvar_ptc_{1}_{2}_excesses.csv".format(output_directory, window_start, window_end)
    if excess_test:
        dso.excess_test(unique_ptcs_rel_pos_file, coding_exons_fasta, excess_test_file)

    location_test_file = "{0}/clinvar_locations_chisquare.csv".format(output_directory)
    if disease_locations_chisquare:
        dso.disease_ptc_location_test(unique_ptcs_rel_pos_file, coding_exons_fasta, location_test_file)
예제 #4
0
def main():

    description = "Take an output file from prepare_FANTOM.py and make a file with the expression data for each gene."
    args = gen.parse_arguments(description, [
        "clean_fasta", "promoters_file_name", "cage_file_name", "out_prefix",
        "TPM_threshold"
    ],
                               ints=[4])
    [
        clean_fasta, promoters_file_name, cage_file_name, out_prefix,
        TPM_threshold
    ] = [
        args.clean_fasta, args.promoters_file_name, args.cage_file_name,
        args.out_prefix, args.TPM_threshold
    ]

    #extract transcript coordinates
    transcripts_file = "{0}_transcripts_clean.bed".format(out_prefix)
    bo.extract_features("../source_data/Homo_sapiens.GRCh37.87.gtf",
                        transcripts_file, ["transcript"])

    #get the names of the transcripts you're interested
    names = gen.read_fasta(clean_fasta)[0]

    #write the coordinates of the promoter regions of those transcripts to file
    with open(promoters_file_name,
              "w") as out_file, open(transcripts_file, "r") as in_file:
        for line in in_file:
            parsed = (line.rstrip("\n")).split("\t")
            #parse out the transcript name
            name = parsed[3].split(".")[0]
            #skip transcripts that aren't among your transcripts of interest
            if name in names:
                #determine the coordinates of a 1001 bp region centered on the TSS (the supposed promoter region)
                if parsed[5] == "+":
                    current_line = [
                        "chr" + parsed[0],
                        int(parsed[1]) - 500,
                        int(parsed[1]) + 500 + 1, name, ".", parsed[5]
                    ]
                elif parsed[5] == "-":
                    current_line = [
                        "chr" + parsed[0],
                        int(parsed[2]) - 500 - 1,
                        int(parsed[2]) + 500, name, ".", parsed[5]
                    ]
                else:
                    RuntimeError("Invalid strand information!")
                out_file.write("\t".join([str(i) for i in current_line]))
                out_file.write("\n")

    #check which CAGE peaks overlap which promoters
    overlapping_peaks_file = "{0}_FANTOM_overlap_peaks.bed".format(out_prefix)
    bmo.intersect_bed(cage_file_name,
                      promoters_file_name,
                      output_file=overlapping_peaks_file,
                      force_strand=True,
                      write_both=True,
                      no_dups=False)

    #for each transcript, get all overlapping peaks
    #(store only the expression information)
    peaks_dict = {name: [] for name in names}
    with open(overlapping_peaks_file, "r") as peaks:
        for peak in peaks:
            peak = peak.split("\t")
            name = peak[9]
            peaks_dict[name].append(peak[3])

    #for each transcript,
    #store the mean TPM within each tissue (averaged over the different peaks
    #associated to that transcript)
    mean_dict = {}
    np.set_printoptions(suppress=True)
    for name in peaks_dict:
        if len(peaks_dict[name]) > 0:
            current_mat = np.array([[float(j) for j in i.split("|")]
                                    for i in peaks_dict[name]])
            means = np.mean(current_mat, axis=0)
            mean_dict[name] = means

    #calculate expression parameters
    final_dict = {}
    for gene in mean_dict:
        expressed = len([i for i in mean_dict[gene] if i > TPM_threshold])
        fraction = expressed / len(mean_dict[gene])
        maximum = np.max(mean_dict[gene])
        median_expr = np.median(mean_dict[gene])
        median_if_expressed = np.median(
            [i for i in mean_dict[gene] if i > TPM_threshold])
        final_dict[gene] = [
            fraction, maximum, median_expr, median_if_expressed
        ]

    output_file_name = "{0}_FANTOM_expression_per_transcript.txt".format(
        out_prefix)
    with open(output_file_name, "w") as file:
        file.write("gene\tbreadth\tmax\tmedian\tmedian_expr\n")
        for i in sorted(list(final_dict.keys())):
            if final_dict[i] != None:
                file.write("\t".join([i] + [str(j) for j in final_dict[i]]))
                file.write("\n")
예제 #5
0
def process_bam_per_individual(bam_files, global_exon_junctions_file,
                               PTC_exon_junctions_file, out_folder, PTC_file,
                               syn_nonsyn_file, out_prefix,
                               exon_junctions_bam_output_folder, kw_dict):
    '''
    Do all of the processing on an individual bam, from filtering out low quality data to mapping reads to
    exon-exon junctions.
    For each exon, return information on how many reads fall at different exon-exon junctions.
    '''

    #parse keyword_dict
    #it's done like this to make it easier to parallelize this process
    if "ptc_snp_simulation" in kw_dict:
        ptc_snp_simulation = kw_dict["ptc_snp_simulation"]
    else:
        ptc_snp_simulation = False
    if "simulation_instance_folder" in kw_dict:
        simulation_instance_folder = kw_dict["simulation_instance_folder"]
    else:
        simulation_instance_folder = None
    if "simulation_number" in kw_dict:
        simulation_number = kw_dict["simulation_number"]
    else:
        simulation_number = None
    if "overwrite_intersect" in kw_dict:
        overwrite_intersect = kw_dict["overwrite_intersect"]
    else:
        overwrite_intersect = False
    if "phase" in kw_dict:
        phase = kw_dict["phase"]
    else:
        phase = False

    bam_file_number = len(bam_files)
    for pos, bam_file in enumerate(bam_files):

        #Process:
        # 1. get the number of reads in bam
        # 2. Filter out reads that don't overlap exon-exon junctions
        # 3. Filter out reads that don't overlap exon-exon junctions flanking PTC-containing exons
        # 4. Filter bams by quality
        # This gives us a set of "good" quality reads.
        # 5. scale down total read number proportionally to how many reads were lost in the quality filtering
        # 6. Count reads either skipping or including each exon

        print("{0}/{1}: {2}".format(pos, bam_file_number, bam_file))
        sample_name = (bam_file.split("/")[-1]).split(".")[0]
        if ptc_snp_simulation:
            output_file = "{0}/{1}_simulation_{2}.txt".format(
                out_folder, sample_name, simulation_number)
        else:
            output_file = "{0}/{1}.txt".format(out_folder, sample_name)

        #folder that will contain all of the intermediate steps in the processing of the bam file
        if ptc_snp_simulation:
            proc_folder = "{0}/bam_proc_files".format(
                simulation_instance_folder)

        else:
            proc_folder = "{0}__analysis_bam_proc_files".format(out_prefix)

        gen.create_output_directories(proc_folder)

        bam_file_parts = os.path.split(bam_file)
        mapq_filtered_bam = "{0}/{1}_filtered_mapq.bam".format(
            proc_folder, bam_file_parts[1])
        mapq_flag_filtered_bam = "{0}_flag.bam".format(mapq_filtered_bam[:-4])
        mapq_flag_xt_filtered_bam = "{0}_xt.bam".format(
            mapq_flag_filtered_bam[:-4])
        mapq_flag_xt_nm_filtered_bam = "{0}_nm.bam".format(
            mapq_flag_xt_filtered_bam[:-4])

        if not os.path.isfile(output_file):

            #1: We get a count of the total reads in the sample which can be used for normalisation
            #I'm initializing it to None for safety. That way, if the process fails,
            #it won't just silently go with whatever the value was at the end of the previous loop.
            #also, writing it down cause this bit takes forever, don't want to do it again every time.
            read_count_file_name = "{0}/read_count_sample_name.txt".format(
                exon_junctions_bam_output_folder)
            read_count = None
            if os.path.isfile(read_count_file_name):
                with open(read_count_file_name) as file:
                    read_count = int("".join(file))
            else:
                read_count = int(
                    gen.run_process(["samtools", "view", "-c", bam_file]))
                with open(read_count_file_name, "w") as file:
                    file.write(str(read_count))

            #2: intersect the bam with all exon-exon junctions
            #only has to be done once for each bam
            #also removing "_out_of_frame" from out_prefix if it is present
            global_out_prefix = out_prefix
            if "out_of_frame" in global_out_prefix:
                global_out_prefix = global_out_prefix[:6]
            global_intersect_bam = "{0}/{1}_exon_junctions.bam".format(
                exon_junctions_bam_output_folder, bam_file_parts[1][:-4])
            if not os.path.isfile(global_intersect_bam) or overwrite_intersect:
                #intersect the filtered bam and the global exon junctions file
                # print(global_intersect_bam)
                bmo.intersect_bed(bam_file,
                                  global_exon_junctions_file,
                                  output_file=global_intersect_bam,
                                  intersect_bam=True)

            #3: filter to relevant exon-exon junctions
            ##Intersect junctions and .bam, and write down the overlapping .bam alignments, without counting.
            #this uses intersect bed, with the intersect bam parameter
            intersect_bam = "{0}/{1}_exon_junction_bam_intersect.bam".format(
                proc_folder, bam_file_parts[1][:-4])

            #intersect the filtered bam and the ptc exon junctions file
            bmo.intersect_bed(global_intersect_bam,
                              PTC_exon_junctions_file,
                              output_file=intersect_bam,
                              intersect_bam=True)

            #count how many reads there are in the sample after filtering to relevant exon-exon junctions but before quality filtering
            read_count_junctions_no_filter = int(
                gen.run_process(["samtools", "view", "-c", intersect_bam]))
            #4. filter .bam alignments by quality.
            #takes both upper and lower bam thresholds
            #outputs bam file with "_quality_filter_{lower_lim}_{upper_lim}" appended
            # need to do this twice and merge, so we use both intervals used by Geuvadis
            #set the mapq filter parameters here
            mapq_intervals = [[251, 255], [175, 181]]
            mapq_filter_filelist = []

            for mapq_interval in mapq_intervals:
                lower_threshold, upper_threshold = mapq_interval[
                    0], mapq_interval[1]
                mapq_filter_file = "{0}/{1}_mapq_filter_{2}_{3}.bam".format(
                    proc_folder, bam_file_parts[1][:-4], lower_threshold,
                    upper_threshold)
                mapq_filter_filelist.append(mapq_filter_file)
                ##run the mapq filter
                bmo.bam_quality_filter(
                    intersect_bam,
                    mapq_filter_file,
                    quality_greater_than_equal_to=lower_threshold,
                    quality_less_than_equal_to=upper_threshold)

            ##merge files in filelist
            bmo.merge_bams(mapq_filter_filelist, mapq_filtered_bam)

            ##filter by flags: get all mapped reads
            #Leaves: mapped unpaired and paired reads
            bmo.bam_flag_filter(mapq_filtered_bam,
                                mapq_flag_filtered_bam,
                                get_mapped_reads=True)

            ##filter bam by xt tag XT=U
            bmo.bam_xt_filter(mapq_flag_filtered_bam,
                              mapq_flag_xt_filtered_bam,
                              xt_filter="U")

            ##filter bam by nm tag NM<=6
            bmo.bam_nm_filter(mapq_flag_xt_filtered_bam,
                              mapq_flag_xt_nm_filtered_bam,
                              nm_less_equal_to=6)

            #5. scale down the initial count of reads in the sample by the proportion lost during quality filtering
            read_count_junctions_filter = int(
                gen.run_process(
                    ["samtools", "view", "-c", mapq_flag_xt_nm_filtered_bam]))
            prop_kept = np.divide(read_count_junctions_filter,
                                  read_count_junctions_no_filter)
            read_count = prop_kept * read_count

            #convert to sam format and phase reads
            intersect_sam = "{0}_phased.sam".format(
                mapq_flag_xt_nm_filtered_bam[:-4])
            if phase:
                temp_snp_file = "temp_data/snps{0}.txt".format(random.random())
                so.merge_and_header(PTC_file, syn_nonsyn_file, temp_snp_file)
                bmo.phase_bams(temp_snp_file, mapq_flag_xt_nm_filtered_bam,
                               sample_name, intersect_sam)
                gen.remove_file(temp_snp_file)
            else:
                gen.run_process(
                    ["samtools", "view", mapq_flag_xt_nm_filtered_bam],
                    file_for_output=intersect_sam)

            #6. count the number of reads supporting either the skipping or the inclusion of each exon
            junctions = bmo.read_exon_junctions(PTC_exon_junctions_file)
            bmo.count_junction_reads(intersect_sam, junctions, output_file,
                                     read_count)