Ejemplo n.º 1
0
def main():

    description = "Record the distribution of peaks for different exons."
    args = hk.parse_arguments(description, ["peaks_file", "gtf", "exon_starts_file", "output_file", "reads_file", "from_end", "intronic", "limit", "nts_before_start", "noncoding", "reads_mode"], flags = [5, 6, 9, 10], ints = [7, 8])
    peaks_file, gtf, exon_starts_file, output_file, reads_file, from_end, intronic, limit, nts_before_start, noncoding, reads_mode = args.peaks_file, args.gtf, args.exon_starts_file, args.output_file, args.reads_file, args.from_end, args.intronic, args.limit, args.nts_before_start, args.noncoding, args.reads_mode

    if noncoding:
        exons = rw.read_gtf(gtf, "exon", gene=False)
    else:
        exons = rw.read_gtf(gtf, "CDS", gene=False)

    # the 3' ss that will be analyzed
    valid_junctions = rw.read_many_fields(exon_starts_file, "\t")
    # pull out the column with transcript IDs
    valid_junctions = [i[3] for i in valid_junctions]

    lengths_dict = co.get_lengths(exons, valid_junctions, intronic=intronic)
    if nts_before_start:
        lengths_dict = {i: lengths_dict[i] + nts_before_start for i in lengths_dict}

    coverage_file_name = "{0}_{1}_coverage.bed".format(exon_starts_file[:-4], reads_file.split("/")[-1][:-4])
    co.get_coverage(exon_starts_file, reads_file, coverage_file_name)

    peak_distances_all, peak_centres = co.peak_pos_in_exon(exon_starts_file, peaks_file, from_end = from_end, reads_mode = reads_mode)

    write_dist_mat(peak_distances_all, limit, output_file, lengths_dict, "{0}_intron_names.txt".format(output_file[:-4]), None)

    write_dist_mat(peak_centres, limit, "{0}_centres.txt".format(output_file[:-4]), lengths_dict,
                                                               "{0}_centres_intron_names.txt".format(output_file[:-4]), None)
def main():
    description = "Given a BED file of reads, filter out reads whose " \
                  "3' end maps to the last nucleotide of an intron or" \
                  "the last nucleotide of an exon."
    args = hk.parse_arguments(description, ["reads_file", "gtf", "outfile"])
    reads_file, gtf, outfile = args.reads_file, args.gtf, args.outfile

    print("Getting intron lariat positions...")

    # read in exon coordinates
    exons = rw.read_gtf(gtf, element="exon", gene=False)
    # make a BED file with the last positions of introns
    intron_lariat_bed = "{0}_intron_lariat_pos_all_exons.bed".format(reads_file[:-4])
    co.write_intron_lariat_pos_from_exons(exons, intron_lariat_bed, add_chr = True)

    # intersect the reads with intron lariat positions
    intron_lariat_intersect_file_name = "{0}_intersect_with_intron_lariat_pos_all_exons.bed".format(reads_file[:-4])
    co.intersect_bed(reads_file, intron_lariat_bed, force_strand=True, write_both=True, no_dups=False, output_file=intron_lariat_intersect_file_name)
    hk.remove_file(intron_lariat_bed)
    intron_lariat_reads_file = "{0}_intron_lariat_reads_all_exons.bed".format(reads_file[:-4])
    # check that the reads end exactly at intron lariat positions
    check_3prime_match(intron_lariat_intersect_file_name, intron_lariat_reads_file)
    hk.remove_file(intron_lariat_intersect_file_name)

    # write BED with the last positions of exons
    splice_intermediate_bed = "{0}_splice_intermediate_pos_all_exons.bed".format(reads_file[:-4])
    co.write_si_pos_from_exons(exons, splice_intermediate_bed, add_chr = True)

    print("Getting splice intermediate positions.")

    # intersect the reads with splice intermediate positions
    splice_intermediate_intersect_file_name = "{0}_intersect_with_SI_pos_all_exons.bed".format(reads_file[:-4])
    co.intersect_bed(reads_file, splice_intermediate_bed, force_strand=True, write_both=True, no_dups=False, output_file=splice_intermediate_intersect_file_name)
    hk.remove_file(splice_intermediate_bed)
    SI_reads_file = "{0}_SI_reads_all_exons.bed".format(reads_file[:-4])
    # check that the reads end exactly at the end of the exon
    check_3prime_match(splice_intermediate_intersect_file_name, SI_reads_file)
    hk.remove_file(splice_intermediate_intersect_file_name)

    print("Concatenating the two files.")

    # concatenate the IL and SI read files so you could exclude both in one go
    combined_file = "{0}_SI_and_IL_reads_all_exons.bed".format(reads_file[:-4])
    hk.run_process(["cat", SI_reads_file, intron_lariat_reads_file], file_for_output=combined_file)

    hk.remove_file(SI_reads_file)
    hk.remove_file(intron_lariat_reads_file)

    # do an exclusive intersect, requiring 1.0 overlap for both A and B, to remove the
    # putative intron lariat reads from the main reads file
    co.intersect_bed(reads_file, combined_file, overlap=1, overlap_rec=1, force_strand=True, no_dups=False, exclude=True, output_file=outfile)

    hk.remove_file(combined_file)
Ejemplo n.º 3
0
 def test_get_flanking_intron_sizes(self):
     exons = rw.read_gtf("tests/get_flanking_intron_sizes_input.gtf",
                         "exon",
                         gene=False)
     expected = {}
     expected["ENSMUST1"] = {
         "upstream": [None, 4, 4],
         "downstream": [4, 4, None]
     }
     expected["ENSMUST8"] = None
     expected["ENSMUST4"] = {
         "upstream": [None, 1, 4, 2],
         "downstream": [1, 4, 2, None]
     }
     observed = get_flanking_intron_sizes(exons)
     self.assertEqual(observed, expected)
Ejemplo n.º 4
0
 def test_get_upstream_intron_size(self):
     exons = rw.read_gtf("tests/get_upstream_intron_size_input.gtf",
                         "exon",
                         gene=False)
     exon_ranks = {
         "ENSMUST1.0": 0,
         "ENSMUST1.1": 1,
         "ENSMUST4.0": 0,
         "ENSMUST4.2": 2
     }
     expected = {
         "ENSMUST1.0": None,
         "ENSMUST1.1": 4,
         "ENSMUST4.0": None,
         "ENSMUST4.2": 4
     }
     observed = get_upstream_intron_size(exons, exon_ranks)
     self.assertEqual(expected, observed)
Ejemplo n.º 5
0
def get_transcripts(gtf, out_file, add_chr = False):
    """
    Given a GTF file that has exon coordinates (among others),
    make an output BED file with transcript coordinates.
    :param gtf: input GTF file
    :param out_file: output BED file name
    :param add_chr: if True, prefix "chr" to chromosome names
    :return: None
    """
    exons = rw.read_gtf(gtf, "exon", gene = False)
    with open(out_file, "w") as file:
        out_writer = csv.writer(file, delimiter = "\t")
        for trans in sorted(list(exons.keys())):
            starts = [i[3] for i in exons[trans]]
            ends = [i[4] for i in exons[trans]]
            # just any exon to get those fields that will be the same for all of them
            template = exons[trans][0]
            if add_chr:
                chrom = "chr{0}".format(template[0])
            else:
                chrom = template[0]
            # convert to BED
            to_write = [chrom, min(starts) - 1, max(ends), trans, ".", template[6]]
            out_writer.writerow(to_write)
Ejemplo n.º 6
0
def main():

    description = "Call peaks in a BED file of NET-seq reads."
    help_info = [
        "BED file (at least a BED6) with NET-seq reads. Should be single-nucleotide resolution (each BED region is the 3' end of a read.).",
        "Ensembl GTF file for the relevant species. Ensure that chromosome names are formatted the same way in both the GTF and the BED file with reads!",
        "BED file with the coordinates of the transcripts to analyze. Only the name field is read, hence the others can hold placeholders. The name field must contain transcript IDs from the GTF file.",
        "Name of the output file (BED file with peak coordinates).",
        "Alpha value for calling a position as having a significantly higher local read denisty than expected by chance. Default: 0.01.",
        "Merge distance: adjacent peaks will be merged if they are closer than this many nucleotides. Default: 21.",
        "Minimum reads per peak. Default: 10.",
        "The number of times the read position randomization should be performed for each transcript. Higher values make the significance calculation (marginally) more robust, however, they also make the programme very slow. Default: 5.",
        "Minimum length of a peak in nucleotides. Default: 5.",
        "Size of the sliding window to use when calculating the local read density. It may be sensible to set this to the same value as the merge distance. Should be an odd integer. Default: 21",
        "The analysis will be performed this many times, with the output files numbered. Useful for running many negative control simulations at once. Default: 1.",
        "Read positions will be shuffled within each transcript before analysis. This should disrupt any signal and should give a flat peak density profile.",
        "Instead of a sliding window, adjacent non-overlapping windows will be used when calculating the local read density.",
        "When calling peaks in a given exon/intron, do not include that exon/intron in the read position randomization.",
        "When --exclude_focal is set, count an exon and its upstream intron as a single unit (except for the first exon).",
        "Don't filter out likely PCR duplicates (peaks where more than 90%% of the reads come from a single nucleotide position).)"
    ]
    defaults = {4: 0.01, 5: 21, 6: 10, 7: 5, 8: 5, 9: 21, 10: 1}
    args = hk.parse_arguments(description, [
        "reads_file", "gtf", "trans_active_file", "output_file",
        "significance_threshold", "merge", "min_reads_per_peak", "iterations",
        "min_peak_length", "window_size", "runs", "neg_control", "no_slide",
        "exclude_focal", "with_ups_intron", "no_PCR_filter"
    ],
                              floats=[4],
                              ints=[5, 6, 7, 8, 9, 10],
                              flags=[4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
                              detailed_help=help_info,
                              defaults=defaults)
    reads_file, gtf, trans_active_file, output_file, significance_threshold, merge, min_reads_per_peak, iterations, min_peak_length, window_size, runs, neg_control, no_slide, exclude_focal, with_ups_intron, no_PCR_filter = args.reads_file, args.gtf, args.trans_active_file, args.output_file, args.significance_threshold, args.merge, args.min_reads_per_peak, args.iterations, args.min_peak_length, args.window_size, args.runs, args.neg_control, args.no_slide, args.exclude_focal, args.with_ups_intron, args.no_PCR_filter

    print("Merge distance: {0}".format(merge))
    print("Minimum number of reads per peak: {0}".format(min_reads_per_peak))
    print("Minimum peak length: {0}".format(min_peak_length))
    print("Window size: {0}".format(window_size))
    print("Significance level: {0}".format(significance_threshold))
    print("Randomization iterations to perform: {0}".format(iterations))
    print("Runs: {0}".format(runs))

    neg_str = ""
    if neg_control:
        neg_str = "_neg_control"

    slide_str = ""
    if no_slide:
        slide_str = "_no_slide"
    intron_str = ""
    if with_ups_intron:
        intron_str = "w_ups_intr"

    # 0. make a BED file with the coordinates of transcripts

    transcripts_file = "{0}_transcripts.bed".format(gtf[:-4])
    co.get_transcripts(gtf, transcripts_file, add_chr=True)
    exons = rw.read_gtf(gtf, "exon")

    # 1. intersect the two files, loop over the result and make a
    # dictionary of reads per pos for each transcript, which has reads

    reads_per_pos = get_reads_per_pos(reads_file, transcripts_file)
    # only leave transcriptionally active genes (one isoform per gene)
    trans_active_genes = rw.read_many_fields(trans_active_file, "\t")[1:]
    # pull out the column with transcript IDs
    trans_active_genes = [i[3] for i in trans_active_genes]
    reads_per_pos = {
        i: reads_per_pos[i]
        for i in reads_per_pos if i.split(".")[-1] in trans_active_genes
    }

    for sim in range(runs):

        print("**********{0}**********".format(sim))

        # 2. for each transcript, randomly reshuffle the reads and calculate the
        # nth percentile depending on what the significance threshold is
        # keep positions that are higher than that threshold and write to BED file

        raw_peak_bed = "{0}_{1}_raw_peaks{2}_{3}_{4}{5}{6}{7}_{8}_sim.bed".format(
            reads_file[:-4],
            gtf.split("/")[-1][:-4], iterations, min_reads_per_peak,
            window_size, neg_str, intron_str, slide_str, sim)
        read_count_file = "{0}_{1}_read_counts{2}_{3}{4}{5}_{6}_sim.txt".format(
            reads_file[:-4],
            gtf.split("/")[-1][:-4], iterations, window_size, neg_str,
            intron_str, sim)
        new_reads_file = write_raw_peaks(reads_per_pos,
                                         raw_peak_bed,
                                         read_count_file,
                                         exons,
                                         iterations=iterations,
                                         min_read_count=min_reads_per_peak,
                                         window_size=window_size,
                                         neg_control=neg_control,
                                         no_slide=no_slide,
                                         exclude_focal=exclude_focal,
                                         with_ups_intron=with_ups_intron)
        if neg_control:
            reads_file = new_reads_file

        # 3. merge peaks

        merged_peak_bed = "{0}_{1}_merged_peaks{2}_{3}_{4}{5}{6}{7}_{8}_sim.bed".format(
            reads_file[:-4],
            gtf.split("/")[-1][:-4], iterations, window_size, merge, neg_str,
            slide_str, intron_str, sim)
        co.merge_bed(raw_peak_bed, merged_peak_bed, merge)
        print("Before filtering, there are {0} peaks.".format(
            hk.line_count(merged_peak_bed)))

        # 4. filter out peaks that don't have enough reads or are too short.
        # Write final results to file and also write a stats file with the size,
        # read count and overlapping transcript of the peaks

        stats_file = "{0}_stats_{1}_sim.txt".format(output_file[:-4], sim)
        filter_peaks(merged_peak_bed,
                     reads_file,
                     read_count_file,
                     "{0}_{1}_sim.bed".format(output_file[:-4], sim),
                     min_reads_per_peak,
                     min_peak_length,
                     stats_file,
                     no_PCR_filter=no_PCR_filter)
Ejemplo n.º 7
0
def main():
    description = "Prepare a BED file with the TES coordinates of transcriptionally" \
                  "active genes and make a metagene of reads within this region."

    args = hk.parse_arguments(description, ["trans_act_file", "gtf", "start_coord", "end_coord", "outname", "reads_file"], ints = [2, 3])
    trans_act_file, gtf, start_coord, end_coord, outname, reads_file = args.trans_act_file, args.gtf, args.start_coord, args.end_coord, args.outname, args.reads_file

    trans_act_genes = []
    with open(trans_act_file) as f:
        reader = csv.reader(f, delimiter = "\t")
        for line in reader:
            trans_act_genes.append(line[3])

    exons = rw.read_gtf(gtf, "exon")
    CDSs = rw.read_gtf(gtf, "CDS")

    exons = {i: exons[i] for i in exons if i in trans_act_genes}
    # protein-coding only
    exons = {i: exons[i] for i in exons if i in CDSs}

    ds_500 = "{0}_ds_500.bed".format(outname[:-4])
    with open(outname, "w") as out, open(ds_500, "w") as out_ds:
        writer = csv.writer(out, delimiter="\t")
        writer_ds = csv.writer(out_ds, delimiter="\t")
        for trans in exons:
            strand = exons[trans][0][6]
            chrom = "chr{0}".format(exons[trans][0][0])
            if strand == "+":
                TES = exons[trans][-1][4]
                new_start = TES - start_coord
                new_end = TES + end_coord
                new_start_ds = TES
                new_end_ds = TES + 500
            else:
                TES = exons[trans][-1][3]
                new_start = TES - start_coord - 1
                new_end = TES + start_coord - 1
                new_start_ds = TES - 500 - 1
                new_end_ds = TES - 1
            writer.writerow([chrom, new_start, new_end, trans, "0", strand])
            chrom = chrom.lstrip("chr")
            writer_ds.writerow([chrom, new_start_ds, new_end_ds, trans, "0", strand])

    intersect = "{0}_ds500_intersect.bed".format(outname[:-4])
    transcripts_file = "{0}_transcripts.bed".format(gtf[:-4])
    co.intersect_bed(ds_500, transcripts_file, write_both = True, force_strand=False, no_dups = False, output_file=intersect)

    co.get_transcripts(gtf, transcripts_file, with_detail=True)
    mapping = co.transcript_mapping(transcripts_file)

    to_exclude = []
    with open(intersect) as int_file:
        reader = csv.reader(int_file, delimiter = "\t")
        for line in reader:
            strand = line[5]
            curr_gene = mapping[line[3]]
            other_gene = mapping[line[9]]
            if curr_gene != other_gene:
                to_exclude.append(line[3])

    filtered_out_name = "{0}_filt.txt".format(outname[:-4])
    with open(filtered_out_name, "w") as filt_f:
        for name in to_exclude:
            filt_f.write("{0}\n".format(name))

    final_out_name = "{0}_distrib.bed".format(outname[:-4])

    distances = co.peak_pos_in_exon(outname, reads_file, from_end = True, reads_mode = True)[0]
    write_dist_mat(distances, start_coord + end_coord, final_out_name, None, "{0}_names.txt".format(final_out_name[:-4]), None)
Ejemplo n.º 8
0
def main():
    description = "Record splicing distance."
    args = hk.parse_arguments(description, ["input_file", "gtf", "output_folder", "trans_active_file", "window_size", "intron_window_size", "outsuffix", "leave_terminal"], ints = [4, 5], flags = [7])
    input_file, gtf, output_folder, trans_active_file, window_size, intron_window_size, outsuffix, leave_terminal = args.input_file, args.gtf, args.output_folder, args.trans_active_file, args.window_size, args.intron_window_size, args.outsuffix, args.leave_terminal

    if outsuffix == "None":
        outsuffix = ""

    bare_input_path = input_file.split("/")[-1]
    bed = "{0}.bed".format(input_file[:-4])
    # hk.convert2bed(input_file, bed)

    # get descriptive stats of the reads
    length_file = "{0}/{1}_read_lengths.txt".format(output_folder, bare_input_path[:-4])
    write_read_lengths(bed, length_file)

    # read in CDS coordinates
    exons = rw.read_gtf(gtf, "CDS", gene=False)
    # only leave transcriptionally active genes (one isoform per gene)
    trans_active_genes = rw.read_many_fields(trans_active_file, "\t")[1:]
    # pull out the column with transcript IDs
    trans_active_genes = [i[3] for i in trans_active_genes]
    exons = {i: exons[i] for i in exons if i in trans_active_genes}
    terminal_suff = "_with_terminal"
    if not leave_terminal:
        # remove last exons
        exons = {i: exons[i][:-1] for i in exons}
        terminal_suff = ""
    # prepare exon-exon junctions
    exon_junctions_file = "{0}_exon_junctions{1}{2}.bed".format(gtf[:-4], outsuffix, terminal_suff)
    all_junctions = co.extract_3ss(exons, exon_junctions_file)

    out_bed = "{0}/{1}_first_{2}_bp{3}{4}.bed".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff)
    write_exon_starts(all_junctions, out_bed, exons, window_size, add_chr=True)
    out_bed_end = "{0}/{1}_last_{2}_bp{3}{4}.bed".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff)
    write_exon_starts(all_junctions, out_bed_end, exons, window_size, add_chr=True, from_end=True)
    intron_bed = "{0}/{1}_first_{2}_intronic_bp{3}{4}.bed".format(output_folder, bare_input_path[:-4], intron_window_size, outsuffix, terminal_suff)
    write_intron_starts(all_junctions, intron_bed, exons, intron_window_size, add_chr=True)
    out_bed = "{0}/{1}_first_centred_{2}_bp{3}{4}.bed".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff)
    write_exon_starts(all_junctions, out_bed, exons, window_size, add_chr=True, centre=True)
    out_bed_end = "{0}/{1}_last_centred_{2}_bp{3}{4}.bed".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff)
    write_exon_starts(all_junctions, out_bed_end, exons, window_size, add_chr=True, from_end=True, centre=True)
    out_bed_si = "{0}/{1}_si_pos{2}{3}.bed".format(output_folder, bare_input_path[:-4], outsuffix, terminal_suff)
    write_si_pos(all_junctions, out_bed_si, exons, add_chr=True)
    out_bed_si_current = "{0}/{1}_si_pos_current{2}{3}.bed".format(output_folder, bare_input_path[:-4], outsuffix, terminal_suff)
    write_si_pos(all_junctions, out_bed_si_current, exons, add_chr=True, curr_exon=True)
    # check which junctions are associated with a splicing intermediate read
    snr_bed = "{0}_snr.bed".format(bed[:-4])
    co.snr_bed(bed, snr_bed)
    si_counts_bed = "{0}/{1}_si_counts{2}{3}.bed".format(output_folder, bare_input_path[:-4], outsuffix, terminal_suff)
    co.intersect_bed(out_bed_si, snr_bed, force_strand=True, hit_count=True, no_dups=False, output_file=si_counts_bed)
    si_counts_current_bed = "{0}/{1}_si_counts_current{2}{3}.bed".format(output_folder, bare_input_path[:-4], outsuffix, terminal_suff)
    co.intersect_bed(out_bed_si_current, snr_bed, force_strand=True, hit_count=True, no_dups=False, output_file=si_counts_current_bed)

    # filter out reads that don't overlap exon-exon junctions
    exon_junction_bed = "{0}_exon_junctions{1}{2}.bed".format(input_file[:-4], outsuffix, terminal_suff)
    co.intersect_bed(bed, exon_junctions_file, write_both=True,
                     output_file=exon_junction_bed,
                  force_strand=True, no_dups=False)

    spliced_bed = "{0}_spliced{1}{2}.bed".format(input_file[:-4], outsuffix, terminal_suff)
    unspliced_bed = "{0}_unspliced{1}{2}.bed".format(input_file[:-4], outsuffix, terminal_suff)
    sr_distances = {}
    ur_distances = {}
    found_count = 0
    file_size = hk.line_count(exon_junction_bed)

    # will store all the intron names for which there are
    # either spliced or unspliced reads
    valid_junctions = []
    with open(exon_junction_bed) as file, open(spliced_bed, "w") as sfile, open(unspliced_bed, "w") as ufile:
        for pos, line in enumerate(file):

            if pos % 100000 == 0:
                print("{0}/{1}".format(pos, file_size))
                print("Found {0} spliced reads.".format(found_count))
                print("\n")

            line = line.split("\t")

            # reads that end at the last nucleotide of an exon
            intermediate_read = NGS.check_intermediate_read(line, exons)
            intron_name = line[20]

            if not intermediate_read:

                # check that it ends within the exon just downstream of
                # the 3' ss that is being analyzed

                in_dwns_exon = NGS.check_position_in_exon(line, exons)

                if in_dwns_exon:

                    # 'spliced', 'unspliced' or 'None' (=can't analyze)
                    read_type = NGS.analyze_cigar(line, overhang = 5)

                    if read_type:
                        if intron_name not in valid_junctions:
                            valid_junctions.append(intron_name)
                        splice_dist = NGS.get_splice_dist(line)
                        if read_type == "S":
                            sfile.write("\t".join([str(i) for i in line]))
                            found_count = found_count + 1
                            sr_distances = update_dist_dict(intron_name, sr_distances, splice_dist)
                        else:
                            ufile.write("\t".join([str(i) for i in line]))
                            ur_distances = update_dist_dict(intron_name, ur_distances, splice_dist)

    print("Proportion of spliced reads: {0}.".format(found_count/(pos + 1)))

    # for each valid junction, calculate the length of the exonic sequence
    # afterwards, so that you wouldn't consider intronic sequence in the distance
    # matrix
    lengths_dict = co.get_lengths(exons, valid_junctions)

    write_dist_mat(sr_distances, window_size,
                   "{0}/{1}_spliced_read_distances_{2}{3}{4}.txt".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff),
                   lengths_dict,
                   "{0}/{1}_spliced_read_{2}_intron_names{3}{4}.txt".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff),
                   "{0}/{1}_spliced_read_first_spliced{2}{3}.txt".format(output_folder, bare_input_path[:-4], outsuffix, terminal_suff))

    write_dist_mat(ur_distances, window_size,
                   "{0}/{1}_unspliced_read_distances_{2}{3}{4}.txt".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff),
                   lengths_dict,
                   "{0}/{1}_unspliced_read_{2}_intron_names{3}{4}.txt".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff),
                   "{0}/{1}_unspliced_read_first_unspliced{2}{3}.txt".format(output_folder, bare_input_path[:-4], outsuffix, terminal_suff))
Ejemplo n.º 9
0
def main():
    description = "Aggregate various statistics on the splicing events you're studying."
    args = hk.parse_arguments(description, [
        "gtf", "polII_bed", "exon_start_coords", "truncated_exons_file",
        "genome_file", "output_file"
    ])
    gtf, polII_bed, exon_start_coords, truncated_exons_file, genome_file, output_file = args.gtf, args.polII_bed, args.exon_start_coords, args.truncated_exons_file, args.genome_file, args.output_file

    CDSs = rw.read_gtf(gtf, "CDS", gene=False)
    exons = rw.read_gtf(gtf, "exon", gene=False)
    exon_starts = rw.read_many_fields(exon_start_coords,
                                      skip_header=False,
                                      delimiter="\t")
    exon_starts = {i[3]: i for i in exon_starts}
    out_array = np.array(sorted(exon_starts.keys()), dtype="str")
    out_array.shape = (len(exon_starts.keys()), 1)
    out_array = np.vstack((["junction"], out_array))

    #1. exon size
    curr_dict = co.get_lengths(CDSs, exon_starts.keys())
    out_array = add_to_array(out_array, curr_dict, "exon_size")
    print("Exon size done.")

    #2. exon number
    curr_dict = co.get_exon_number(exons, exon_starts.keys())
    out_array = add_to_array(out_array, curr_dict, "exon_number")
    print("Exon number done.")

    #3. exon rank (from start and end)
    exon_rank_start, exon_rank_end = co.get_exon_rank(exons, exon_starts)
    out_array = add_to_array(out_array, exon_rank_start,
                             "exon_rank_from_start")
    out_array = add_to_array(out_array, exon_rank_end, "exon_rank_from_end")
    print("Exon rank done.")

    #4. upstream intron size
    curr_dict = co.get_upstream_intron_size(exons, exon_rank_start)
    out_array = add_to_array(out_array, curr_dict, "upstream_intron_size")
    curr_dict = co.get_upstream_intron_size(exons,
                                            exon_rank_start,
                                            downstream=True)
    out_array = add_to_array(out_array, curr_dict, "downstream_intron_size")
    print("Intron size done.")

    if truncated_exons_file != "None":

        #5. Pol II density per transcript
        dens_per_trans_file = "{0}_dens_per_trans.txt".format(polII_bed[:-4])
        dens_per_trans_junctions = get_dens_per_trans(truncated_exons_file,
                                                      polII_bed,
                                                      dens_per_trans_file,
                                                      out_array[1:, 0])
        out_array = add_to_array(out_array, dens_per_trans_junctions,
                                 "polII_dens_per_trans")
        print("Pol II density done.")

    #6. exon GC4 and GC content
    genome = Fasta(genome_file)
    curr_dict = get_exon_GC4(CDSs, exons, exon_rank_start, genome)
    out_array = add_to_array(out_array, curr_dict, "exon_GC4")
    curr_dict = get_exon_GC(exons, exon_rank_start, genome)
    out_array = add_to_array(out_array, curr_dict, "exon_GC")
    print("Exon GC done.")

    #7. upstream intron GC content
    curr_dict = get_upstream_intron_GC(exons, exon_rank_start, genome)
    out_array = add_to_array(out_array, curr_dict, "upstream_intron_GC")
    print("Intron GC done.")

    #8. splice site strength
    curr_dict = nc.get_ss_strength(exons,
                                   genome_file,
                                   upstream=True,
                                   five=True,
                                   exonic=3,
                                   intronic=6)
    out_array = add_to_array(out_array, curr_dict, "upstream_5ss_strength")
    curr_dict = nc.get_ss_strength(exons,
                                   genome_file,
                                   upstream=True,
                                   five=False,
                                   exonic=3,
                                   intronic=20)
    out_array = add_to_array(out_array, curr_dict, "upstream_3ss_strength")
    curr_dict = nc.get_ss_strength(exons,
                                   genome_file,
                                   upstream=False,
                                   five=True,
                                   exonic=3,
                                   intronic=6)
    out_array = add_to_array(out_array, curr_dict, "downstream_5ss_strength")
    print("Splice site strength done.")

    with open(output_file, "w") as file:
        for line in range(0, out_array.shape[0]):
            line = out_array[line, :]
            line = "\t".join([str(i) for i in line])
            file.write(line)
            file.write("\n")