Esempio n. 1
0
def count_reads_in_features(sam_filenames, gff_filename, samtype, order,
                            max_buffer_size, stranded, overlap_mode,
                            multimapped_mode, secondary_alignment_mode,
                            supplementary_alignment_mode, feature_type,
                            id_attribute, additional_attributes, quiet,
                            minaqual, samouts):
    def write_to_samout(r, assignment, samoutfile):
        if samoutfile is None:
            return
        if not pe_mode:
            r = (r, )
        for read in r:
            if read is not None:
                samoutfile.write(read.original_sam_line.rstrip() + "\tXF:Z:" +
                                 assignment + "\n")

    if samouts != "":
        if len(samouts) != len(sam_filenames):
            raise ValueError(
                'Select the same number of SAM input and output files')
        # Try to open samout files early in case any of them has issues
        for samout in samouts:
            with open(samout, 'w'):
                pass

    # Try to open samfiles to fail early in case any of them is not there
    if (len(sam_filenames) != 1) or (sam_filenames[0] != '-'):
        for sam_filename in sam_filenames:
            with open(sam_filename):
                pass

    # CIGAR match characters (including alignment match, sequence match, and
    # sequence mismatch
    com = ('M', '=', 'X')

    features = HTSeq.GenomicArrayOfSets("auto", stranded != "no")
    gff = HTSeq.GFF_Reader(gff_filename)
    counts = {}
    attributes = {}
    i = 0
    try:
        for f in gff:
            if f.type == feature_type:
                try:
                    feature_id = f.attr[id_attribute]
                except KeyError:
                    raise ValueError(
                        "Feature %s does not contain a '%s' attribute" %
                        (f.name, id_attribute))
                if stranded != "no" and f.iv.strand == ".":
                    raise ValueError(
                        "Feature %s at %s does not have strand information but you are "
                        "running htseq-count in stranded mode. Use '--stranded=no'."
                        % (f.name, f.iv))
                features[f.iv] += feature_id
                counts[f.attr[id_attribute]] = 0
                attributes[f.attr[id_attribute]] = [
                    f.attr[attr] if attr in f.attr else ''
                    for attr in additional_attributes
                ]
            i += 1
            if i % 100000 == 0 and not quiet:
                sys.stderr.write("%d GFF lines processed.\n" % i)
    except:
        sys.stderr.write("Error occured when processing GFF file (%s):\n" %
                         gff.get_line_number_string())
        raise

    if not quiet:
        sys.stderr.write("%d GFF lines processed.\n" % i)

    if len(counts) == 0:
        sys.stderr.write("Warning: No features of type '%s' found.\n" %
                         feature_type)

    if samtype == "sam":
        SAM_or_BAM_Reader = HTSeq.SAM_Reader
    elif samtype == "bam":
        SAM_or_BAM_Reader = HTSeq.BAM_Reader
    else:
        raise ValueError("Unknown input format %s specified." % samtype)

    counts_all = []
    empty_all = []
    ambiguous_all = []
    notaligned_all = []
    lowqual_all = []
    nonunique_all = []
    for isam, (sam_filename) in enumerate(sam_filenames):
        if samouts != '':
            samoutfile = open(samouts[isam], 'w')
        else:
            samoutfile = None

        try:
            if sam_filename != "-":
                read_seq_file = SAM_or_BAM_Reader(sam_filename)
                read_seq = read_seq_file
                first_read = next(iter(read_seq))
            else:
                read_seq_file = SAM_or_BAM_Reader(sys.stdin)
                read_seq_iter = iter(read_seq_file)
                first_read = next(read_seq_iter)
                read_seq = itertools.chain([first_read], read_seq_iter)
            pe_mode = first_read.paired_end
        except:
            sys.stderr.write(
                "Error occured when reading beginning of SAM/BAM file.\n")
            raise

        try:
            if pe_mode:
                if order == "name":
                    read_seq = HTSeq.pair_SAM_alignments(read_seq)
                elif order == "pos":
                    read_seq = HTSeq.pair_SAM_alignments_with_buffer(
                        read_seq, max_buffer_size=max_buffer_size)
                else:
                    raise ValueError("Illegal order specified.")
            empty = 0
            ambiguous = 0
            notaligned = 0
            lowqual = 0
            nonunique = 0
            i = 0
            for r in read_seq:
                if i > 0 and i % 100000 == 0 and not quiet:
                    sys.stderr.write("%d SAM alignment record%s processed.\n" %
                                     (i, "s" if not pe_mode else " pairs"))

                i += 1
                if not pe_mode:
                    if not r.aligned:
                        notaligned += 1
                        write_to_samout(r, "__not_aligned", samoutfile)
                        continue
                    if ((secondary_alignment_mode == 'ignore')
                            and r.not_primary_alignment):
                        continue
                    if ((supplementary_alignment_mode == 'ignore')
                            and r.supplementary):
                        continue
                    try:
                        if r.optional_field("NH") > 1:
                            nonunique += 1
                            write_to_samout(r, "__alignment_not_unique",
                                            samoutfile)
                            if multimapped_mode == 'none':
                                continue
                    except KeyError:
                        pass
                    if r.aQual < minaqual:
                        lowqual += 1
                        write_to_samout(r, "__too_low_aQual", samoutfile)
                        continue
                    if stranded != "reverse":
                        iv_seq = (co.ref_iv for co in r.cigar
                                  if co.type == "M" and co.size > 0)
                    else:
                        iv_seq = (invert_strand(co.ref_iv) for co in r.cigar
                                  if (co.type in com and co.size > 0))
                else:
                    if r[0] is not None and r[0].aligned:
                        if stranded != "reverse":
                            iv_seq = (co.ref_iv for co in r[0].cigar
                                      if co.type in com and co.size > 0)
                        else:
                            iv_seq = (invert_strand(co.ref_iv)
                                      for co in r[0].cigar
                                      if co.type in com and co.size > 0)
                    else:
                        iv_seq = tuple()
                    if r[1] is not None and r[1].aligned:
                        if stranded != "reverse":
                            iv_seq = itertools.chain(
                                iv_seq, (invert_strand(co.ref_iv)
                                         for co in r[1].cigar
                                         if co.type in com and co.size > 0))
                        else:
                            iv_seq = itertools.chain(
                                iv_seq, (co.ref_iv for co in r[1].cigar
                                         if co.type in com and co.size > 0))
                    else:
                        if (r[0] is None) or not (r[0].aligned):
                            write_to_samout(r, "__not_aligned", samoutfile)
                            notaligned += 1
                            continue
                    if ((secondary_alignment_mode == 'ignore')
                            and r[0].not_primary_alignment):
                        continue
                    if ((supplementary_alignment_mode == 'ignore')
                            and r[0].supplementary):
                        continue
                    try:
                        if ((r[0] is not None
                             and r[0].optional_field("NH") > 1)
                                or (r[1] is not None
                                    and r[1].optional_field("NH") > 1)):
                            nonunique += 1
                            write_to_samout(r, "__alignment_not_unique",
                                            samoutfile)
                            if multimapped_mode == 'none':
                                continue
                    except KeyError:
                        pass
                    if ((r[0] and r[0].aQual < minaqual)
                            or (r[1] and r[1].aQual < minaqual)):
                        lowqual += 1
                        write_to_samout(r, "__too_low_aQual", samoutfile)
                        continue

                try:
                    if overlap_mode == "union":
                        fs = set()
                        for iv in iv_seq:
                            if iv.chrom not in features.chrom_vectors:
                                raise UnknownChrom
                            for iv2, fs2 in features[iv].steps():
                                fs = fs.union(fs2)
                    elif overlap_mode in ("intersection-strict",
                                          "intersection-nonempty"):
                        fs = None
                        for iv in iv_seq:
                            if iv.chrom not in features.chrom_vectors:
                                raise UnknownChrom
                            for iv2, fs2 in features[iv].steps():
                                if ((len(fs2) > 0) or
                                    (overlap_mode == "intersection-strict")):
                                    if fs is None:
                                        fs = fs2.copy()
                                    else:
                                        fs = fs.intersection(fs2)
                    else:
                        sys.exit("Illegal overlap mode.")

                    if fs is None or len(fs) == 0:
                        write_to_samout(r, "__no_feature", samoutfile)
                        empty += 1
                    elif len(fs) > 1:
                        write_to_samout(r, "__ambiguous[" + '+'.join(fs) + "]",
                                        samoutfile)
                        ambiguous += 1
                    else:
                        write_to_samout(r, list(fs)[0], samoutfile)

                    if fs is not None and len(fs) > 0:
                        if multimapped_mode == 'none':
                            if len(fs) == 1:
                                counts[list(fs)[0]] += 1
                        elif multimapped_mode == 'all':
                            for fsi in list(fs):
                                counts[fsi] += 1
                        else:
                            sys.exit("Illegal multimap mode.")

                except UnknownChrom:
                    write_to_samout(r, "__no_feature", samoutfile)
                    empty += 1

        except:
            sys.stderr.write(
                "Error occured when processing SAM input (%s):\n" %
                read_seq_file.get_line_number_string())
            raise

        if not quiet:
            sys.stderr.write(
                "%d SAM %s processed.\n" %
                (i, "alignments " if not pe_mode else "alignment pairs"))

        if samoutfile is not None:
            samoutfile.close()

        counts_all.append(counts.copy())
        for fn in counts:
            counts[fn] = 0
        empty_all.append(empty)
        ambiguous_all.append(ambiguous)
        lowqual_all.append(lowqual)
        notaligned_all.append(notaligned)
        nonunique_all.append(nonunique)

    pad = ['' for attr in additional_attributes]
    for fn in sorted(counts.keys()):
        print('\t'.join([fn] + attributes[fn] +
                        [str(c[fn]) for c in counts_all]))
    print('\t'.join(["__no_feature"] + pad + [str(c) for c in empty_all]))
    print('\t'.join(["__ambiguous"] + pad + [str(c) for c in ambiguous_all]))
    print('\t'.join(["__too_low_aQual"] + pad + [str(c) for c in lowqual_all]))
    print('\t'.join(["__not_aligned"] + pad + [str(c)
                                               for c in notaligned_all]))
    print('\t'.join(["__alignment_not_unique"] + pad +
                    [str(c) for c in nonunique_all]))
Esempio n. 2
0
lHitIndex = [-1,-1]
lSuffix = [".ja.txt",".ta.txt",".jaraw.txt"]
nri = 0

#==============================================================================
# main script
#==============================================================================


# parse the GTF into a dict to make looking up transcripts easy later on
# NOTE: i should verify the exons are sorted properly at this point

if args.b_verbose:
	sys.stderr.write("> parsing GTF file...\n")

gff = hts.GFF_Reader(args.reference)
nri = 0
for feature in gff:
	if feature.type == "exon":
		nri += 1
		if args.b_verbose:
			if nri % 2048 == 0:
				sys.stderr.write("\r> features parsed: %d  " % nri)

		szTid = feature.attr['transcript_id']
		if szTid not in dGtf:
			dGtf[szTid] = {}
			dGtf[szTid]['features'] = []
			dGtf[szTid]['junctions'] = []
			dGtf[szTid]['strand'] = feature.iv.strand
			dGtf[szTid]['gene_id'] = ""
Esempio n. 3
0
def count_reads_in_features(sam_filename, gff_filename, samtype, order,
                            stranded, overlap_mode, feature_type, id_attribute,
                            quiet, minaqual, samout, include_non_annotated,
                            htseq_no_ambiguous, outputDiscarded):
    """
    This is taken from the function count_reads_in_features() from the 
    script htseq-count in the HTSeq package version 0.61.p2 
    The reason to do so is to fix two really small bugs related to the SAM output.
    The code of the function is small and simple so for now we
    will use the patched function here. A patch request has been sent
    to the HTSeq team.
    The description of the parameters are the same as htseq-count.
    Two parameters were added to filter out what to write in the sam output
    
    The HTSEQ License
    HTSeq is free software: you can redistribute it and/or modify it under the terms of 
    the GNU General Public License as published by the Free Software Foundation, 
    either version 3 of the License, or (at your option) any later version.

    This program is distributed in the hope that it will be useful, 
    but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 
    or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

    The full text of the GNU General Public License, version 3, 
    can be found here: http://www.gnu.org/licenses/gpl-3.0-standalone.html
    """
    # Set up the filters
    count_reads_in_features.filter_htseq = \
    ["__too_low_aQual", "__not_aligned", "__alignment_not_unique"]
    if not include_non_annotated:
        count_reads_in_features.filter_htseq.append("__no_feature")
    count_reads_in_features.filter_htseq_no_ambiguous = htseq_no_ambiguous

    # Open SAM/BAM output file
    flag_write = "wb" if samtype == "bam" else "wh"
    flag_read = "rb" if samtype == "bam" else "r"
    saminfile = pysam.AlignmentFile(sam_filename, flag_read)
    count_reads_in_features.samoutfile = pysam.AlignmentFile(
        samout, flag_write, template=saminfile)
    if outputDiscarded is not None:
        count_reads_in_features.samdiscarded = pysam.AlignmentFile(
            outputDiscarded, flag_write, template=saminfile)
    saminfile.close()

    # Counter of annotated records
    count_reads_in_features.annotated = 0

    # Function to write to SAM output
    def write_to_samout(read, assignment):
        # Creates the PySAM record
        # to_pysam_AlignedSegment is the new method in HTSeq>=0.7.0 that
        # uses the latest Pysam API and reports the correct sequences
        sam_record = read.to_pysam_AlignedSegment(
            count_reads_in_features.samoutfile)
        sam_record.set_tag("XF", assignment, "Z")
        if read is not None and assignment not in count_reads_in_features.filter_htseq \
        and not (count_reads_in_features.filter_htseq_no_ambiguous and assignment.find("__ambiguous") != -1):
            count_reads_in_features.samoutfile.write(sam_record)
            count_reads_in_features.annotated += 1
        elif outputDiscarded is not None:
            count_reads_in_features.samdiscarded.write(sam_record)

    # Annotation objects
    features = HTSeq.GenomicArrayOfSets("auto", stranded != "no")
    counts = {}
    gff = HTSeq.GFF_Reader(gff_filename)

    try:
        for f in gff:
            if f.type == feature_type:
                try:
                    feature_id = f.attr[id_attribute]
                except KeyError:
                    raise ValueError, ("Feature %s does not contain a '%s' attribute" \
                                       % (f.name, id_attribute))
                if stranded != "no" and f.iv.strand == ".":
                    raise ValueError, ("Feature %s at %s does not have strand information but you are " \
                                       "running htseq-count in stranded mode. Use '--stranded=no'." %
                                       (f.name, f.iv))
                features[f.iv] += feature_id
                counts[f.attr[id_attribute]] = 0
    except:
        raise

    if len(counts) == 0:
        raise RuntimeError, "No features of type '%s' found.\n" % feature_type

    if samtype == "sam":
        SAM_or_BAM_Reader = HTSeq.SAM_Reader
    elif samtype == "bam":
        SAM_or_BAM_Reader = HTSeq.BAM_Reader
    else:
        raise ValueError, "Unknown input format %s specified." % samtype

    try:
        read_seq = SAM_or_BAM_Reader(sam_filename)
    except:
        raise RuntimeError, "Error occurred when reading beginning of SAM/BAM file."

    try:

        for r in read_seq:
            if r.aQual < minaqual:
                write_to_samout(r, "__too_low_aQual")
                continue
            if stranded != "reverse":
                iv_seq = (co.ref_iv for co in r.cigar
                          if co.type == "M" and co.size > 0)
            else:
                iv_seq = (invert_strand(co.ref_iv) for co in r.cigar
                          if co.type == "M" and co.size > 0)
            try:
                if overlap_mode == "union":
                    fs = set()
                    for iv in iv_seq:
                        if iv.chrom not in features.chrom_vectors:
                            raise UnknownChrom
                        for iv2, fs2 in features[iv].steps():
                            fs = fs.union(fs2)
                elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty":
                    fs = None
                    for iv in iv_seq:
                        if iv.chrom not in features.chrom_vectors:
                            raise UnknownChrom
                        for iv2, fs2 in features[iv].steps():
                            if len(
                                    fs2
                            ) > 0 or overlap_mode == "intersection-strict":
                                if fs is None:
                                    fs = fs2.copy()
                                else:
                                    fs = fs.intersection(fs2)
                else:
                    raise RuntimeError, "Illegal overlap mode."

                if fs is None or len(fs) == 0:
                    write_to_samout(r, "__no_feature")
                elif len(fs) > 1:
                    write_to_samout(r, "__ambiguous[" + '+'.join(fs) + "]")
                else:
                    write_to_samout(r, list(fs)[0])

            except UnknownChrom:
                pass

    except:
        count_reads_in_features.samoutfile.close()
        if outputDiscarded is not None:
            count_reads_in_features.samdiscarded.close()
        raise

    count_reads_in_features.samoutfile.close()
    if outputDiscarded is not None:
        count_reads_in_features.samdiscarded.close()
    return count_reads_in_features.annotated
def main(
        intron_annotation_path="S_cerevisiae.R64-2-1_introns_verifiedcoding_nomito_no5utr.gff",
        swap_alignment_strand=True,
        closed_intervals=True,
        alignment_path="test.bam",
        min_overhang=4,
        sample_id="test_sample",
        group_id="test_group",
        output_path="results.tsv"):

    intron_annotation = htseq.GFF_Reader(intron_annotation_path,
                                         end_included=closed_intervals)
    alignments = htseq.BAM_Reader(alignment_path)

    # Make a counter for the number of times an intron name is
    # seen, so we can look up when to append a count to the
    # intron name in order to keep intron names unique
    intron_id_counts = collections.Counter()
    for feature in intron_annotation:
        intron_id_counts.update([feature.attr["Name"]])

    seen_intron_ids = collections.Counter()
    intron_intervals = dict()
    intron_map = htseq.GenomicArrayOfSets("auto", stranded=True)

    for feature in intron_annotation:
        intron_id = feature.attr["Name"]

        # Append a count to non-unique intron names
        if intron_id_counts[intron_id] > 1:
            seen_intron_ids.update([intron_id])
            intron_id = f"{intron_id}_{str(seen_intron_ids[intron_id])}"
            feature.attr["Name"] = intron_id

        intron_intervals[intron_id] = feature.iv

        # Update map of genomic position to overlapping annotations with intron name
        intron_map[feature.iv] += intron_id

    read_counts = {alignment_type: collections.Counter() for \
            alignment_type in ("ambiguous",
                               "spliced",
                               "intronic",
                               "junction_five",
                               "junction_three")}

    # bam_writers = {"ambiguous": htseq.BAM_Writer.from_BAM_Reader("ambiguous.bam", alignments),
    #                "spliced": htseq.BAM_Writer.from_BAM_Reader("spliced.bam", alignments),
    #                "intronic": htseq.BAM_Writer.from_BAM_Reader("intronic.bam", alignments),
    #                "junction_five": htseq.BAM_Writer.from_BAM_Reader("junction_five.bam", alignments),
    #                "junction_three": htseq.BAM_Writer.from_BAM_Reader("junction_three.bam", alignments)}

    for alignment in alignments:
        # when sequencing from 3' end, the strand is swapped
        if swap_alignment_strand:
            alignment.iv.strand = {"+": "-", "-": "+"}.get(alignment.iv.strand)

        # for each alignment, find overlapping introns
        overlapped_introns = set()
        for interval, value in intron_map[alignment.iv].steps():
            overlapped_introns |= value

        # ignore alignments not spanning any introns
        if len(overlapped_introns) == 0:
            continue

        # mark alignments spanning multiple introns as ambiguous
        if len(overlapped_introns) > 1:
            read_counts["ambiguous"].update(overlapped_introns)
            # bam_writers["ambiguous"].write(alignment)
            continue

        cigar = alignment.cigar
        cigar_length = len(cigar)

        # mark alignments with complex CIGAR strings as ambiguous
        if cigar_length not in [1, 3]:
            read_counts["ambiguous"].update(overlapped_introns)
            # bam_writers["ambiguous"].write(alignment)
            continue

        overlapped_intron = list(overlapped_introns)[0]

        # handle potentially spliced alignments
        if cigar_length == 3:
            if [x.type for x in cigar] != ["M", "N", "M"]:
                read_counts["ambiguous"].update(overlapped_introns)
                # bam_writers["ambiguous"].write(alignment)
                continue
            if cigar[1].ref_iv.start != intron_intervals[overlapped_intron].start or \
                    cigar[1].ref_iv.end != intron_intervals[overlapped_intron].end or \
                    cigar[0].ref_iv.end != intron_intervals[overlapped_intron].start or \
                    cigar[2].ref_iv.start != intron_intervals[overlapped_intron].end or \
                    cigar[0].size < min_overhang or \
                    cigar[2].size < min_overhang:
                read_counts["ambiguous"].update(overlapped_introns)
                # bam_writers["ambiguous"].write(alignment)
                continue
            read_counts["spliced"].update(overlapped_introns)
            # bam_writers["spliced"].write(alignment)
            continue

        # handle potential junction or intronic reads
        if cigar[0].type != "M":
            read_counts["ambiguous"].update(overlapped_introns)
            # bam_writers["ambiguous"].write(alignment)
            continue

        if cigar[0].ref_iv.start >= intron_intervals[overlapped_intron].start and \
                cigar[0].ref_iv.end <= intron_intervals[overlapped_intron].end:
            read_counts["intronic"].update(overlapped_introns)
            # bam_writers["intronic"].write(alignment)
            continue

        if cigar[0].ref_iv.start <= (intron_intervals[overlapped_intron].start - min_overhang) and \
                cigar[0].ref_iv.end >= (intron_intervals[overlapped_intron].start + min_overhang):
            ({
                "+": read_counts["junction_five"],
                "-": read_counts["junction_three"]
            }.get(alignment.iv.strand)).update(overlapped_introns)
            # {"+": bam_writers["junction_five"],
            #  "-": bam_writers["junction_three"]}.get(alignment.iv.strand).write(alignment)
            continue
        if cigar[0].ref_iv.start <= (intron_intervals[overlapped_intron].end - min_overhang) and \
                cigar[0].ref_iv.end >= (intron_intervals[overlapped_intron].end + min_overhang):
            ({
                "+": read_counts["junction_three"],
                "-": read_counts["junction_five"]
            }.get(alignment.iv.strand)).update(overlapped_introns)
            # {"+": bam_writers["junction_three"],
            #  "-": bam_writers["junction_five"]}.get(alignment.iv.strand).write(alignment)
            continue
        read_counts["ambiguous"].update(overlapped_introns)

    # for writer in bam_writers.values():
    #     writer.close()

    with open(output_path, "w") as output_file:
        output_file.write("\t".join([
            "chrom", "start", "end", "name", "score", "strand", "sample_id",
            "group_id", "spliced", "junction_5", "junction_3", "intronic",
            "ambiguous"
        ]) + "\n")
        for key, interval in intron_intervals.items():
            output_string = "\t".join([
                interval.chrom,
                str(interval.start),
                str(interval.end), key, "0", interval.strand, sample_id,
                group_id,
                str(read_counts["spliced"][key]),
                str(read_counts["junction_five"][key]),
                str(read_counts["junction_three"][key]),
                str(read_counts["intronic"][key]),
                str(read_counts["ambiguous"][key])
            ]) + "\n"
            output_file.write(output_string)
Esempio n. 5
0
#!/usr/bin/python

import sys, time, re
import HTSeq as hts

g_exons = hts.GenomicArrayOfSets("auto", stranded=False)

# start time
n_tStart = time.time()

gr = hts.GFF_Reader(sys.argv[1])
for feature in gr:
    if feature.type == "exon":
        sz_name = feature.attr['transcript_id'] + ";" + feature.attr['gene_id']
        if "gene_name" in feature.attr:
            sz_name += ";" + feature.attr['gene_name']

        sz_name += ";" + feature.iv.chrom

        g_exons[feature.iv] += sz_name

        # record total lengths of featurea in order to calculate RPKM later on
        if sz_name not in dLengths:
            dLengths[sz_name] = 0
            dHits[sz_name] = 0

        dLengths[sz_name] += feature.iv.end - feature.iv.start
Esempio n. 6
0
            "Could not import pysam, which is needed to process BAM file (though\n"
        )
        sys.stderr.write(
            "not to process text SAM files). Please install the 'pysam' library from\n"
        )
        sys.stderr.write("https://code.google.com/p/pysam/\n")
        sys.exit(1)

if sam_file == "-":
    sam_file = sys.stdin

# Step 1: Read in the GFF file as generated by aggregate_genes.py
# and put everything into a GenomicArrayOfSets

features = HTSeq.GenomicArrayOfSets("auto", stranded=stranded)
for f in HTSeq.GFF_Reader(gff_file):
    if f.type == "exonic_part":
        f.name = f.attr['gene_id'] + ":" + f.attr['exonic_part_number']
        features[f.iv] += f

# initialise counters
num_reads = 0
counts = {}
counts['_empty'] = 0
counts['_ambiguous'] = 0
counts['_lowaqual'] = 0
counts['_notaligned'] = 0
counts['_ambiguous_readpair_position'] = 0

# put a zero for each feature ID
for iv, s in features.steps():
def intron_retention(outfile, gff_file, g_alnm, t_alnm):
    # Read intron information from GFF file
    sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Reading intron coordinates from GFF file\n")
    gff_features = HTSeq.GFF_Reader(gff_file, end_included=True)
    features = HTSeq.GenomicArrayOfSets("auto", stranded=False)
    dict_intron_info = {}
    for feature in gff_features:
        if "transcript_id" in feature.attr:
            feature_id = feature.attr['transcript_id']
        elif "Parent" in feature.attr:  # no "if feature.type == intron" to also consider trxs without intron
            info = feature.name.split(":")
            if len(info) == 1:
                feature_id = info[0]
            else:
                if info[0] == "transcript":
                    feature_id = info[1]
                else:
                    continue
        else:
            continue

        feature_id = feature_id.split(".")[0]
        if feature_id not in dict_intron_info:
            dict_intron_info[feature_id] = []

        # remove "chr" from chromosome names to be constant
        if "chr" in feature.iv.chrom:
            feature.iv.chrom = feature.iv.chrom.strip("chr")

        if feature.type == "intron":
            features[feature.iv] += feature_id
            dict_intron_info[feature_id].append((feature.iv.start, feature.iv.end, feature.iv.length))

    # read primary genome alignment for each read
    sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Read primary genome alignment for each read\n")
    dict_g_alnm = {}
    sam_reader = HTSeq.SAM_Reader
    g_alignments = sam_reader(g_alnm)
    for alnm in g_alignments:
        qname = alnm.read.name
        if alnm.aligned:
            dict_g_alnm[qname] = parse_cigar(alnm.cigar)

    # read primary transcriptome alignment for each read
    sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Read primary transcriptome alignment for each read\n")
    dict_t_alnm = {}
    sam_reader = HTSeq.SAM_Reader
    t_alignments = sam_reader(t_alnm)
    for alnm in t_alignments:
        qname = alnm.read.name
        if alnm.aligned:
            dict_t_alnm[qname] = alnm.iv.chrom.split(".")[0]

    # Count the length of Intron retention events
    sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Calculating probabilities for each intron retention event\n")
    dict_first_intron_state = {False: 0, True: 0}
    dict_states = {(False, False): 0, (False, True): 0, (True, False): 0, (True, True): 0}
    dict_ir_info = {}
    for qname in dict_g_alnm:
        iv_seq = dict_g_alnm[qname]
        if qname in dict_t_alnm:
            primary_trx = dict_t_alnm[qname]
            if primary_trx not in dict_ir_info:
                dict_ir_info[primary_trx] = []
            list_IR_positions = []
            pos = []
            ir_info = False
            try:
                length_IR = 0
                for item in iv_seq:
                    iv = HTSeq.GenomicInterval(item[0], item[1], item[2], item[3])
                    if "chr" in iv.chrom:
                        iv.chrom = iv.chrom.strip("chr")
                    for iv2, fs2 in features[iv].steps():
                        if fs2.intersection(set([primary_trx])):
                            length_IR += iv2.length
                            pos.append(iv2.start)
                            pos.append(iv2.end)
                        else:
                            if length_IR != 0:
                                for intron in dict_intron_info[primary_trx]:
                                    if length_IR == intron[2]:
                                        list_IR_positions.append(min(pos))
                                        list_IR_positions.append(max(pos))
                                        ir_info = True
                                length_IR = 0
                                pos = []
            # TODO ??
            except UnknownChrom:
                ir_info = False
                pass

            if not ir_info:
                if primary_trx in dict_intron_info:
                    if len(dict_intron_info[primary_trx]) >= 1:  # if there is an intron
                        dict_first_intron_state[False] += 1
                        for i in range(1, len(dict_intron_info[primary_trx])):
                            dict_states[(False, False)] += 1
            else:
                # Now, go over all introns and check with the IR events
                # First we need to determine the state of first intron:
                first_intron = dict_intron_info[primary_trx][0]
                first_intron_spos = first_intron[0]
                first_intron_epos = first_intron[1]
                flag = False
                for IR_pos in list_IR_positions:
                    if first_intron_spos <= IR_pos <= first_intron_epos:
                        flag = True
                        break
                if flag:
                    dict_ir_info[primary_trx].append((first_intron_spos, first_intron_epos))
                    dict_first_intron_state[True] += 1
                    previous_state = True
                else:
                    dict_first_intron_state[False] += 1
                    previous_state = False

                # Then we will go over other introns:
                for i in range(1, len(dict_intron_info[primary_trx])):
                    intron = dict_intron_info[primary_trx][i]
                    current_state = False
                    intron_spos = intron[0]
                    intron_epos = intron[1]
                    for IR_pos in list_IR_positions:
                        if intron_spos <= IR_pos <= intron_epos:
                            current_state = True
                            dict_ir_info[primary_trx].append((intron_spos, intron_epos))
                            break
                    # print(intron_spos, intron_epos, previous_state, current_state)
                    dict_states[(previous_state, current_state)] += 1
                    previous_state = current_state

    del dict_g_alnm
    del dict_t_alnm
    # print (dict_first_intron_state)
    # print (dict_states)
    sum_first_introns = dict_first_intron_state[True] + dict_first_intron_state[False]
    sum_for_noIR = dict_states[(False, False)] + dict_states[(False, True)]
    sum_for_IR = dict_states[(True, False)] + dict_states[(True, True)]

    fout = open(outfile + "_IR_markov_model", 'w')
    fout.write("succedent\tno_IR\tIR\n")

    if sum_first_introns != 0:
        fout.write("start\t" + str(round(dict_first_intron_state[False] / float(sum_first_introns), 4)) + "\t" +
                   str(round(dict_first_intron_state[True] / float(sum_first_introns), 4)) + "\n")
    else:
        fout.write("start\t0.0\t0.0\n")

    if sum_for_noIR != 0:
        fout.write("no_IR\t" + str(round(dict_states[(False, False)] / float(sum_for_noIR), 4)) + "\t" +
                   str(round(dict_states[(False, True)] / float(sum_for_noIR), 4)) + "\n")
    else:
        fout.write("no_IR\t0.0\t0.0\n")

    if sum_for_IR != 0:
        fout.write("IR\t" + str(round(dict_states[(True, False)] / float(sum_for_IR), 4)) + "\t" +
                   str(round(dict_states[(True, True)] / float(sum_for_IR), 4)) + "\n")
    else:
        fout.write("IR\t0.0\t0.0\n")

    # output intron coordinates and information to the user:
    out_ir_info = open(outfile + "_IR_info", 'w')
    out_ir_info.write("trx_name\tintron_spos\tintron_epos\n")

    for trx in dict_ir_info:
        if len(dict_ir_info[trx]) != 0:
            lst_sorted = sorted(set(dict_ir_info[trx]))
            fstr_spos = ",".join([str(item[0]) for item in lst_sorted])
            fstr_epos = ",".join([str(item[1]) for item in lst_sorted])
            out_ir_info.write(trx + "\t" + fstr_spos + "\t" + fstr_epos + "\n")

    fout.close()
    out_ir_info.close()
Esempio n. 8
0
import os
os.chdir(
    '/share/ScratchGeneral/jamtor/projects/hgsoc_repeats/RNA-seq/results/star/GC/exp5'
)
os.getcwd()

# read in bam:
bam_reader = HTSeq.BAM_Reader(
    "bowtell_FT3_subset/Aligned.sortedByCoord.out.bam")
# check first 5 lines of bam:
import itertools
for a in itertools.islice(bam_reader, 5):
    print a

# read in gencode annotation:
homeDir = '/share/ScratchGeneral/jamtor/'
gc = homeDir + '/genomes/hg38_ercc/gencode_v24_hg38_annotation.gtf'
gtf_file = HTSeq.GFF_Reader(gc, end_included=True)
# check first 10 lines of annotation:
for feature in itertools.islice(gtf_file, 10):
    print feature

# initiate a GenomicArrayOfSets object and fill with exons only from annotation:
exons = HTSeq.GenomicArrayOfSets("auto", stranded=True)

for feature in gtf_file:
    if feature.type == "exon":
        exons[feature.iv] += feature.name

for e in itertools.islice(gtf_file, 10):
    print e
Esempio n. 9
0
import sys
import HTSeq
import numpy
import matplotlib as mpl
mpl.use('pdf')
from matplotlib import pyplot

#sortedbamfile = HTSeq.BAM_Reader( "../input/DHS.Chr1.unique.bam" )
#sortedbamfile = HTSeq.BAM_Reader( "../input/DHS.unique.bam" )
#gtffile = HTSeq.GFF_Reader( "../input/MSU7.gene.exon_number.gtf" )
sortedbamfile = HTSeq.BAM_Reader(sys.argv[1])
gtffile = HTSeq.GFF_Reader(sys.argv[2])

halfwinwidth = 2000
fragmentsize = 150
#total = 60745783.00/1000000 ## nucleosome
#total = 7480914/1000000 ## nucleosome chr1
#total = 23299296/1000000 #DHS unique
#gsize = 372000000

#coverage = HTSeq.GenomicArray( "auto", stranded=False, typecode="i" )
#for almnt in bamfile:
#   if almnt.aligned:
#      #almnt.iv.length = fragmentsize
#      print almnt.iv
#      if not almnt.iv.start < 500:
#          coverage[ almnt.iv ] += 1

#tsspos = set()
#for feature in gtffile:
#   if feature.type == "exon" and feature.attr["exon_number"] == "-1":