def count_reads_in_features(sam_filenames, gff_filename, samtype, order, max_buffer_size, stranded, overlap_mode, multimapped_mode, secondary_alignment_mode, supplementary_alignment_mode, feature_type, id_attribute, additional_attributes, quiet, minaqual, samouts): def write_to_samout(r, assignment, samoutfile): if samoutfile is None: return if not pe_mode: r = (r, ) for read in r: if read is not None: samoutfile.write(read.original_sam_line.rstrip() + "\tXF:Z:" + assignment + "\n") if samouts != "": if len(samouts) != len(sam_filenames): raise ValueError( 'Select the same number of SAM input and output files') # Try to open samout files early in case any of them has issues for samout in samouts: with open(samout, 'w'): pass # Try to open samfiles to fail early in case any of them is not there if (len(sam_filenames) != 1) or (sam_filenames[0] != '-'): for sam_filename in sam_filenames: with open(sam_filename): pass # CIGAR match characters (including alignment match, sequence match, and # sequence mismatch com = ('M', '=', 'X') features = HTSeq.GenomicArrayOfSets("auto", stranded != "no") gff = HTSeq.GFF_Reader(gff_filename) counts = {} attributes = {} i = 0 try: for f in gff: if f.type == feature_type: try: feature_id = f.attr[id_attribute] except KeyError: raise ValueError( "Feature %s does not contain a '%s' attribute" % (f.name, id_attribute)) if stranded != "no" and f.iv.strand == ".": raise ValueError( "Feature %s at %s does not have strand information but you are " "running htseq-count in stranded mode. Use '--stranded=no'." % (f.name, f.iv)) features[f.iv] += feature_id counts[f.attr[id_attribute]] = 0 attributes[f.attr[id_attribute]] = [ f.attr[attr] if attr in f.attr else '' for attr in additional_attributes ] i += 1 if i % 100000 == 0 and not quiet: sys.stderr.write("%d GFF lines processed.\n" % i) except: sys.stderr.write("Error occured when processing GFF file (%s):\n" % gff.get_line_number_string()) raise if not quiet: sys.stderr.write("%d GFF lines processed.\n" % i) if len(counts) == 0: sys.stderr.write("Warning: No features of type '%s' found.\n" % feature_type) if samtype == "sam": SAM_or_BAM_Reader = HTSeq.SAM_Reader elif samtype == "bam": SAM_or_BAM_Reader = HTSeq.BAM_Reader else: raise ValueError("Unknown input format %s specified." % samtype) counts_all = [] empty_all = [] ambiguous_all = [] notaligned_all = [] lowqual_all = [] nonunique_all = [] for isam, (sam_filename) in enumerate(sam_filenames): if samouts != '': samoutfile = open(samouts[isam], 'w') else: samoutfile = None try: if sam_filename != "-": read_seq_file = SAM_or_BAM_Reader(sam_filename) read_seq = read_seq_file first_read = next(iter(read_seq)) else: read_seq_file = SAM_or_BAM_Reader(sys.stdin) read_seq_iter = iter(read_seq_file) first_read = next(read_seq_iter) read_seq = itertools.chain([first_read], read_seq_iter) pe_mode = first_read.paired_end except: sys.stderr.write( "Error occured when reading beginning of SAM/BAM file.\n") raise try: if pe_mode: if order == "name": read_seq = HTSeq.pair_SAM_alignments(read_seq) elif order == "pos": read_seq = HTSeq.pair_SAM_alignments_with_buffer( read_seq, max_buffer_size=max_buffer_size) else: raise ValueError("Illegal order specified.") empty = 0 ambiguous = 0 notaligned = 0 lowqual = 0 nonunique = 0 i = 0 for r in read_seq: if i > 0 and i % 100000 == 0 and not quiet: sys.stderr.write("%d SAM alignment record%s processed.\n" % (i, "s" if not pe_mode else " pairs")) i += 1 if not pe_mode: if not r.aligned: notaligned += 1 write_to_samout(r, "__not_aligned", samoutfile) continue if ((secondary_alignment_mode == 'ignore') and r.not_primary_alignment): continue if ((supplementary_alignment_mode == 'ignore') and r.supplementary): continue try: if r.optional_field("NH") > 1: nonunique += 1 write_to_samout(r, "__alignment_not_unique", samoutfile) if multimapped_mode == 'none': continue except KeyError: pass if r.aQual < minaqual: lowqual += 1 write_to_samout(r, "__too_low_aQual", samoutfile) continue if stranded != "reverse": iv_seq = (co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r.cigar if (co.type in com and co.size > 0)) else: if r[0] is not None and r[0].aligned: if stranded != "reverse": iv_seq = (co.ref_iv for co in r[0].cigar if co.type in com and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar if co.type in com and co.size > 0) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if stranded != "reverse": iv_seq = itertools.chain( iv_seq, (invert_strand(co.ref_iv) for co in r[1].cigar if co.type in com and co.size > 0)) else: iv_seq = itertools.chain( iv_seq, (co.ref_iv for co in r[1].cigar if co.type in com and co.size > 0)) else: if (r[0] is None) or not (r[0].aligned): write_to_samout(r, "__not_aligned", samoutfile) notaligned += 1 continue if ((secondary_alignment_mode == 'ignore') and r[0].not_primary_alignment): continue if ((supplementary_alignment_mode == 'ignore') and r[0].supplementary): continue try: if ((r[0] is not None and r[0].optional_field("NH") > 1) or (r[1] is not None and r[1].optional_field("NH") > 1)): nonunique += 1 write_to_samout(r, "__alignment_not_unique", samoutfile) if multimapped_mode == 'none': continue except KeyError: pass if ((r[0] and r[0].aQual < minaqual) or (r[1] and r[1].aQual < minaqual)): lowqual += 1 write_to_samout(r, "__too_low_aQual", samoutfile) continue try: if overlap_mode == "union": fs = set() for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): fs = fs.union(fs2) elif overlap_mode in ("intersection-strict", "intersection-nonempty"): fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): if ((len(fs2) > 0) or (overlap_mode == "intersection-strict")): if fs is None: fs = fs2.copy() else: fs = fs.intersection(fs2) else: sys.exit("Illegal overlap mode.") if fs is None or len(fs) == 0: write_to_samout(r, "__no_feature", samoutfile) empty += 1 elif len(fs) > 1: write_to_samout(r, "__ambiguous[" + '+'.join(fs) + "]", samoutfile) ambiguous += 1 else: write_to_samout(r, list(fs)[0], samoutfile) if fs is not None and len(fs) > 0: if multimapped_mode == 'none': if len(fs) == 1: counts[list(fs)[0]] += 1 elif multimapped_mode == 'all': for fsi in list(fs): counts[fsi] += 1 else: sys.exit("Illegal multimap mode.") except UnknownChrom: write_to_samout(r, "__no_feature", samoutfile) empty += 1 except: sys.stderr.write( "Error occured when processing SAM input (%s):\n" % read_seq_file.get_line_number_string()) raise if not quiet: sys.stderr.write( "%d SAM %s processed.\n" % (i, "alignments " if not pe_mode else "alignment pairs")) if samoutfile is not None: samoutfile.close() counts_all.append(counts.copy()) for fn in counts: counts[fn] = 0 empty_all.append(empty) ambiguous_all.append(ambiguous) lowqual_all.append(lowqual) notaligned_all.append(notaligned) nonunique_all.append(nonunique) pad = ['' for attr in additional_attributes] for fn in sorted(counts.keys()): print('\t'.join([fn] + attributes[fn] + [str(c[fn]) for c in counts_all])) print('\t'.join(["__no_feature"] + pad + [str(c) for c in empty_all])) print('\t'.join(["__ambiguous"] + pad + [str(c) for c in ambiguous_all])) print('\t'.join(["__too_low_aQual"] + pad + [str(c) for c in lowqual_all])) print('\t'.join(["__not_aligned"] + pad + [str(c) for c in notaligned_all])) print('\t'.join(["__alignment_not_unique"] + pad + [str(c) for c in nonunique_all]))
lHitIndex = [-1,-1] lSuffix = [".ja.txt",".ta.txt",".jaraw.txt"] nri = 0 #============================================================================== # main script #============================================================================== # parse the GTF into a dict to make looking up transcripts easy later on # NOTE: i should verify the exons are sorted properly at this point if args.b_verbose: sys.stderr.write("> parsing GTF file...\n") gff = hts.GFF_Reader(args.reference) nri = 0 for feature in gff: if feature.type == "exon": nri += 1 if args.b_verbose: if nri % 2048 == 0: sys.stderr.write("\r> features parsed: %d " % nri) szTid = feature.attr['transcript_id'] if szTid not in dGtf: dGtf[szTid] = {} dGtf[szTid]['features'] = [] dGtf[szTid]['junctions'] = [] dGtf[szTid]['strand'] = feature.iv.strand dGtf[szTid]['gene_id'] = ""
def count_reads_in_features(sam_filename, gff_filename, samtype, order, stranded, overlap_mode, feature_type, id_attribute, quiet, minaqual, samout, include_non_annotated, htseq_no_ambiguous, outputDiscarded): """ This is taken from the function count_reads_in_features() from the script htseq-count in the HTSeq package version 0.61.p2 The reason to do so is to fix two really small bugs related to the SAM output. The code of the function is small and simple so for now we will use the patched function here. A patch request has been sent to the HTSeq team. The description of the parameters are the same as htseq-count. Two parameters were added to filter out what to write in the sam output The HTSEQ License HTSeq is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. The full text of the GNU General Public License, version 3, can be found here: http://www.gnu.org/licenses/gpl-3.0-standalone.html """ # Set up the filters count_reads_in_features.filter_htseq = \ ["__too_low_aQual", "__not_aligned", "__alignment_not_unique"] if not include_non_annotated: count_reads_in_features.filter_htseq.append("__no_feature") count_reads_in_features.filter_htseq_no_ambiguous = htseq_no_ambiguous # Open SAM/BAM output file flag_write = "wb" if samtype == "bam" else "wh" flag_read = "rb" if samtype == "bam" else "r" saminfile = pysam.AlignmentFile(sam_filename, flag_read) count_reads_in_features.samoutfile = pysam.AlignmentFile( samout, flag_write, template=saminfile) if outputDiscarded is not None: count_reads_in_features.samdiscarded = pysam.AlignmentFile( outputDiscarded, flag_write, template=saminfile) saminfile.close() # Counter of annotated records count_reads_in_features.annotated = 0 # Function to write to SAM output def write_to_samout(read, assignment): # Creates the PySAM record # to_pysam_AlignedSegment is the new method in HTSeq>=0.7.0 that # uses the latest Pysam API and reports the correct sequences sam_record = read.to_pysam_AlignedSegment( count_reads_in_features.samoutfile) sam_record.set_tag("XF", assignment, "Z") if read is not None and assignment not in count_reads_in_features.filter_htseq \ and not (count_reads_in_features.filter_htseq_no_ambiguous and assignment.find("__ambiguous") != -1): count_reads_in_features.samoutfile.write(sam_record) count_reads_in_features.annotated += 1 elif outputDiscarded is not None: count_reads_in_features.samdiscarded.write(sam_record) # Annotation objects features = HTSeq.GenomicArrayOfSets("auto", stranded != "no") counts = {} gff = HTSeq.GFF_Reader(gff_filename) try: for f in gff: if f.type == feature_type: try: feature_id = f.attr[id_attribute] except KeyError: raise ValueError, ("Feature %s does not contain a '%s' attribute" \ % (f.name, id_attribute)) if stranded != "no" and f.iv.strand == ".": raise ValueError, ("Feature %s at %s does not have strand information but you are " \ "running htseq-count in stranded mode. Use '--stranded=no'." % (f.name, f.iv)) features[f.iv] += feature_id counts[f.attr[id_attribute]] = 0 except: raise if len(counts) == 0: raise RuntimeError, "No features of type '%s' found.\n" % feature_type if samtype == "sam": SAM_or_BAM_Reader = HTSeq.SAM_Reader elif samtype == "bam": SAM_or_BAM_Reader = HTSeq.BAM_Reader else: raise ValueError, "Unknown input format %s specified." % samtype try: read_seq = SAM_or_BAM_Reader(sam_filename) except: raise RuntimeError, "Error occurred when reading beginning of SAM/BAM file." try: for r in read_seq: if r.aQual < minaqual: write_to_samout(r, "__too_low_aQual") continue if stranded != "reverse": iv_seq = (co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r.cigar if co.type == "M" and co.size > 0) try: if overlap_mode == "union": fs = set() for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): fs = fs.union(fs2) elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty": fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): if len( fs2 ) > 0 or overlap_mode == "intersection-strict": if fs is None: fs = fs2.copy() else: fs = fs.intersection(fs2) else: raise RuntimeError, "Illegal overlap mode." if fs is None or len(fs) == 0: write_to_samout(r, "__no_feature") elif len(fs) > 1: write_to_samout(r, "__ambiguous[" + '+'.join(fs) + "]") else: write_to_samout(r, list(fs)[0]) except UnknownChrom: pass except: count_reads_in_features.samoutfile.close() if outputDiscarded is not None: count_reads_in_features.samdiscarded.close() raise count_reads_in_features.samoutfile.close() if outputDiscarded is not None: count_reads_in_features.samdiscarded.close() return count_reads_in_features.annotated
def main( intron_annotation_path="S_cerevisiae.R64-2-1_introns_verifiedcoding_nomito_no5utr.gff", swap_alignment_strand=True, closed_intervals=True, alignment_path="test.bam", min_overhang=4, sample_id="test_sample", group_id="test_group", output_path="results.tsv"): intron_annotation = htseq.GFF_Reader(intron_annotation_path, end_included=closed_intervals) alignments = htseq.BAM_Reader(alignment_path) # Make a counter for the number of times an intron name is # seen, so we can look up when to append a count to the # intron name in order to keep intron names unique intron_id_counts = collections.Counter() for feature in intron_annotation: intron_id_counts.update([feature.attr["Name"]]) seen_intron_ids = collections.Counter() intron_intervals = dict() intron_map = htseq.GenomicArrayOfSets("auto", stranded=True) for feature in intron_annotation: intron_id = feature.attr["Name"] # Append a count to non-unique intron names if intron_id_counts[intron_id] > 1: seen_intron_ids.update([intron_id]) intron_id = f"{intron_id}_{str(seen_intron_ids[intron_id])}" feature.attr["Name"] = intron_id intron_intervals[intron_id] = feature.iv # Update map of genomic position to overlapping annotations with intron name intron_map[feature.iv] += intron_id read_counts = {alignment_type: collections.Counter() for \ alignment_type in ("ambiguous", "spliced", "intronic", "junction_five", "junction_three")} # bam_writers = {"ambiguous": htseq.BAM_Writer.from_BAM_Reader("ambiguous.bam", alignments), # "spliced": htseq.BAM_Writer.from_BAM_Reader("spliced.bam", alignments), # "intronic": htseq.BAM_Writer.from_BAM_Reader("intronic.bam", alignments), # "junction_five": htseq.BAM_Writer.from_BAM_Reader("junction_five.bam", alignments), # "junction_three": htseq.BAM_Writer.from_BAM_Reader("junction_three.bam", alignments)} for alignment in alignments: # when sequencing from 3' end, the strand is swapped if swap_alignment_strand: alignment.iv.strand = {"+": "-", "-": "+"}.get(alignment.iv.strand) # for each alignment, find overlapping introns overlapped_introns = set() for interval, value in intron_map[alignment.iv].steps(): overlapped_introns |= value # ignore alignments not spanning any introns if len(overlapped_introns) == 0: continue # mark alignments spanning multiple introns as ambiguous if len(overlapped_introns) > 1: read_counts["ambiguous"].update(overlapped_introns) # bam_writers["ambiguous"].write(alignment) continue cigar = alignment.cigar cigar_length = len(cigar) # mark alignments with complex CIGAR strings as ambiguous if cigar_length not in [1, 3]: read_counts["ambiguous"].update(overlapped_introns) # bam_writers["ambiguous"].write(alignment) continue overlapped_intron = list(overlapped_introns)[0] # handle potentially spliced alignments if cigar_length == 3: if [x.type for x in cigar] != ["M", "N", "M"]: read_counts["ambiguous"].update(overlapped_introns) # bam_writers["ambiguous"].write(alignment) continue if cigar[1].ref_iv.start != intron_intervals[overlapped_intron].start or \ cigar[1].ref_iv.end != intron_intervals[overlapped_intron].end or \ cigar[0].ref_iv.end != intron_intervals[overlapped_intron].start or \ cigar[2].ref_iv.start != intron_intervals[overlapped_intron].end or \ cigar[0].size < min_overhang or \ cigar[2].size < min_overhang: read_counts["ambiguous"].update(overlapped_introns) # bam_writers["ambiguous"].write(alignment) continue read_counts["spliced"].update(overlapped_introns) # bam_writers["spliced"].write(alignment) continue # handle potential junction or intronic reads if cigar[0].type != "M": read_counts["ambiguous"].update(overlapped_introns) # bam_writers["ambiguous"].write(alignment) continue if cigar[0].ref_iv.start >= intron_intervals[overlapped_intron].start and \ cigar[0].ref_iv.end <= intron_intervals[overlapped_intron].end: read_counts["intronic"].update(overlapped_introns) # bam_writers["intronic"].write(alignment) continue if cigar[0].ref_iv.start <= (intron_intervals[overlapped_intron].start - min_overhang) and \ cigar[0].ref_iv.end >= (intron_intervals[overlapped_intron].start + min_overhang): ({ "+": read_counts["junction_five"], "-": read_counts["junction_three"] }.get(alignment.iv.strand)).update(overlapped_introns) # {"+": bam_writers["junction_five"], # "-": bam_writers["junction_three"]}.get(alignment.iv.strand).write(alignment) continue if cigar[0].ref_iv.start <= (intron_intervals[overlapped_intron].end - min_overhang) and \ cigar[0].ref_iv.end >= (intron_intervals[overlapped_intron].end + min_overhang): ({ "+": read_counts["junction_three"], "-": read_counts["junction_five"] }.get(alignment.iv.strand)).update(overlapped_introns) # {"+": bam_writers["junction_three"], # "-": bam_writers["junction_five"]}.get(alignment.iv.strand).write(alignment) continue read_counts["ambiguous"].update(overlapped_introns) # for writer in bam_writers.values(): # writer.close() with open(output_path, "w") as output_file: output_file.write("\t".join([ "chrom", "start", "end", "name", "score", "strand", "sample_id", "group_id", "spliced", "junction_5", "junction_3", "intronic", "ambiguous" ]) + "\n") for key, interval in intron_intervals.items(): output_string = "\t".join([ interval.chrom, str(interval.start), str(interval.end), key, "0", interval.strand, sample_id, group_id, str(read_counts["spliced"][key]), str(read_counts["junction_five"][key]), str(read_counts["junction_three"][key]), str(read_counts["intronic"][key]), str(read_counts["ambiguous"][key]) ]) + "\n" output_file.write(output_string)
#!/usr/bin/python import sys, time, re import HTSeq as hts g_exons = hts.GenomicArrayOfSets("auto", stranded=False) # start time n_tStart = time.time() gr = hts.GFF_Reader(sys.argv[1]) for feature in gr: if feature.type == "exon": sz_name = feature.attr['transcript_id'] + ";" + feature.attr['gene_id'] if "gene_name" in feature.attr: sz_name += ";" + feature.attr['gene_name'] sz_name += ";" + feature.iv.chrom g_exons[feature.iv] += sz_name # record total lengths of featurea in order to calculate RPKM later on if sz_name not in dLengths: dLengths[sz_name] = 0 dHits[sz_name] = 0 dLengths[sz_name] += feature.iv.end - feature.iv.start
"Could not import pysam, which is needed to process BAM file (though\n" ) sys.stderr.write( "not to process text SAM files). Please install the 'pysam' library from\n" ) sys.stderr.write("https://code.google.com/p/pysam/\n") sys.exit(1) if sam_file == "-": sam_file = sys.stdin # Step 1: Read in the GFF file as generated by aggregate_genes.py # and put everything into a GenomicArrayOfSets features = HTSeq.GenomicArrayOfSets("auto", stranded=stranded) for f in HTSeq.GFF_Reader(gff_file): if f.type == "exonic_part": f.name = f.attr['gene_id'] + ":" + f.attr['exonic_part_number'] features[f.iv] += f # initialise counters num_reads = 0 counts = {} counts['_empty'] = 0 counts['_ambiguous'] = 0 counts['_lowaqual'] = 0 counts['_notaligned'] = 0 counts['_ambiguous_readpair_position'] = 0 # put a zero for each feature ID for iv, s in features.steps():
def intron_retention(outfile, gff_file, g_alnm, t_alnm): # Read intron information from GFF file sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Reading intron coordinates from GFF file\n") gff_features = HTSeq.GFF_Reader(gff_file, end_included=True) features = HTSeq.GenomicArrayOfSets("auto", stranded=False) dict_intron_info = {} for feature in gff_features: if "transcript_id" in feature.attr: feature_id = feature.attr['transcript_id'] elif "Parent" in feature.attr: # no "if feature.type == intron" to also consider trxs without intron info = feature.name.split(":") if len(info) == 1: feature_id = info[0] else: if info[0] == "transcript": feature_id = info[1] else: continue else: continue feature_id = feature_id.split(".")[0] if feature_id not in dict_intron_info: dict_intron_info[feature_id] = [] # remove "chr" from chromosome names to be constant if "chr" in feature.iv.chrom: feature.iv.chrom = feature.iv.chrom.strip("chr") if feature.type == "intron": features[feature.iv] += feature_id dict_intron_info[feature_id].append((feature.iv.start, feature.iv.end, feature.iv.length)) # read primary genome alignment for each read sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Read primary genome alignment for each read\n") dict_g_alnm = {} sam_reader = HTSeq.SAM_Reader g_alignments = sam_reader(g_alnm) for alnm in g_alignments: qname = alnm.read.name if alnm.aligned: dict_g_alnm[qname] = parse_cigar(alnm.cigar) # read primary transcriptome alignment for each read sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Read primary transcriptome alignment for each read\n") dict_t_alnm = {} sam_reader = HTSeq.SAM_Reader t_alignments = sam_reader(t_alnm) for alnm in t_alignments: qname = alnm.read.name if alnm.aligned: dict_t_alnm[qname] = alnm.iv.chrom.split(".")[0] # Count the length of Intron retention events sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Calculating probabilities for each intron retention event\n") dict_first_intron_state = {False: 0, True: 0} dict_states = {(False, False): 0, (False, True): 0, (True, False): 0, (True, True): 0} dict_ir_info = {} for qname in dict_g_alnm: iv_seq = dict_g_alnm[qname] if qname in dict_t_alnm: primary_trx = dict_t_alnm[qname] if primary_trx not in dict_ir_info: dict_ir_info[primary_trx] = [] list_IR_positions = [] pos = [] ir_info = False try: length_IR = 0 for item in iv_seq: iv = HTSeq.GenomicInterval(item[0], item[1], item[2], item[3]) if "chr" in iv.chrom: iv.chrom = iv.chrom.strip("chr") for iv2, fs2 in features[iv].steps(): if fs2.intersection(set([primary_trx])): length_IR += iv2.length pos.append(iv2.start) pos.append(iv2.end) else: if length_IR != 0: for intron in dict_intron_info[primary_trx]: if length_IR == intron[2]: list_IR_positions.append(min(pos)) list_IR_positions.append(max(pos)) ir_info = True length_IR = 0 pos = [] # TODO ?? except UnknownChrom: ir_info = False pass if not ir_info: if primary_trx in dict_intron_info: if len(dict_intron_info[primary_trx]) >= 1: # if there is an intron dict_first_intron_state[False] += 1 for i in range(1, len(dict_intron_info[primary_trx])): dict_states[(False, False)] += 1 else: # Now, go over all introns and check with the IR events # First we need to determine the state of first intron: first_intron = dict_intron_info[primary_trx][0] first_intron_spos = first_intron[0] first_intron_epos = first_intron[1] flag = False for IR_pos in list_IR_positions: if first_intron_spos <= IR_pos <= first_intron_epos: flag = True break if flag: dict_ir_info[primary_trx].append((first_intron_spos, first_intron_epos)) dict_first_intron_state[True] += 1 previous_state = True else: dict_first_intron_state[False] += 1 previous_state = False # Then we will go over other introns: for i in range(1, len(dict_intron_info[primary_trx])): intron = dict_intron_info[primary_trx][i] current_state = False intron_spos = intron[0] intron_epos = intron[1] for IR_pos in list_IR_positions: if intron_spos <= IR_pos <= intron_epos: current_state = True dict_ir_info[primary_trx].append((intron_spos, intron_epos)) break # print(intron_spos, intron_epos, previous_state, current_state) dict_states[(previous_state, current_state)] += 1 previous_state = current_state del dict_g_alnm del dict_t_alnm # print (dict_first_intron_state) # print (dict_states) sum_first_introns = dict_first_intron_state[True] + dict_first_intron_state[False] sum_for_noIR = dict_states[(False, False)] + dict_states[(False, True)] sum_for_IR = dict_states[(True, False)] + dict_states[(True, True)] fout = open(outfile + "_IR_markov_model", 'w') fout.write("succedent\tno_IR\tIR\n") if sum_first_introns != 0: fout.write("start\t" + str(round(dict_first_intron_state[False] / float(sum_first_introns), 4)) + "\t" + str(round(dict_first_intron_state[True] / float(sum_first_introns), 4)) + "\n") else: fout.write("start\t0.0\t0.0\n") if sum_for_noIR != 0: fout.write("no_IR\t" + str(round(dict_states[(False, False)] / float(sum_for_noIR), 4)) + "\t" + str(round(dict_states[(False, True)] / float(sum_for_noIR), 4)) + "\n") else: fout.write("no_IR\t0.0\t0.0\n") if sum_for_IR != 0: fout.write("IR\t" + str(round(dict_states[(True, False)] / float(sum_for_IR), 4)) + "\t" + str(round(dict_states[(True, True)] / float(sum_for_IR), 4)) + "\n") else: fout.write("IR\t0.0\t0.0\n") # output intron coordinates and information to the user: out_ir_info = open(outfile + "_IR_info", 'w') out_ir_info.write("trx_name\tintron_spos\tintron_epos\n") for trx in dict_ir_info: if len(dict_ir_info[trx]) != 0: lst_sorted = sorted(set(dict_ir_info[trx])) fstr_spos = ",".join([str(item[0]) for item in lst_sorted]) fstr_epos = ",".join([str(item[1]) for item in lst_sorted]) out_ir_info.write(trx + "\t" + fstr_spos + "\t" + fstr_epos + "\n") fout.close() out_ir_info.close()
import os os.chdir( '/share/ScratchGeneral/jamtor/projects/hgsoc_repeats/RNA-seq/results/star/GC/exp5' ) os.getcwd() # read in bam: bam_reader = HTSeq.BAM_Reader( "bowtell_FT3_subset/Aligned.sortedByCoord.out.bam") # check first 5 lines of bam: import itertools for a in itertools.islice(bam_reader, 5): print a # read in gencode annotation: homeDir = '/share/ScratchGeneral/jamtor/' gc = homeDir + '/genomes/hg38_ercc/gencode_v24_hg38_annotation.gtf' gtf_file = HTSeq.GFF_Reader(gc, end_included=True) # check first 10 lines of annotation: for feature in itertools.islice(gtf_file, 10): print feature # initiate a GenomicArrayOfSets object and fill with exons only from annotation: exons = HTSeq.GenomicArrayOfSets("auto", stranded=True) for feature in gtf_file: if feature.type == "exon": exons[feature.iv] += feature.name for e in itertools.islice(gtf_file, 10): print e
import sys import HTSeq import numpy import matplotlib as mpl mpl.use('pdf') from matplotlib import pyplot #sortedbamfile = HTSeq.BAM_Reader( "../input/DHS.Chr1.unique.bam" ) #sortedbamfile = HTSeq.BAM_Reader( "../input/DHS.unique.bam" ) #gtffile = HTSeq.GFF_Reader( "../input/MSU7.gene.exon_number.gtf" ) sortedbamfile = HTSeq.BAM_Reader(sys.argv[1]) gtffile = HTSeq.GFF_Reader(sys.argv[2]) halfwinwidth = 2000 fragmentsize = 150 #total = 60745783.00/1000000 ## nucleosome #total = 7480914/1000000 ## nucleosome chr1 #total = 23299296/1000000 #DHS unique #gsize = 372000000 #coverage = HTSeq.GenomicArray( "auto", stranded=False, typecode="i" ) #for almnt in bamfile: # if almnt.aligned: # #almnt.iv.length = fragmentsize # print almnt.iv # if not almnt.iv.start < 500: # coverage[ almnt.iv ] += 1 #tsspos = set() #for feature in gtffile: # if feature.type == "exon" and feature.attr["exon_number"] == "-1":