def HTseq_count(bam_file, gtf_file, out_dir, identifier, parallel = True ): gtf_file = HTSeq.GFF_Reader(gtf_file) features = HTSeq.GenomicArrayOfSets( "auto", stranded=True ) print "extracting features from gtf file" for feature in gtf_file: # if feature.type == "exon": features[feature.iv] += feature.attr[identifier] counts = collections.Counter( ) almnt_file = HTSeq.SAM_Reader(bam_file) counts = collections.Counter( ) for bundle in HTSeq.pair_SAM_alignments( almnt_file, bundle=True ): if len(bundle) != 1: continue # Skip multiple alignments first_almnt, second_almnt = bundle[0] # extract pair if not first_almnt.aligned and second_almnt.aligned: count[ "_unmapped" ] += 1 continue gene_ids = set() for iv, val in features[ left_almnt.iv ].steps(): gene_ids |= val for iv, val in features[ right_almnt.iv ].steps(): gene_ids |= val if len(gene_ids) == 1: gene_id = list(gene_ids)[0] counts[ gene_id ] += 1 elif len(gene_ids) == 0: counts[ "_no_feature" ] += 1 else: counts[ "_ambiguous" ] += 1 for gene_id in counts: print gene_id, counts[ gene_id ]
def count_reads_paired(read_seq, counter, order, stranded, quiet, minaqual, write_to_samout ): if order == "name": read_seq = HTSeq.pair_SAM_alignments( read_seq ) elif order == "pos": read_seq = HTSeq.pair_SAM_alignments_with_buffer( read_seq ) else: raise ValueError, "Illegal order specified." i = 0 for r in read_seq: if i > 0 and i % 100000 == 0 and not quiet: sys.stderr.write( "%d SAM alignment record%s processed.\n" % ( i, "s" if not pe_mode else " pairs" ) ) i += 1 if r[0] is not None and r[0].aligned: if stranded != "reverse": iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0 ) else: iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" and co.size > 0 ) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if stranded != "reverse": iv_seq = itertools.chain(iv_seq, ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" and co.size > 0 ) ) else: iv_seq = itertools.chain( iv_seq, ( co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0 ) ) else: if ( r[0] is None ) or not ( r[0].aligned ): write_to_samout( r, "__not_aligned" ) counter.notaligned += 1 continue try: if ( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \ ( r[1] is not None and r[1].optional_field( "NH" ) > 1 ): counter.nonunique += 1 write_to_samout( r, "__alignment_not_unique" ) continue except KeyError: pass if ( r[0] and r[0].aQual < minaqual ) or ( r[1] and r[1].aQual < minaqual ): lowqual += 1 write_to_samout( r, "__too_low_aQual" ) continue counter.count(iv_seq, r) if not quiet: sys.stderr.write( "%d SAM %s processed.\n" % ( i, "alignments " if not pe_mode else "alignment pairs" ) )
def count_reads_paired(read_seq, counter, order, quiet, minaqual): if order == "name": read_seq = HTSeq.pair_SAM_alignments(read_seq) elif order == "pos": read_seq = HTSeq.pair_SAM_alignments_with_buffer(read_seq) else: raise ValueError("Illegal order specified.") i = 0 for r in read_seq: if i > 0 and i % 100000 == 0 and not quiet: msg = "%d SAM alignment record pairs processed.\n" % (i) sys.stderr.write(msg) i += 1 if r[0] is not None and r[0].aligned: forward_iv_seq = (co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0) reverse_iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar if co.type == "M" and co.size > 0) else: forward_iv_seq = tuple() reverse_iv_seq = tuple() if r[1] is not None and r[1].aligned: rest = (invert_strand(co.ref_iv) for co in r[1].cigar if co.type == "M" and co.size > 0) forward_iv_seq = itertools.chain(forward_iv_seq, rest) rest = (co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0) reverse_iv_seq = itertools.chain(reverse_iv_seq, rest) else: if (r[0] is None) or not (r[0].aligned): counter.not_aligned(r) continue try: if (r[0] is not None and r[0].optional_field("NH") > 1) or \ (r[1] is not None and r[1].optional_field("NH") > 1): counter.non_unique(r) continue except KeyError: pass if (r[0] and r[0].aQual < minaqual) or \ (r[1] and r[1].aQual < minaqual): counter.too_low_quality(r) continue counter.forward_count(forward_iv_seq, r) counter.reverse_count(reverse_iv_seq, r) if not quiet: sys.stderr.write("%d SAM alignment pairs processed.\n" % (i))
def ungapped_pe_counter(sam_reader, feature_array): counts = collections.Counter( ) pair_iterator = hts.pair_SAM_alignments( sam_reader, bundle=True ) # bundle puts all multiply-mapped pairs together. t0 = datetime.datetime.now() for ic, bundle in enumerate(pair_iterator): # report progress (to prove that it is still alive): if ic % 1000000 == 0: t1 = datetime.datetime.now() print "\r%d read bundles counted in %s\r" % (ic, t1-t0) sys.stdout.flush() if bundle == []: # first bundle for some reason is always an empty list continue bcounts = assess_bundle(bundle, feature_array) """ To evaluate the multiply mapped bundles, each pair in a bundle must still ALWAYS and ONLY map to a single feature. Thus, every aligned pair has come from the same feature (gene), and this bundle counts as evidence of one read for this gene. If any of the read pairs maps to a different gene, or no gene, or multiple genes, then the bundle is considered ambiguous. If all pairs in a bundle map as _no_feature, _unmapped or _ambiguous, then the bundle counts as one count towards this feature type. (ie, it is passed on to the final counter to increment by 1). """ if len(bcounts) > 1: # ie, is a multiply mapped feature with multiple gene mappings counts[ "_ambiguous" ] += 1 continue elif len(bcounts) == 0: # uh oh! There is an error somewhere. print "#" * 40 print "Error! bundle was not assigned any status" print "Contents of bundle:" print bundle continue else: counts[ bcounts.keys()[0] ] += 1 return counts
def bam_parser_2(bam_file, min_len, max_clip, min_id, mode): bam_dict = {} query_counter = 0 output_list = list() if mode == 'paired': #import itertools #for aln in itertools.islice( HTSeq.pair_SAM_alignments(bam_file), 1000 ): # printing first N reads for aln in HTSeq.pair_SAM_alignments(bam_file): query_counter += 1 query_1, query_2 = aln q1_aln = parser_aln_list(query_1, aln_number = query_counter, pair_pos = 1, min_len=min_len, max_clip=max_clip, min_id=min_id) q2_aln = parser_aln_list(query_2, aln_number = query_counter, pair_pos = 2, min_len=min_len, max_clip=max_clip, min_id=min_id) alns = [q1_aln, q2_aln] if alns == [None, None]: continue else: if None in alns: alns.remove(None) output_list.append(alns) elif mode == 'single': for aln in bam_file: query_counter += 1 query_1 = aln q1_aln = parser_aln_list(query_1, aln_number = query_counter, pair_pos = 1, min_len=min_len, max_clip=max_clip, min_id=min_id) alns = [q1_aln] if q1_aln != None: output_list.append(alns) df_columns = ['ALN','QUERY','REF','SEQ','LEN','ID','SCORE','CLIP_PCT'] output_list = [item for sublist in output_list for item in sublist] return pd.DataFrame(output_list, columns=df_columns)
def count_reads_in_features( sam_filename, gff_filename, stranded, overlap_mode, feature_type, id_attribute, quiet, minaqual, samout, \ filename_read_names_gene_names,filename_read_names_gene_names_amb_unique): """ Main function to count reads in features i.e. genes. Input: + sam_filename: Input alignment with all the ambiguously mapped reads + gff_filename: GTF containing all genes for a given species + stranded: specify whether data are stranded - see -s option + overlap_mode: mode to handle reads overlapping more than one feature (e.g. union) - See -m option: choices = ( "union", "intersection-strict", "intersection-nonempty") + feature_type: see -t option + id_attribute: see -i option + quiet: see -q option + minaqual: see -a option + samout: SAM output file storing disambiguated reads (see -o option). + filename_read_names_gene_names: filename for the output file containing the mappings readName to geneNames for multimapped reads + filename_read_names_gene_names_amb_unique: filename for the output file containing the mappings readName to geneNames for ambiguously mapped reads Output: + Writes readName to geneName outputs. + Writes SAM output file for ddisambiguated uniquely mapped reads. + Writes to stdout the genes and their read counts with read count for distinct read type: non-ambiguous unique, multimapped and ambiguous unique. This output redirected and stored to an output file in main peakRescue pipeline. This output is used in the later stage of the peakRescue pipeline to rescue the reads present in the readName to genNames mappings. """ # Output filhandles for readName to geneNames mappings fh_read_names_gene_names = open(filename_read_names_gene_names, 'w') fh_read_names_gene_names_amb_unique = open(filename_read_names_gene_names_amb_unique, 'w') def write_to_samout( r, assignment ): if samoutfile is None: return if not pe_mode: r = (r,) for read in r: if read is not None: samoutfile.write( read.original_sam_line.rstrip() + "\tXF:Z:" + assignment + "\n" ) if quiet: warnings.filterwarnings( action="ignore", module="HTSeq" ) if samout != "": samoutfile = open( samout, "w" ) else: samoutfile = None features = HTSeq.GenomicArrayOfSets( "auto", stranded != "no" ) ## Hash table to store unique reads per exon (if modified GTF) counts = {} ## Hash table to store original non unique reads per gene (without dict_nonunique = {} ## Hash table to store all unique reads as per original GTF dict_gene_unique_counts = {} ## hast table to store ambigouous read count for unique reads... dict_gene_unique_counts_ambiguous = {} ## Hash table to store all non-unique reads including shared reads ## (either split reads or read pair matching on two distinct exons, same gene) dict_gene_nonunique_counts = {} ## Hash to store the non-unique read-names as key and genes names as values (fragments) dict_read_name_genes_names = {} ## Hash to store the non-unique read-names as key and genes names as values (fragments) including instances of a given multimapped read on same gene dict_read_name_genes_names_final = {} dict_read_name_genes_names_ambiguous = {} ## @todo: tag_gff - parameter to be removed - only deal with gene level information ## tag_gff: type to specify whether it contains gene or exons information tag_gff = "gene_gff" # Try to open samfile and fail early in case it is not there if sam_filename != "-": open( sam_filename ).close() gff = HTSeq.GFF_Reader( gff_filename ) exons = HTSeq.GenomicArrayOfSets( "auto", stranded=False ) i = 0 try: for f in gff: if f.type == feature_type: exons[ f.iv ] += f # added to get exon interval data try: feature_id = f.attr[ id_attribute ] except KeyError: sys.exit( "Feature %s does not contain a '%s' attribute" % ( f.name, id_attribute ) ) if stranded != "no" and f.iv.strand == ".": sys.exit( "Feature %s at %s does not have strand information but you are " "running htseq-count in stranded mode. Use '--stranded=no'." % ( f.name, f.iv ) ) features[ f.iv ] += feature_id counts[ f.attr[ id_attribute ] ] = 0 # -- Initialisation feature_name = f.attr[ id_attribute ] # -- Added tag_gff for GFF type if tag_gff == "gene_gff": # Original GTF (genes) dict_nonunique = initialise_counts_per_feature(dict_nonunique, feature_name) dict_gene_unique_counts = initialise_counts_per_feature(dict_gene_unique_counts, feature_name) dict_gene_nonunique_counts = initialise_counts_per_feature(dict_gene_nonunique_counts, feature_name) dict_gene_unique_counts_ambiguous = initialise_counts_per_feature(dict_gene_unique_counts_ambiguous, feature_name) i += 1 if i % 100000 == 0 and not quiet: sys.stderr.write( "%d GFF lines processed.\n" % i ) except: sys.stderr.write( "Error occured in %s.\n" % gff.get_line_number_string() ) raise if not quiet: sys.stderr.write( "%d GFF lines processed.\n" % i ) if len( counts ) == 0 and not quiet: sys.stderr.write( "Warning: No features of type '%s' found.\n" % feature_type ) try: if sam_filename != "-": read_seq = HTSeq.SAM_Reader( sam_filename ) first_read = iter(read_seq).next() else: read_seq = iter( HTSeq.SAM_Reader( sys.stdin ) ) first_read = read_seq.next() read_seq = itertools.chain( [ first_read ], read_seq ) pe_mode = first_read.paired_end #pe_mode = 1 ## Added by us except: sys.stderr.write( "Error occured when reading first line of sam file.\n" ) raise ################################################################################################### try: if pe_mode: read_seq_pe_file = read_seq read_seq = HTSeq.pair_SAM_alignments( read_seq ) empty = 0 ambiguous = 0 ambiguous_tag=0 notaligned = 0 lowqual = 0 nonunique = 0 nonunique_nonamb_to_be_rescued = 0 temp_read_name="NA" previous_read_name="NA" temp_interval_r0="NA" temp_interval_r1="NA" counter_fragment = 0 flag_result = 0 i = 0 pe_mode_for_SE = 0 ## -- Added pe_mode on for SE files so that multireads reads will be accounted for if not pe_mode: # real SE pe_mode_for_SE = 1 # read_seq_pe_file = read_seq pe_mode=1 ## -- End index_fragment = 0 for r in read_seq: prev_index_fragment = index_fragment tag_nonunique_NH = 0 tag_overlapping_genes = 0 flag_aln_not_unique = 0 # flag_ambiguous = 0 # #-- LOOP OVER ALL READS IN INPUT BAM FILE if pe_mode_for_SE: r = (r, None) counter_fragment += 1 i += 1 if not pe_mode: # -- SINGLE_END mode if not r.aligned: notaligned += 1 #write_to_samout( r, "not_aligned" ) continue try: if r.optional_field( "NH" ) > 1: # --- Rescue multimappers in singel-end mode #write_to_samout( r, "alignment_not_unique" ) #nonunique += 1 continue except KeyError: pass if r.aQual < minaqual: lowqual += 1 #write_to_samout( r, "too_low_aQual" ) continue if stranded != "reverse": iv_seq = ( co.ref_iv for co in r.cigar if co.type == "M" ) else: iv_seq = ( invert_strand( co.ref_iv ) for co in r.cigar if co.type == "M" ) else: # -- PAIRED-END if r[0] is not None and r[0].aligned: if stranded != "reverse": iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" ) else: iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" ) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if stranded != "reverse": iv_seq = itertools.chain( iv_seq, ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" ) ) else: iv_seq = itertools.chain( iv_seq, ( co.ref_iv for co in r[1].cigar if co.type == "M" ) ) else: if ( r[0] is None ) or not ( r[0].aligned ): #write_to_samout( r, "not_aligned" ) notaligned += 1 continue try: if (( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \ ( r[1] is not None and r[1].optional_field( "NH" ) > 1 )): tag_nonunique_NH = 1 if ( r[0] is not None and r[1] is None ): result, fs_genes, fs_exons,dict_read_name_genes_names,ambiguous_tag = is_read_in_gene_interval(r[0], features,dict_read_name_genes_names,ambiguous_tag, exons) if result: flag_result = 1 (dict_nonunique, flag_aln_not_unique, dict_read_name_genes_names) = _add_non_unique_counts_per_feature_based_on_read_interval_and_readname(r[0], \ temp_interval_r0, temp_read_name, fs_genes, dict_nonunique, dict_read_name_genes_names) else: if len(fs_genes) != 0: (dict_nonunique, flag_aln_not_unique, dict_read_name_genes_names) = _add_non_unique_counts_per_feature_based_on_read_interval_and_readname(r[0], \ temp_interval_r0, temp_read_name, fs_genes, dict_nonunique, dict_read_name_genes_names) if ( r[0] is None and r[1] is not None ): result, fs_genes, fs_exons,dict_read_name_genes_names,ambiguous_tag = is_read_in_gene_interval(r[1], features,dict_read_name_genes_names,ambiguous_tag,exons) if result: flag_result = 1 (dict_nonunique, flag_aln_not_unique, dict_read_name_genes_names) = _add_non_unique_counts_per_feature_based_on_read_interval_and_readname(r[1], \ temp_interval_r1, temp_read_name, fs_genes, dict_nonunique, dict_read_name_genes_names) else: if len(fs_genes) != 0: (dict_nonunique, flag_aln_not_unique, dict_read_name_genes_names) = _add_non_unique_counts_per_feature_based_on_read_interval_and_readname(r[1], \ temp_interval_r1, temp_read_name, fs_genes, dict_nonunique, dict_read_name_genes_names) if ( r[0] is not None and r[1] is not None ): result1, fs_genes1, fs_exons1,dict_read_name_genes_names,ambiguous_tag = is_read_in_gene_interval(r[0], features,dict_read_name_genes_names,ambiguous_tag,exons) result2, fs_genes2, fs_exons2,dict_read_name_genes_names,ambiguous_tag = is_read_in_gene_interval(r[1], features,dict_read_name_genes_names,ambiguous_tag,exons) if len(fs_genes1.intersection(fs_genes2)) > 0: fs_genes = fs_genes1.intersection(fs_genes2) elif len(fs_genes1.intersection(fs_genes2))==0: fs_genes = fs_genes1.union(fs_genes2) if result1 and not result2: flag_result = 1 (dict_nonunique, flag_aln_not_unique, dict_read_name_genes_names) = _add_non_unique_counts_per_feature_based_on_read_interval_and_readname(r[0], \ temp_interval_r0, temp_read_name, fs_genes, dict_nonunique, dict_read_name_genes_names) elif result2 and not result1: flag_result = 1 (dict_nonunique, flag_aln_not_unique, dict_read_name_genes_names) = _add_non_unique_counts_per_feature_based_on_read_interval_and_readname(r[1], \ temp_interval_r1, temp_read_name, fs_genes, dict_nonunique, dict_read_name_genes_names) else: if len(fs_genes1) != 0 or len(fs_genes2) != 0: flag_result = 1 if ( ( ((temp_interval_r0 != str(r[0].iv)) or (temp_interval_r1 != str(r[1].iv))) or (temp_read_name != r[0].read.name) ) ): (dict_nonunique)= add_non_unique_counts_per_feature(fs_genes, dict_nonunique) dict_read_name_genes_names = _populate_read_name_gene_name(dict_read_name_genes_names, fs_genes, r[0].read.name, tag_report_instances_same_multiread_on_same_gene) flag_aln_not_unique = 1 #write_to_samout( r, "alignment_not_unique" ) nonunique += 1 if flag_result: if r[0] is not None and r[1] is None: non_uniq_read_name = r[0].read.name elif r[0] is None and r[1] is not None: non_uniq_read_name = r[1].read.name elif r[0] is not None and r[1] is not None: non_uniq_read_name= r[0].read.name non_uniq_read_name2 = dict_read_name_genes_names.keys()[0] if flag_aln_not_unique: nonunique_nonamb_to_be_rescued += 1 # -- Re-initialise hash # previous_read_name: read which falls into at least one gene interval # tmp_read_name: the previous read in the bam file # BAM is sorted by read name hence each multimapper will be arranged one after another if previous_read_name == "NA": previous_read_name = non_uniq_read_name if non_uniq_read_name != previous_read_name: if previous_read_name in dict_read_name_genes_names.keys(): fs_genes_names = dict_read_name_genes_names[previous_read_name] fh_read_names_gene_names.write("%s\t%s\n" % (previous_read_name, "\t".join(list(fs_genes_names)) )) previous_read_name = non_uniq_read_name tmp_dict = {} if non_uniq_read_name in dict_read_name_genes_names.keys(): #print "non_uniq_read_name IN dict_read_name_genes_names.keys()" tmp_dict[non_uniq_read_name] = dict_read_name_genes_names[non_uniq_read_name] dict_read_name_genes_names.clear() # only one read stored dict_read_name_genes_names = tmp_dict flag_result = 0 flag_aln_not_unique = 0 # (temp_read_name, temp_interval_r0, temp_interval_r1) = initalize_read_name_and_interval(r[0], r[1]) continue # except KeyError: except KeyError: pass if ( r[0] and r[0].aQual < minaqual ) or ( r[1] and r[1].aQual < minaqual ): lowqual += 1 #write_to_samout( r, "too_low_aQual" ) continue try: # -- if overlap_mode == "union": fs = set() for iv in iv_seq: # interval from bam file for each fragment if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[ iv ].steps(): #if debug: #print "****Unique_feature %s and feature_interval %s" %(fs2,iv2) fs = fs.union( fs2 ) elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty": fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[ iv ].steps(): if len(fs2) > 0 or overlap_mode == "intersection-strict": if fs is None: fs = fs2.copy() else: fs = fs.intersection( fs2 ) else: sys.exit( "Illegal overlap mode." ) fs_genes = fs if fs_genes is None or len( fs_genes ) == 0: #write_to_samout( r, "no_feature" ) empty += 1 # ambiguous read count and/or one of the read pair mapping on different gene (potential gene fusion events)... # elif len( fs ) > 1: elif len( fs_genes ) > 1: ############################################################### ## AMBIGUOUS UNIQUE ############################################################### is_disambiguated = 0 if not tag_nonunique_NH: if ( r[0] is not None and r[1] is None ): result, fs_genes, fs_exons,dict_read_name_genes_names_ambiguous, ambiguous_tag = is_read_in_gene_interval(r[0], features, dict_read_name_genes_names_ambiguous, ambiguous_tag,exons) if result: (dict_gene_unique_counts) = add_unique_counts_per_feature(dict_gene_unique_counts, fs_genes) is_disambiguated = 1 if ambiguous_tag: (dict_gene_unique_counts_ambiguous) = add_unique_counts_per_feature_ambiguous(fs_genes, dict_gene_unique_counts_ambiguous) flag_ambiguous = 1 # write in the file ambiguous read name gene name data... fh_read_names_gene_names_amb_unique.write("%s\t%s\n" % (r[0].read.name, "\t".join(list(fs_genes)) )) if ( r[0] is None and r[1] is not None ): result, fs_genes, fs_exons, dict_read_name_genes_names_ambiguous, ambiguous_tag = is_read_in_gene_interval(r[1], features, dict_read_name_genes_names_ambiguous, ambiguous_tag,exons) if result: (dict_gene_unique_counts) = add_unique_counts_per_feature(dict_gene_unique_counts, fs_genes) is_disambiguated = 1 if ambiguous_tag: (dict_gene_unique_counts_ambiguous) = add_unique_counts_per_feature_ambiguous(fs_genes, dict_gene_unique_counts_ambiguous) flag_ambiguous = 1 fh_read_names_gene_names_amb_unique.write("%s\t%s\n" % (r[1].read.name, "\t".join(list(fs_genes)) )) if ( r[0] is not None and r[1] is not None ): result1, fs_genes1, fs_exons, dict_read_name_genes_names_ambiguous, ambiguous_tag1 = is_read_in_gene_interval(r[0], features, dict_read_name_genes_names_ambiguous, ambiguous_tag,exons) result2, fs_genes2, fs_exons, dict_read_name_genes_names_ambiguous, ambiguous_tag2 = is_read_in_gene_interval(r[1], features, dict_read_name_genes_names_ambiguous, ambiguous_tag,exons) if debug: print "IN UNIQUE DISAMBIGUATION -->r[0].read.name=%s\t%s\t%s\t%s\t%s\n" % (r[0].read.name,result1, result2, fs_genes1, fs_genes2) if len(fs_genes1.intersection(fs_genes2))==1: fs_genes = fs_genes1.intersection(fs_genes2) (dict_gene_unique_counts) = add_unique_counts_per_feature(dict_gene_unique_counts, fs_genes) is_disambiguated = 1 elif len(fs_genes1.intersection(fs_genes2)) > 1: fs_genes = fs_genes1.intersection(fs_genes2) (dict_gene_unique_counts_ambiguous) = add_unique_counts_per_feature_ambiguous(fs_genes, dict_gene_unique_counts_ambiguous) flag_ambiguous = 1 fh_read_names_gene_names_amb_unique.write("%s\t%s\n" % (r[0].read.name, "\t".join(list(fs_genes)) )) elif len(fs_genes1.intersection(fs_genes2))==0: fs_genes = fs_genes1.union(fs_genes2) if (fs_genes1 == set([]) or fs_genes2 == set([])) and len(fs_genes) == 1: ## Disambiguate the uniquely mapped to the single gene it maps on (dict_gene_unique_counts) = add_unique_counts_per_feature(dict_gene_unique_counts, fs_genes) is_disambiguated = 1 elif (fs_genes1 != set([]) or fs_genes2 != set([])): ## Add fragment to the RN-GN for ambiguous uniquely mapped based on ## union of both fs_genes (fs_genes1 & fs_genes2) > 1 (dict_gene_unique_counts_ambiguous) = add_unique_counts_per_feature_ambiguous(fs_genes, dict_gene_unique_counts_ambiguous) flag_ambiguous = 1 fh_read_names_gene_names_amb_unique.write("%s\t%s\n" % (r[0].read.name, "\t".join(list(fs_genes)) )) if flag_ambiguous: ambiguous += 1 #write_to_samout( r, "ambiguous[" + '+'.join( fs ) + "]" ) if is_disambiguated: write_to_samout( r, list(fs_genes)[0] ) else: if debug: #print "DEBUG::CR:: len(fs) <-> 1:: fs = %s" %fs pass write_to_samout( r, list(fs)[0] ) rr2 = r[0] if r[0] is not None else r[1] if not tag_nonunique_NH: (dict_gene_unique_counts) = add_unique_counts_per_feature(dict_gene_unique_counts, fs_genes) except UnknownChrom: if not pe_mode: rr = r else: rr = r[0] if r[0] is not None else r[1] if not quiet: sys.stderr.write( ( "Warning: Skipping read '%s', because chromosome " + "'%s', to which it has been aligned, did not appear in the GFF file.\n" ) % ( rr.read.name, iv.chrom ) ) if i % 100000 == 0 and not quiet: sys.stderr.write( "%d sam %s processed.\n" % ( i, "lines " if not pe_mode else "line pairs" ) ) flag_ambiguous = 0 ## re-initialise.... index_fragment += 1 ######################### # This is to store the last read/fragment since it will no pass in previous condition: # => if non_uniq_read_name != previous_read_name: # -- At same level as the for loop (outside of the for loop) - column: 7 #fh_read_names_gene_names.close() if dict_read_name_genes_names.keys() != []: #print "dict_read_name_genes_names passing" non_uniq_read_name = dict_read_name_genes_names.keys()[0] fs_genes_names = dict_read_name_genes_names[non_uniq_read_name] fh_read_names_gene_names.write("%s\t%s\n" % (non_uniq_read_name, "\t".join(list(fs_genes_names)) )) # -- fh_read_names_gene_names.close() fh_read_names_gene_names_amb_unique.close() ################################################################################################### #except UnboundLocalError: except AttributeError: #except: if not pe_mode: sys.stderr.write( "Error occured in %s.\n" % read_seq.get_line_number_string() ) else: sys.stderr.write( "Error occured in %s.\n" % read_seq_pe_file.get_line_number_string() ) raise if not quiet: sys.stderr.write( "%d sam %s processed.\n" % ( i, "lines " if not pe_mode else "line pairs" ) ) if samoutfile is not None: samoutfile.close() if tag_gff == "gene_gff": tuples_genenames_exontag = [(fn, fn) for fn in dict_gene_unique_counts.keys()] tuples_genenames_exontag.sort() previous_gene_name = "NA" for gene_name, fn in tuples_genenames_exontag: gene_name = gene_name.strip() fn = fn.strip() if tag_gff == "gene_gff": # if gene_name in dict_gene_unique_counts.keys(): print "%s\t%i\t%i\t%s" % ( fn, dict_gene_unique_counts[gene_name], dict_nonunique[gene_name],dict_gene_unique_counts_ambiguous[gene_name] ) else: # -- No non-unique reads for that gene_name print "%s\t%i\t%i\t%i" % ( fn, dict_gene_unique_counts[gene_name], 0,dict_gene_unique_counts_ambiguous[gene_name] ) # -- Re-initialise gene name previous_gene_name = gene_name print "no_feature\t%d" % empty print "ambiguous\t%d" % ambiguous print "too_low_aQual\t%d" % lowqual print "not_aligned\t%d" % notaligned print "alignment_not_unique\t%d" % nonunique print "nonunique_nonamb_to_be_rescued:\t%d" % nonunique_nonamb_to_be_rescued
def tabulate_start_positions(BamFileName, cells, name, targetsite, mapq_threshold, gap_threshold, start_threshold, outfile_base): output_filename = '{0}_coordinates.txt'.format(outfile_base) sorted_bam_file = HTSeq.BAM_Reader(BamFileName) filename_base = os.path.basename(BamFileName) ga = HTSeq.GenomicArray("auto", stranded=False) ga_windows = HTSeq.GenomicArray("auto", stranded=False) ga_stranded = HTSeq.GenomicArray("auto", stranded=True) ga_coverage = HTSeq.GenomicArray("auto", stranded=False) read_count = 0 ref_chr = [ '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X', 'Y' ] with open(output_filename, 'w') as o: header = [ '#Name', 'Targetsite_Sequence', 'Cells', 'BAM', 'Read1_chr', 'Read1_start_position', 'Read1_strand', 'Read2_chr', 'Read1_start_position', 'Read2_strand' ] print(*header, sep='\t', file=o) for bundle in HTSeq.pair_SAM_alignments(sorted_bam_file, bundle=True): output = False first_read_chr, first_read_position, first_read_strand = None, None, None second_read_chr, second_read_position, second_read_strand = None, None, None if len(bundle) == 1: # single alignment first_read, second_read = bundle[0] if first_read.aligned: if first_read.aQual >= mapq_threshold and not first_read.flag & 1024 and \ (first_read.iv.strand == '+' and first_read.cigar[0].type == 'M') or \ (first_read.iv.strand == '-' and first_read.cigar[-1].type == 'M'): first_read_chr = first_read.iv.chrom first_read_position = first_read.iv.start_d first_read_strand = first_read.iv.strand if second_read.aligned: if second_read.aQual >= mapq_threshold and not first_read.flag & 1024 and \ (second_read.iv.strand == '+' and second_read.cigar[0].type == 'M') or \ (second_read.iv.strand == '-' and second_read.cigar[-1].type == 'M'): second_read_chr = second_read.iv.chrom second_read_position = second_read.iv.start_d second_read_strand = second_read.iv.strand elif len(bundle) > 1: # multiple alignments first_read_list, second_read_list = zip(*bundle) filtered_first_read_list = [] filtered_second_read_list = [] for read in first_read_list: if read: if read.aligned: if read.iv.strand == '+' and read.cigar[ 0].type == 'M': filtered_first_read_list.append(read) elif read.iv.strand == '-' and read.cigar[ -1].type == 'M': filtered_first_read_list.append(read) for read in second_read_list: if read: if read.aligned: if read.iv.strand == '+' and read.cigar[ 0].type == 'M': filtered_second_read_list.append(read) elif read.iv.strand == '-' and read.cigar[ -1].type == 'M': filtered_second_read_list.append(read) if len(filtered_first_read_list) == 1: first_read = filtered_first_read_list[0] if first_read.aQual >= mapq_threshold and not first_read.flag & 1024: first_read_chr = first_read.iv.chrom first_read_position = first_read.iv.start_d first_read_strand = first_read.iv.strand if len(filtered_second_read_list) == 1: second_read = filtered_second_read_list[0] if second_read.aQual >= mapq_threshold and not first_read.flag & 1024: second_read_chr = second_read.iv.chrom second_read_position = second_read.iv.start_d second_read_strand = second_read.iv.strand if first_read_chr == second_read_chr and first_read_chr in ref_chr and \ ((first_read.iv.strand == '+' and second_read.iv.strand == '-' and abs(first_read_position - second_read_position) <= gap_threshold) or (second_read.iv.strand == '+' and first_read.iv.strand == '-' and abs(second_read_position - first_read_position) <= gap_threshold)): #if first_read_chr in ref_chr and first_read_position and first_read_strand: ga[HTSeq.GenomicPosition(first_read_chr, first_read_position, first_read_strand)] += 1 ga_windows[HTSeq.GenomicPosition(first_read_chr, first_read_position, first_read_strand)] = 1 ga_stranded[HTSeq.GenomicPosition(first_read_chr, first_read_position, first_read_strand)] += 1 # output = True #if second_read_chr in ref_chr and second_read_position and second_read_strand: ga[HTSeq.GenomicPosition(second_read_chr, second_read_position, second_read_strand)] += 1 ga_windows[HTSeq.GenomicPosition(second_read_chr, second_read_position, second_read_strand)] = 1 ga_stranded[HTSeq.GenomicPosition(second_read_chr, second_read_position, second_read_strand)] += 1 output = True # Output read positions for plotting. Add gap. if output == True: print(name, targetsite, cells, filename_base, first_read_chr, first_read_position, first_read_strand, second_read_chr, second_read_position, second_read_strand, sep='\t', file=o) last_pair_position = [ first_read_chr, first_read_position, first_read_strand, second_read_chr, second_read_position, second_read_strand ] read_count += 1 if not read_count % 100000: print(read_count / float(1000000), end=" ", file=sys.stderr) return ga, ga_windows, ga_stranded, ga_coverage, read_count
def count_reads_in_features(sam_filename, gff_filename, samtype, order, overlap_mode, feature_type, id_attribute, quiet, minaqual, mapping_file, scale_method): features = HTSeq.GenomicArrayOfSets("auto", False) counts = {} # Try to open samfile to fail early in case it is not there if sam_filename != "-": open(sam_filename).close() # Try to open mapping file to fail early in case it is not there if mapping_file: open(mapping_file).close() gff = HTSeq.GFF_Reader(gff_filename) i = 0 try: for f in gff: if f.type == feature_type: try: feature_id = f.attr[id_attribute] except KeyError: continue features[f.iv] += feature_id counts[feature_id] = 0 i += 1 if i % 100000 == 0 and not quiet: sys.stderr.write("{!s} GFF lines processed.\n".format(i)) except: sys.stderr.write("Error occured when processing GFF file ({}):\n" .format(gff.get_line_number_string())) raise if not quiet: sys.stderr.write("{!s} GFF lines processed.\n".format(i)) num_features = len(counts) if num_features == 0: sys.stderr.write("Warning: No features of type '{}' found.\n" .format(feature_type)) if samtype == "sam": align_reader = HTSeq.SAM_Reader elif samtype == "bam": align_reader = HTSeq.BAM_Reader else: raise ValueError, "Unknown input format {} specified.".format(samtype) try: if sam_filename != "-": read_seq_file = align_reader(sam_filename) read_seq = read_seq_file first_read = iter(read_seq).next() else: read_seq_file = align_reader(sys.stdin) read_seq_iter = iter(read_seq_file) first_read = read_seq_iter.next() read_seq = itertools.chain([first_read], read_seq_iter) pe_mode = first_read.paired_end except: sys.stderr.write("Error occured when reading SAM/BAM file.\n" ) raise try: if pe_mode: if order == "name": read_seq = HTSeq.pair_SAM_alignments(read_seq) elif order == "position": read_seq = HTSeq.pair_SAM_alignments_with_buffer(read_seq) else: raise ValueError, "Illegal order specified." empty = 0 ambiguous = 0 notaligned = 0 lowqual = 0 nonunique = 0 i = 0 for r in read_seq: if i > 0 and i % 100000 == 0 and not quiet: sys.stderr.write("{!s} SAM alignment record{} processed.\n" .format(i, "s" if not pe_mode else " pairs")) i += 1 if not pe_mode: if not r.aligned: notaligned += 1 continue try: if r.optional_field("NH") > 1: nonunique += 1 continue except KeyError: pass if r.aQual < minaqual: lowqual += 1 continue iv_seq = (invert_strand(co.ref_iv) for co in r.cigar if co.type == "M" and co.size > 0) else: if r[0] is not None and r[0].aligned: iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar if co.type == "M" and co.size > 0) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: iv_seq = itertools.chain( iv_seq, (co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0)) else: if (r[0] is None) or not (r[0].aligned): notaligned += 1 continue try: if (r[0] is not None and r[0].optional_field("NH") > 1 ) or \ (r[1] is not None and r[1].optional_field("NH") > 1): nonunique += 1 continue except KeyError: pass if (r[0] and r[0].aQual < minaqual) or (r[1] and r[1].aQual < minaqual): lowqual += 1 continue try: if overlap_mode == "union": fs = set() for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): fs = fs.union(fs2) elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty": fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[ iv ].steps(): if len(fs2) > 0 or overlap_mode == "intersection-strict": if fs is None: fs = fs2.copy() else: fs = fs.intersection(fs2) else: sys.exit("Illegal overlap mode.") if fs is None or len(fs) == 0: empty += 1 elif len(fs) > 1: ambiguous += 1 else: counts[list(fs)[0]] += 1 except UnknownChrom: empty += 1 except: sys.stderr.write("Error occured when processing SAM input ({}):\n" .format(read_seq_file.get_line_number_string())) raise if not quiet: sys.stderr.write("{!s} SAM {} processed.\n" .format(i, "alignments " if not pe_mode else "alignment pairs")) # map to higher order features if applicable if mapping_file: abundances = {} with open(mapping_file) as mapping_h: for row in csv.reader(mapping_h, delimiter='\t'): try: feature, feature_category, feature_length, organism = row except ValueError: sys.stderr.write("Can't determine the format of '{}'".format(mapping_file)) raise if feature not in counts: continue if not feature_category: feature_category = feature abund = counts[feature] if scale_method == 'none' else scale_abundance(counts[feature], int(feature_length)) if ',' in feature_category: cats = feature_category.split(',') for category in cats: abundances[category] = abundances.get(category, 0) + abund else: abundances[feature_category] = abundances.get(feature_category, 0) + abund if num_features > 0 and len(abundances) == 0: sys.stderr.write("Warning: No higher order features found. Please " "make sure the mapping file is formatted correctly.\n") for feature in counts: if feature not in abundances: abundances['UNMAPPED'] = abundances.get('UNMAPPED', 0) + counts[feature] else: abundances = counts # "UNMAPPED" can be interpreted as a single unknown gene of length 1 # kilobase recruiting all reads that failed to map to known sequences abundances['UNMAPPED'] = (abundances.get('UNMAPPED', 0) + empty + ambiguous + lowqual + notaligned + nonunique) for fn in sorted(abundances.keys()): print("{}\t{!s}".format(fn, abundances[fn])) sys.stderr.write("__no_feature\t{!s}\n".format(empty)) sys.stderr.write("__ambiguous\t{!s}\n".format(ambiguous)) sys.stderr.write("__too_low_aQual\t{!s}\n".format(lowqual)) sys.stderr.write("__not_aligned\t{!s}\n".format(notaligned)) sys.stderr.write("__alignment_not_unique\t{!s}\n".format(nonunique))
def count_reads_in_features( sam_filename, gff_filename, stranded, overlap_mode, feature_type, id_attribute, quiet, minaqual, samout, allow_ambiguous, allow_nonunique ): def write_to_samout( r, assignment ): if samoutfile is None: return if not pe_mode: r = (r,) for read in r: if read is not None: samoutfile.write( read.original_sam_line.rstrip() + "\tXF:Z:" + assignment + "\n" ) if quiet: warnings.filterwarnings( action="ignore", module="HTSeq" ) if samout != "": samoutfile = open( samout, "w" ) else: samoutfile = None features = HTSeq.GenomicArrayOfSets( "auto", stranded != "no" ) features_dict = defaultdict(list) counts = {} # Try to open samfile to fail early in case it is not there if sam_filename != "-": open( sam_filename ).close() gff = HTSeq.GFF_Reader( gff_filename ) i = 0 try: for f in gff: if f.type == feature_type: try: feature_id = f.attr[ id_attribute ] except KeyError: sys.exit( "Feature %s does not contain a '%s' attribute" % ( f.name, id_attribute ) ) if stranded != "no" and f.iv.strand == ".": sys.exit( "Feature %s at %s does not have strand information but you are " "running htseq-count in stranded mode. Use '--stranded=no'." % ( f.name, f.iv ) ) features[ f.iv ] += feature_id counts[ f.attr[ id_attribute ] ] = 0 features_dict[ f.attr[ id_attribute ] ].append(f) i += 1 if i % 100000 == 0 and not quiet: sys.stderr.write( "%d GFF lines processed.\n" % i ) except: sys.stderr.write( "Error occured in %s.\n" % gff.get_line_number_string() ) raise if not quiet: sys.stderr.write( "%d GFF lines processed.\n" % i ) sys.stderr.write( "Sorting exons from GFF file.\n" ) for key, value in features_dict.items(): if features_dict[key][0].iv.strand == "-": features_dict[key] = sorted(features_dict[key], key=lambda feat: feat.iv.start, reverse=True) else: features_dict[key] = sorted(features_dict[key], key=lambda feat: feat.iv.start, reverse=False) if len( counts ) == 0 and not quiet: sys.stderr.write( "Warning: No features of type '%s' found.\n" % feature_type ) try: if sam_filename != "-": read_seq = HTSeq.SAM_Reader( sam_filename ) first_read = iter(read_seq).next() else: read_seq = iter( HTSeq.SAM_Reader( sys.stdin ) ) first_read = read_seq.next() read_seq = itertools.chain( [ first_read ], read_seq ) pe_mode = first_read.paired_end except: sys.stderr.write( "Error occured when reading first line of sam file.\n" ) raise try: if pe_mode: read_seq_pe_file = read_seq read_seq = HTSeq.pair_SAM_alignments( read_seq ) empty = 0 ambiguous = 0 notaligned = 0 lowqual = 0 nonunique = 0 i = 0 for r in read_seq: i += 1 if not pe_mode: if not r.aligned: notaligned += 1 write_to_samout( r, "not_aligned" ) continue try: if ((allow_nonunique == "no") and (r.optional_field( "NH" ) > 1)): write_to_samout( r, "alignment_not_unique" ) nonunique += 1 continue except KeyError: pass if r.aQual < minaqual: lowqual += 1 write_to_samout( r, "too_low_aQual" ) continue if stranded != "reverse": iv_seq = ( co.ref_iv for co in r.cigar if co.type == "M" ) else: iv_seq = ( invert_strand( co.ref_iv ) for co in r.cigar if co.type == "M" ) else: if r[0] is not None and r[0].aligned: if stranded != "reverse": iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" ) else: iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" ) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if stranded != "reverse": iv_seq = itertools.chain( iv_seq, ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" ) ) else: iv_seq = itertools.chain( iv_seq, ( co.ref_iv for co in r[1].cigar if co.type == "M" ) ) else: if ( r[0] is None ) or not ( r[0].aligned ): write_to_samout( r, "not_aligned" ) notaligned += 1 continue try: if (allow_nonunique == "no") and (( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \ ( r[1] is not None and r[1].optional_field( "NH" ) > 1 )): nonunique += 1 write_to_samout( r, "alignment_not_unique" ) continue except KeyError: pass if ( r[0] and r[0].aQual < minaqual ) or ( r[1] and r[1].aQual < minaqual ): lowqual += 1 write_to_samout( r, "too_low_aQual" ) continue try: iv_seq = list(iv_seq) if overlap_mode == "union": fs = set() for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[ iv ].steps(): fs = fs.union( fs2 ) elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty": fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[ iv ].steps(): if len(fs2) > 0 or overlap_mode == "intersection-strict": if fs is None: fs = fs2.copy() else: fs = fs.intersection( fs2 ) else: sys.exit( "Illegal overlap mode." ) # sys.stderr.write( "fs = %s with len = %d allow_ambiguous=%s\n" % (fs, len(fs), allow_ambiguous) ) if fs is None or len( fs ) == 0: write_to_samout( r, "no_feature" ) empty += 1 elif ((len( fs ) > 1) and (allow_ambiguous == "no")): write_to_samout( r, "ambiguous[" + '+'.join( fs ) + "]" ) ambiguous += 1 else: # write_to_samout( r, list(fs)[0] ) # sys.stderr.write( "iv_seq is now %s\n" % iv_seq ) # sys.exit ("stopping for now" ) for iv in iv_seq: # sys.stderr.write( "iv in iv_seq is %s and getting %s\n" % (iv, features[iv]) ) for iv2, fs2 in features[ iv ].steps(): # sys.stderr.write( "iv2 = %s and fs2 = %s\n" % (iv2, fs2) ) if len(fs2) == 0: continue for fsi in fs2: # sys.stderr.write( "fsi = %s\n" % fsi ) offset = 0 # write_to_samout( r, fsi[0] ) for exon in features_dict[ fsi ]: if ((iv2.start >= exon.iv.start) and (iv2.end <= exon.iv.end)): # sys.stderr.write("found matching exon %s\n" % exon) if (exon.iv.strand == "+"): offset += (iv2.start - exon.iv.start) else: offset += (exon.iv.end - iv2.end) # sys.stderr.write("matching exon new offset %d\n" % offset) break else: offset += (exon.iv.end - exon.iv.start) # sys.stderr.write( "skipping exon %s - new offset %d\n" % (exon, offset) ) # sys.stderr.write( "mapping read %s to offset %d\n" % (r, offset) ) # sys.stderr.write( "dir(r) is %s" % "\t".join(list(dir(r))) ) if pe_mode: rname = r[0].read.name if r[0] is not None else r[1].read.name else: rname = r.read.name print "%s\t%d\t%d\t%s" % (fsi, offset, offset + (iv2.end-iv2.start-1), rname) # output is 0-based, inclusive on both ends except UnknownChrom: if not pe_mode: rr = r else: rr = r[0] if r[0] is not None else r[1] empty += 1 #if not quiet: # sys.stderr.write( ( "Warning: Skipping read '%s', because chromosome " + # "'%s', to which it has been aligned, did not appear in the GFF file.\n" ) % # ( rr.read.name, iv.chrom ) ) if i % 100000 == 0 and not quiet: sys.stderr.write( "%d sam %s processed.\n" % ( i, "lines " if not pe_mode else "line pairs" ) ) except: if not pe_mode: sys.stderr.write( "Error occured in %s.\n" % read_seq.get_line_number_string() ) else: sys.stderr.write( "Error occured in %s.\n" % read_seq_pe_file.get_line_number_string() ) raise if not quiet: sys.stderr.write( "%d sam %s processed.\n" % ( i, "lines " if not pe_mode else "line pairs" ) ) if samoutfile is not None: samoutfile.close()
def count_reads_in_features( sam_filename, gff_filename, stranded, overlap_mode, feature_type, id_attribute, quiet, minaqual, samout ): def write_to_samout( r, assignment ): if samoutfile is None: return if not pe_mode: r = (r,) for read in r: if read is not None: samoutfile.write( read.original_sam_line.rstrip() + "\tXF:Z:" + assignment + "\n" ) if quiet: warnings.filterwarnings( action="ignore", module="HTSeq" ) if samout != "": samoutfile = open( samout, "w" ) else: samoutfile = None features = HTSeq.GenomicArrayOfSets( "auto", stranded != "no" ) counts = {} gene_length = {} # Try to open samfile to fail early in case it is not there if sam_filename != "-": open( sam_filename ).close() counts, colgenes = parse_gff(gff_filename,features,feature_type,id_attribute,stranded,quiet,counts) if len( counts ) == 0 and not quiet: sys.stderr.write( "Warning: No features of type '%s' found.\n" % feature_type ) ################# read sam file ####################### try: if sam_filename != "-": read_seq = HTSeq.SAM_Reader( sam_filename ) first_read = iter(read_seq).next() else: read_seq = iter( HTSeq.SAM_Reader( sys.stdin ) ) first_read = read_seq.next() read_seq = itertools.chain( [ first_read ], read_seq ) pe_mode = first_read.paired_end except: sys.stderr.write( "Error occured when reading first line of sam file.\n" ) raise ################ read sam file ####################### try: if pe_mode: read_seq_pe_file = read_seq read_seq = HTSeq.pair_SAM_alignments( read_seq ) empty = 0 ambiguous = 0 notaligned = 0 lowqual = 0 nonunique = 0 i = 0 for r in read_seq: i += 1 if not pe_mode: if not r.aligned: notaligned += 1 write_to_samout( r, "not_aligned" ) continue try: if r.optional_field( "NH" ) > 1: write_to_samout( r, "alignment_not_unique" ) nonunique += 1 continue except KeyError: pass if r.aQual < minaqual: lowqual += 1 write_to_samout( r, "too_low_aQual" ) continue if stranded != "reverse": iv_seq = ( co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0 ) else: iv_seq = ( invert_strand( co.ref_iv ) for co in r.cigar if co.type == "M" and co.size > 0 ) else: if r[0] is not None and r[0].aligned: if stranded != "reverse": iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0 ) else: iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" and co.size > 0 ) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if stranded != "reverse": iv_seq = itertools.chain( iv_seq, ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" and co.size > 0 ) ) else: iv_seq = itertools.chain( iv_seq, ( co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0 ) ) else: if ( r[0] is None ) or not ( r[0].aligned ): write_to_samout( r, "not_aligned" ) notaligned += 1 continue try: if ( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \ ( r[1] is not None and r[1].optional_field( "NH" ) > 1 ): nonunique += 1 write_to_samout( r, "alignment_not_unique" ) continue except KeyError: pass if ( r[0] and r[0].aQual < minaqual ) or ( r[1] and r[1].aQual < minaqual ): lowqual += 1 write_to_samout( r, "too_low_aQual" ) continue try: if overlap_mode == "union": fs = set() for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[ iv ].steps(): ## what is within the genomic interval of iv fs = fs.union( fs2 ) elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty": fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[ iv ].steps(): if len(fs2) > 0 or overlap_mode == "intersection-strict": if fs is None: fs = fs2.copy() else: fs = fs.intersection( fs2 ) else: sys.exit( "Illegal overlap mode." ) if fs is None or len( fs ) == 0: write_to_samout( r, "no_feature" ) empty += 1 elif len( fs ) > 1: write_to_samout( r, "ambiguous[" + '+'.join( fs ) + "]" ) ambiguous += 1 else: write_to_samout( r, list(fs)[0] ) counts[ list(fs)[0] ] += 1 except UnknownChrom: if not pe_mode: rr = r else: rr = r[0] if r[0] is not None else r[1] empty += 1 #if not quiet: # sys.stderr.write( ( "Warning: Skipping read '%s', because chromosome " + # "'%s', to which it has been aligned, did not appear in the GFF file.\n" ) % # ( rr.read.name, iv.chrom ) ) if i % 100000 == 0 and not quiet: sys.stderr.write( "%d sam %s processed.\n" % ( i, "lines " if not pe_mode else "line pairs" ) ) except: if not pe_mode: sys.stderr.write( "Error occured in %s.\n" % read_seq.get_line_number_string() ) else: sys.stderr.write( "Error occured in %s.\n" % read_seq_pe_file.get_line_number_string() ) raise if not quiet: sys.stderr.write( "%d sam %s processed.\n" % ( i, "lines " if not pe_mode else "line pairs" ) ) if samoutfile is not None: samoutfile.close() for fn in sorted( counts.keys() ): print i, sum(counts.values()) rpkm, feature_len = get_rpkm(counts[fn],i,colgenes[fn]) print "%s\t%d\t%d\t%d" % ( fn, counts[fn], feature_len,rpkm) print "no_feature\t%d" % empty print "ambiguous\t%d" % ambiguous print "too_low_aQual\t%d" % lowqual print "not_aligned\t%d" % notaligned print "alignment_not_unique\t%d" % nonunique
set_of_gene_names = set( [ f.name.split(":")[0] for f in rs ] ) if len( set_of_gene_names ) == 0: counts[ '_empty' ] += 1 elif len( set_of_gene_names ) > 1: counts[ '_ambiguous' ] +=1 else: for f in rs: counts[ f.name ] += 1 num_reads += 1 if num_reads % 100000 == 0: sys.stderr.write( "%d reads processed.\n" % num_reads ) else: # paired-end alignments = dict() if order == "name": for af, ar in HTSeq.pair_SAM_alignments( reader( sam_file ) ): if af == None or ar == None: continue if not ar.aligned: continue if not af.aligned: continue elif ar.optional_field("NH") > max_NH or af.optional_field("NH") > max_NH: continue elif af.iv.chrom != ar.iv.chrom: counts['_ambiguous_readpair_position'] += 1 continue else: rs = map_read_pair( af, ar ) counts = update_count_vector( counts, rs ) num_reads += 1
def count_reads(features, counts, pe_mode, read_seq, order, stranded, overlap_mode, quiet, minaqual, write_to_samout): if pe_mode: if order == "name": read_seq = HTSeq.pair_SAM_alignments(read_seq) elif order == "pos": read_seq = HTSeq.pair_SAM_alignments_with_buffer(read_seq) else: raise ValueError, "Illegal order specified." empty = 0 ambiguous = 0 notaligned = 0 lowqual = 0 nonunique = 0 i = 0 for r in read_seq: if i > 0 and i % 100000 == 0 and not quiet: sys.stderr.write("%d SAM alignment record%s processed.\n" % (i, "s" if not pe_mode else " pairs")) i += 1 if not pe_mode: if not r.aligned: notaligned += 1 write_to_samout(r, "__not_aligned") continue try: if r.optional_field("NH") > 1: nonunique += 1 write_to_samout(r, "__alignment_not_unique") continue except KeyError: pass if r.aQual < minaqual: lowqual += 1 write_to_samout(r, "__too_low_aQual") continue if stranded != "reverse": iv_seq = (co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r.cigar if co.type == "M" and co.size > 0) else: if r[0] is not None and r[0].aligned: if stranded != "reverse": iv_seq = (co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar if co.type == "M" and co.size > 0) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if stranded != "reverse": iv_seq = itertools.chain( iv_seq, (invert_strand(co.ref_iv) for co in r[1].cigar if co.type == "M" and co.size > 0)) else: iv_seq = itertools.chain( iv_seq, (co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0)) else: if (r[0] is None) or not (r[0].aligned): write_to_samout(r, "__not_aligned") notaligned += 1 continue try: if ( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \ ( r[1] is not None and r[1].optional_field( "NH" ) > 1 ): nonunique += 1 write_to_samout(r, "__alignment_not_unique") continue except KeyError: pass if (r[0] and r[0].aQual < minaqual) or (r[1] and r[1].aQual < minaqual): lowqual += 1 write_to_samout(r, "__too_low_aQual") continue try: if overlap_mode == "union": fs = set() for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): fs = fs.union(fs2) elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty": fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): if len(fs2 ) > 0 or overlap_mode == "intersection-strict": if fs is None: fs = fs2.copy() else: fs = fs.intersection(fs2) else: sys.exit("Illegal overlap mode.") if fs is None or len(fs) == 0: write_to_samout(r, "__no_feature") empty += 1 elif len(fs) > 1: write_to_samout(r, "__ambiguous[" + '+'.join(fs) + "]") ambiguous += 1 else: write_to_samout(r, list(fs)[0]) counts[list(fs)[0]] += 1 except UnknownChrom: write_to_samout(r, "__no_feature") empty += 1 if not quiet: sys.stderr.write( "%d SAM %s processed.\n" % (i, "alignments " if not pe_mode else "alignment pairs")) for fn in sorted(counts.keys()): print "%s\t%d" % (fn, counts[fn]) print "__no_feature\t%d" % empty print "__ambiguous\t%d" % ambiguous print "__too_low_aQual\t%d" % lowqual print "__not_aligned\t%d" % notaligned print "__alignment_not_unique\t%d" % nonunique
def count_reads(sam_filename, features, counts, samtype, order, stranded, overlap_mode, quiet, minaqual, samout): def write_to_samout(r, assignment): if samoutfile is None: return if not pe_mode: r = (r, ) for read in r: if read is not None: samoutfile.write(read.original_sam_line.rstrip() + "\tXF:Z:" + assignment + "\n") if samout != "": samoutfile = open(samout, "w") else: samoutfile = None if samtype == "sam": SAM_or_BAM_Reader = HTSeq.SAM_Reader elif samtype == "bam": SAM_or_BAM_Reader = HTSeq.BAM_Reader else: raise ValueError, "Unknown input format %s specified." % samtype try: if sam_filename != "-": read_seq_file = SAM_or_BAM_Reader(sam_filename) read_seq = read_seq_file first_read = iter(read_seq).next() else: read_seq_file = SAM_or_BAM_Reader(sys.stdin) read_seq_iter = iter(read_seq_file) first_read = read_seq_iter.next() read_seq = itertools.chain([first_read], read_seq_iter) pe_mode = first_read.paired_end except: sys.stderr.write( "Error occured when reading beginning of SAM/BAM file.\n") raise try: if pe_mode: if order == "name": read_seq = HTSeq.pair_SAM_alignments(read_seq) elif order == "pos": read_seq = HTSeq.pair_SAM_alignments_with_buffer(read_seq) else: raise ValueError, "Illegal order specified." empty = 0 ambiguous = 0 notaligned = 0 lowqual = 0 nonunique = 0 i = 0 for r in read_seq: if i > 0 and i % 100000 == 0 and not quiet: sys.stderr.write("%d SAM alignment record%s processed.\n" % (i, "s" if not pe_mode else " pairs")) i += 1 if not pe_mode: if not r.aligned: notaligned += 1 write_to_samout(r, "__not_aligned") continue try: if r.optional_field("NH") > 1: nonunique += 1 write_to_samout(r, "__alignment_not_unique") continue except KeyError: pass if r.aQual < minaqual: lowqual += 1 write_to_samout(r, "__too_low_aQual") continue if stranded != "reverse": iv_seq = (co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r.cigar if co.type == "M" and co.size > 0) else: if r[0] is not None and r[0].aligned: if stranded != "reverse": iv_seq = (co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar if co.type == "M" and co.size > 0) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if stranded != "reverse": iv_seq = itertools.chain( iv_seq, (invert_strand(co.ref_iv) for co in r[1].cigar if co.type == "M" and co.size > 0)) else: iv_seq = itertools.chain( iv_seq, (co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0)) else: if (r[0] is None) or not (r[0].aligned): write_to_samout(r, "__not_aligned") notaligned += 1 continue try: if (r[0] is not None and r[0].optional_field("NH") > 1) or \ (r[1] is not None and r[1].optional_field("NH") > 1): nonunique += 1 write_to_samout(r, "__alignment_not_unique") continue except KeyError: pass if (r[0] and r[0].aQual < minaqual) or (r[1] and r[1].aQual < minaqual): lowqual += 1 write_to_samout(r, "__too_low_aQual") continue try: if overlap_mode == "union": fs = set() for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): fs = fs.union(fs2) elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty": fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): if len( fs2 ) > 0 or overlap_mode == "intersection-strict": if fs is None: fs = fs2.copy() else: fs = fs.intersection(fs2) else: sys.exit("Illegal overlap mode.") if fs is None or len(fs) == 0: write_to_samout(r, "__no_feature") empty += 1 elif len(fs) > 1: write_to_samout(r, "__ambiguous[" + '+'.join(fs) + "]") ambiguous += 1 else: write_to_samout(r, list(fs)[0]) counts[list(fs)[0]] += 1 except UnknownChrom: write_to_samout(r, "__no_feature") empty += 1 except: sys.stderr.write("Error occured when processing SAM input (%s):\n" % read_seq_file.get_line_number_string()) raise if not quiet: sys.stderr.write( "%d SAM %s processed.\n" % (i, "alignments " if not pe_mode else "alignment pairs")) if samoutfile is not None: samoutfile.close() for fn in sorted(counts.keys()): print "%s\t%d" % (fn, counts[fn]) print "__no_feature\t%d" % empty print "__ambiguous\t%d" % ambiguous print "__too_low_aQual\t%d" % lowqual print "__not_aligned\t%d" % notaligned print "__alignment_not_unique\t%d" % nonunique
def count_reads(sam_filename, features, counts, samtype, order, forward, reverse, overlap_mode, quiet, minaqual, samout, directory): def write_to_samout(r, assignment): if samoutfile is None: return if not pe_mode: r = (r, ) for read in r: if read is not None: samoutfile.write(read.original_sam_line.rstrip() + "\tXF:Z:" + assignment + "\n") if samout != "": samoutfile = open(samout, "w") else: samoutfile = None if samtype is None: samtype = detect_sam_type(sam_filename) if samtype == "sam": SAM_or_BAM_Reader = HTSeq.SAM_Reader elif samtype == "bam": SAM_or_BAM_Reader = HTSeq.BAM_Reader else: raise ValueError("Unknown input format %s specified." % samtype) try: if sam_filename != "-": read_seq_file = SAM_or_BAM_Reader(sam_filename) read_seq = read_seq_file first_read = iter(read_seq).next() else: read_seq_file = SAM_or_BAM_Reader(sys.stdin) read_seq_iter = iter(read_seq_file) first_read = read_seq_iter.next() read_seq = itertools.chain([first_read], read_seq_iter) pe_mode = first_read.paired_end except: sys.stderr.write("Error occured when reading beginning " "of SAM/BAM file.\n") raise try: if pe_mode: if order == "name": read_seq = HTSeq.pair_SAM_alignments(read_seq) elif order == "pos": read_seq = HTSeq.pair_SAM_alignments_with_buffer(read_seq) else: raise ValueError("Illegal order specified.") if forward: empty_forward = 0 ambiguous_forward = 0 counts_forward = copy.copy(counts) if reverse: empty_reverse = 0 ambiguous_reverse = 0 counts_reverse = copy.copy(counts) notaligned = 0 lowqual = 0 nonunique = 0 i = 0 for r in read_seq: if i > 0 and i % 100000 == 0 and not quiet: sys.stderr.write("%d SAM alignment record%s processed.\n" % (i, "s" if not pe_mode else " pairs")) i += 1 if not pe_mode: if not r.aligned: notaligned += 1 write_to_samout(r, "__not_aligned") continue try: if r.optional_field("NH") > 1: nonunique += 1 write_to_samout(r, "__alignment_not_unique") continue except KeyError: pass if r.aQual < minaqual: lowqual += 1 write_to_samout(r, "__too_low_aQual") continue if forward: iv_seq_for = (co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0) if reverse: iv_seq_rev = (invert_strand(co.ref_iv) for co in r.cigar if co.type == "M" and co.size > 0) else: if r[0] is not None and r[0].aligned: if forward: iv_seq_for = (co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0) if reverse: iv_seq_rev = (invert_strand(co.ref_iv) for co in r[0].cigar if co.type == "M" and co.size > 0) else: iv_seq_rev = tuple() iv_seq_for = tuple() if r[1] is not None and r[1].aligned: if forward: iv_seq_for = (itertools.chain( iv_seq_for, (invert_strand(co.ref_iv) for co in r[1].cigar if co.type == "M" and co.size > 0))) if reverse: iv_seq_rev = itertools.chain( iv_seq_rev, (co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0)) else: if (r[0] is None) or not (r[0].aligned): write_to_samout(r, "__not_aligned") notaligned += 1 continue try: if ((r[0] is not None and r[0].optional_field("NH") > 1) or (r[1] is not None and r[1].optional_field("NH") > 1)): nonunique += 1 write_to_samout(r, "__alignment_not_unique") continue except KeyError: pass if ((r[0] and r[0].aQual < minaqual) or (r[1] and r[1].aQual < minaqual)): lowqual += 1 write_to_samout(r, "__too_low_aQual") continue try: if overlap_mode == "union": if forward: fs_for = set() for iv in iv_seq_for: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): fs_for = fs_for.union(fs2) if reverse: fs_rev = set() for iv in iv_seq_rev: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): fs_rev = fs_rev.union(fs2) elif (overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty"): if forward: fs_for = None for iv in iv_seq_for: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): if len(fs2) > 0 or \ overlap_mode == "intersection-strict": if fs_for is None: fs_for = fs2.copy() else: fs_for = fs_for.intersection(fs2) if reverse: fs_reverse = None for iv in iv_seq_rev: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): if len(fs2) > 0 or \ overlap_mode == "intersection-strict": if fs_rev is None: fs_rev = fs2.copy() else: fs_rev = fs_rev.intersection(fs2) else: sys.exit("Illegal overlap mode.") if forward: if fs_for is None or len(fs_for) == 0: write_to_samout(r, "__no_feature") empty_forward += 1 elif len(fs_for) > 1: write_to_samout( r, "__ambiguous[" + '+'.join(fs_for) + "]") ambiguous_forward += 1 else: write_to_samout(r, list(fs_for)[0]) counts_forward[list(fs_for)[0]] += 1 if reverse: if fs_reverse is None or len(fs_rev) == 0: write_to_samout(r, "__no_feature") empty_reverse += 1 elif len(fs_reverse) > 1: write_to_samout( r, "__ambiguous[" + '+'.join(fs_rev) + "]") ambiguous_reverse += 1 else: write_to_samout(r, list(fs_rev)[0]) counts_reverse[list(fs_rev)[0]] += 1 except UnknownChrom: write_to_samout(r, "__no_feature") empty_forward += 1 empty_reverse += 1 except: sys.stderr.write("Error occured when processing SAM input (%s):\n" % read_seq_file.get_line_number_string()) raise if not quiet: sys.stderr.write( "%d SAM %s processed.\n" % (i, "alignments " if not pe_mode else "alignment pairs")) if samoutfile is not None: samoutfile.close() if forward: output = brenninc_utils.create_new_file(sam_filename, "_forward_count", outputdir=directory, extension="txt", gzipped=False) used_features_count = 0 used_features_sum = 0 print "Forward written to", output with open(output, "w") as output_file: for fn in sorted(counts_forward.keys()): output_file.write("%s\t%d\n" % (fn, counts_forward[fn])) used_features_count += 1 used_features_sum += counts_forward[fn] output_file.write("__no_feature\t%d\n" % empty_forward) output_file.write("__ambiguous\t%d\n" % ambiguous_forward) output_file.write("__too_low_aQual\t%d\n" % lowqual) output_file.write("__not_aligned\t%d\n" % notaligned) output_file.write("__alignment_not_unique\t%d\n" % nonunique) print "Forward features with alignment\t%d" % used_features_count print "Forward alignments asigned to feature\t%d" % used_features_sum print "__forward_no_feature\t%d" % empty_forward print "__forward_ambiguous\t%d" % ambiguous_forward if reverse: output = brenninc_utils.create_new_file(sam_filename, "_reverse_count", outputdir=directory, extension="txt", gzipped=False) used_features_count = 0 used_features_sum = 0 print "Reverse written to", output with open(output, "w") as output_file: for fn in sorted(counts_reverse.keys()): output.write("%s\t%d\n" % (fn, counts_reverse[fn])) used_features_count += 1 used_features_sum += counts_reverse[fn] output_file.write("__no_feature\t%d\n" % empty_reverse) output_file.write("__ambiguous\t%d\n" % ambiguous_reverse) output_file.write("__too_low_aQual\t%d\n" % lowqual) output_file.write("__not_aligned\t%d\n" % notaligned) output_file.write("__alignment_not_unique\t%d\n" % nonunique) print "Reverse features with alignment\t%d" % used_features_count print "Reverse alignments asigned to feature\t%d" % used_features_sum print "__reverse_no_feature\t%d" % empty_reverse print "__reverse_ambiguous\t%d" % ambiguous_reverse print "__too_low_aQual\t%d" % lowqual print "__not_aligned\t%d" % notaligned print "__alignment_not_unique\t%d" % nonunique
def count_reads(features, counts, pe_mode, read_seq, order, stranded, overlap_mode, quiet, minaqual, write_to_samout ): if pe_mode: if order == "name": read_seq = HTSeq.pair_SAM_alignments( read_seq ) elif order == "pos": read_seq = HTSeq.pair_SAM_alignments_with_buffer( read_seq ) else: raise ValueError, "Illegal order specified." empty = 0 ambiguous = 0 notaligned = 0 lowqual = 0 nonunique = 0 i = 0 for r in read_seq: if i > 0 and i % 100000 == 0 and not quiet: sys.stderr.write( "%d SAM alignment record%s processed.\n" % ( i, "s" if not pe_mode else " pairs" ) ) i += 1 if not pe_mode: if not r.aligned: notaligned += 1 write_to_samout( r, "__not_aligned" ) continue try: if r.optional_field( "NH" ) > 1: nonunique += 1 write_to_samout( r, "__alignment_not_unique" ) continue except KeyError: pass if r.aQual < minaqual: lowqual += 1 write_to_samout( r, "__too_low_aQual" ) continue if stranded != "reverse": iv_seq = ( co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0 ) else: iv_seq = ( invert_strand( co.ref_iv ) for co in r.cigar if co.type == "M" and co.size > 0 ) else: if r[0] is not None and r[0].aligned: if stranded != "reverse": iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0 ) else: iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" and co.size > 0 ) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if stranded != "reverse": iv_seq = itertools.chain(iv_seq, ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" and co.size > 0 ) ) else: iv_seq = itertools.chain( iv_seq, ( co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0 ) ) else: if ( r[0] is None ) or not ( r[0].aligned ): write_to_samout( r, "__not_aligned" ) notaligned += 1 continue try: if ( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \ ( r[1] is not None and r[1].optional_field( "NH" ) > 1 ): nonunique += 1 write_to_samout( r, "__alignment_not_unique" ) continue except KeyError: pass if ( r[0] and r[0].aQual < minaqual ) or ( r[1] and r[1].aQual < minaqual ): lowqual += 1 write_to_samout( r, "__too_low_aQual" ) continue try: if overlap_mode == "union": fs = set() for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[ iv ].steps(): fs = fs.union( fs2 ) elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty": fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[ iv ].steps(): if len(fs2) > 0 or overlap_mode == "intersection-strict": if fs is None: fs = fs2.copy() else: fs = fs.intersection( fs2 ) else: sys.exit( "Illegal overlap mode." ) if fs is None or len( fs ) == 0: write_to_samout( r, "__no_feature" ) empty += 1 elif len( fs ) > 1: write_to_samout( r, "__ambiguous[" + '+'.join( fs ) + "]" ) ambiguous += 1 else: write_to_samout( r, list(fs)[0] ) counts[ list(fs)[0] ] += 1 except UnknownChrom: write_to_samout( r, "__no_feature" ) empty += 1 if not quiet: sys.stderr.write( "%d SAM %s processed.\n" % ( i, "alignments " if not pe_mode else "alignment pairs" ) ) for fn in sorted( counts.keys() ): print "%s\t%d" % ( fn, counts[fn] ) print "__no_feature\t%d" % empty print "__ambiguous\t%d" % ambiguous print "__too_low_aQual\t%d" % lowqual print "__not_aligned\t%d" % notaligned print "__alignment_not_unique\t%d" % nonunique
import HTSeq import collections gtf_file = HTSeq.GFF_Reader("p_stutzeri_28a24_and_pMPPla107.gtf") cds = HTSeq.GenomicArrayOfSets("auto", stranded=True) for feature in gtf_file: if feature.type == "CDS": cds[feature.iv] += feature.attr["gene_id"] almnt_file = HTSeq.BAM_Reader('pair.386_48hr_a_AATGTTGC_starAligned.out.bam') counts = collections.Counter() for bundle in HTSeq.pair_SAM_alignments(almnt_file, bundle =True): if len(bundle) != 1: continue # Skip multiple alignments first_almnt, second_almnt = bundle[0] #extract pair if not first_almnt.aligned and second_almnt.aligned: count["_unmapped"] += 1 continue gene_ids = set() for iv, val in features[left_almnt.iv].steps(): gene_ids |= val for iv, val in features[right_almnt.iv].steps(): gene_ids |= val if len(gene_ids) == 1: gene_id = list(gene_ids)[0] counts[gene_id] += 1 elif len(gene_ids) == 0: counts["_no_feature"] += 1 else:
def count_reads(sam_filename, features, counts, samtype, order, forward, reverse, overlap_mode, quiet, minaqual, samout, directory): def write_to_samout(r, assignment): if samoutfile is None: return if not pe_mode: r = (r,) for read in r: if read is not None: samoutfile.write(read.original_sam_line.rstrip() + "\tXF:Z:" + assignment + "\n") if samout != "": samoutfile = open(samout, "w") else: samoutfile = None if samtype is None: samtype = detect_sam_type(sam_filename) if samtype == "sam": SAM_or_BAM_Reader = HTSeq.SAM_Reader elif samtype == "bam": SAM_or_BAM_Reader = HTSeq.BAM_Reader else: raise ValueError("Unknown input format %s specified." % samtype) try: if sam_filename != "-": read_seq_file = SAM_or_BAM_Reader(sam_filename) read_seq = read_seq_file first_read = iter(read_seq).next() else: read_seq_file = SAM_or_BAM_Reader(sys.stdin) read_seq_iter = iter(read_seq_file) first_read = read_seq_iter.next() read_seq = itertools.chain([first_read], read_seq_iter) pe_mode = first_read.paired_end except: sys.stderr.write("Error occured when reading beginning " "of SAM/BAM file.\n") raise try: if pe_mode: if order == "name": read_seq = HTSeq.pair_SAM_alignments(read_seq) elif order == "pos": read_seq = HTSeq.pair_SAM_alignments_with_buffer(read_seq) else: raise ValueError("Illegal order specified.") if forward: empty_forward = 0 ambiguous_forward = 0 counts_forward = copy.copy(counts) if reverse: empty_reverse = 0 ambiguous_reverse = 0 counts_reverse = copy.copy(counts) notaligned = 0 lowqual = 0 nonunique = 0 i = 0 for r in read_seq: if i > 0 and i % 100000 == 0 and not quiet: sys.stderr.write("%d SAM alignment record%s processed.\n" % (i, "s" if not pe_mode else " pairs")) i += 1 if not pe_mode: if not r.aligned: notaligned += 1 write_to_samout(r, "__not_aligned") continue try: if r.optional_field("NH") > 1: nonunique += 1 write_to_samout(r, "__alignment_not_unique") continue except KeyError: pass if r.aQual < minaqual: lowqual += 1 write_to_samout(r, "__too_low_aQual") continue if forward: iv_seq_for = (co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0) if reverse: iv_seq_rev = (invert_strand(co.ref_iv) for co in r.cigar if co.type == "M" and co.size > 0) else: if r[0] is not None and r[0].aligned: if forward: iv_seq_for = (co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0) if reverse: iv_seq_rev = (invert_strand(co.ref_iv) for co in r[0].cigar if co.type == "M" and co.size > 0) else: iv_seq_rev = tuple() iv_seq_for = tuple() if r[1] is not None and r[1].aligned: if forward: iv_seq_for = (itertools.chain(iv_seq_for, (invert_strand(co.ref_iv) for co in r[1].cigar if co.type == "M" and co.size > 0))) if reverse: iv_seq_rev = itertools.chain(iv_seq_rev, (co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0)) else: if (r[0] is None) or not (r[0].aligned): write_to_samout(r, "__not_aligned") notaligned += 1 continue try: if ((r[0] is not None and r[0].optional_field("NH") > 1) or (r[1] is not None and r[1].optional_field("NH") > 1)): nonunique += 1 write_to_samout(r, "__alignment_not_unique") continue except KeyError: pass if ((r[0] and r[0].aQual < minaqual) or (r[1] and r[1].aQual < minaqual)): lowqual += 1 write_to_samout(r, "__too_low_aQual") continue try: if overlap_mode == "union": if forward: fs_for = set() for iv in iv_seq_for: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): fs_for = fs_for.union(fs2) if reverse: fs_rev = set() for iv in iv_seq_rev: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): fs_rev = fs_rev.union(fs2) elif (overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty"): if forward: fs_for = None for iv in iv_seq_for: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): if len(fs2) > 0 or \ overlap_mode == "intersection-strict": if fs_for is None: fs_for = fs2.copy() else: fs_for = fs_for.intersection(fs2) if reverse: fs_reverse = None for iv in iv_seq_rev: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): if len(fs2) > 0 or \ overlap_mode == "intersection-strict": if fs_rev is None: fs_rev = fs2.copy() else: fs_rev = fs_rev.intersection(fs2) else: sys.exit("Illegal overlap mode.") if forward: if fs_for is None or len(fs_for) == 0: write_to_samout(r, "__no_feature") empty_forward += 1 elif len(fs_for) > 1: write_to_samout(r, "__ambiguous[" + '+'.join(fs_for) + "]") ambiguous_forward += 1 else: write_to_samout(r, list(fs_for)[0]) counts_forward[list(fs_for)[0]] += 1 if reverse: if fs_reverse is None or len(fs_rev) == 0: write_to_samout(r, "__no_feature") empty_reverse += 1 elif len(fs_reverse) > 1: write_to_samout(r, "__ambiguous[" + '+'.join(fs_rev) + "]") ambiguous_reverse += 1 else: write_to_samout(r, list(fs_rev)[0]) counts_reverse[list(fs_rev)[0]] += 1 except UnknownChrom: write_to_samout(r, "__no_feature") empty_forward += 1 empty_reverse += 1 except: sys.stderr.write("Error occured when processing SAM input (%s):\n" % read_seq_file.get_line_number_string()) raise if not quiet: sys.stderr.write("%d SAM %s processed.\n" % (i, "alignments " if not pe_mode else "alignment pairs")) if samoutfile is not None: samoutfile.close() if forward: output = brenninc_utils.create_new_file(sam_filename, "_forward_count", outputdir=directory, extension="txt", gzipped=False) used_features_count = 0 used_features_sum = 0 print "Forward written to", output with open(output, "w") as output_file: for fn in sorted(counts_forward.keys()): output_file.write("%s\t%d\n" % (fn, counts_forward[fn])) used_features_count += 1 used_features_sum += counts_forward[fn] output_file.write("__no_feature\t%d\n" % empty_forward) output_file.write("__ambiguous\t%d\n" % ambiguous_forward) output_file.write("__too_low_aQual\t%d\n" % lowqual) output_file.write("__not_aligned\t%d\n" % notaligned) output_file.write("__alignment_not_unique\t%d\n" % nonunique) print "Forward features with alignment\t%d" % used_features_count print "Forward alignments asigned to feature\t%d" % used_features_sum print "__forward_no_feature\t%d" % empty_forward print "__forward_ambiguous\t%d" % ambiguous_forward if reverse: output = brenninc_utils.create_new_file(sam_filename, "_reverse_count", outputdir=directory, extension="txt", gzipped=False) used_features_count = 0 used_features_sum = 0 print "Reverse written to", output with open(output, "w") as output_file: for fn in sorted(counts_reverse.keys()): output.write("%s\t%d\n" % (fn, counts_reverse[fn])) used_features_count += 1 used_features_sum += counts_reverse[fn] output_file.write("__no_feature\t%d\n" % empty_reverse) output_file.write("__ambiguous\t%d\n" % ambiguous_reverse) output_file.write("__too_low_aQual\t%d\n" % lowqual) output_file.write("__not_aligned\t%d\n" % notaligned) output_file.write("__alignment_not_unique\t%d\n" % nonunique) print "Reverse features with alignment\t%d" % used_features_count print "Reverse alignments asigned to feature\t%d" % used_features_sum print "__reverse_no_feature\t%d" % empty_reverse print "__reverse_ambiguous\t%d" % ambiguous_reverse print "__too_low_aQual\t%d" % lowqual print "__not_aligned\t%d" % notaligned print "__alignment_not_unique\t%d" % nonunique
def main(): exe_parser = argparse.ArgumentParser() exe_parser.add_argument('infile', type=str, help='<input file> [(full path), -b/-s required]') exe_parser.add_argument("-u", "--not_aligned", help="output reads that were not aligned, including those that were aligned multiple times(flat file).", type=str) exe_parser.add_argument("-s", "--samout", help="output not aligned reads to [file path].", type=str) exe_parser.add_argument("-b", "--ambiguous_out", help="output a fasta file of ambiguous hits [file path].", type=str) exe_parser.add_argument("-v", "--verbose", help="verbose. (default = TRUE).", action="store_true") exe_parser.add_argument("gff", help="<gff file> [(full path)]", type=str) exe_parser.add_argument("-f", "--fasta", help="output fasta file of hits (full path).", type=str) exe_parser.add_argument("-m", "--min_read_length", help="minimal read length to consider. (default = 60b).", type=int) exe_parser.add_argument("-i", "--min_id", help="minimal percent id of hit to consider. (default = 80).", type=int) exe_parser.add_argument("-z", "--min_score", help="minimal aligner score to consider. (default = 0).", type=int) exe_parser.add_argument("-c", "--max_clip", help="proportion of bases clipped from read for alignment. (default = 0.3).", type=float) exe_parser.add_argument("--stranded", help="whether the data is stranded (y, n, reverse). (default = n).", type=str, choices=["y", "n", "reverse"], default="n") exe_parser.add_argument("--idattr", help="GFF attribute to be used as feature ID. (default = GeneID).", type=str) exe_parser.add_argument("--type", help="feature type (3rd column in GFF file) to be used. (default = CDS).", type=str) exe_parser.add_argument("-a", "--minaqual", help="min. alignment quality (default = 0).", type=str) exe_parser.add_argument("-p", "--paired_end_mode", help="input is paired end sorted by name (n) or position (p) . (default = p).", type=str, choices=["p", "n"], default="p") exe_parser.add_argument("-o", "--out", help="name of counts output file.", type=str) args = exe_parser.parse_args() if args.paired_end_mode == 'p': paired_end = True pe_order = 'p' elif args.paired_end_mode == 'n': paired_end = True pe_order = 'n' if args.infile: try: if args.infile == '-': # get sam on a stream seqfile = HTSeq.SAM_Reader(sys.stdin) if args.paired_end_mode: # read_seq_iter = iter(seqfile) # first_read = read_seq_iter.next() # read_seq = itertools.chain([first_read], read_seq_iter) # reader = HTSeq.pair_SAM_alignments(read_seq) if pe_order == 'p': reader = HTSeq.pair_SAM_alignments_with_buffer(seqfile) elif pe_order == 'n': reader = HTSeq.pair_SAM_alignments(seqfile) # (read_seq) else: reader = seqfile elif args.infile != '-': seqfile = HTSeq.SAM_Reader(args.infile) if args.paired_end_mode: read_seq_iter = iter(seqfile) first_read = read_seq_iter.next() read_seq = itertools.chain([first_read], read_seq_iter) reader = HTSeq.pair_SAM_alignments(read_seq) if pe_order == 'p': reader = HTSeq.pair_SAM_alignments_with_buffer(reader) elif pe_order == 'n': reader = HTSeq.pair_SAM_alignments(reader) else: reader = seqfile # fread_seq_iter = iter(reader) # first_read = iter(read_seq).next() elif args.infile == '': print "no input file type given. exiting..." sys.exit(1) except: print "failed processing SAM/BAM file" raise elif not args.infile: print "no input file given. exiting..." sys.exit(1) if args.gff: gff_file = args.gff else: print "no gff file given. exiting..." sys.exit(1) if args.verbose: verbose = True else: verbose = False if args.min_read_length: min_read_len = args.min_read_length else: min_read_len = 60 # default read length if args.max_clip: max_clip_ = float(args.max_clip) else: max_clip_ = float(0.3) # default read length if args.min_id: min_id = float(args.min_id) else: min_id = float(80) if args.min_score: min_score = int(args.min_score) else: min_score = 0 if args.stranded == 'n': stranded = 'no' elif args.stranded == 'y': stranded = 'yes' elif args.stranded == 'reverse': stranded = 'reverse' if args.minaqual: minaqual = args.minaqual else: minaqual = 0 if args.idattr: id_attribute = args.idattr else: id_attribute = "GeneID" if args.type: feature_type = args.type else: feature_type = 'CDS' # ### # parse GFF file features, counts = gff_reader(gff_file, feature_type, id_attribute, verbose, stranded) # ### if args.samout: samoutfile = open(args.samout, "w") else: samoutfile = None if args.ambiguous_out: ambiguousfile = open(args.ambiguous_out, "w") else: ambiguousfile = None if args.fasta: fastafile = open(args.fasta, "w") else: fastafile = None if args.not_aligned: not_aligned_file = open(args.not_aligned, "w") else: not_aligned_file = None if args.out: outfile = open(args.out, "w") else: outfile = None # if outfile and samoutfile and ambiguousfile and fastafile and not_aligned_file == None: # print "None of the possible output file options specified. exiting..." # sys.exit(1) # ####### # decalre counter variables empty = 0 ambiguous = 0 notaligned = 0 lowqual = 0 nonunique = 0 # ####### read_counter = 0 for alignment in reader: # for alignment entry (line in fact) in sam file # iv_seq # print alignment if not paired_end: if read_counter % 1000000 == 0 and verbose: if verbose: print read_counter, 'non paired-end alignments processed' read_name = alignment.read.name # read = alignment.read # READ. Note that def invert_strand( iv ): read_seq = alignment.read.seq read_length = len(alignment.read.seq) if not alignment.aligned: # check if read is aligned to ref sequence if alignment is not None: notaligned += 1 if args.samout: write_to_samout(samoutfile, paired_end, alignment, "not_aligned") if args.not_aligned: not_aligned_file.write(read_name + '\t' + 'not_aligned' + '\n') # continue elif alignment.aligned: opt_fields = alignment.optional_fields # flag = alignment.flag cigar_string = parse_cigar(alignment.original_sam_line.split('\t')[ 5]) # just the cigar string without the fancy HTseq additions cigar_soft_clipped, cigar_m, cigar_insertions, cigar_deletions, cigar_insertions = parse_cigar_alignment(cigar_string) # get alignment data from cigar string score, md_matches, md_deletions, md_mismatches = parse_opt_fields( opt_fields) # get alignment data from md string percent_id = 100.0 * ( float(md_matches) / (float(read_length - cigar_soft_clipped + cigar_insertions + cigar_deletions))) if alignment[0] is not None: # check if read is aligned to ref sequence if alignment.optional_field("NH") > 1: # check if read is mapped more than once # By default these reads are discarded. CHANGE? if args.samout: write_to_samout(samoutfile, paired_end, alignment, "alignment_not_unique") nonunique += 1 if args.not_aligned: not_aligned_file.write(read_name + '\t' + 'alignment_not_unique' + '\n') # continue if alignment.aQual < minaqual: # check quality. default is 0 lowqual += 1 if args.samout: write_to_samout(samoutfile, paired_end, alignment, "too_low_aQual") if args.not_aligned: not_aligned_file.write(read_name + '\t' + 'too_low_aQual' + '\n') # continue clipped = (float(cigar_soft_clipped) / float(read_length)) if read_length >= min_read_len: if (float(cigar_soft_clipped) / float(read_length)) <= max_clip_: if score >= args.min_score: if percent_id >= float(min_id): if stranded == "reverse": iv_seq = ( (invert_strand(cigar_operation.ref_iv) for cigar_operation in alignment[1].cigar if cigar_operation.type == "M" and cigar_operation.size > 0)) else: iv_seq = (cigar_operation.ref_iv for cigar_operation in alignment.cigar if cigar_operation.type == "M" and cigar_operation.size > 0) iv_seq_good = True # collects hits to chromosomes/features. """ cigarOperation in HTSeq: HTSeq.parse_cigar( "20M6I10M", 1000, "chr2", "+" ) #ref_iv == genomicInterval object of htSeq [< CigarOperation: 20 base(s) matched on ref iv chr2:[1000,1020)/+,query iv[0,20)>, < CigarOperation: 6 base(s) inserted on ref iv chr2:[1020,1020)/+,query iv[20,26)>,] """ # if args.fasta: # fastafile.write('>' + read_name + '\n' + read_seq + '\n') else: iv_seq_good = False if args.samout: write_to_samout(samoutfile, paired_end, alignment, "percent_id_too_low=" + str(percent_id)) if args.not_aligned: not_aligned_file.write( read_name + '\t' + 'percent_id_too_low=' + str(percent_id) + '\n') else: iv_seq_good = False if args.samout: write_to_samout(samoutfile, paired_end, alignment, 'alignment_score_too_low=' + str(score)) if args.not_aligned: not_aligned_file.write( read_name + '\t' + 'alignment_score_too_low=' + str(score) + '\n') else: iv_seq_good = False if args.samout: write_to_samout(samoutfile, paired_end, alignment, 'too_many_bases_clipped_from_read=' + str(cigar_soft_clipped)) if args.not_aligned: not_aligned_file.write(read_name + '\t' + 'too_many_bases_clipped_from_read=' + str( cigar_soft_clipped) + '\n') elif paired_end: # print "read counter=", read_counter if read_counter % 100000 == 0 and verbose: if verbose: print read_counter, 'alignment pairs processed' if (alignment[0] is None) or not alignment[0].aligned: notaligned += 1 try: read_1_name = alignment[0].read.name except: read_1_name = 'None' if args.samout: write_to_samout(samoutfile, paired_end, alignment, "not_aligned") if args.not_aligned: not_aligned_file.write(read_1_name + '\t' + 'not_aligned' + '\n') elif (alignment[1] is None) or not alignment[1].aligned: notaligned += 1 try: read_2_name = alignment[1].read.name except: read_2_name = 'None' if args.samout: write_to_samout(samoutfile, paired_end, alignment, "not_aligned") if args.not_aligned: not_aligned_file.write(read_2_name + '\t' + 'not_aligned' + '\n') else: # else: read_1_name = alignment[0].read.name # read_1 = alignment[0].read #READ. read_1_length = len(alignment[0].read.seq) read_1_seq = alignment[0].read.seq read_2_name = alignment[1].read.name # read_2 = alignment[1].read #READ. # read_2_length = len(alignment[1].read.seq) read_2_seq = alignment[1].read.seq iv_seq = tuple() if (alignment[0] is not None) and alignment[0].aligned: # check if read is aligned to ref sequence opt_1_fields = alignment[0].optional_fields # flag_1 = alignment[0].flag cigar_1_string = parse_cigar(alignment[0].original_sam_line.split('\t')[ 5]) # just the cigar string without the fancy HTseq additions cigar_1_soft_clipped, cigar_1_m, cigar_1_insertions, cigar_1_deletions, cigar_1_insertions = parse_cigar_alignment( cigar_1_string) score_1, md_1_matches, md_1_deletions, md_1_mismatches = parse_opt_fields( opt_1_fields) # get alignment data from md string percent_1_id = (100.0 * ((float(md_1_matches) / ( float(read_1_length - cigar_1_soft_clipped + cigar_1_insertions + cigar_1_deletions))))) clipped_1 = (float(cigar_1_soft_clipped) / float(read_1_length)) if int(read_1_length) >= int(min_read_len): if (float(cigar_1_soft_clipped) / float(read_1_length)) <= float(max_clip_): # if int(score_1) >= int(args.min_score): if int(score_1) >= int(min_score): # if float(percent_1_id) >= float(args.min_id): if float(percent_1_id) >= float(min_id): if stranded == "reverse": iv_seq = itertools.chain(iv_seq, (invert_strand(cigar_operation.ref_iv) for cigar_operation in alignment[0].cigar if cigar_operation.type == "M" and cigar_operation.size > 0)) else: iv_seq = itertools.chain(iv_seq, (cigar_operation.ref_iv for cigar_operation in alignment[0].cigar if cigar_operation.type == "M" and cigar_operation.size > 0)) # if args.fasta: # fastafile.write('>' + read_1_name + '\n' + read_1_seq + '\n') iv_seq_good_1 = True else: iv_seq_good_1 = False if args.samout: write_to_samout(samoutfile, paired_end, alignment, "percent_id_too_low=" + str(percent_1_id)) if args.not_aligned: not_aligned_file.write( read_1_name + '\t' + 'percent_id_too_low=' + str(percent_1_id) + '\n') else: iv_seq_good_1 = False if args.samout: write_to_samout(samoutfile, paired_end, alignment, 'alignment_score_too_low=' + str(score_1)) if args.not_aligned: not_aligned_file.write( read_1_name + '\t' + 'alignment_score_too_low=' + str(score_1) + '\n') else: iv_seq_good = False if args.samout: write_to_samout(samoutfile, paired_end, alignment, 'too_many_bases_clipped_from_read=' + str(cigar_1_soft_clipped)) if args.not_aligned: not_aligned_file.write(read_1_name + '\t' + 'too_many_bases_clipped_from_read=' + str( cigar_1_soft_clipped) + '\n') # else: # iv_seq = tuple() if (alignment[1] is not None) and alignment[1].aligned: # check if read is aligned to ref sequence opt_2_fields = alignment[1].optional_fields # flag_2 = alignment[1].flag # ', #'bit_length', 'conjugate', 'denominator', 'imag', 'numerator', 'real'] cigar_2_string = parse_cigar(alignment[1].original_sam_line.split('\t')[ 5]) # just the cigar string without the fancy HTseq additions cigar_2_soft_clipped, cigar_2_m, cigar_2_insertions, cigar_2_deletions, cigar_2_insertions = parse_cigar_alignment( cigar_2_string) score_2, md_2_matches, md_2_deletions, md_2_mismatches = parse_opt_fields( opt_2_fields) # get alignment data from md string read_2_name = alignment[1].read.name read_2_length = len(alignment[1].read.seq) # read_2 = alignment[1].read # READ. read_2_seq = alignment[1].read.seq percent_2_id = (100.0 * (float(md_2_matches) / ( float(read_2_length - cigar_2_soft_clipped + cigar_2_insertions + cigar_2_deletions)))) clipped_2 = (float(cigar_2_soft_clipped) / float(read_2_length)) if int(read_2_length) >= int(min_read_len): if (float(cigar_2_soft_clipped) / float(read_2_length)) <= float(max_clip_): if int(score_2) >= int(min_score): if float(percent_2_id) >= float(min_id): if stranded == "reverse": iv_seq = itertools.chain(iv_seq, (invert_strand(cigar_operation.ref_iv) for cigar_operation in alignment[1].cigar if cigar_operation.type == "M" and cigar_operation.size > 0)) else: iv_seq = itertools.chain(iv_seq, (cigar_operation.ref_iv for cigar_operation in alignment[1].cigar if cigar_operation.type == "M" and cigar_operation.size > 0)) iv_seq_good_2 = True try: if (alignment[0].optional_field("NH") > 1) or (alignment[1].optional_field( "NH") > 1): # or (alignment[1].optional_field("NH") > 1): #check if read is mapped more # than once # By default these reads are discarded. CHANGE? iv_seq_good_1 = False iv_seq_good_2 = False if args.samout: write_to_samout(samoutfile, paired_end, alignment, "alignment_not_unique") nonunique += 1 if args.not_aligned: not_aligned_file.write(read_1_name + '\t' + 'not_aligned' + '\n') not_aligned_file.write(read_2_name + '\t' + 'not_aligned' + '\n') continue except KeyError: pass if (alignment[0] and alignment[0].aQual < minaqual) or (alignment[1] and alignment[1].aQual < minaqual): # check quality. default is 0 iv_seq_good_2 = False lowqual += 1 if args.samout: write_to_samout(samoutfile, paired_end, alignment, "too_low_aQual") if args.not_aligned: not_aligned_file.write(read_1_name + '\t' + 'not_aligned' + '\n') not_aligned_file.write(read_2_name + '\t' + 'not_aligned' + '\n') continue else: iv_seq_good_2 = False if args.samout: write_to_samout(samoutfile, paired_end, alignment, "percent_id_too_low=" + str(percent_2_id)) if args.not_aligned: not_aligned_file.write( read_2_name + '\t' + 'percent_id_too_low=' + str(percent_2_id) + '\n') else: iv_seq_good_2 = False if args.samout: write_to_samout(samoutfile, paired_end, alignment, 'alignment_score_too_low=' + str(score_2)) if args.not_aligned: not_aligned_file.write( read_2_name + '\t' + 'alignment_score_too_low=' + str(score_2) + '\n') else: iv_seq_good_2 = False if args.samout: write_to_samout(samoutfile, paired_end, alignment, 'too_many_bases_clipped_from_read=' + str(cigar_2_soft_clipped)) if args.not_aligned: not_aligned_file.write(read_2_name + '\t' + 'too_many_bases_clipped_from_read=' + str( cigar_2_soft_clipped) + '\n') read_counter += 1 """ overlap_mode == "union" will count a hit even if read is mapped across an intron or there is an insertion. """ try: feature_set = set() for iv in iv_seq: # print iv if iv.chrom not in features.chrom_vectors: # check if alignment feaure name in features from GFF file # The name of a sequence (i.e., chromosome, contig, or the like). # check the gff features dictionary raise UnknownChrom for iv2, fs2 in features[iv].steps(): # fs == feature steps. """ from HTseq manual: GenomicArray objects use by default so-called StepVectors that store the data internally in steps of constant value """ feature_set = feature_set.union(fs2) # print feature_set if feature_set is None or len(feature_set) == 0: if args.samout: write_to_samout(samoutfile, paired_end, alignment, "no_feature") if args.not_aligned: not_aligned_file.write('None' + '\t' + 'no_feature' + '\n') empty += 1 elif len(feature_set) > 1: if args.samout: write_to_samout(samoutfile, paired_end, alignment, "ambiguous[" + '+'.join(feature_set) + "]") if ambiguousfile: if paired_end: if iv_seq_good_1: ambiguousfile.write('>' + read_1_name + '_' + "ambiguous[" + '+'.join( feature_set) + "]" + '_clipped_' + str(clipped_1) + '_score_' + str(score_2) + '_percent_id_' + str(percent_1_id) + '\n' + read_1_seq + '\n') if iv_seq_good_2: ambiguousfile.write('>' + read_2_name + '_' + "ambiguous[" + '+'.join( feature_set) + "]" + '_clipped_' + str(clipped_2) + '_score_' + str(score_2) + '_percent_id_' + str(percent_2_id) + '\n' + read_2_seq + '\n') else: if iv_seq_good: ambiguousfile.write('>' + alignment.read.name + '_' + "ambiguous[" + '+'.join( feature_set) + "]" + '_clipped_' + str(clipped) + '_score_' + str(score) + '_percent_id_' + str(percent_id) + '\n' + read_seq + '\n') """ #if args.not_aligned: # if paired_end: # not_aligned_file.write(alignment[0].read.name + '\t' + 'ambiguous['+'+'.join(feature_set)+']' + '\n') # not_aligned_file.write(alignment[1].read.name + '\t' + 'ambiguous['+'+'.join(feature_set)+']' + '\n') # else: # not_aligned_file.write(alignment.read.name + '\t' + 'ambiguous['+'+'.join(feature_set)+']' + '\n') """ ambiguous += 1 elif len(feature_set) == 1: if args.samout: write_to_samout(samoutfile, paired_end, alignment, list(feature_set)[0]) if args.fasta: if paired_end: if iv_seq_good_1: fastafile.write('>' + read_1_name + '_' + ''.join(list(feature_set)[0]) + '_clipped_' + str( clipped_1) + '_score_' + str(score_1) + '_percent_id_' + str(percent_1_id) + '\n' + read_1_seq + '\n') if iv_seq_good_2: fastafile.write('>' + read_2_name + '_' + ''.join(list(feature_set)[0]) + '_clipped_' + str( clipped_2) + '_score_' + str(score_2) + '_percent_id_' + str(percent_2_id) + '\n' + read_2_seq + '\n') else: if iv_seq_good: fastafile.write('>' + read_name + '_' + ''.join(list(feature_set)[0]) + '_clipped_' + str( clipped) + '_score_' + str(score) + '_percent_id_' + str(percent_id) + '\n' + read_seq + '\n') counts[list(feature_set)[0]] += 1 except: if args.samout: write_to_samout(samoutfile, paired_end, alignment, "__no_feature") empty += 1 # if not paired_end: # al = alignment # else: # al = alignment[0] if alignment[0] is not None else alignment[1] # if args.not_aligned: # not_aligned_file.write(al.read.name + '\t' + 'feature_not_in_gff_file' + '\n') # if not verbose: # print (("Warning: Skipping read '%s', because chromosome " + # "'%s', to which it has been aligned, did not appear in the GFF file.\n" ) % # (al.read.name, iv.chrom) ) print 'total', read_counter, 'alignments processed' if samoutfile is not None: samoutfile.close() if fastafile is not None: fastafile.close if not_aligned_file is not None: not_aligned_file.close() if outfile is not None: for feature in sorted(counts.keys()): outfile.write("%s\t%d\n" % (feature, counts[feature])) outfile.write("no_feature\t%d\n" % empty) outfile.write("ambiguous\t%d\n" % ambiguous) outfile.write("too_low_aQual\t%d\n" % lowqual) outfile.write("not_aligned\t%d\n" % notaligned) outfile.write("alignment_not_unique\t%d\n" % nonunique) if outfile is not None: outfile.close()
def reader(x): # return HTSeq.pair_SAM_alignments(HTSeq.BAM_Reader(x), bundle=True) return HTSeq.pair_SAM_alignments(HTSeq.BAM_Reader(x), bundle=False)
def count_reads_in_features( sam_filename, gff_filename, stranded, overlap_mode, feature_type, id_attribute, quiet, minaqual, samout ): def write_to_samout( r, assignment ): if samoutfile is None: return if not pe_mode: r = (r,) for read in r: if read is not None: samoutfile.write( read.original_sam_line.rstrip() + "\tXF:Z:" + assignment + "\n" ) if quiet: warnings.filterwarnings( action="ignore", module="HTSeq" ) if samout != "": samoutfile = open( samout, "w" ) else: samoutfile = None features = HTSeq.GenomicArrayOfSets( "auto", stranded != "no" ) counts = {} # Try to open samfile to fail early in case it is not there if sam_filename != "-": open( sam_filename ).close() gff = HTSeq.GFF_Reader( gff_filename ) i = 0 try: for f in gff: if f.type == feature_type: try: feature_id = f.attr[ id_attribute ] except KeyError: sys.exit( "Feature %s does not contain a '%s' attribute" % ( f.name, id_attribute ) ) if stranded != "no" and f.iv.strand == ".": sys.exit( "Feature %s at %s does not have strand information but you are " "running htseq-count in stranded mode. Use '--stranded=no'." % ( f.name, f.iv ) ) features[ f.iv ] += feature_id counts[ f.attr[ id_attribute ] ] = 0 i += 1 if i % 100000 == 0 and not quiet: sys.stderr.write( "%d GFF lines processed.\n" % i ) except: sys.stderr.write( "Error occured in %s.\n" % gff.get_line_number_string() ) raise if not quiet: sys.stderr.write( "%d GFF lines processed.\n" % i ) if len( counts ) == 0 and not quiet: sys.stderr.write( "Warning: No features of type '%s' found.\n" % feature_type ) try: if sam_filename != "-": read_seq = HTSeq.SAM_Reader( sam_filename ) first_read = iter(read_seq).next() else: read_seq = iter( HTSeq.SAM_Reader( sys.stdin ) ) first_read = read_seq.next() read_seq = itertools.chain( [ first_read ], read_seq ) pe_mode = first_read.paired_end except: sys.stderr.write( "Error occured when reading first line of sam file.\n" ) raise try: if pe_mode: read_seq_pe_file = read_seq read_seq = HTSeq.pair_SAM_alignments( read_seq ) empty = 0 ambiguous = 0 notaligned = 0 lowqual = 0 nonunique = 0 i = 0 for r in read_seq: i += 1 if not pe_mode: if not r.aligned: notaligned += 1 write_to_samout( r, "not_aligned" ) continue try: if r.optional_field( "NH" ) > 1: write_to_samout( r, "alignment_not_unique" ) nonunique += 1 continue except KeyError: pass if r.aQual < minaqual: lowqual += 1 write_to_samout( r, "too_low_aQual" ) continue if stranded != "reverse": iv_seq = ( co.ref_iv for co in r.cigar if co.type == "M" ) else: iv_seq = ( invert_strand( co.ref_iv ) for co in r.cigar if co.type == "M" ) else: if r[0] is not None and r[0].aligned: if stranded != "reverse": iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" ) else: iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" ) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if stranded != "reverse": iv_seq = itertools.chain( iv_seq, ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" ) ) else: iv_seq = itertools.chain( iv_seq, ( co.ref_iv for co in r[1].cigar if co.type == "M" ) ) else: if ( r[0] is None ) or not ( r[0].aligned ): write_to_samout( r, "not_aligned" ) notaligned += 1 continue try: if ( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \ ( r[1] is not None and r[1].optional_field( "NH" ) > 1 ): nonunique += 1 write_to_samout( r, "alignment_not_unique" ) continue except KeyError: pass if ( r[0] and r[0].aQual < minaqual ) or ( r[1] and r[1].aQual < minaqual ): lowqual += 1 write_to_samout( r, "too_low_aQual" ) continue try: if overlap_mode == "union": fs = set() for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[ iv ].steps(): fs = fs.union( fs2 ) elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty": fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[ iv ].steps(): if len(fs2) > 0 or overlap_mode == "intersection-strict": if fs is None: fs = fs2.copy() else: fs = fs.intersection( fs2 ) else: sys.exit( "Illegal overlap mode." ) if fs is None or len( fs ) == 0: write_to_samout( r, "no_feature" ) empty += 1 elif len( fs ) > 1: write_to_samout( r, "ambiguous[" + '+'.join( fs ) + "]" ) ambiguous += 1 else: write_to_samout( r, list(fs)[0] ) counts[ list(fs)[0] ] += 1 except UnknownChrom: if not pe_mode: rr = r else: rr = r[0] if r[0] is not None else r[1] if not quiet: sys.stderr.write( ( "Warning: Skipping read '%s', because chromosome " + "'%s', to which it has been aligned, did not appear in the GFF file.\n" ) % ( rr.read.name, iv.chrom ) ) if i % 100000 == 0 and not quiet: sys.stderr.write( "%d sam %s processed.\n" % ( i, "lines " if not pe_mode else "line pairs" ) ) except: if not pe_mode: sys.stderr.write( "Error occured in %s.\n" % read_seq.get_line_number_string() ) else: sys.stderr.write( "Error occured in %s.\n" % read_seq_pe_file.get_line_number_string() ) raise if not quiet: sys.stderr.write( "%d sam %s processed.\n" % ( i, "lines " if not pe_mode else "line pairs" ) ) if samoutfile is not None: samoutfile.close() for fn in sorted( counts.keys() ): print "%s\t%d" % ( fn, counts[fn] ) print "no_feature\t%d" % empty print "ambiguous\t%d" % ambiguous print "too_low_aQual\t%d" % lowqual print "not_aligned\t%d" % notaligned print "alignment_not_unique\t%d" % nonunique
def count_reads_in_features(sam_filenames, colnames, gff_filename, opts): """ Hacked version of htseq count.py """ if opts.quiet: warnings.filterwarnings(action="ignore", module="HTSeq") features = HTSeq.GenomicArrayOfSets("auto", opts.stranded != "no") mapqMin = int(opts.mapqMin) counts = {} nreads = 0 empty = 0 ambiguous = 0 notaligned = 0 lowqual = 0 nonunique = 0 filtered = 0 # new filter_extras - need a better way to do this - independent filter tool? gff = HTSeq.GFF_Reader(gff_filename) try: for i, f in enumerate(gff): if f.type == opts.feature_type: try: feature_id = f.attr[opts.id_attribute] except KeyError: try: feature_id = f.attr['gene_id'] except KeyError: sys.exit( "Feature at row %d %s does not contain a '%s' attribute OR a gene_id attribute - faulty GFF?" % ((i + 1), f.name, opts.id_attribute)) if opts.stranded != "no" and f.iv.strand == ".": sys.exit( "Feature %s at %s does not have strand information but you are " "running htseq-count in stranded mode. Use '--stranded=no'." % (f.name, f.iv)) features[f.iv] += feature_id counts[feature_id] = [ 0 for x in colnames ] # we use sami as an index here to bump counts later except: sys.stderr.write("Error occured in %s.\n" % gff.get_line_number_string()) raise if not opts.quiet: sys.stdout.write("%d GFF lines processed.\n" % i) if len(counts) == 0 and not opts.quiet: sys.stdout.write("Warning: No features of type '%s' found.\n" % opts.feature_type) for sami, sam_filename in enumerate(sam_filenames): colname = colnames[sami] isbam = sam_exts[sami] == 'bam' hasbai = sam_bais[sami] > '' if hasbai: tempname = os.path.splitext(os.path.basename(sam_filename))[0] tempbam = '%s_TEMP.bam' % tempname tempbai = '%s_TEMP.bai' % tempname os.link(sam_filename, tempbam) os.link(sam_bais[sami], tempbai) try: if isbam: if hasbai: read_seq = HTSeq.BAM_Reader(tempbam) else: read_seq = HTSeq.BAM_Reader(sam_filename) else: read_seq = HTSeq.SAM_Reader(sam_filename) first_read = iter(read_seq).next() pe_mode = first_read.paired_end except: if isbam: print >> sys.stderr, "Error occured when reading first line of bam file %s colname=%s \n" % ( sam_filename, colname) else: print >> sys.stderr, "Error occured when reading first line of sam file %s colname=%s \n" % ( sam_filename, colname) raise try: if pe_mode: read_seq_pe_file = read_seq read_seq = HTSeq.pair_SAM_alignments(read_seq) for seqi, r in enumerate(read_seq): nreads += 1 if not pe_mode: if not r.aligned: notaligned += 1 continue try: if len(opts.filter_extras) > 0: for extra in opts.filter_extras: if r.optional_field(extra): filtered += 1 continue if r.optional_field("NH") > 1: nonunique += 1 continue except KeyError: pass if r.aQual < mapqMin: lowqual += 1 continue if opts.stranded != "reverse": iv_seq = (co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r.cigar if co.type == "M" and co.size > 0) else: if r[0] is not None and r[0].aligned: if opts.stranded != "reverse": iv_seq = (co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar if co.type == "M" and co.size > 0) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if opts.stranded != "reverse": iv_seq = itertools.chain( iv_seq, (invert_strand(co.ref_iv) for co in r[1].cigar if co.type == "M" and co.size > 0)) else: iv_seq = itertools.chain( iv_seq, (co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0)) else: if (r[0] is None) or not (r[0].aligned): notaligned += 1 continue try: if ( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \ ( r[1] is not None and r[1].optional_field( "NH" ) > 1 ): nonunique += 1 continue except KeyError: pass if (r[0] and r[0].aQual < mapqMin) or ( r[1] and r[1].aQual < mapqMin): lowqual += 1 continue try: if opts.mode == "union": fs = set() for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): fs = fs.union(fs2) elif opts.mode == "intersection-strict" or opts.mode == "intersection-nonempty": fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): if len( fs2 ) > 0 or opts.mode == "intersection-strict": if fs is None: fs = fs2.copy() else: fs = fs.intersection(fs2) else: sys.exit("Illegal overlap mode %s" % opts.mode) if fs is None or len(fs) == 0: empty += 1 elif len(fs) > 1: ambiguous += 1 else: ck = list(fs)[0] counts[ck][ sami] += 1 # end up with counts for each sample as a list except UnknownChrom: if not pe_mode: rr = r else: rr = r[0] if r[0] is not None else r[1] empty += 1 if not opts.quiet: sys.stdout.write(( "Warning: Skipping read '%s', because chromosome " + "'%s', to which it has been aligned, did not appear in the GFF file.\n" ) % (rr.read.name, iv.chrom)) except: if not pe_mode: sys.stderr.write("Error occured in %s.\n" % read_seq.get_line_number_string()) else: sys.stderr.write("Error occured in %s.\n" % read_seq_pe_file.get_line_number_string()) raise if not opts.quiet: sys.stdout.write( "%d sam %s processed for %s.\n" % (seqi, "lines " if not pe_mode else "line pairs", colname)) return counts, empty, ambiguous, lowqual, notaligned, nonunique, filtered, nreads
def mapping_reads2shared_exons_introns(refGene_txt, bam_filename, minaqual, stranded, order, max_buffer_size): # initialise counters counts = {} counts['_empty'] = 0 counts['_ambiguous'] = 0 counts['_lowaqual'] = 0 counts['_notaligned'] = 0 counts['_ambiguous_readpair_position'] = 0 # Read BAM file bam_reader = HTSeq.BAM_Reader(bam_filename) # CIGAR match characters (including alignment match, sequence match, and sequence mismatch cigar_char = ('M', '=', 'X') # (Refer to HTSeq-count)strand-associated stranded_boolean = stranded == 'yes' or stranded == 'reverse' reverse_boolean = stranded == 'reverse' def invert_strand(iv): iv2 = iv.copy() if iv2.strand == "+": iv2.strand = "-" elif iv2.strand == "-": iv2.strand = "+" else: raise ValueError("Illegal strand") return iv2 sys.stdout.write( "Gene\tfeature\trank\tposition\tlength\tread_counts\tread_counts_norm\tcoverage(%)\n" ) annot = collections.OrderedDict() for line in open(refGene_txt): gene_label, feature, rank, position, length = line.strip().split('\t') chrom, iv_str, strand = position.strip().split(':') start, end = map(int, iv_str.strip().split('-')) annot.setdefault(gene_label, []).append( (feature, int(rank), chrom, start, end, strand, int(length))) for gene_name in annot: gene_count = {} gas = HTSeq.GenomicArrayOfSets("auto", stranded=stranded_boolean) ga = HTSeq.GenomicArray("auto", stranded=stranded_boolean, typecode="i") cvg_list = [] # Annotation for feature, rank, chrom, start, end, strand, length in annot[ gene_name]: iv = HTSeq.GenomicInterval(chrom, start, end, strand) gas[iv] += (feature, rank) gene_count[(feature, rank)] = 0 # 直接对bam_reader取iter有问题,作者说是pysam的bug导致的。修正:加fetch boundary_left, boundary_right = min( [i[3] for i in annot[gene_name]]), max([i[4] for i in annot[gene_name]]) region_fetch = annot[gene_name][0][2] + ':' + str( int(boundary_left) - 500) + '-' + str(int(boundary_right) + 500) read_seq = bam_reader.fetch(region=region_fetch) # distinguish SE and PE mode: read_seq_iter = iter(bam_reader.fetch()) one_read = next(read_seq_iter) pe_mode = one_read.paired_end if pe_mode: if order == 'name': read_seq = HTSeq.pair_SAM_alignments(read_seq) elif order == 'pos': read_seq = HTSeq.pair_SAM_alignments_with_buffer( read_seq, max_buffer_size=max_buffer_size) else: raise ValueError("Illegal order name.") # Mapping for a in read_seq: if not pe_mode: if not a.aligned: counts['_notaligned'] += 1 continue if a.optional_field('NH') > 1: continue if a.aQual < minaqual: counts['_lowaqual'] += 1 continue if not reverse_boolean: iv_seq = (cigop.ref_iv for cigop in a.cigar if cigop.type == "M" and cigop.size > 0) else: iv_seq = (invert_strand(cigop.ref_iv) for cigop in a.cigar if cigop.type in cigar_char and cigop.size > 0) # pe mode else: if ((a[0] and a[0].aQual < minaqual) or (a[1] and a[1].aQual < minaqual)): counts['_lowaqual'] += 1 continue if ((a[0] and a[0].optional_field('NH') > 1) or (a[1] and a[1].optional_field('NH') > 1)): continue if a[0] is not None and a[0].aligned: if not reverse_boolean: iv_seq = ( cigop.ref_iv for cigop in a[0].cigar if cigop.type in cigar_char and cigop.size > 0) else: iv_seq = ( invert_strand(cigop.ref_iv) for cigop in a[0].cigar if cigop.type in cigar_char and cigop.size > 0) else: iv_seq = tuple() if a[1] is not None and a[1].aligned: if not reverse_boolean: iv_seq = itertools.chain( iv_seq, (invert_strand(cigop.ref_iv) for cigop in a[1].cigar if cigop.type in cigar_char and cigop.size > 0)) else: iv_seq = itertools.chain( iv_seq, (cigop.ref_iv for cigop in a[1].cigar if cigop.type in cigar_char and cigop.size > 0)) feature_aligned = set() for iv in iv_seq: for iv2, val2 in gas[iv].steps(): feature_aligned |= val2 ga[iv] += 1 # for calculating coverage if len(feature_aligned) == 0: counts['_empty'] += 1 continue # when mapping to intron, discard exons for f in [item for item in feature_aligned if item[0] == 'intron']: gene_count[f] += 1 # when no mapping to intron, count all exons if 'intron' not in [x for x, y in feature_aligned]: for f in feature_aligned: gene_count[f] += 1 res = [] for feature, rank, chrom, start, end, strand, length in annot[ gene_name]: feature_count = gene_count[(feature, rank)] feature_count_norm = feature_count / length * 1000 # Coverage calculation iv = HTSeq.GenomicInterval(chrom, start, end, strand) cvg_region = list(ga[iv]) cvg = len(filter(lambda x: x > 0, cvg_region)) / len(cvg_region) * 100 res.append([ feature, rank, chrom, start, end, strand, length, feature_count, feature_count_norm, cvg ]) # Output for feature, rank, chrom, start, end, strand, length, feature_count, feature_count_norm, cvg in res: pos = "%s:%d-%d:%s" % (chrom, start, end, strand) sys.stdout.write('\t'.join( map(str, [ gene_name, feature, rank, pos, length, feature_count, feature_count_norm, cvg ])) + '\n') for fn in counts.keys(): sys.stderr.write('%s\t%d\n' % (fn, counts[fn]))
# Deal with any GFF file reading errors except ValueError as e: e.args += ( gff.get_line_number_string(), ) raise try: # Get the first read to see if we're dealing with paired-end data read_seq = HTSeq.SAM_Reader(options.sam) first_read = iter(read_seq).next() pe_mode = first_read.paired_end # Re-initialize read_seq depending on if it's paired-end data or not read_seq = HTSeq.SAM_Reader(options.sam) if pe_mode: read_seq = HTSeq.pair_SAM_alignments(read_seq) # Read counter, for feedback to user i = 0 total = 0 # Here we go, through each read... for r in read_seq: spliced = False if not pe_mode: if not r.aligned: continue total += 1 iv_seq = [] # Check to see if it's spliced for co in r.cigar:
def count_reads_onto_prebuilt_features( sam_filename, features, feature_ids, stranded, overlap_mode, quiet, minaqual, samout, umis=False ): def write_to_samout(r, assignment): if samoutfile is None: return if not pe_mode: r = (r,) for read in r: if read is not None: samoutfile.write(read.original_sam_line.rstrip() + "\tXF:Z:" + assignment + "\n") if quiet: warnings.filterwarnings(action="ignore", module="HTSeq") if samout != "": samoutfile = open(samout, "w") else: samoutfile = None if umis: umi_re = re.compile(":UMI:(\w+):") umi_counts = {} def count_umis(fs, read_name): umi_seq = umi_re.search(read_name).group(1) umi_counts[fs][umi_seq] += 1 for feature_id in feature_ids: umi_counts[feature_id] = Counter() else: def count_umis(x, y): return None # Try to open samfile to fail early in case it is not there if sam_filename != "-": open(sam_filename).close() counts = {} for feature_id in feature_ids: counts[feature_id] = 0 try: if sam_filename != "-": read_seq_file = HTSeq.SAM_Reader(sam_filename) read_seq = read_seq_file first_read = iter(read_seq).next() else: read_seq_file = HTSeq.SAM_Reader(sys.stdin) read_seq_iter = iter(read_seq_file) first_read = read_seq_iter.next() read_seq = itertools.chain([first_read], read_seq_iter) pe_mode = first_read.paired_end except StopIteration: raise EmptySamError(sam_filename) try: if pe_mode: read_seq = HTSeq.pair_SAM_alignments(read_seq) empty = 0 ambiguous = 0 notaligned = 0 lowqual = 0 nonunique = 0 i = 0 for r in read_seq: i += 1 if not pe_mode: if not r.aligned: notaligned += 1 write_to_samout(r, "not_aligned") continue try: if r.optional_field("NH") > 1: write_to_samout(r, "alignment_not_unique") nonunique += 1 continue except KeyError: pass if r.aQual < minaqual: lowqual += 1 write_to_samout(r, "too_low_aQual") continue if stranded != "reverse": iv_seq = (co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r.cigar if co.type == "M" and co.size > 0) else: if r[0] is not None and r[0].aligned: if stranded != "reverse": iv_seq = (co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar if co.type == "M" and co.size > 0) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if stranded != "reverse": iv_seq = itertools.chain( iv_seq, (invert_strand(co.ref_iv) for co in r[1].cigar if co.type == "M" and co.size > 0) ) else: iv_seq = itertools.chain( iv_seq, (co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0) ) else: if (r[0] is None) or not (r[0].aligned): write_to_samout(r, "not_aligned") notaligned += 1 continue try: if (r[0] is not None and r[0].optional_field("NH") > 1) or ( r[1] is not None and r[1].optional_field("NH") > 1 ): nonunique += 1 write_to_samout(r, "alignment_not_unique") continue except KeyError: pass if (r[0] and r[0].aQual < minaqual) or (r[1] and r[1].aQual < minaqual): lowqual += 1 write_to_samout(r, "too_low_aQual") continue try: if overlap_mode == "union": fs = set() for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): fs = fs.union(fs2) elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty": fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): if len(fs2) > 0 or overlap_mode == "intersection-strict": if fs is None: fs = fs2.copy() else: fs = fs.intersection(fs2) else: sys.exit("Illegal overlap mode.") if fs is None or len(fs) == 0: write_to_samout(r, "no_feature") empty += 1 elif len(fs) > 1: write_to_samout(r, "ambiguous[" + "+".join(fs) + "]") ambiguous += 1 else: write_to_samout(r, list(fs)[0]) counts[list(fs)[0]] += 1 count_umis(list(fs)[0], r.read.name) except UnknownChrom: if not pe_mode: rr = r else: rr = r[0] if r[0] is not None else r[1] empty += 1 # if not quiet: # sys.stderr.write( ( "Warning: Skipping read '%s', because chromosome " + # "'%s', to which it has been aligned, did not appear in the GFF file.\n" ) % # ( rr.read.name, iv.chrom ) ) if i % 100000 == 0 and not quiet: sys.stderr.write("%d sam %s processed.\n" % (i, "lines " if not pe_mode else "line pairs")) except: sys.stderr.write("Error occured when processing SAM input (%s):\n" % read_seq_file.get_line_number_string()) raise if not quiet: sys.stderr.write("%d sam %s processed.\n" % (i, "lines " if not pe_mode else "line pairs")) if samoutfile is not None: samoutfile.close() # sorted feature list. features+counts feats = [fn for fn in sorted(counts.keys())] if umis: counts = [len(umi_counts[fn]) for fn in feats] else: counts = [counts[fn] for fn in feats] # cat statistics summary to feature+count list feats = feats + ["no_feature", "ambiguous", "too_low_aQual", "not_aligned", "alignment_not_unique"] counts = counts + [empty, ambiguous, lowqual, notaligned, nonunique] return (feats, counts)
def count_reads_in_features( sam_filename, gff_filename, samtype, order, stranded, overlap_mode, feature_type, id_attribute, quiet, minaqual, samout ): def write_to_samout( r, assignment ): if samoutfile is None: return if not pe_mode: r = (r,) for read in r: if read is not None: samoutfile.write( read.original_sam_line.rstrip() + "\tXF:Z:" + assignment + "\n" ) if samout != "": samoutfile = open( samout, "w" ) else: samoutfile = None features = HTSeq.GenomicArrayOfSets( "auto", stranded != "no" ) counts = {} # Try to open samfile to fail early in case it is not there if sam_filename != "-": open( sam_filename ).close() gff = HTSeq.GFF_Reader( gff_filename ) i = 0 try: for f in gff: if f.type == feature_type: try: feature_id = f.attr[ id_attribute ] except KeyError: raise ValueError( "Feature %s does not contain a '%s' attribute" % ( f.name, id_attribute )) if stranded != "no" and f.iv.strand == ".": raise ValueError( "Feature %s at %s does not have strand information but you are " "running htseq-count in stranded mode. Use '--stranded=no'." % ( f.name, f.iv )) features[ f.iv ] += feature_id counts[ f.attr[ id_attribute ] ] = 0 i += 1 if i % 100000 == 0 and not quiet: sys.stderr.write( "%d GFF lines processed.\n" % i ) except: sys.stderr.write( "Error occured when processing GFF file (%s):\n" % gff.get_line_number_string() ) raise if not quiet: sys.stderr.write( "%d GFF lines processed.\n" % i ) if len( counts ) == 0: sys.stderr.write( "Warning: No features of type '%s' found.\n" % feature_type ) if samtype == "sam": SAM_or_BAM_Reader = HTSeq.SAM_Reader elif samtype == "bam": SAM_or_BAM_Reader = HTSeq.BAM_Reader else: raise ValueError("Unknown input format %s specified." % samtype) try: if sam_filename != "-": read_seq_file = SAM_or_BAM_Reader( sam_filename ) read_seq = read_seq_file first_read = next(iter(read_seq)) else: read_seq_file = SAM_or_BAM_Reader( sys.stdin ) read_seq_iter = iter( read_seq_file ) first_read = next(read_seq_iter) read_seq = itertools.chain( [ first_read ], read_seq_iter ) pe_mode = first_read.paired_end except: sys.stderr.write( "Error occured when reading beginning of SAM/BAM file.\n" ) raise try: if pe_mode: if order == "name": read_seq = HTSeq.pair_SAM_alignments( read_seq ) elif order == "pos": read_seq = HTSeq.pair_SAM_alignments_with_buffer( read_seq ) else: raise ValueError("Illegal order specified.") empty = 0 ambiguous = 0 notaligned = 0 lowqual = 0 nonunique = 0 i = 0 for r in read_seq: if i > 0 and i % 100000 == 0 and not quiet: sys.stderr.write( "%d SAM alignment record%s processed.\n" % ( i, "s" if not pe_mode else " pairs" ) ) i += 1 if not pe_mode: if not r.aligned: notaligned += 1 write_to_samout( r, "__not_aligned" ) continue try: if r.optional_field( "NH" ) > 1: nonunique += 1 write_to_samout( r, "__alignment_not_unique" ) continue except KeyError: pass if r.aQual < minaqual: lowqual += 1 write_to_samout( r, "__too_low_aQual" ) continue if stranded != "reverse": iv_seq = ( co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0 ) else: iv_seq = ( invert_strand( co.ref_iv ) for co in r.cigar if co.type == "M" and co.size > 0 ) else: if r[0] is not None and r[0].aligned: if stranded != "reverse": iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0 ) else: iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" and co.size > 0 ) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if stranded != "reverse": iv_seq = itertools.chain( iv_seq, ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" and co.size > 0 ) ) else: iv_seq = itertools.chain( iv_seq, ( co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0 ) ) else: if ( r[0] is None ) or not ( r[0].aligned ): write_to_samout( r, "__not_aligned" ) notaligned += 1 continue try: if ( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \ ( r[1] is not None and r[1].optional_field( "NH" ) > 1 ): nonunique += 1 write_to_samout( r, "__alignment_not_unique" ) continue except KeyError: pass if ( r[0] and r[0].aQual < minaqual ) or ( r[1] and r[1].aQual < minaqual ): lowqual += 1 write_to_samout( r, "__too_low_aQual" ) continue try: if overlap_mode == "union": fs = set() for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[ iv ].steps(): fs = fs.union( fs2 ) elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty": fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[ iv ].steps(): if len(fs2) > 0 or overlap_mode == "intersection-strict": if fs is None: fs = fs2.copy() else: fs = fs.intersection( fs2 ) else: sys.exit( "Illegal overlap mode." ) if fs is None or len( fs ) == 0: write_to_samout( r, "__no_feature" ) empty += 1 elif len( fs ) > 1: write_to_samout( r, "__ambiguous[" + '+'.join( fs ) + "]" ) ambiguous += 1 else: write_to_samout( r, list(fs)[0] ) counts[ list(fs)[0] ] += 1 except UnknownChrom: write_to_samout( r, "__no_feature" ) empty += 1 except: sys.stderr.write( "Error occured when processing SAM input (%s):\n" % read_seq_file.get_line_number_string() ) raise if not quiet: sys.stderr.write( "%d SAM %s processed.\n" % ( i, "alignments " if not pe_mode else "alignment pairs" ) ) if samoutfile is not None: samoutfile.close() for fn in sorted( counts.keys() ): print("%s\t%d" % ( fn, counts[fn] )) print("__no_feature\t%d" % empty) print("__ambiguous\t%d" % ambiguous) print("__too_low_aQual\t%d" % lowqual) print("__not_aligned\t%d" % notaligned) print("__alignment_not_unique\t%d" % nonunique)
def count_reads_in_features( sam_filename, gff_filename, samtype, order, stranded, overlap_mode, feature_type, id_attribute, quiet, minaqual, samout ): def write_to_samout( r, assignment ): if samoutfile is None: return if not pe_mode: r = (r,) for read in r: if read is not None: samoutfile.write( read.original_sam_line.rstrip() + "\tXF:Z:" + assignment + "\n" ) if samout != "": samoutfile = open( samout, "w" ) else: samoutfile = None features = HTSeq.GenomicArrayOfSets( "auto", stranded != "no" ) counts = {} # Try to open samfile to fail early in case it is not there if sam_filename != "-": open( sam_filename ).close() gff = HTSeq.GFF_Reader( gff_filename ) i = 0 try: for f in gff: if f.type == feature_type: try: feature_id = f.attr[ id_attribute ] except KeyError: raise ValueError, ( "Feature %s does not contain a '%s' attribute" % ( f.name, id_attribute ) ) if stranded != "no" and f.iv.strand == ".": raise ValueError, ( "Feature %s at %s does not have strand information but you are " "running htseq-count in stranded mode. Use '--stranded=no'." % ( f.name, f.iv ) ) features[ f.iv ] += feature_id counts[ f.attr[ id_attribute ] ] = 0 i += 1 if i % 100000 == 0 and not quiet: sys.stderr.write( "%d GFF lines processed.\n" % i ) except: sys.stderr.write( "Error occured when processing GFF file (%s):\n" % gff.get_line_number_string() ) raise if not quiet: sys.stderr.write( "%d GFF lines processed.\n" % i ) if len( counts ) == 0: sys.stderr.write( "Warning: No features of type '%s' found.\n" % feature_type ) if samtype == "sam": SAM_or_BAM_Reader = HTSeq.SAM_Reader elif samtype == "bam": SAM_or_BAM_Reader = HTSeq.BAM_Reader else: raise ValueError, "Unknown input format %s specified." % samtype try: if sam_filename != "-": read_seq_file = SAM_or_BAM_Reader( sam_filename ) read_seq = read_seq_file first_read = iter(read_seq).next() else: read_seq_file = SAM_or_BAM_Reader( sys.stdin ) read_seq_iter = iter( read_seq_file ) first_read = read_seq_iter.next() read_seq = itertools.chain( [ first_read ], read_seq_iter ) pe_mode = first_read.paired_end except: sys.stderr.write( "Error occured when reading beginning of SAM/BAM file.\n" ) raise try: if pe_mode: if order == "name": read_seq = HTSeq.pair_SAM_alignments( read_seq ) elif order == "pos": read_seq = HTSeq.pair_SAM_alignments_with_buffer( read_seq ) else: raise ValueError, "Illegal order specified." empty = 0 ambiguous = 0 notaligned = 0 lowqual = 0 nonunique = 0 i = 0 for r in read_seq: if i > 0 and i % 100000 == 0 and not quiet: sys.stderr.write( "%d SAM alignment record%s processed.\n" % ( i, "s" if not pe_mode else " pairs" ) ) i += 1 if not pe_mode: if not r.aligned: notaligned += 1 write_to_samout( r, "__not_aligned" ) continue try: if r.optional_field( "NH" ) > 1: nonunique += 1 write_to_samout( r, "__alignment_not_unique" ) continue except KeyError: pass if r.aQual < minaqual: lowqual += 1 write_to_samout( r, "__too_low_aQual" ) continue if stranded != "reverse": iv_seq = ( co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0 ) else: iv_seq = ( invert_strand( co.ref_iv ) for co in r.cigar if co.type == "M" and co.size > 0 ) else: if r[0] is not None and r[0].aligned: if stranded != "reverse": iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0 ) else: iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" and co.size > 0 ) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if stranded != "reverse": iv_seq = itertools.chain( iv_seq, ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" and co.size > 0 ) ) else: iv_seq = itertools.chain( iv_seq, ( co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0 ) ) else: if ( r[0] is None ) or not ( r[0].aligned ): write_to_samout( r, "__not_aligned" ) notaligned += 1 continue try: if ( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \ ( r[1] is not None and r[1].optional_field( "NH" ) > 1 ): nonunique += 1 write_to_samout( r, "__alignment_not_unique" ) continue except KeyError: pass if ( r[0] and r[0].aQual < minaqual ) or ( r[1] and r[1].aQual < minaqual ): lowqual += 1 write_to_samout( r, "__too_low_aQual" ) continue try: if overlap_mode == "union": fs = set() for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[ iv ].steps(): fs = fs.union( fs2 ) elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty": fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[ iv ].steps(): if len(fs2) > 0 or overlap_mode == "intersection-strict": if fs is None: fs = fs2.copy() else: fs = fs.intersection( fs2 ) else: sys.exit( "Illegal overlap mode." ) if fs is None or len( fs ) == 0: write_to_samout( r, "__no_feature" ) empty += 1 elif len( fs ) > 1: write_to_samout( r, "__ambiguous[" + '+'.join( fs ) + "]" ) ambiguous += 1 else: write_to_samout( r, list(fs)[0] ) counts[ list(fs)[0] ] += 1 except UnknownChrom: write_to_samout( r, "__no_feature" ) empty += 1 except: sys.stderr.write( "Error occured when processing SAM input (%s):\n" % read_seq_file.get_line_number_string() ) raise if not quiet: sys.stderr.write( "%d SAM %s processed.\n" % ( i, "alignments " if not pe_mode else "alignment pairs" ) ) if samoutfile is not None: samoutfile.close() for fn in sorted( counts.keys() ): print "%s\t%d" % ( fn, counts[fn] ) print "__no_feature\t%d" % empty print "__ambiguous\t%d" % ambiguous print "__too_low_aQual\t%d" % lowqual print "__not_aligned\t%d" % notaligned print "__alignment_not_unique\t%d" % nonunique
def count_reads_with_barcodes( sam_filename, features, feature_attr, order, max_buffer_size, stranded, overlap_mode, multimapped_mode, secondary_alignment_mode, supplementary_alignment_mode, feature_type, id_attribute, additional_attributes, quiet, minaqual, samout_format, samout_filename, cb_tag, ub_tag, ): def write_to_samout(r, assignment, samoutfile, template=None): if samoutfile is None: return if not pe_mode: r = (r, ) for read in r: if read is not None: read.optional_fields.append(('XF', assignment)) if samout_format in ('SAM', 'sam'): samoutfile.write(read.get_sam_line() + "\n") else: samoutfile.write(read.to_pysam_AlignedSegment(template)) def identify_barcodes(r): '''Identify barcode from the read or pair (both must have the same)''' if not pe_mode: r = (r, ) # cell, UMI barcodes = [None, None] nbar = 0 for read in r: if read is not None: for tag, val in read.optional_fields: if tag == cb_tag: barcodes[0] = val nbar += 1 if nbar == 2: return barcodes elif tag == ub_tag: barcodes[1] = val nbar += 1 if nbar == 2: return barcodes return barcodes try: if sam_filename == "-": read_seq_file = HTSeq.BAM_Reader(sys.stdin) else: read_seq_file = HTSeq.BAM_Reader(sam_filename) # Get template for output BAM if samout_filename is None: template = None samoutfile = None elif samout_format in ('bam', 'BAM'): template = read_seq_file.get_template() samoutfile = pysam.AlignmentFile( samout_filename, 'wb', template=template, ) else: template = None samoutfile = open(samout_filename, 'w') read_seq_iter = iter(read_seq_file) # Catch empty BAM files try: first_read = next(read_seq_iter) pe_mode = first_read.paired_end # FIXME: catchall can hide subtle bugs except: first_read = None pe_mode = False if first_read is not None: read_seq = itertools.chain([first_read], read_seq_iter) else: read_seq = [] except: sys.stderr.write( "Error occured when reading beginning of SAM/BAM file.\n") raise # CIGAR match characters (including alignment match, sequence match, and # sequence mismatch com = ('M', '=', 'X') try: if pe_mode: if ((supplementary_alignment_mode == 'ignore') and (secondary_alignment_mode == 'ignore')): primary_only = True else: primary_only = False if order == "name": read_seq = HTSeq.pair_SAM_alignments(read_seq, primary_only=primary_only) elif order == "pos": read_seq = HTSeq.pair_SAM_alignments_with_buffer( read_seq, max_buffer_size=max_buffer_size, primary_only=primary_only) else: raise ValueError("Illegal order specified.") # The nesting is cell barcode, UMI, feature counts = defaultdict(lambda: defaultdict(Counter)) i = 0 for r in read_seq: if i > 0 and i % 100000 == 0 and not quiet: sys.stderr.write("%d alignment record%s processed.\n" % (i, "s" if not pe_mode else " pairs")) sys.stderr.flush() i += 1 cb, ub = identify_barcodes(r) if not pe_mode: if not r.aligned: counts[cb][ub]['__not_aligned'] += 1 write_to_samout(r, "__not_aligned", samoutfile, template) continue if ((secondary_alignment_mode == 'ignore') and r.not_primary_alignment): continue if ((supplementary_alignment_mode == 'ignore') and r.supplementary): continue try: if r.optional_field("NH") > 1: counts[cb][ub]['__alignment_not_unique'] += 1 write_to_samout(r, "__alignment_not_unique", samoutfile, template) if multimapped_mode == 'none': continue except KeyError: pass if r.aQual < minaqual: counts[cb][ub]['__too_low_aQual'] += 1 write_to_samout(r, "__too_low_aQual", samoutfile, template) continue if stranded != "reverse": iv_seq = (co.ref_iv for co in r.cigar if co.type in com and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r.cigar if (co.type in com and co.size > 0)) else: if r[0] is not None and r[0].aligned: if stranded != "reverse": iv_seq = (co.ref_iv for co in r[0].cigar if co.type in com and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar if co.type in com and co.size > 0) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if stranded != "reverse": iv_seq = itertools.chain( iv_seq, (invert_strand(co.ref_iv) for co in r[1].cigar if co.type in com and co.size > 0)) else: iv_seq = itertools.chain( iv_seq, (co.ref_iv for co in r[1].cigar if co.type in com and co.size > 0)) else: if (r[0] is None) or not (r[0].aligned): write_to_samout(r, "__not_aligned", samoutfile, template) counts[cb][ub]['__not_aligned'] += 1 continue if secondary_alignment_mode == 'ignore': if (r[0] is not None) and r[0].not_primary_alignment: continue elif (r[1] is not None) and r[1].not_primary_alignment: continue if supplementary_alignment_mode == 'ignore': if (r[0] is not None) and r[0].supplementary: continue elif (r[1] is not None) and r[1].supplementary: continue try: if ((r[0] is not None and r[0].optional_field("NH") > 1) or (r[1] is not None and r[1].optional_field("NH") > 1)): write_to_samout(r, "__alignment_not_unique", samoutfile, template) counts[cb][ub]['__alignment_not_unique'] += 1 if multimapped_mode == 'none': continue except KeyError: pass if ((r[0] and r[0].aQual < minaqual) or (r[1] and r[1].aQual < minaqual)): write_to_samout(r, "__too_low_aQual", samoutfile, template) counts[cb][ub]['__too_low_aQual'] += 1 continue try: if overlap_mode == "union": fs = set() for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): fs = fs.union(fs2) elif overlap_mode in ("intersection-strict", "intersection-nonempty"): fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): if ((len(fs2) > 0) or (overlap_mode == "intersection-strict")): if fs is None: fs = fs2.copy() else: fs = fs.intersection(fs2) else: sys.exit("Illegal overlap mode.") if fs is None or len(fs) == 0: write_to_samout(r, "__no_feature", samoutfile, template) counts[cb][ub]['__no_feature'] += 1 elif len(fs) > 1: write_to_samout(r, "__ambiguous[" + '+'.join(fs) + "]", samoutfile, template) counts[cb][ub]['__ambiguous'] += 1 else: write_to_samout(r, list(fs)[0], samoutfile, template) if fs is not None and len(fs) > 0: if multimapped_mode == 'none': if len(fs) == 1: counts[cb][ub][list(fs)[0]] += 1 elif multimapped_mode == 'all': for fsi in list(fs): counts[cb][ub][fsi] += 1 else: sys.exit("Illegal multimap mode.") except UnknownChrom: write_to_samout(r, "__no_feature", samoutfile, template) counts[cb][ub]['__no_feature'] += 1 except: sys.stderr.write("Error occured when processing input (%s):\n" % (read_seq_file.get_line_number_string())) raise if not quiet: sys.stderr.write( "%d %s processed.\n" % (i, "alignments " if not pe_mode else "alignment pairs")) sys.stderr.flush() if samoutfile is not None: samoutfile.close() # Get rid of UMI by majority rule cbs = sorted(counts.keys()) counts_noumi = {} for cb in cbs: counts_cell = Counter() for ub, udic in counts.pop(cb).items(): # In case of a tie, do not increment either feature top = udic.most_common(2) if (len(top) == 2) and (top[0][1] == top[1][1]): continue counts_cell[top[0][0]] += 1 counts_noumi[cb] = counts_cell return { 'cell_barcodes': cbs, 'counts': counts_noumi, }
def count_reads_in_features( sam_filenames, colnames, gff_filename, opts ): """ Hacked version of htseq count.py """ if opts.quiet: warnings.filterwarnings( action="ignore", module="HTSeq" ) features = HTSeq.GenomicArrayOfSets( "auto", opts.stranded != "no" ) mapqMin = int(opts.mapqMin) counts = {} nreads = 0 empty = 0 ambiguous = 0 notaligned = 0 lowqual = 0 nonunique = 0 filtered = 0 # new filter_extras - need a better way to do this - independent filter tool? gff = HTSeq.GFF_Reader( gff_filename ) try: for i, f in enumerate(gff): if f.type == opts.feature_type: try: feature_id = f.attr[ opts.id_attribute ] except KeyError: try: feature_id = f.attr[ 'gene_id' ] except KeyError: sys.exit( "Feature at row %d %s does not contain a '%s' attribute OR a gene_id attribute - faulty GFF?" % ( (i + 1), f.name, opts.id_attribute ) ) if opts.stranded != "no" and f.iv.strand == ".": sys.exit( "Feature %s at %s does not have strand information but you are " "running htseq-count in stranded mode. Use '--stranded=no'." % ( f.name, f.iv ) ) features[ f.iv ] += feature_id counts[ feature_id ] = [0 for x in colnames] # we use sami as an index here to bump counts later except: sys.stderr.write( "Error occured in %s.\n" % gff.get_line_number_string() ) raise if not opts.quiet: sys.stdout.write( "%d GFF lines processed.\n" % i ) if len( counts ) == 0 and not opts.quiet: sys.stdout.write( "Warning: No features of type '%s' found.\n" % opts.feature_type ) for sami, sam_filename in enumerate(sam_filenames): colname = colnames[sami] isbam = sam_exts[sami] == 'bam' hasbai = sam_bais[sami] > '' if hasbai: tempname = os.path.splitext(os.path.basename(sam_filename))[0] tempbam = '%s_TEMP.bam' % tempname tempbai = '%s_TEMP.bai' % tempname os.link(sam_filename, tempbam) os.link(sam_bais[sami], tempbai) try: if isbam: if hasbai: read_seq = HTSeq.BAM_Reader( tempbam ) else: read_seq = HTSeq.BAM_Reader( sam_filename ) else: read_seq = HTSeq.SAM_Reader( sam_filename ) first_read = iter(read_seq).next() pe_mode = first_read.paired_end except: if isbam: print >> sys.stderr, "Error occured when reading first line of bam file %s colname=%s \n" % (sam_filename, colname ) else: print >> sys.stderr, "Error occured when reading first line of sam file %s colname=%s \n" % (sam_filename, colname ) raise try: if pe_mode: read_seq_pe_file = read_seq read_seq = HTSeq.pair_SAM_alignments( read_seq ) for seqi, r in enumerate(read_seq): nreads += 1 if not pe_mode: if not r.aligned: notaligned += 1 continue try: if len(opts.filter_extras) > 0: for extra in opts.filter_extras: if r.optional_field(extra): filtered += 1 continue if r.optional_field( "NH" ) > 1: nonunique += 1 continue except KeyError: pass if r.aQual < mapqMin: lowqual += 1 continue if opts.stranded != "reverse": iv_seq = ( co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0 ) else: iv_seq = ( invert_strand( co.ref_iv ) for co in r.cigar if co.type == "M" and co.size > 0 ) else: if r[0] is not None and r[0].aligned: if opts.stranded != "reverse": iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0 ) else: iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" and co.size > 0 ) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if opts.stranded != "reverse": iv_seq = itertools.chain( iv_seq, ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" and co.size > 0 ) ) else: iv_seq = itertools.chain( iv_seq, ( co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0 ) ) else: if r[0] is None or not r[0].aligned: notaligned += 1 continue try: if ( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \ ( r[1] is not None and r[1].optional_field( "NH" ) > 1 ): nonunique += 1 continue except KeyError: pass if ( r[0] and r[0].aQual < mapqMin ) or ( r[1] and r[1].aQual < mapqMin ): lowqual += 1 continue try: if opts.mode == "union": fs = set() for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[ iv ].steps(): fs = fs.union( fs2 ) elif opts.mode == "intersection-strict" or opts.mode == "intersection-nonempty": fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[ iv ].steps(): if len(fs2) > 0 or opts.mode == "intersection-strict": if fs is None: fs = fs2.copy() else: fs = fs.intersection( fs2 ) else: sys.exit( "Illegal overlap mode %s" % opts.mode ) if fs is None or len( fs ) == 0: empty += 1 elif len( fs ) > 1: ambiguous += 1 else: ck = list(fs)[0] counts[ck][sami] += 1 # end up with counts for each sample as a list except UnknownChrom: if not pe_mode: rr = r else: rr = r[0] if r[0] is not None else r[1] empty += 1 if not opts.quiet: sys.stdout.write( ( "Warning: Skipping read '%s', because chromosome " + "'%s', to which it has been aligned, did not appear in the GFF file.\n" ) % ( rr.read.name, iv.chrom ) ) except: if not pe_mode: sys.stderr.write( "Error occured in %s.\n" % read_seq.get_line_number_string() ) else: sys.stderr.write( "Error occured in %s.\n" % read_seq_pe_file.get_line_number_string() ) raise if not opts.quiet: sys.stdout.write( "%d sam %s processed for %s.\n" % ( seqi, "lines " if not pe_mode else "line pairs", colname ) ) return counts, empty, ambiguous, lowqual, notaligned, nonunique, filtered, nreads
# Deal with any GFF file reading errors except ValueError as e: e.args += (gff.get_line_number_string(), ) raise try: # Get the first read to see if we're dealing with paired-end data read_seq = HTSeq.SAM_Reader(options.sam) first_read = iter(read_seq).next() pe_mode = first_read.paired_end # Re-initialize read_seq depending on if it's paired-end data or not read_seq = HTSeq.SAM_Reader(options.sam) if pe_mode: read_seq = HTSeq.pair_SAM_alignments(read_seq) # Read counter, for feedback to user i = 0 total = 0 # Here we go, through each read... for r in read_seq: spliced = False if not pe_mode: if not r.aligned: continue total += 1 iv_seq = [] # Check to see if it's spliced for co in r.cigar:
def count_reads_single_file( isam, sam_filename, features, feature_attr, order, max_buffer_size, stranded, overlap_mode, multimapped_mode, secondary_alignment_mode, supplementary_alignment_mode, feature_type, id_attribute, additional_attributes, quiet, minaqual, samout_format, samout_filename, ): def write_to_samout(r, assignment, samoutfile, template=None): if samoutfile is None: return if not pe_mode: r = (r, ) for read in r: if read is not None: read.optional_fields.append(('XF', assignment)) if samout_format in ('SAM', 'sam'): samoutfile.write(read.get_sam_line() + "\n") else: samoutfile.write(read.to_pysam_AlignedSegment(template)) try: if sam_filename == "-": read_seq_file = HTSeq.BAM_Reader(sys.stdin) else: read_seq_file = HTSeq.BAM_Reader(sam_filename) # Get template for output BAM if samout_filename is None: template = None samoutfile = None elif samout_format in ('bam', 'BAM'): template = read_seq_file.get_template() samoutfile = pysam.AlignmentFile( samout_filename, 'wb', template=template, ) else: template = None samoutfile = open(samout_filename, 'w') read_seq_iter = iter(read_seq_file) # Catch empty BAM files try: first_read = next(read_seq_iter) pe_mode = first_read.paired_end # FIXME: catchall can hide subtle bugs except: first_read = None pe_mode = False if first_read is not None: read_seq = itertools.chain([first_read], read_seq_iter) else: read_seq = [] except: sys.stderr.write( "Error occured when reading beginning of SAM/BAM file.\n") raise # CIGAR match characters (including alignment match, sequence match, and # sequence mismatch com = ('M', '=', 'X') counts = {key: 0 for key in feature_attr} try: if pe_mode: if ((supplementary_alignment_mode == 'ignore') and (secondary_alignment_mode == 'ignore')): primary_only = True else: primary_only = False if order == "name": read_seq = HTSeq.pair_SAM_alignments(read_seq, primary_only=primary_only) elif order == "pos": read_seq = HTSeq.pair_SAM_alignments_with_buffer( read_seq, max_buffer_size=max_buffer_size, primary_only=primary_only) else: raise ValueError("Illegal order specified.") empty = 0 ambiguous = 0 notaligned = 0 lowqual = 0 nonunique = 0 i = 0 for r in read_seq: if i > 0 and i % 100000 == 0 and not quiet: sys.stderr.write("%d alignment record%s processed.\n" % (i, "s" if not pe_mode else " pairs")) sys.stderr.flush() i += 1 if not pe_mode: if not r.aligned: notaligned += 1 write_to_samout(r, "__not_aligned", samoutfile, template) continue if ((secondary_alignment_mode == 'ignore') and r.not_primary_alignment): continue if ((supplementary_alignment_mode == 'ignore') and r.supplementary): continue try: if r.optional_field("NH") > 1: nonunique += 1 write_to_samout(r, "__alignment_not_unique", samoutfile, template) if multimapped_mode == 'none': continue except KeyError: pass if r.aQual < minaqual: lowqual += 1 write_to_samout(r, "__too_low_aQual", samoutfile, template) continue if stranded != "reverse": iv_seq = (co.ref_iv for co in r.cigar if co.type in com and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r.cigar if (co.type in com and co.size > 0)) else: if r[0] is not None and r[0].aligned: if stranded != "reverse": iv_seq = (co.ref_iv for co in r[0].cigar if co.type in com and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar if co.type in com and co.size > 0) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if stranded != "reverse": iv_seq = itertools.chain( iv_seq, (invert_strand(co.ref_iv) for co in r[1].cigar if co.type in com and co.size > 0)) else: iv_seq = itertools.chain( iv_seq, (co.ref_iv for co in r[1].cigar if co.type in com and co.size > 0)) else: if (r[0] is None) or not (r[0].aligned): write_to_samout(r, "__not_aligned", samoutfile, template) notaligned += 1 continue if secondary_alignment_mode == 'ignore': if (r[0] is not None) and r[0].not_primary_alignment: continue elif (r[1] is not None) and r[1].not_primary_alignment: continue if supplementary_alignment_mode == 'ignore': if (r[0] is not None) and r[0].supplementary: continue elif (r[1] is not None) and r[1].supplementary: continue try: if ((r[0] is not None and r[0].optional_field("NH") > 1) or (r[1] is not None and r[1].optional_field("NH") > 1)): nonunique += 1 write_to_samout(r, "__alignment_not_unique", samoutfile, template) if multimapped_mode == 'none': continue except KeyError: pass if ((r[0] and r[0].aQual < minaqual) or (r[1] and r[1].aQual < minaqual)): lowqual += 1 write_to_samout(r, "__too_low_aQual", samoutfile, template) continue try: if overlap_mode == "union": fs = set() for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): fs = fs.union(fs2) elif overlap_mode in ("intersection-strict", "intersection-nonempty"): fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): if ((len(fs2) > 0) or (overlap_mode == "intersection-strict")): if fs is None: fs = fs2.copy() else: fs = fs.intersection(fs2) else: sys.exit("Illegal overlap mode.") if fs is None or len(fs) == 0: write_to_samout(r, "__no_feature", samoutfile, template) empty += 1 elif len(fs) > 1: write_to_samout(r, "__ambiguous[" + '+'.join(fs) + "]", samoutfile, template) ambiguous += 1 else: write_to_samout(r, list(fs)[0], samoutfile, template) if fs is not None and len(fs) > 0: if multimapped_mode == 'none': if len(fs) == 1: counts[list(fs)[0]] += 1 elif multimapped_mode == 'all': for fsi in list(fs): counts[fsi] += 1 elif multimapped_mode == 'fraction': for fsi in list(fs): counts[fsi] += 1.0 / len(fs) elif multimapped_mode == 'random': fsi = random.choice(fs) counts[fsi] += 1 else: sys.exit("Illegal multimap mode.") except UnknownChrom: write_to_samout(r, "__no_feature", samoutfile, template) empty += 1 except: sys.stderr.write("Error occured when processing input (%s):\n" % (read_seq_file.get_line_number_string())) raise if not quiet: sys.stderr.write( "%d %s processed.\n" % (i, "alignments " if not pe_mode else "alignment pairs")) sys.stderr.flush() if samoutfile is not None: samoutfile.close() return { 'isam': isam, 'counts': counts, 'empty': empty, 'ambiguous': ambiguous, 'lowqual': lowqual, 'notaligned': notaligned, 'nonunique': nonunique, }
def count_reads_in_features(sam_filenames, gff_filename, samtype, order, max_buffer_size, stranded, overlap_mode, multimapped_mode, secondary_alignment_mode, supplementary_alignment_mode, feature_type, id_attribute, additional_attributes, quiet, minaqual, samouts): def exists(obj, chain): _key = chain.pop(0) if _key in obj: return exists(obj[_key], chain) if chain else obj[_key] def check_overlapped_exons_and_calc_sum(gene): rightmost_value = gene["exons"][0][1] start = gene["exons"][0][0] new_exons = [] total = rightmost_value - start for interval in gene["exons"]: if (interval[0] <= rightmost_value and interval[1] >= rightmost_value): total += (interval[1] - rightmost_value) rightmost_value = interval[1] elif (interval[0] > rightmost_value): total += (interval[1] - interval[0]) new_exons.append([start, rightmost_value ]) #add previous extended interval to result start = interval[0] rightmost_value = interval[1] new_exons.append([start, rightmost_value]) gene["exons"] = new_exons gene["total_sum_of_exons"] = total def check_and_count_points_coverage(gene_id, first_read, second_read): # определить какую из точек пересекает # вычесть из каждой координаты координату начала гена! if (first_read is None or second_read is None): return gene_begin = genes_exons[gene_id]["gene_begin"] fstart = first_read.iv.start - gene_begin fend = first_read.iv.end - gene_begin sstart = second_read.iv.start - gene_begin send = second_read.iv.end - gene_begin if (first_read.proper_pair == False or second_read.proper_pair == False): return if (fend < sstart and fstart < fend and sstart < send): check(gene_id, fstart, fend) check(gene_id, sstart, send) elif (send < fstart and fstart < fend and sstart < send): check(gene_id, fstart, fend) check(gene_id, sstart, send) elif (fstart < fend and sstart < send and sstart >= fstart and send >= fend and sstart <= fend): check(gene_id, fstart, send) elif (fstart < fend and sstart < send and sstart <= fstart and send >= fstart and send <= fend): check(gene_id, sstart, fend) elif (fstart < sstart and send < fend): check(gene_id, fstart, fend) elif (sstart < fstart and fend < send): check(gene_id, sstart, send) def check(gene_id, start, end): total = 100 half = total / 2 left_interval = right_interval = half try: i = 0 while (left_interval >= 10): if (i > 10): raise ValueError('Out of boundaries\n') if (exists( genes_coverage_in_points, [gene_id, half ]) == None): # если точки нет то ищем ближаишую слева # half = math.ceil(half) half = int(math.floor(half / 10) * 10) point = genes_coverage_in_points[gene_id][half]["point"] right_interval += 5 left_interval -= 5 else: # если точка есть, point = genes_coverage_in_points[gene_id][half]["point"] if (point < start): # слева точка от рида, рид справой строны half = half + (right_interval / 2) left_interval = right_interval = right_interval / 2 elif (point > end): # точка справа от рида, рид слевой стороны half = half - (left_interval / 2) left_interval = right_interval = left_interval / 2 elif (point > start and point < end): # пересекает genes_coverage_in_points[gene_id][half]["coverage"] += 1 return i += 1 except: sys.stderr.write("Out of boundaries\n") def check2(gene_id, start, end): #gene_begin = genes_exons[gene_id]["gene_begin"] for i in range(0, 100, 10): point = genes_coverage_in_points[gene_id][i]["point"] if (start < point and point < end): genes_coverage_in_points[gene_id][i]["coverage"] += 1 return def clear_all_cov_points(): for gene_id, gene in genes_coverage_in_points.iteritems(): for k, val in gene.iteritems(): val["coverage"] = 0 def plot_gene_coverage(): sys.stderr.write("ENSG00000000003.10 genes on: " + str(test_n[0]) + "\n") x = [] y = [] i = 0 for k, val in enumerate( list(cvg[HTSeq.GenomicInterval("chrX", test_first_exon_start, test_last_exon_end)])): x.append(i) y.append(val) i += 1 plt.plot(x, y) plt.show() """ iv = HTSeq.GenomicInterval("chr3", 100, 200, "+") cvg[iv] += 1 iv = HTSeq.GenomicInterval("chr3", 150, 250, "-") cvg[iv] += 1 """ if samouts != "": if len(samouts) != len(sam_filenames): raise ValueError( 'Select the same number of SAM input and output files') # Try to open samout files early in case any of them has issues for samout in samouts: with open(samout, 'w'): pass # Try to open samfiles to fail early in case any of them is not there if (len(sam_filenames) != 1) or (sam_filenames[0] != '-'): for sam_filename in sam_filenames: with open(sam_filename): pass # CIGAR match characters (including alignment match, sequence match, and # sequence mismatch com = ('M', '=', 'X') features = HTSeq.GenomicArrayOfSets("auto", stranded != "no") gff = HTSeq.GFF_Reader(gff_filename) #genes_coverage_in_points = {} genes_coverage_in_points = defaultdict(dict) #genes_exons = {} genes_exons = defaultdict(dict) #cvg = HTSeq.GenomicArray("auto", stranded != "no") test_n = [0] i = 0 try: for f in gff: if f.type == feature_type: try: feature_id = f.attr[id_attribute] except KeyError: raise ValueError( "Feature %s does not contain a '%s' attribute" % (f.name, id_attribute)) if stranded != "no" and f.iv.strand == ".": raise ValueError( "Feature %s at %s does not have strand information but you are " "running htseq-count in stranded mode. Use '--stranded=no'." % (f.name, f.iv)) features[f.iv] += feature_id #counts[f.attr[id_attribute]] = 0 #экзоны не в порядке сортировки! координат #ген - граница экзона #здесь будут все интервалы и сумма всех интервалов gene_id = feature_id #f.attr[id_attribute] if (exists(genes_exons, [gene_id]) == None): #координата первого экзона genes_exons[gene_id] = { "total_sum_of_exons": 0, "total_aligned_reads": 0, "gene_begin": 0, "exons": list([[f.iv.start, f.iv.end]]) } else: genes_exons[gene_id]["exons"].append( [f.iv.start, f.iv.end]) #10 точек для гена для которых будем считать покрытие(интроны вычтем) i += 1 if i % 100000 == 0 and not quiet: sys.stderr.write("%d GFF lines processed.\n" % i) except: sys.stderr.write("Error occured when processing GFF file (%s):\n" % gff.get_line_number_string()) raise if not quiet: sys.stderr.write("%d GFF lines processed.\n" % i) if len(genes_exons) == 0: sys.stderr.write("Warning: No features of type '%s' found.\n" % feature_type) #проход по всем генам и внутри каждого сортируем по первой координате экзона #в конце сортировки каждого гена назначаем крайнюю координату начала гена(первый экзон) #пересекающиеся экзоны надо склеивать и расширять границы #после склеивания будем получать сумму экзонов total_sum_of_exons, т.е. мы получим участки непокрытые ни на одном стренде for gene_id, gene in genes_exons.iteritems(): gene["exons"].sort() #by first member gene["gene_begin"] = gene["exons"][0][0] #слить все пересекающиеся экзоны и одновременно посчитать сумму длин без полученных промежутков check_overlapped_exons_and_calc_sum(gene) total = gene["total_sum_of_exons"] # длина всех экзонов for ten_interval in xrange(0, 100, 10): point = (total * ten_interval ) / 100 #точка в абсолютном исчислении % от длины экзона prev_exon_end = 0 for exon_key, exon in enumerate(gene["exons"]): #prev_exon_length + exon.start + point += (exon[0] - prev_exon_end) #длина интрона if (point < exon[1]): #точка конца экзона #пишем точку в конечный массив genes_coverage_in_points[gene_id][ten_interval] = { "point": point - gene["gene_begin"], "coverage": 0 } break # переход на следующую точку 10% else: #длину экзона не уложившегося записываем #prev_exon_length += exon.end - exon.start prev_exon_end = exon[1] if samtype == "sam": SAM_or_BAM_Reader = HTSeq.SAM_Reader elif samtype == "bam": SAM_or_BAM_Reader = HTSeq.BAM_Reader else: raise ValueError("Unknown input format %s specified." % samtype) sample = 0 colors = ["red", "blue", "green", "yellow"] handlers = [] sys.stderr.write(strftime("%Y-%m-%d %H:%M:%S", gmtime()) + "\n") for isam, (sam_filename) in enumerate(sam_filenames): total_of_reads_in_sample = 0 if samouts != '': samoutfile = open(samouts[isam], 'w') else: samoutfile = None try: if sam_filename != "-": read_seq_file = SAM_or_BAM_Reader(sam_filename) read_seq = read_seq_file first_read = next(iter(read_seq)) else: read_seq_file = SAM_or_BAM_Reader(sys.stdin) read_seq_iter = iter(read_seq_file) first_read = next(read_seq_iter) read_seq = itertools.chain([first_read], read_seq_iter) pe_mode = first_read.paired_end except: sys.stderr.write( "Error occured when reading beginning of SAM/BAM file.\n") raise try: if pe_mode: if order == "name": read_seq = HTSeq.pair_SAM_alignments(read_seq) elif order == "pos": read_seq = HTSeq.pair_SAM_alignments_with_buffer( read_seq, max_buffer_size=max_buffer_size) else: raise ValueError("Illegal order specified.") notaligned = 0 lowqual = 0 i = 0 for r in read_seq: #TODO 'NoneType' object has no attribute 'iv' raised in plot_coverage.py:169] total_of_reads_in_sample += 1 if i > 0 and i % 100000 == 0 and not quiet: sys.stderr.write("%d SAM alignment record%s processed.\n" % (i, "s" if not pe_mode else " pairs")) sys.stderr.write( strftime("%Y-%m-%d %H:%M:%S", gmtime()) + "\n") i += 1 if not pe_mode: if not r.aligned: #notaligned += 1 #write_to_samout(r, "__not_aligned", samoutfile) continue if ((secondary_alignment_mode == 'ignore') and r.not_primary_alignment): continue if ((supplementary_alignment_mode == 'ignore') and r.supplementary): continue try: if r.optional_field("NH") > 1: #nonunique += 1 #write_to_samout(r, "__alignment_not_unique", samoutfile) if multimapped_mode == 'none': continue except KeyError: pass if r.aQual < minaqual: lowqual += 1 #write_to_samout(r, "__too_low_aQual", samoutfile) continue if stranded != "reverse": iv_seq = (co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r.cigar if (co.type in com and co.size > 0)) else: if r[0] is not None and r[0].aligned: if stranded != "reverse": iv_seq = (co.ref_iv for co in r[0].cigar if co.type in com and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar if co.type in com and co.size > 0) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if stranded != "reverse": iv_seq = itertools.chain( iv_seq, (invert_strand(co.ref_iv) for co in r[1].cigar if co.type in com and co.size > 0)) else: iv_seq = itertools.chain( iv_seq, (co.ref_iv for co in r[1].cigar if co.type in com and co.size > 0)) else: if (r[0] is None) or not (r[0].aligned): #write_to_samout(r, "__not_aligned", samoutfile) #notaligned += 1 continue if secondary_alignment_mode == 'ignore': if (r[0] is not None) and r[0].not_primary_alignment: continue elif (r[1] is not None) and r[1].not_primary_alignment: continue if supplementary_alignment_mode == 'ignore': if (r[0] is not None) and r[0].supplementary: continue elif (r[1] is not None) and r[1].supplementary: continue try: if ((r[0] is not None and r[0].optional_field("NH") > 1) or (r[1] is not None and r[1].optional_field("NH") > 1)): #nonunique += 1 #write_to_samout(r, "__alignment_not_unique", samoutfile) if multimapped_mode == 'none': continue except KeyError: pass if ((r[0] and r[0].aQual < minaqual) or (r[1] and r[1].aQual < minaqual)): lowqual += 1 #write_to_samout(r, "__too_low_aQual", samoutfile) continue try: if overlap_mode == "union": fs = set() for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): fs = fs.union(fs2) elif overlap_mode in ("intersection-strict", "intersection-nonempty"): fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: continue #raise UnknownChrom for iv2, fs2 in features[iv].steps(): if ((len(fs2) > 0) or (overlap_mode == "intersection-strict")): if fs is None: fs = fs2.copy() else: fs = fs.intersection(fs2) else: sys.exit("Illegal overlap mode.") if fs is not None and len(fs) > 0: if multimapped_mode == 'none': if len(fs) == 1: #counts[list(fs)[0]] += 1 #read mapped only for one exon, (all cigar parts of both reads in pair mapped on one gene, but may be for several exons) #we can take this read into account of analysis #they must come in sorted order by coordinate! #this is one unit of analysis. save it in memory and go throught it gene_name = list(fs)[0] # - имя гена genes_exons[gene_name][ "total_aligned_reads"] += 1 #if (total_of_reads_in_sample==100000): # break check_and_count_points_coverage( gene_name, r[0], r[1]) """ elif multimapped_mode == 'all': for fsi in list(fs): #counts[fsi] += 1 """ else: sys.exit("Illegal multimap mode.") except UnknownChrom: #write_to_samout(r, "__no_feature", samoutfile) #empty += 1 raise except: sys.stderr.write( "Error occured when processing SAM input (%s):\n" % read_seq_file.get_line_number_string()) raise if not quiet: sys.stderr.write( "%d SAM %s processed.\n" % (i, "alignments " if not pe_mode else "alignment pairs")) if samoutfile is not None: samoutfile.close() #сохранить данные в таблицы чтобы работать с ними как угодно потом! outfile = open( '/home/kirill/bi/transcript/' + str(sample) + '_dict.txt', 'w') outfile.write("total_of_reads_in_sample" + '\t' + str(total_of_reads_in_sample) + '\n') for gene_id, gene in genes_coverage_in_points.iteritems(): outfile.write( str(gene_id) + '\t' + str(genes_exons[gene_id]["total_aligned_reads"]) + '\t' + str(genes_exons[gene_id]["total_sum_of_exons"]) + '\n') outfile.write(str(gene_id) + '\t') [ outfile.write(str(val["coverage"]) + '\t') for k, val in gene.iteritems() ] outfile.write('\n') outfile.close() #############test################ #plot_gene_coverage() ################################ #1. получить % от числа ридов картированных на ген в конкретной точке(сумма всех % на 10 точках = 100) - число ридов картированных на ген будем записывать в массив(это бывший массиыв count) #2 для каждой точки делим полученный процент на длину конкретного гена (total_sum of exons) #3. для каждой точки делим величину на общее число ридов в образце #4. deviance - min - max всех значений? точка на графике среднее между ними CalcCoverage.do_coverage(genes_coverage_in_points, genes_exons, total_of_reads_in_sample, colors, sample, handlers) sample += 1 #обнуление точек покрытия clear_all_cov_points() plt.legend(handlers, ['Sample ' + str(v) for v in range(0, sample, 1)]) plt.title('Positions relative coverege') plt.xlabel('5` -> 3` positions, %') plt.ylabel('relative coverage') plt.grid(True) plt.savefig('/home/kirill/bi/transcript/covarage.png') plt.show() plt.close()
def count_reads_in_features( sam_filename, gff_filename, stranded, overlap_mode, feature_type, id_attribute, quiet, minaqual, samout, custom_stat ): def write_to_samout( r, assignment ): if samoutfile is None: return if not pe_mode: r = (r,) for read in r: if read is not None: samoutfile.write( read.original_sam_line.rstrip() + "\tXF:Z:" + assignment + "\n" ) if quiet: warnings.filterwarnings( action="ignore", module="HTSeq" ) if samout != "": samoutfile = open( samout, "w" ) else: samoutfile = None # MB if custom_stat != "": custom_stat_file=open(custom_stat,"a") else: custom_stat_file = None # endMB features = HTSeq.GenomicArrayOfSets( "auto", stranded != "no" ) counts = {} # Try to open samfile to fail early in case it is not there if sam_filename != "-": open( sam_filename ).close() gff = HTSeq.GFF_Reader( gff_filename ) i = 0 try: for f in gff: if f.type == feature_type: try: feature_id = f.attr[ id_attribute ] except KeyError: sys.exit( "Feature %s does not contain a '%s' attribute" % ( f.name, id_attribute ) ) if stranded != "no" and f.iv.strand == ".": sys.exit( "Feature %s at %s does not have strand information but you are " "running htseq-count in stranded mode. Use '--stranded=no'." % ( f.name, f.iv ) ) features[ f.iv ] += feature_id counts[ f.attr[ id_attribute ] ] = 0 i += 1 if i % 100000 == 0 and not quiet: sys.stderr.write( "%d GFF lines processed.\n" % i ) except: sys.stderr.write( "Error occured in %s.\n" % gff.get_line_number_string() ) raise if not quiet: sys.stderr.write( "%d GFF lines processed.\n" % i ) if len( counts ) == 0 and not quiet: sys.stderr.write( "Warning: No features of type '%s' found.\n" % feature_type ) try: if sam_filename != "-": read_seq = HTSeq.SAM_Reader( sam_filename ) first_read = iter(read_seq).next() else: read_seq = iter( HTSeq.SAM_Reader( sys.stdin ) ) first_read = read_seq.next() read_seq = itertools.chain( [ first_read ], read_seq ) pe_mode = first_read.paired_end except: sys.stderr.write( "Error occured when reading first line of sam file.\n" ) raise try: if pe_mode: read_seq_pe_file = read_seq read_seq = HTSeq.pair_SAM_alignments( read_seq ) empty = 0 ambiguous = 0 notaligned = 0 lowqual = 0 nonunique = 0 # MB: Creating detailed stats if custom_stat_file: sam_lines = 0 skipped = 0 assigned_reads = 0 assigned_reads_s = 0 assigned_reads_p = 0 assigned_genes = 0 assigned_genes_s = 0 assigned_genes_p = 0 empty_s = 0 empty_p = 0 ambiguous_s = 0 ambiguous_p = 0 anu_dict = {} # endMB i = 0 for r in read_seq: i += 1 if not pe_mode: if not r.aligned: notaligned += 1 write_to_samout( r, "not_aligned" ) continue try: if r.optional_field( "NH" ) > 1: write_to_samout( r, "alignment_not_unique" ) nonunique += 1 continue except KeyError: pass if r.aQual < minaqual: lowqual += 1 write_to_samout( r, "too_low_aQual" ) continue if stranded != "reverse": iv_seq = ( co.ref_iv for co in r.cigar if co.type == "M" ) else: iv_seq = ( invert_strand( co.ref_iv ) for co in r.cigar if co.type == "M" ) else: if r[0] is not None and r[0].aligned: #for co in r[0].cigar: #sys.stderr.write("ID: %s, %s\n" % (r[0].original_sam_line.split('\t')[0],co.ref_iv)) if stranded != "reverse": iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" ) else: iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" ) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if stranded != "reverse": iv_seq = itertools.chain( iv_seq, ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" ) ) else: iv_seq = itertools.chain( iv_seq, ( co.ref_iv for co in r[1].cigar if co.type == "M" ) ) else: if ( r[0] is None ) or not ( r[0].aligned ): write_to_samout( r, "not_aligned" ) notaligned += 1 continue try: if ( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \ ( r[1] is not None and r[1].optional_field( "NH" ) > 1 ): nonunique += 1 write_to_samout( r, "alignment_not_unique" ) # MB: Counting the 'alignment_not_unique' for one or both mates if custom_stat_file: if r[0] is not None and r[1] is not None: # The 2 mates are mapped read_id = r[0].original_sam_line.split('\t')[0] if read_id not in anu_dict: # The read is not indexed yet anu_dict[read_id] = {} anu_dict[read_id]['chr1'] = r[0].original_sam_line.split('\t')[2] anu_dict[read_id]['chr2'] = r[1].original_sam_line.split('\t')[2] anu_dict[read_id]['start1'] = r[0].original_sam_line.split('\t')[3] anu_dict[read_id]['start2'] = r[1].original_sam_line.split('\t')[3] anu_dict[read_id]['al_unique1'] = True anu_dict[read_id]['al_unique2'] = True else: # Read already indexed if anu_dict[read_id]['al_unique1']: if anu_dict[read_id]['chr1'] != r[0].original_sam_line.split('\t')[2] or anu_dict[read_id]['start1'] != r[0].original_sam_line.split('\t')[3]: # At least two positions exists for mate r[0] anu_dict[read_id]['al_unique1'] = False if anu_dict[read_id]['al_unique2']: if anu_dict[read_id]['chr2'] != r[1].original_sam_line.split('\t')[2] or anu_dict[read_id]['start2'] != r[1].original_sam_line.split('\t')[3]: # At least two positions exists for mate r[1] anu_dict[read_id]['al_unique2'] = False elif r[0] is not None: # Only r[1] is mapped anu_dict[r[0].original_sam_line.split('\t')[0]] = {} anu_dict[r[0].original_sam_line.split('\t')[0]]['al_unique1'] = False else: # Only r[0] is mapped anu_dict[r[1].original_sam_line.split('\t')[0]] = {} anu_dict[r[1].original_sam_line.split('\t')[0]]['al_unique2'] = False # endMB continue except KeyError: pass if ( r[0] and r[0].aQual < minaqual ) or ( r[1] and r[1].aQual < minaqual ): lowqual += 1 write_to_samout( r, "too_low_aQual" ) continue try: if overlap_mode == "union": fs = set() for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[ iv ].steps(): fs = fs.union( fs2 ) elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty": fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[ iv ].steps(): if len(fs2) > 0 or overlap_mode == "intersection-strict": if fs is None: fs = fs2.copy() else: fs = fs.intersection( fs2 ) else: sys.exit( "Illegal overlap mode." ) if fs is None or len( fs ) == 0: write_to_samout( r, "no_feature" ) empty += 1 # MB if custom_stat_file: if r[0] is not None and r[1] is not None: empty_p += 1 else: empty_s += 1 # endMB elif len( fs ) > 1: write_to_samout( r, "ambiguous[" + '+'.join( fs ) + "]" ) ambiguous += 1 # MB if custom_stat_file: if r[0] is not None and r[1] is not None: ambiguous_p += 1 else: ambiguous_s += 1 # endMB else: write_to_samout( r, list(fs)[0] ) counts[ list(fs)[0] ] += 1 # MB if custom_stat_file: if counts[ list(fs)[0] ] == 1: assigned_genes += 1 assigned_reads += 1 if r[0] is not None and r[1] is not None: assigned_reads_p += 1 else: assigned_reads_s += 1 # endMB except UnknownChrom: if not pe_mode: rr = r else: rr = r[0] if r[0] is not None else r[1] # MB if custom_stat_file: skipped += 1 #endMB if not quiet: sys.stderr.write( ( "Warning: Skipping read '%s', because chromosome " + "'%s', to which it has been aligned, did not appear in the GFF file.\n" ) % ( rr.read.name, iv.chrom ) ) if i % 100000 == 0 and not quiet: sys.stderr.write( "%d sam %s processed.\n" % ( i, "lines " if not pe_mode else "line pairs" ) ) except: if not pe_mode: sys.stderr.write( "Error occured in %s.\n" % read_seq.get_line_number_string() ) else: sys.stderr.write( "Error occured in %s.\n" % read_seq_pe_file.get_line_number_string() ) raise if not quiet: sys.stderr.write( "%d sam %s processed.\n" % ( i, "lines " if not pe_mode else "line pairs" ) ) if samoutfile is not None: samoutfile.close() for fn in sorted( counts.keys() ): print "%s\t%d" % ( fn, counts[fn] ) print "no_feature\t%d" % empty print "ambiguous\t%d" % ambiguous print "too_low_aQual\t%d" % lowqual print "not_aligned\t%d" % notaligned print "alignment_not_unique\t%d" % nonunique # MB: Adding stats in the custom_stat file if custom_stat_file: custom_stat_file.write("Input SAM file line count\t"+"{:,}".format(sum(1 for line in open(sam_filename) if not line.startswith('@')))+"\n\n") custom_stat_file.write("SAM lines (pairs or singles) processed\t"+"{:,}".format(i)+"\n\n") custom_stat_file.write("Skipped pairs (chr.not found)\t"+"{:,}".format(skipped)+"\n\n") custom_stat_file.write("Assigned_genes\t"+"{:,}".format(assigned_genes)+"\n\n") custom_stat_file.write("Assigned_reads\t"+"{:,}".format(assigned_reads)+"\n") custom_stat_file.write("\tSingle reads\t"+"{:,}".format(assigned_reads_s)+"\n") custom_stat_file.write("\tPaired reads\t"+"{:,}".format(assigned_reads_p)+"\n\n") custom_stat_file.write("No_features\t"+"{:,}".format(empty)+"\n") custom_stat_file.write("\tSingle reads\t"+"{:,}".format(empty_s)+"\n") custom_stat_file.write("\tPaired reads\t"+"{:,}".format(empty_p)+"\n\n") custom_stat_file.write("Ambiguous\t"+"{:,}".format(ambiguous)+"\n") custom_stat_file.write("\tSingle reads\t"+"{:,}".format(ambiguous_s)+"\n") custom_stat_file.write("\tPaired reads\t"+"{:,}".format(ambiguous_p)+"\n\n") custom_stat_file.write("Alignment_not_unique\t"+"{:,}".format(nonunique)+"\n") custom_stat_file.write("\tSAM lines (pairs or singles)\t"+"{:,}".format(len(anu_dict))+"\n") # Counting the 'alignment_not_unique' with one or both mates multiply aligned simpl = 0 multipl = 0 for i in anu_dict: if 'al_unique1' in anu_dict[i] and 'al_unique2' in anu_dict[i]: if anu_dict[i]['al_unique1'] or anu_dict[i]['al_unique2']: simpl += 1 else: multipl += 1 else: multipl += 1 custom_stat_file.write("\tOne_mate_uniquely_mapped\t"+"{:,}".format(simpl)+"\n") custom_stat_file.write("\tTwo_mates_multiply_mapped\t"+"{:,}".format(multipl)+"\n")
def count_PE_reads(sam_files, labels, regions, file_type="sam", use_chrom_name=False, order="name"): """ counts fragments (PE read pairs) for each region from all SAM/BAM files """ assert len(sam_files) == len(labels) if use_chrom_name: print "INFO: Running in mode for counting per chromosome name." m = len(sam_files) # initialize a list with default zero counts all_counts = [collections.Counter() for i in range(m)] # iterate over all sam/bam files for j in range(m): print "INFO: Start to count reads in", sam_files[j], "..." if file_type == "sam": almnt_file = HTSeq.SAM_Reader(sam_files[j]) else: almnt_file = HTSeq.BAM_Reader(sam_files[j]) # pair alignment records according to PE pairs and iterate over pairs if order == "name": print "INFO: Assuming SAM/BAM file ordered by read name." alignmentIterator = HTSeq.pair_SAM_alignments(almnt_file) else: print "INFO: Assuming SAM/BAM file ordered by position" alignmentIterator = HTSeq.pair_SAM_alignments_with_buffer( almnt_file, max_buffer_size=100 * 3000000) for pair in alignmentIterator: first_almnt, second_almnt = pair # extract pair # check if both pairs are mapped if first_almnt == None or second_almnt == None or not ( first_almnt.aligned and second_almnt.aligned): all_counts[j]["_unmapped"] += 1 continue # potential speed up for transcript fragments as reference if use_chrom_name: if first_almnt.iv.chrom == second_almnt.iv.chrom: all_counts[j][first_almnt.iv.chrom] += 1 else: all_counts[j]["_no_feature"] += 1 else: # build set for all regions overalapping with the reads gene_ids_first = set() gene_ids_second = set() # extract all region names that overlap with the reads and add them to set for iv, val in regions[first_almnt.iv].steps(): gene_ids_first |= val for iv, val in regions[second_almnt.iv].steps(): gene_ids_second |= val # take only those genes that are common for first and second read gene_ids = gene_ids_first & gene_ids_second # handle read-pairs not mapped to a feature if len(gene_ids) == 0: all_counts[j]["_no_feature"] += 1 # if pair maps to a unique gene count it else: # add increase counter for all genes for gene_id in list(gene_ids): all_counts[j][gene_id] += 1 # return counts return (all_counts)
def count_reads_paired(read_seq, forward_counter, reverse_counter, order, quiet, minaqual, write_to_samout): if order == "name": read_seq = HTSeq.pair_SAM_alignments(read_seq) elif order == "pos": read_seq = HTSeq.pair_SAM_alignments_with_buffer(read_seq) else: raise ValueError("Illegal order specified.") i = 0 for r in read_seq: if i > 0 and i % 100000 == 0 and not quiet: msg = "%d SAM alignment record pairs processed.\n" % (i) sys.stderr.write(msg) i += 1 if r[0] is not None and r[0].aligned: if forward_counter is not None: forward_iv_seq = (co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0) if reverse_counter is not None: reverse_iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar if co.type == "M" and co.size > 0) else: forward_iv_seq = tuple() reverse_iv_seq = tuple() if r[1] is not None and r[1].aligned: if forward_counter is not None: rest = (invert_strand(co.ref_iv) for co in r[1].cigar if co.type == "M" and co.size > 0) forward_iv_seq = itertools.chain(forward_iv_seq, rest) if reverse_counter is not None: rest = (co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0) reverse_iv_seq = itertools.chain(reverse_iv_seq, rest) else: if (r[0] is None) or not (r[0].aligned): write_to_samout(r, "__not_aligned") if forward_counter is not None: forward_counter.notaligned += 1 if reverse_counter is not None: reverse_counter.notaligned += 1 continue try: if (r[0] is not None and r[0].optional_field("NH") > 1) or \ (r[1] is not None and r[1].optional_field("NH") > 1): if forward_counter is not None: forward_counter.nonunique += 1 if reverse_counter is not None: reverse_counter.nonunique += 1 write_to_samout(r, "__alignment_not_unique") continue except KeyError: pass if (r[0] and r[0].aQual < minaqual) or \ (r[1] and r[1].aQual < minaqual): if forward_counter is not None: forward_counter.lowqual += 1 if reverse_counter is not None: reverse_counter.lowqual += 1 write_to_samout(r, "__too_low_aQual") continue if forward_counter is not None: forward_counter.count(forward_iv_seq, r) if reverse_counter is not None: reverse_counter.count(reverse_iv_seq, r) if not quiet: sys.stderr.write("%d SAM alignment pairs processed.\n" % (i))
def count_reads_in_features( sam_filename, gff_filename, stranded, overlap_mode, feature_type, id_attribute, quiet, minaqual, samout, \ filename_read_names_gene_names,filename_read_names_gene_names_amb_unique): """ Main function to count reads in features i.e. genes. Input: + sam_filename: Input alignment with all the ambiguously mapped reads + gff_filename: GTF containing all genes for a given species + stranded: specify whether data are stranded - see -s option + overlap_mode: mode to handle reads overlapping more than one feature (e.g. union) - See -m option: choices = ( "union", "intersection-strict", "intersection-nonempty") + feature_type: see -t option + id_attribute: see -i option + quiet: see -q option + minaqual: see -a option + samout: SAM output file storing disambiguated reads (see -o option). + filename_read_names_gene_names: filename for the output file containing the mappings readName to geneNames for multimapped reads + filename_read_names_gene_names_amb_unique: filename for the output file containing the mappings readName to geneNames for ambiguously mapped reads Output: + Writes readName to geneName outputs. + Writes SAM output file for ddisambiguated uniquely mapped reads. + Writes to stdout the genes and their read counts with read count for distinct read type: non-ambiguous unique, multimapped and ambiguous unique. This output redirected and stored to an output file in main peakRescue pipeline. This output is used in the later stage of the peakRescue pipeline to rescue the reads present in the readName to genNames mappings. """ # Output filhandles for readName to geneNames mappings fh_read_names_gene_names = open(filename_read_names_gene_names, 'w') fh_read_names_gene_names_amb_unique = open( filename_read_names_gene_names_amb_unique, 'w') def write_to_samout(r, assignment): if samoutfile is None: return if not pe_mode: r = (r, ) for read in r: if read is not None: samoutfile.write(read.original_sam_line.rstrip() + "\tXF:Z:" + assignment + "\n") if quiet: warnings.filterwarnings(action="ignore", module="HTSeq") if samout != "": samoutfile = open(samout, "w") else: samoutfile = None features = HTSeq.GenomicArrayOfSets("auto", stranded != "no") ## Hash table to store unique reads per exon (if modified GTF) counts = {} ## Hash table to store original non unique reads per gene (without dict_nonunique = {} ## Hash table to store all unique reads as per original GTF dict_gene_unique_counts = {} ## hast table to store ambigouous read count for unique reads... dict_gene_unique_counts_ambiguous = {} ## Hash table to store all non-unique reads including shared reads ## (either split reads or read pair matching on two distinct exons, same gene) dict_gene_nonunique_counts = {} ## Hash to store the non-unique read-names as key and genes names as values (fragments) dict_read_name_genes_names = {} ## Hash to store the non-unique read-names as key and genes names as values (fragments) including instances of a given multimapped read on same gene dict_read_name_genes_names_final = {} dict_read_name_genes_names_ambiguous = {} ## @todo: tag_gff - parameter to be removed - only deal with gene level information ## tag_gff: type to specify whether it contains gene or exons information tag_gff = "gene_gff" # Try to open samfile and fail early in case it is not there if sam_filename != "-": open(sam_filename).close() gff = HTSeq.GFF_Reader(gff_filename) exons = HTSeq.GenomicArrayOfSets("auto", stranded=False) i = 0 try: for f in gff: if f.type == feature_type: exons[f.iv] += f # added to get exon interval data try: feature_id = f.attr[id_attribute] except KeyError: sys.exit("Feature %s does not contain a '%s' attribute" % (f.name, id_attribute)) if stranded != "no" and f.iv.strand == ".": sys.exit( "Feature %s at %s does not have strand information but you are " "running htseq-count in stranded mode. Use '--stranded=no'." % (f.name, f.iv)) features[f.iv] += feature_id counts[f.attr[id_attribute]] = 0 # -- Initialisation feature_name = f.attr[id_attribute] # -- Added tag_gff for GFF type if tag_gff == "gene_gff": # Original GTF (genes) dict_nonunique = initialise_counts_per_feature( dict_nonunique, feature_name) dict_gene_unique_counts = initialise_counts_per_feature( dict_gene_unique_counts, feature_name) dict_gene_nonunique_counts = initialise_counts_per_feature( dict_gene_nonunique_counts, feature_name) dict_gene_unique_counts_ambiguous = initialise_counts_per_feature( dict_gene_unique_counts_ambiguous, feature_name) i += 1 if i % 100000 == 0 and not quiet: sys.stderr.write("%d GFF lines processed.\n" % i) except: sys.stderr.write("Error occured in %s.\n" % gff.get_line_number_string()) raise if not quiet: sys.stderr.write("%d GFF lines processed.\n" % i) if len(counts) == 0 and not quiet: sys.stderr.write("Warning: No features of type '%s' found.\n" % feature_type) try: if sam_filename != "-": read_seq = HTSeq.SAM_Reader(sam_filename) first_read = iter(read_seq).next() else: read_seq = iter(HTSeq.SAM_Reader(sys.stdin)) first_read = read_seq.next() read_seq = itertools.chain([first_read], read_seq) pe_mode = first_read.paired_end #pe_mode = 1 ## Added by us except: sys.stderr.write( "Error occured when reading first line of sam file.\n") raise ################################################################################################### try: if pe_mode: read_seq_pe_file = read_seq read_seq = HTSeq.pair_SAM_alignments(read_seq) empty = 0 ambiguous = 0 ambiguous_tag = 0 notaligned = 0 lowqual = 0 nonunique = 0 nonunique_nonamb_to_be_rescued = 0 temp_read_name = "NA" previous_read_name = "NA" temp_interval_r0 = "NA" temp_interval_r1 = "NA" counter_fragment = 0 flag_result = 0 i = 0 pe_mode_for_SE = 0 ## -- Added pe_mode on for SE files so that multireads reads will be accounted for if not pe_mode: # real SE pe_mode_for_SE = 1 # read_seq_pe_file = read_seq pe_mode = 1 ## -- End index_fragment = 0 for r in read_seq: prev_index_fragment = index_fragment tag_nonunique_NH = 0 tag_overlapping_genes = 0 flag_aln_not_unique = 0 # flag_ambiguous = 0 # #-- LOOP OVER ALL READS IN INPUT BAM FILE if pe_mode_for_SE: r = (r, None) counter_fragment += 1 i += 1 if not pe_mode: # -- SINGLE_END mode if not r.aligned: notaligned += 1 #write_to_samout( r, "not_aligned" ) continue try: if r.optional_field("NH") > 1: # --- Rescue multimappers in singel-end mode #write_to_samout( r, "alignment_not_unique" ) #nonunique += 1 continue except KeyError: pass if r.aQual < minaqual: lowqual += 1 #write_to_samout( r, "too_low_aQual" ) continue if stranded != "reverse": iv_seq = (co.ref_iv for co in r.cigar if co.type == "M") else: iv_seq = (invert_strand(co.ref_iv) for co in r.cigar if co.type == "M") else: # -- PAIRED-END if r[0] is not None and r[0].aligned: if stranded != "reverse": iv_seq = (co.ref_iv for co in r[0].cigar if co.type == "M") else: iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar if co.type == "M") else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if stranded != "reverse": iv_seq = itertools.chain( iv_seq, (invert_strand(co.ref_iv) for co in r[1].cigar if co.type == "M")) else: iv_seq = itertools.chain( iv_seq, (co.ref_iv for co in r[1].cigar if co.type == "M")) else: if (r[0] is None) or not (r[0].aligned): #write_to_samout( r, "not_aligned" ) notaligned += 1 continue try: if (( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \ ( r[1] is not None and r[1].optional_field( "NH" ) > 1 )): tag_nonunique_NH = 1 if (r[0] is not None and r[1] is None): result, fs_genes, fs_exons, dict_read_name_genes_names, ambiguous_tag = is_read_in_gene_interval( r[0], features, dict_read_name_genes_names, ambiguous_tag, exons) if result: flag_result = 1 (dict_nonunique, flag_aln_not_unique, dict_read_name_genes_names) = _add_non_unique_counts_per_feature_based_on_read_interval_and_readname(r[0], \ temp_interval_r0, temp_read_name, fs_genes, dict_nonunique, dict_read_name_genes_names) else: if len(fs_genes) != 0: (dict_nonunique, flag_aln_not_unique, dict_read_name_genes_names) = _add_non_unique_counts_per_feature_based_on_read_interval_and_readname(r[0], \ temp_interval_r0, temp_read_name, fs_genes, dict_nonunique, dict_read_name_genes_names) if (r[0] is None and r[1] is not None): result, fs_genes, fs_exons, dict_read_name_genes_names, ambiguous_tag = is_read_in_gene_interval( r[1], features, dict_read_name_genes_names, ambiguous_tag, exons) if result: flag_result = 1 (dict_nonunique, flag_aln_not_unique, dict_read_name_genes_names) = _add_non_unique_counts_per_feature_based_on_read_interval_and_readname(r[1], \ temp_interval_r1, temp_read_name, fs_genes, dict_nonunique, dict_read_name_genes_names) else: if len(fs_genes) != 0: (dict_nonunique, flag_aln_not_unique, dict_read_name_genes_names) = _add_non_unique_counts_per_feature_based_on_read_interval_and_readname(r[1], \ temp_interval_r1, temp_read_name, fs_genes, dict_nonunique, dict_read_name_genes_names) if (r[0] is not None and r[1] is not None): result1, fs_genes1, fs_exons1, dict_read_name_genes_names, ambiguous_tag = is_read_in_gene_interval( r[0], features, dict_read_name_genes_names, ambiguous_tag, exons) result2, fs_genes2, fs_exons2, dict_read_name_genes_names, ambiguous_tag = is_read_in_gene_interval( r[1], features, dict_read_name_genes_names, ambiguous_tag, exons) if len(fs_genes1.intersection(fs_genes2)) > 0: fs_genes = fs_genes1.intersection(fs_genes2) elif len(fs_genes1.intersection(fs_genes2)) == 0: fs_genes = fs_genes1.union(fs_genes2) if result1 and not result2: flag_result = 1 (dict_nonunique, flag_aln_not_unique, dict_read_name_genes_names) = _add_non_unique_counts_per_feature_based_on_read_interval_and_readname(r[0], \ temp_interval_r0, temp_read_name, fs_genes, dict_nonunique, dict_read_name_genes_names) elif result2 and not result1: flag_result = 1 (dict_nonunique, flag_aln_not_unique, dict_read_name_genes_names) = _add_non_unique_counts_per_feature_based_on_read_interval_and_readname(r[1], \ temp_interval_r1, temp_read_name, fs_genes, dict_nonunique, dict_read_name_genes_names) else: if len(fs_genes1) != 0 or len(fs_genes2) != 0: flag_result = 1 if ((((temp_interval_r0 != str(r[0].iv)) or (temp_interval_r1 != str(r[1].iv))) or (temp_read_name != r[0].read.name))): (dict_nonunique ) = add_non_unique_counts_per_feature( fs_genes, dict_nonunique) dict_read_name_genes_names = _populate_read_name_gene_name( dict_read_name_genes_names, fs_genes, r[0].read.name, tag_report_instances_same_multiread_on_same_gene ) flag_aln_not_unique = 1 #write_to_samout( r, "alignment_not_unique" ) nonunique += 1 if flag_result: if r[0] is not None and r[1] is None: non_uniq_read_name = r[0].read.name elif r[0] is None and r[1] is not None: non_uniq_read_name = r[1].read.name elif r[0] is not None and r[1] is not None: non_uniq_read_name = r[0].read.name non_uniq_read_name2 = dict_read_name_genes_names.keys( )[0] if flag_aln_not_unique: nonunique_nonamb_to_be_rescued += 1 # -- Re-initialise hash # previous_read_name: read which falls into at least one gene interval # tmp_read_name: the previous read in the bam file # BAM is sorted by read name hence each multimapper will be arranged one after another if previous_read_name == "NA": previous_read_name = non_uniq_read_name if non_uniq_read_name != previous_read_name: if previous_read_name in dict_read_name_genes_names.keys( ): fs_genes_names = dict_read_name_genes_names[ previous_read_name] fh_read_names_gene_names.write( "%s\t%s\n" % (previous_read_name, "\t".join( list(fs_genes_names)))) previous_read_name = non_uniq_read_name tmp_dict = {} if non_uniq_read_name in dict_read_name_genes_names.keys( ): #print "non_uniq_read_name IN dict_read_name_genes_names.keys()" tmp_dict[ non_uniq_read_name] = dict_read_name_genes_names[ non_uniq_read_name] dict_read_name_genes_names.clear( ) # only one read stored dict_read_name_genes_names = tmp_dict flag_result = 0 flag_aln_not_unique = 0 # (temp_read_name, temp_interval_r0, temp_interval_r1) = initalize_read_name_and_interval( r[0], r[1]) continue # except KeyError: except KeyError: pass if (r[0] and r[0].aQual < minaqual) or (r[1] and r[1].aQual < minaqual): lowqual += 1 #write_to_samout( r, "too_low_aQual" ) continue try: # -- if overlap_mode == "union": fs = set() for iv in iv_seq: # interval from bam file for each fragment if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): #if debug: #print "****Unique_feature %s and feature_interval %s" %(fs2,iv2) fs = fs.union(fs2) elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty": fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): if len( fs2 ) > 0 or overlap_mode == "intersection-strict": if fs is None: fs = fs2.copy() else: fs = fs.intersection(fs2) else: sys.exit("Illegal overlap mode.") fs_genes = fs if fs_genes is None or len(fs_genes) == 0: #write_to_samout( r, "no_feature" ) empty += 1 # ambiguous read count and/or one of the read pair mapping on different gene (potential gene fusion events)... # elif len( fs ) > 1: elif len(fs_genes) > 1: ############################################################### ## AMBIGUOUS UNIQUE ############################################################### is_disambiguated = 0 if not tag_nonunique_NH: if (r[0] is not None and r[1] is None): result, fs_genes, fs_exons, dict_read_name_genes_names_ambiguous, ambiguous_tag = is_read_in_gene_interval( r[0], features, dict_read_name_genes_names_ambiguous, ambiguous_tag, exons) if result: (dict_gene_unique_counts ) = add_unique_counts_per_feature( dict_gene_unique_counts, fs_genes) is_disambiguated = 1 if ambiguous_tag: (dict_gene_unique_counts_ambiguous ) = add_unique_counts_per_feature_ambiguous( fs_genes, dict_gene_unique_counts_ambiguous) flag_ambiguous = 1 # write in the file ambiguous read name gene name data... fh_read_names_gene_names_amb_unique.write( "%s\t%s\n" % (r[0].read.name, "\t".join( list(fs_genes)))) if (r[0] is None and r[1] is not None): result, fs_genes, fs_exons, dict_read_name_genes_names_ambiguous, ambiguous_tag = is_read_in_gene_interval( r[1], features, dict_read_name_genes_names_ambiguous, ambiguous_tag, exons) if result: (dict_gene_unique_counts ) = add_unique_counts_per_feature( dict_gene_unique_counts, fs_genes) is_disambiguated = 1 if ambiguous_tag: (dict_gene_unique_counts_ambiguous ) = add_unique_counts_per_feature_ambiguous( fs_genes, dict_gene_unique_counts_ambiguous) flag_ambiguous = 1 fh_read_names_gene_names_amb_unique.write( "%s\t%s\n" % (r[1].read.name, "\t".join( list(fs_genes)))) if (r[0] is not None and r[1] is not None): result1, fs_genes1, fs_exons, dict_read_name_genes_names_ambiguous, ambiguous_tag1 = is_read_in_gene_interval( r[0], features, dict_read_name_genes_names_ambiguous, ambiguous_tag, exons) result2, fs_genes2, fs_exons, dict_read_name_genes_names_ambiguous, ambiguous_tag2 = is_read_in_gene_interval( r[1], features, dict_read_name_genes_names_ambiguous, ambiguous_tag, exons) if debug: print "IN UNIQUE DISAMBIGUATION -->r[0].read.name=%s\t%s\t%s\t%s\t%s\n" % ( r[0].read.name, result1, result2, fs_genes1, fs_genes2) if len(fs_genes1.intersection(fs_genes2)) == 1: fs_genes = fs_genes1.intersection(fs_genes2) (dict_gene_unique_counts ) = add_unique_counts_per_feature( dict_gene_unique_counts, fs_genes) is_disambiguated = 1 elif len(fs_genes1.intersection(fs_genes2)) > 1: fs_genes = fs_genes1.intersection(fs_genes2) (dict_gene_unique_counts_ambiguous ) = add_unique_counts_per_feature_ambiguous( fs_genes, dict_gene_unique_counts_ambiguous) flag_ambiguous = 1 fh_read_names_gene_names_amb_unique.write( "%s\t%s\n" % (r[0].read.name, "\t".join( list(fs_genes)))) elif len(fs_genes1.intersection(fs_genes2)) == 0: fs_genes = fs_genes1.union(fs_genes2) if (fs_genes1 == set([]) or fs_genes2 == set( [])) and len(fs_genes) == 1: ## Disambiguate the uniquely mapped to the single gene it maps on (dict_gene_unique_counts ) = add_unique_counts_per_feature( dict_gene_unique_counts, fs_genes) is_disambiguated = 1 elif (fs_genes1 != set([]) or fs_genes2 != set([])): ## Add fragment to the RN-GN for ambiguous uniquely mapped based on ## union of both fs_genes (fs_genes1 & fs_genes2) > 1 ( dict_gene_unique_counts_ambiguous ) = add_unique_counts_per_feature_ambiguous( fs_genes, dict_gene_unique_counts_ambiguous) flag_ambiguous = 1 fh_read_names_gene_names_amb_unique.write( "%s\t%s\n" % (r[0].read.name, "\t".join( list(fs_genes)))) if flag_ambiguous: ambiguous += 1 #write_to_samout( r, "ambiguous[" + '+'.join( fs ) + "]" ) if is_disambiguated: write_to_samout(r, list(fs_genes)[0]) else: if debug: #print "DEBUG::CR:: len(fs) <-> 1:: fs = %s" %fs pass write_to_samout(r, list(fs)[0]) rr2 = r[0] if r[0] is not None else r[1] if not tag_nonunique_NH: (dict_gene_unique_counts ) = add_unique_counts_per_feature( dict_gene_unique_counts, fs_genes) except UnknownChrom: if not pe_mode: rr = r else: rr = r[0] if r[0] is not None else r[1] if not quiet: sys.stderr.write(( "Warning: Skipping read '%s', because chromosome " + "'%s', to which it has been aligned, did not appear in the GFF file.\n" ) % (rr.read.name, iv.chrom)) if i % 100000 == 0 and not quiet: sys.stderr.write( "%d sam %s processed.\n" % (i, "lines " if not pe_mode else "line pairs")) flag_ambiguous = 0 ## re-initialise.... index_fragment += 1 ######################### # This is to store the last read/fragment since it will no pass in previous condition: # => if non_uniq_read_name != previous_read_name: # -- At same level as the for loop (outside of the for loop) - column: 7 #fh_read_names_gene_names.close() if dict_read_name_genes_names.keys() != []: #print "dict_read_name_genes_names passing" non_uniq_read_name = dict_read_name_genes_names.keys()[0] fs_genes_names = dict_read_name_genes_names[non_uniq_read_name] fh_read_names_gene_names.write( "%s\t%s\n" % (non_uniq_read_name, "\t".join(list(fs_genes_names)))) # -- fh_read_names_gene_names.close() fh_read_names_gene_names_amb_unique.close() ################################################################################################### #except UnboundLocalError: except AttributeError: #except: if not pe_mode: sys.stderr.write("Error occured in %s.\n" % read_seq.get_line_number_string()) else: sys.stderr.write("Error occured in %s.\n" % read_seq_pe_file.get_line_number_string()) raise if not quiet: sys.stderr.write("%d sam %s processed.\n" % (i, "lines " if not pe_mode else "line pairs")) if samoutfile is not None: samoutfile.close() if tag_gff == "gene_gff": tuples_genenames_exontag = [(fn, fn) for fn in dict_gene_unique_counts.keys()] tuples_genenames_exontag.sort() previous_gene_name = "NA" for gene_name, fn in tuples_genenames_exontag: gene_name = gene_name.strip() fn = fn.strip() if tag_gff == "gene_gff": # if gene_name in dict_gene_unique_counts.keys(): print "%s\t%i\t%i\t%s" % ( fn, dict_gene_unique_counts[gene_name], dict_nonunique[gene_name], dict_gene_unique_counts_ambiguous[gene_name]) else: # -- No non-unique reads for that gene_name print "%s\t%i\t%i\t%i" % ( fn, dict_gene_unique_counts[gene_name], 0, dict_gene_unique_counts_ambiguous[gene_name]) # -- Re-initialise gene name previous_gene_name = gene_name print "no_feature\t%d" % empty print "ambiguous\t%d" % ambiguous print "too_low_aQual\t%d" % lowqual print "not_aligned\t%d" % notaligned print "alignment_not_unique\t%d" % nonunique print "nonunique_nonamb_to_be_rescued:\t%d" % nonunique_nonamb_to_be_rescued
def count_reads_in_features(sam_filenames, gff_filename, samtype, order, max_buffer_size, stranded, overlap_mode, multimapped_mode, secondary_alignment_mode, supplementary_alignment_mode, feature_type, id_attribute, additional_attributes, quiet, minaqual, samouts): def write_to_samout(r, assignment, samoutfile): if samoutfile is None: return if not pe_mode: r = (r, ) for read in r: if read is not None: read.optional_fields.append(('XF', assignment)) samoutfile.write(read.get_sam_line() + "\n") if samtype == "sam": SAM_or_BAM_Reader = HTSeq.SAM_Reader samname = 'SAM' elif samtype == "bam": SAM_or_BAM_Reader = HTSeq.BAM_Reader samname = 'BAM' else: raise ValueError("Unknown input format %s specified." % samtype) if samouts != []: if len(samouts) != len(sam_filenames): raise ValueError( 'Select the same number of {:} input and output files'.format( samname)) # Try to open samout files early in case any of them has issues for samout in samouts: with open(samout, 'w'): pass # Try to open samfiles to fail early in case any of them is not there if (len(sam_filenames) != 1) or (sam_filenames[0] != '-'): for sam_filename in sam_filenames: with open(sam_filename): pass # CIGAR match characters (including alignment match, sequence match, and # sequence mismatch com = ('M', '=', 'X') features = HTSeq.GenomicArrayOfSets("auto", stranded != "no") gff = HTSeq.GFF_Reader(gff_filename) counts = {} attributes = {} i = 0 try: for f in gff: if f.type == feature_type: try: feature_id = f.attr[id_attribute] except KeyError: raise ValueError( "Feature %s does not contain a '%s' attribute" % (f.name, id_attribute)) if stranded != "no" and f.iv.strand == ".": raise ValueError( "Feature %s at %s does not have strand information but you are " "running htseq-count in stranded mode. Use '--stranded=no'." % (f.name, f.iv)) features[f.iv] += feature_id counts[f.attr[id_attribute]] = 0 attributes[f.attr[id_attribute]] = [ f.attr[attr] if attr in f.attr else '' for attr in additional_attributes ] i += 1 if i % 100000 == 0 and not quiet: sys.stderr.write("%d GFF lines processed.\n" % i) sys.stderr.flush() except: sys.stderr.write("Error occured when processing GFF file (%s):\n" % gff.get_line_number_string()) raise if not quiet: sys.stderr.write("%d GFF lines processed.\n" % i) sys.stderr.flush() if len(counts) == 0: sys.stderr.write("Warning: No features of type '%s' found.\n" % feature_type) counts_all = [] empty_all = [] ambiguous_all = [] notaligned_all = [] lowqual_all = [] nonunique_all = [] for isam, (sam_filename) in enumerate(sam_filenames): if samouts != []: samoutfile = open(samouts[isam], 'w') else: samoutfile = None try: if sam_filename == "-": read_seq_file = SAM_or_BAM_Reader(sys.stdin) else: read_seq_file = SAM_or_BAM_Reader(sam_filename) read_seq_iter = iter(read_seq_file) # Catch empty BAM files try: first_read = next(read_seq_iter) pe_mode = first_read.paired_end except: first_read = None pe_mode = False if first_read is not None: read_seq = itertools.chain([first_read], read_seq_iter) else: read_seq = [] except: sys.stderr.write( "Error occured when reading beginning of {:} file.\n".format( samname)) raise try: if pe_mode: if ((supplementary_alignment_mode == 'ignore') and (secondary_alignment_mode == 'ignore')): primary_only = True else: primary_only = False if order == "name": read_seq = HTSeq.pair_SAM_alignments( read_seq, primary_only=primary_only) elif order == "pos": read_seq = HTSeq.pair_SAM_alignments_with_buffer( read_seq, max_buffer_size=max_buffer_size, primary_only=primary_only) else: raise ValueError("Illegal order specified.") empty = 0 ambiguous = 0 notaligned = 0 lowqual = 0 nonunique = 0 i = 0 for r in read_seq: if i > 0 and i % 100000 == 0 and not quiet: sys.stderr.write( "%d %s alignment record%s processed.\n" % (i, samname, "s" if not pe_mode else " pairs")) sys.stderr.flush() i += 1 if not pe_mode: if not r.aligned: notaligned += 1 write_to_samout(r, "__not_aligned", samoutfile) continue if ((secondary_alignment_mode == 'ignore') and r.not_primary_alignment): continue if ((supplementary_alignment_mode == 'ignore') and r.supplementary): continue try: if r.optional_field("NH") > 1: nonunique += 1 write_to_samout(r, "__alignment_not_unique", samoutfile) if multimapped_mode == 'none': continue except KeyError: pass if r.aQual < minaqual: lowqual += 1 write_to_samout(r, "__too_low_aQual", samoutfile) continue if stranded != "reverse": iv_seq = (co.ref_iv for co in r.cigar if co.type in com and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r.cigar if (co.type in com and co.size > 0)) else: if r[0] is not None and r[0].aligned: if stranded != "reverse": iv_seq = (co.ref_iv for co in r[0].cigar if co.type in com and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar if co.type in com and co.size > 0) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if stranded != "reverse": iv_seq = itertools.chain( iv_seq, (invert_strand(co.ref_iv) for co in r[1].cigar if co.type in com and co.size > 0)) else: iv_seq = itertools.chain( iv_seq, (co.ref_iv for co in r[1].cigar if co.type in com and co.size > 0)) else: if (r[0] is None) or not (r[0].aligned): write_to_samout(r, "__not_aligned", samoutfile) notaligned += 1 continue if secondary_alignment_mode == 'ignore': if (r[0] is not None) and r[0].not_primary_alignment: continue elif (r[1] is not None) and r[1].not_primary_alignment: continue if supplementary_alignment_mode == 'ignore': if (r[0] is not None) and r[0].supplementary: continue elif (r[1] is not None) and r[1].supplementary: continue try: if ((r[0] is not None and r[0].optional_field("NH") > 1) or (r[1] is not None and r[1].optional_field("NH") > 1)): nonunique += 1 write_to_samout(r, "__alignment_not_unique", samoutfile) if multimapped_mode == 'none': continue except KeyError: pass if ((r[0] and r[0].aQual < minaqual) or (r[1] and r[1].aQual < minaqual)): lowqual += 1 write_to_samout(r, "__too_low_aQual", samoutfile) continue try: if overlap_mode == "union": fs = set() for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): fs = fs.union(fs2) elif overlap_mode in ("intersection-strict", "intersection-nonempty"): fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): if ((len(fs2) > 0) or (overlap_mode == "intersection-strict")): if fs is None: fs = fs2.copy() else: fs = fs.intersection(fs2) else: sys.exit("Illegal overlap mode.") if fs is None or len(fs) == 0: write_to_samout(r, "__no_feature", samoutfile) empty += 1 elif len(fs) > 1: write_to_samout(r, "__ambiguous[" + '+'.join(fs) + "]", samoutfile) ambiguous += 1 else: write_to_samout(r, list(fs)[0], samoutfile) if fs is not None and len(fs) > 0: if multimapped_mode == 'none': if len(fs) == 1: counts[list(fs)[0]] += 1 elif multimapped_mode == 'all': for fsi in list(fs): counts[fsi] += 1 else: sys.exit("Illegal multimap mode.") except UnknownChrom: write_to_samout(r, "__no_feature", samoutfile) empty += 1 except: sys.stderr.write("Error occured when processing %s input (%s):\n" % (samname, read_seq_file.get_line_number_string())) raise if not quiet: sys.stderr.write( "%d %s %s processed.\n" % (i, samname, "alignments " if not pe_mode else "alignment pairs")) sys.stderr.flush() if samoutfile is not None: samoutfile.close() counts_all.append(counts.copy()) for fn in counts: counts[fn] = 0 empty_all.append(empty) ambiguous_all.append(ambiguous) lowqual_all.append(lowqual) notaligned_all.append(notaligned) nonunique_all.append(nonunique) pad = ['' for attr in additional_attributes] for fn in sorted(counts.keys()): print('\t'.join([fn] + attributes[fn] + [str(c[fn]) for c in counts_all])) print('\t'.join(["__no_feature"] + pad + [str(c) for c in empty_all])) print('\t'.join(["__ambiguous"] + pad + [str(c) for c in ambiguous_all])) print('\t'.join(["__too_low_aQual"] + pad + [str(c) for c in lowqual_all])) print('\t'.join(["__not_aligned"] + pad + [str(c) for c in notaligned_all])) print('\t'.join(["__alignment_not_unique"] + pad + [str(c) for c in nonunique_all]))
def count_reads_in_features(sam_filenames, gff_filename, samtype, order, max_buffer_size, stranded, overlap_mode, multimapped_mode, secondary_alignment_mode, supplementary_alignment_mode, feature_type, id_attribute, additional_attributes, quiet, minaqual, samouts): def write_to_samout(r, assignment, samoutfile): if samoutfile is None: return if not pe_mode: r = (r,) for read in r: if read is not None: read.optional_fields.append(('XF', assignment)) samoutfile.write(read.get_sam_line() + "\n") if samtype == "sam": SAM_or_BAM_Reader = HTSeq.SAM_Reader samname = 'SAM' elif samtype == "bam": SAM_or_BAM_Reader = HTSeq.BAM_Reader samname = 'BAM' else: raise ValueError("Unknown input format %s specified." % samtype) if samouts != []: if len(samouts) != len(sam_filenames): raise ValueError( 'Select the same number of {:} input and output files'.format(samname)) # Try to open samout files early in case any of them has issues for samout in samouts: with open(samout, 'w'): pass # Try to open samfiles to fail early in case any of them is not there if (len(sam_filenames) != 1) or (sam_filenames[0] != '-'): for sam_filename in sam_filenames: with open(sam_filename): pass # CIGAR match characters (including alignment match, sequence match, and # sequence mismatch com = ('M', '=', 'X') features = HTSeq.GenomicArrayOfSets("auto", stranded != "no") gff = HTSeq.GFF_Reader(gff_filename) counts = {} attributes = {} i = 0 try: for f in gff: if f.type == feature_type: try: feature_id = f.attr[id_attribute] except KeyError: raise ValueError( "Feature %s does not contain a '%s' attribute" % (f.name, id_attribute)) if stranded != "no" and f.iv.strand == ".": raise ValueError( "Feature %s at %s does not have strand information but you are " "running htseq-count in stranded mode. Use '--stranded=no'." % (f.name, f.iv)) features[f.iv] += feature_id counts[f.attr[id_attribute]] = 0 attributes[f.attr[id_attribute]] = [ f.attr[attr] if attr in f.attr else '' for attr in additional_attributes] i += 1 if i % 100000 == 0 and not quiet: sys.stderr.write("%d GFF lines processed.\n" % i) sys.stderr.flush() except: sys.stderr.write( "Error occured when processing GFF file (%s):\n" % gff.get_line_number_string()) raise if not quiet: sys.stderr.write("%d GFF lines processed.\n" % i) sys.stderr.flush() if len(counts) == 0: sys.stderr.write( "Warning: No features of type '%s' found.\n" % feature_type) counts_all = [] empty_all = [] ambiguous_all = [] notaligned_all = [] lowqual_all = [] nonunique_all = [] for isam, (sam_filename) in enumerate(sam_filenames): if samouts != []: samoutfile = open(samouts[isam], 'w') else: samoutfile = None try: if sam_filename == "-": read_seq_file = SAM_or_BAM_Reader(sys.stdin) else: read_seq_file = SAM_or_BAM_Reader(sam_filename) read_seq_iter = iter(read_seq_file) # Catch empty BAM files try: first_read = next(read_seq_iter) pe_mode = first_read.paired_end except: first_read = None pe_mode = False if first_read is not None: read_seq = itertools.chain([first_read], read_seq_iter) else: read_seq = [] except: sys.stderr.write( "Error occured when reading beginning of {:} file.\n".format( samname)) raise try: if pe_mode: if ((supplementary_alignment_mode == 'ignore') and (secondary_alignment_mode == 'ignore')): primary_only = True else: primary_only = False if order == "name": read_seq = HTSeq.pair_SAM_alignments( read_seq, primary_only=primary_only) elif order == "pos": read_seq = HTSeq.pair_SAM_alignments_with_buffer( read_seq, max_buffer_size=max_buffer_size, primary_only=primary_only) else: raise ValueError("Illegal order specified.") empty = 0 ambiguous = 0 notaligned = 0 lowqual = 0 nonunique = 0 i = 0 for r in read_seq: if i > 0 and i % 100000 == 0 and not quiet: sys.stderr.write( "%d %s alignment record%s processed.\n" % (i, samname, "s" if not pe_mode else " pairs")) sys.stderr.flush() i += 1 if not pe_mode: if not r.aligned: notaligned += 1 write_to_samout(r, "__not_aligned", samoutfile) continue if ((secondary_alignment_mode == 'ignore') and r.not_primary_alignment): continue if ((supplementary_alignment_mode == 'ignore') and r.supplementary): continue try: if r.optional_field("NH") > 1: nonunique += 1 write_to_samout( r, "__alignment_not_unique", samoutfile) if multimapped_mode == 'none': continue except KeyError: pass if r.aQual < minaqual: lowqual += 1 write_to_samout(r, "__too_low_aQual", samoutfile) continue if stranded != "reverse": iv_seq = (co.ref_iv for co in r.cigar if co.type in com and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r.cigar if (co.type in com and co.size > 0)) else: if r[0] is not None and r[0].aligned: if stranded != "reverse": iv_seq = (co.ref_iv for co in r[0].cigar if co.type in com and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar if co.type in com and co.size > 0) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if stranded != "reverse": iv_seq = itertools.chain( iv_seq, (invert_strand(co.ref_iv) for co in r[1].cigar if co.type in com and co.size > 0)) else: iv_seq = itertools.chain( iv_seq, (co.ref_iv for co in r[1].cigar if co.type in com and co.size > 0)) else: if (r[0] is None) or not (r[0].aligned): write_to_samout(r, "__not_aligned", samoutfile) notaligned += 1 continue if secondary_alignment_mode == 'ignore': if (r[0] is not None) and r[0].not_primary_alignment: continue elif (r[1] is not None) and r[1].not_primary_alignment: continue if supplementary_alignment_mode == 'ignore': if (r[0] is not None) and r[0].supplementary: continue elif (r[1] is not None) and r[1].supplementary: continue try: if ((r[0] is not None and r[0].optional_field("NH") > 1) or (r[1] is not None and r[1].optional_field("NH") > 1)): nonunique += 1 write_to_samout(r, "__alignment_not_unique", samoutfile) if multimapped_mode == 'none': continue except KeyError: pass if ((r[0] and r[0].aQual < minaqual) or (r[1] and r[1].aQual < minaqual)): lowqual += 1 write_to_samout(r, "__too_low_aQual", samoutfile) continue try: if overlap_mode == "union": fs = set() for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): fs = fs.union(fs2) elif overlap_mode in ("intersection-strict", "intersection-nonempty"): fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): if ((len(fs2) > 0) or (overlap_mode == "intersection-strict")): if fs is None: fs = fs2.copy() else: fs = fs.intersection(fs2) else: sys.exit("Illegal overlap mode.") if fs is None or len(fs) == 0: write_to_samout(r, "__no_feature", samoutfile) empty += 1 elif len(fs) > 1: write_to_samout(r, "__ambiguous[" + '+'.join(fs) + "]", samoutfile) ambiguous += 1 else: write_to_samout(r, list(fs)[0], samoutfile) if fs is not None and len(fs) > 0: if multimapped_mode == 'none': if len(fs) == 1: counts[list(fs)[0]] += 1 elif multimapped_mode == 'all': for fsi in list(fs): counts[fsi] += 1 else: sys.exit("Illegal multimap mode.") except UnknownChrom: write_to_samout(r, "__no_feature", samoutfile) empty += 1 except: sys.stderr.write( "Error occured when processing %s input (%s):\n" % (samname, read_seq_file.get_line_number_string())) raise if not quiet: sys.stderr.write( "%d %s %s processed.\n" % (i, samname, "alignments " if not pe_mode else "alignment pairs")) sys.stderr.flush() if samoutfile is not None: samoutfile.close() counts_all.append(counts.copy()) for fn in counts: counts[fn] = 0 empty_all.append(empty) ambiguous_all.append(ambiguous) lowqual_all.append(lowqual) notaligned_all.append(notaligned) nonunique_all.append(nonunique) pad = ['' for attr in additional_attributes] for fn in sorted(counts.keys()): print('\t'.join([fn] + attributes[fn] + [str(c[fn]) for c in counts_all])) print('\t'.join(["__no_feature"] + pad + [str(c) for c in empty_all])) print('\t'.join(["__ambiguous"] + pad + [str(c) for c in ambiguous_all])) print('\t'.join(["__too_low_aQual"] + pad + [str(c) for c in lowqual_all])) print('\t'.join(["__not_aligned"] + pad + [str(c) for c in notaligned_all])) print('\t'.join(["__alignment_not_unique"] + pad + [str(c) for c in nonunique_all]))
def count_circrna(args): import HTSeq import numpy as np import pandas as pd from collections import OrderedDict, defaultdict from ioutils import open_file_or_stdout logger.info('read input BAM/SAM file: ' + args.input_file) if args.input_file.endswith('.sam'): sam = HTSeq.SAM_Reader(args.input_file) elif args.input_file.endswith('.bam'): sam = HTSeq.BAM_Reader(args.input_file) else: raise ValueError('unsupported file extension') # extract junction positions from SAM header logger.info('extract junction positions') junction_positions = OrderedDict() for sq in sam.get_header_dict()['SQ']: junction_positions[sq['SN']] = sq['LN'] // 2 # initialize counts gene_ids = list(junction_positions.keys()) counts = pd.Series(np.zeros(len(gene_ids), dtype='int'), index=gene_ids) # count reads min_mapping_quality = args.min_mapping_quality strandness = args.strandness if args.paired_end: logger.info('count paired-end fragments') stats = defaultdict(int) for bundle in HTSeq.pair_SAM_alignments(sam, bundle=True): stats['total_pairs'] += 1 # ignore multi-mapped pairs if len(bundle) != 1: stats['multi_mapping'] += 1 continue read1, read2 = bundle[0] # ignore singletons if (read1 is None) or (read2 is None): stats['singleton'] += 1 continue # ignore unmapped reads if not (read1.aligned and read2.aligned): stats['unmapped'] += 1 continue # ignore pairs with mapping quality below threshold if (read1.aQual < min_mapping_quality) or (read2.aQual < min_mapping_quality): stats['low_mapping_quality'] += 1 continue if (strandness == 'forward') and (not ((read1.iv.strand == '+') and (read2.iv.strand == '-'))): stats['improper_strand'] += 1 continue if (strandness == 'reverse') and (not ((read1.iv.strand == '-') and (read2.iv.strand == '+'))): stats['improper_strand'] += 1 continue # ignore pairs on different chromosomes if read1.iv.chrom != read2.iv.chrom: stats['diff_chrom'] += 1 continue pos = junction_positions[read1.iv.chrom] if read1.iv.start < pos <= read2.iv.end: counts[read1.iv.chrom] += 1 for key, val in stats.items(): logger.info('{}: {}'.format(key, val)) else: logger.info('count single-end reads') for read in sam: # ignore unmapped read if not read.aligned: continue # ignore reads with mapping quality below threshold if read.aQual < min_mapping_quality: continue if (strandness == 'forward') and (read.iv.strand == '-'): continue if (strandness == 'reverse') and (not ((read.iv.strand == '+'))): continue pos = junction_positions[read.iv.chrom] if read.iv.start < pos <= read.iv.end: counts[read.iv.chrom] += 1 # output counts logger.info('count fragments: {}'.format(counts.sum())) logger.info('write counts to file: ' + args.output_file) with open_file_or_stdout(args.output_file) as fout: counts.to_csv(fout, sep='\t', header=None, index=True, na_rep='NA')
def count_reads_in_features( sam_filename, gff_filename, stranded, overlap_mode, quiet, minaqual, samout ): warnings.filterwarnings( action="ignore", module="HTSeq" ) samoutfile = open_sam_output_file(samout) check_sam_file(sam_filename) counts, features = read_gff_file(gff_filename, quiet, stranded) pe_mode, read_seq = read_pe_mode(sam_filename) try: if pe_mode: read_seq_pe_file = read_seq read_seq = HTSeq.pair_SAM_alignments( read_seq ) empty = 0 ambiguous = 0 notaligned = 0 lowqual = 0 nonunique = 0 i = 0 for r in read_seq: i += 1 if not pe_mode: if not r.aligned: notaligned += 1 write_to_samout( samoutfile ,r, "not_aligned",pe_mode ) continue try: if r.optional_field( "NH" ) > 1: write_to_samout( samoutfile ,r, "alignment_not_unique" ,pe_mode) nonunique += 1 continue except KeyError: pass if r.aQual < minaqual: lowqual += 1 write_to_samout(samoutfile , r, "too_low_aQual",pe_mode ) continue if stranded != "reverse": iv_seq = ( co.ref_iv for co in r.cigar if co.type == "M" ) else: iv_seq = ( invert_strand( co.ref_iv ) for co in r.cigar if co.type == "M" ) else: if r[0] is not None and r[0].aligned: if stranded != "reverse": iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" ) else: iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" ) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if stranded != "reverse": iv_seq = itertools.chain( iv_seq, ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" ) ) else: iv_seq = itertools.chain( iv_seq, ( co.ref_iv for co in r[1].cigar if co.type == "M" ) ) else: if ( r[0] is None ) or not ( r[0].aligned ): write_to_samout( samoutfile, r, "not_aligned",pe_mode ) notaligned += 1 continue try: if ( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \ ( r[1] is not None and r[1].optional_field( "NH" ) > 1 ): nonunique += 1 write_to_samout(samoutfile, r, "alignment_not_unique",pe_mode ) continue except KeyError: pass if ( r[0] and r[0].aQual < minaqual ) or ( r[1] and r[1].aQual < minaqual ): lowqual += 1 write_to_samout(samoutfile, r, "too_low_aQual",pe_mode ) continue try: if overlap_mode == "union": fs = set() for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[ iv ].steps(): fs = fs.union( fs2 ) elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty": fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[ iv ].steps(): if len(fs2) > 0 or overlap_mode == "intersection-strict": if fs is None: fs = fs2.copy() else: fs = fs.intersection( fs2 ) else: sys.exit( "Illegal overlap mode." ) if fs is None or len( fs ) == 0: write_to_samout( samoutfile,r, "no_feature" ,pe_mode) empty += 1 elif len( fs ) > 1: write_to_samout(samoutfile, r, "ambiguous[" + '+'.join( fs ) + "]",pe_mode ) print "ambiguous[" + '+'.join( fs ) + "]" ambiguous += 1 else: write_to_samout( samoutfile, r, list(fs)[0] ,pe_mode ) counts[ list(fs)[0] ] += 1 except UnknownChrom: if not pe_mode: rr = r else: rr = r[0] if r[0] is not None else r[1] if not quiet: sys.stderr.write( ( "Warning: Skipping read '%s', because chromosome " + "'%s', to which it has been aligned, did not appear in the GFF file.\n" ) % ( rr.read.name, iv.chrom ) ) if i % 100000 == 0 and not quiet: sys.stderr.write( "%d sam %s processed.\n" % ( i, "lines " if not pe_mode else "line pairs" ) ) except: if not pe_mode: sys.stderr.write( "Error occured in %s.\n" % read_seq.get_line_number_string() ) else: sys.stderr.write( "Error occured in %s.\n" % read_seq_pe_file.get_line_number_string() ) raise if not quiet: sys.stderr.write( "%d sam %s processed.\n" % ( i, "lines " if not pe_mode else "line pairs" ) ) if samoutfile is not None: samoutfile.close() for fn in sorted( counts.keys() ): if counts[fn]: print "%s\t%d" % ( fn, counts[fn] ) print "no_feature\t%d" % empty print "ambiguous\t%d" % ambiguous print "too_low_aQual\t%d" % lowqual print "not_aligned\t%d" % notaligned print "alignment_not_unique\t%d" % nonunique
def output_single_deletion_reads(input_sam_file, library_max_size): output_file = input_sam_file.split(":")[0] + "single_pos.txt" input_sam = HTSeq.SAM_Reader(input_sam_file) input_sam = HTSeq.pair_SAM_alignments(input_sam) with open(output_file, "w") as output_list: output_list.write( "read_ID\tread_start\tgap_start\tgap_end\tread_end\tdel_size\tother_info\n" ) for sam_line in input_sam: if (sam_line[0] is not None and sam_line[0].aligned) and (sam_line[1] is not None and sam_line[1].aligned): (clipping_1, read_start_1, read_start_clip_1, read_end_clip_1, read_end_1, insert_size_1, mapped_size_1) = cigar_analyse(sam_line[0]) (clipping_2, read_start_2, read_start_clip_2, read_end_clip_2, read_end_2, insert_size_2, mapped_size_2) = cigar_analyse(sam_line[1]) read_start = min(read_start_1, read_start_2) read_end = max(read_end_1, read_end_2) if read_end - read_start - insert_size_1 - insert_size_2 < library_max_size: if clipping_1 + clipping_2 > 0: # at least one clip read if clipping_1 * clipping_2 == 0: # only one clip read if clipping_1 > 0: # read1 clipped insert_size = insert_size_1 read_start_clip = read_start_clip_1 read_end_clip = read_end_clip_1 mapped_size = mapped_size_1 else: # read2 clipped insert_size = insert_size_2 read_start_clip = read_start_clip_2 read_end_clip = read_end_clip_2 mapped_size = mapped_size_2 if (read_start_clip - read_start) >= 30 and ( read_end - read_end_clip) >= 30: output_list.write( "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (sam_line[0].get_sam_line().split("\t")[0], str(read_start), str(read_start_clip), str(read_end_clip), str(read_end), str(insert_size), "paired")) elif clipping_1 + clipping_2 > 1 and ( mapped_size + read_start + read_end_clip - read_end - read_start_clip) >= 30: output_list.write( "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (sam_line[0].get_sam_line().split("\t")[0], str(read_start), str(read_start_clip), str(read_end_clip), str(read_end), str(insert_size), "paired,multi")) else: # both are clip reads insert_size = insert_size_1 + insert_size_2 read_start_clip = min(read_start_clip_1, read_start_clip_2) read_end_clip = max(read_end_clip_1, read_end_clip_2) mapped_size = mapped_size_1 + mapped_size_2 if (read_start_clip - read_start) >= 30 and ( read_end - read_end_clip) >= 30: output_list.write( "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (sam_line[0].get_sam_line().split("\t")[0], str(read_start), str(read_start_clip), str(read_end_clip), str(read_end), str(insert_size), "paired_clip")) elif (clipping_1 > 1 or clipping_2 > 1) and ( mapped_size + read_start + read_end_clip - read_end - read_start_clip) >= 30: output_list.write( "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (sam_line[0].get_sam_line().split("\t")[0], str(read_start), str(read_start_clip), str(read_end_clip), str(read_end), str(insert_size), "paired_clip,multi")) return output_file
def generateProfiles(sam_fn, fasta_fn='NC_003210.1.fa'): ''' Creates coverage and mismatch profiles for DMS-MaPseq sample. Requires the .sam file of Bowtie2 alignment of DMS-MaPseq reads to the bacterial genome. Also requires the .fasta file of the genome, to which the reads were aligned. Creates the profiles for coverage and number of mismatches for each nucleotide in the genome. The profiles are saved as .pickle files in the form of GenomicArray objects (See HTSeq library docs). Additionally the profiles can be visualized with Artemis and IGV genome browsers. ''' # read the genome sequence from fasta file genome = SeqIO.read(fasta_fn, "fasta") genome_length = len(genome.seq) # create genomic arrays to store coverage and mismatch data cvg = HTSeq.GenomicArray({genome.id: genome_length}, stranded=True, typecode="i") mis = HTSeq.GenomicArray({genome.id: genome_length}, stranded=True, storage='ndarray', typecode="i") # read the paired-end sam file sam_reader = HTSeq.SAM_Reader(sam_fn) i = 0 for first, second in HTSeq.pair_SAM_alignments(sam_reader): i += 1 if not i % 100000: print sam_fn, '->', i if not (first.proper_pair and first.proper_pair): continue # Add fragment coverage to the coverage profile second.iv.strand = first.iv.strand # The first read determines the fragment strand # If first and second reads overlap, the coverage is calculated for the whole fragment if first.iv.overlaps(second.iv): first.iv.extend_to_include(second.iv) cvg[first.iv] += 1 else: # Alternatively, the coverage is calculated for each read separately cvg[first.iv] += 1 cvg[second.iv] += 1 # Add unique mismatches from every pair of reads to the mismatch profile mism_1 = parse_md(first.optional_field('MD')) mism_2 = parse_md(second.optional_field('MD')) coord_mism = set() for mism in mism_1: offset = mism[0] coord = first.iv.start + offset coord_mism.add(coord) for mism in mism_2: offset = mism[0] coord = second.iv.start + offset coord_mism.add(coord) for coord in coord_mism: pos = HTSeq.GenomicPosition(genome.id, coord, strand = first.iv.strand) mis[pos] += 1 # Write coverage and mismatch profiles to file using pickle cvg_pickle_fn = sam_fn.replace('.sam', '_cvg.pickle') write_pickle(cvg, cvg_pickle_fn) mis_pickle_fn = sam_fn.replace('.sam', '_mis.pickle') write_pickle(mis, mis_pickle_fn) # Create Artemis profile of coverage cvg_artemis_fn = sam_fn.replace('.sam', '_cvg.artemis') write_artemis(cvg, cvg_artemis_fn) mis_artemis_fn = sam_fn.replace('.sam', '_mis.artemis') write_artemis(mis, mis_artemis_fn) # Create .bedgraph profiles of coverage at plus and minus strands cvg_bed_plus_fn = sam_fn.replace('.sam', '_cvg_plus.bedgraph') cvg_bed_minus_fn = sam_fn.replace('.sam', '_cvg_minus.bedgraph') write_bed(cvg, cvg_bed_plus_fn, cvg_bed_minus_fn) mis_bed_plus_fn = sam_fn.replace('.sam', '_mis_plus.bedgraph') mis_bed_minus_fn = sam_fn.replace('.sam', '_mis_minus.bedgraph') write_bed(mis, mis_bed_plus_fn, mis_bed_minus_fn)
def htseq_count(data): """ adapted from Simon Anders htseq-count.py script http://www-huber.embl.de/users/anders/HTSeq/doc/count.html """ sam_filename, gff_filename, out_file, stats_file = _get_files(data) stranded = _get_stranded_flag(data["config"]) overlap_mode = "union" feature_type = "exon" id_attribute = "gene_id" minaqual = 0 if file_exists(out_file): return out_file logger.info( "Counting reads mapping to exons in %s using %s as the " "annotation and strandedness as %s." % (os.path.basename(sam_filename), os.path.basename(gff_filename), _get_strandedness(data["config"]))) features = HTSeq.GenomicArrayOfSets("auto", stranded != "no") counts = {} # Try to open samfile to fail early in case it is not there open(sam_filename).close() gff = HTSeq.GFF_Reader(gff_filename) i = 0 try: for f in gff: if f.type == feature_type: try: feature_id = f.attr[id_attribute] except KeyError: sys.exit("Feature %s does not contain a '%s' attribute" % (f.name, id_attribute)) if stranded != "no" and f.iv.strand == ".": sys.exit("Feature %s at %s does not have strand " "information but you are running htseq-count " "in stranded mode. Use '--stranded=no'." % (f.name, f.iv)) features[f.iv] += feature_id counts[f.attr[id_attribute]] = 0 i += 1 if i % 100000 == 0: sys.stderr.write("%d GFF lines processed.\n" % i) except: sys.stderr.write("Error occured in %s.\n" % gff.get_line_number_string()) raise sys.stderr.write("%d GFF lines processed.\n" % i) if len(counts) == 0: sys.stderr.write("Warning: No features of type '%s' found.\n" % feature_type) try: align_reader = htseq_reader(sam_filename) first_read = iter(align_reader).next() pe_mode = first_read.paired_end except: sys.stderr.write("Error occured when reading first line of sam " "file.\n") raise try: if pe_mode: read_seq_pe_file = align_reader read_seq = HTSeq.pair_SAM_alignments(align_reader) empty = 0 ambiguous = 0 notaligned = 0 lowqual = 0 nonunique = 0 i = 0 for r in read_seq: i += 1 if not pe_mode: if not r.aligned: notaligned += 1 continue try: if r.optional_field("NH") > 1: nonunique += 1 continue except KeyError: pass if r.aQual < minaqual: lowqual += 1 continue if stranded != "reverse": iv_seq = (co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r.cigar if co.type == "M" and co.size > 0) else: if r[0] is not None and r[0].aligned: if stranded != "reverse": iv_seq = (co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar if co.type == "M" and co.size > 0) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if stranded != "reverse": iv_seq = itertools.chain( iv_seq, (invert_strand(co.ref_iv) for co in r[1].cigar if co.type == "M" and co.size > 0)) else: iv_seq = itertools.chain( iv_seq, (co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0)) else: if (r[0] is None) or not (r[0].aligned): notaligned += 1 continue try: if (r[0] is not None and r[0].optional_field("NH") > 1) or \ (r[1] is not None and r[1].optional_field("NH") > 1): nonunique += 1 continue except KeyError: pass if (r[0] and r[0].aQual < minaqual) or (r[1] and r[1].aQual < minaqual): lowqual += 1 continue try: if overlap_mode == "union": fs = set() for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): fs = fs.union(fs2) elif (overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty"): fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): if (len(fs2) > 0 or overlap_mode == "intersection-strict"): if fs is None: fs = fs2.copy() else: fs = fs.intersection(fs2) else: sys.exit("Illegal overlap mode.") if fs is None or len(fs) == 0: empty += 1 elif len(fs) > 1: ambiguous += 1 else: counts[list(fs)[0]] += 1 except UnknownChrom: if not pe_mode: rr = r else: rr = r[0] if r[0] is not None else r[1] empty += 1 if i % 100000 == 0: sys.stderr.write( "%d sam %s processed.\n" % (i, "lines " if not pe_mode else "line pairs")) except: if not pe_mode: sys.stderr.write("Error occured in %s.\n" % read_seq.get_line_number_string()) else: sys.stderr.write("Error occured in %s.\n" % read_seq_pe_file.get_line_number_string()) raise sys.stderr.write("%d sam %s processed.\n" % (i, "lines " if not pe_mode else "line pairs")) with file_transaction(out_file) as tmp_out_file: with open(tmp_out_file, "w") as out_handle: on_feature = 0 for fn in sorted(counts.keys()): on_feature += counts[fn] out_handle.write("%s\t%d\n" % (fn, counts[fn])) with file_transaction(stats_file) as tmp_stats_file: with open(tmp_stats_file, "w") as out_handle: out_handle.write("on_feature\t%d\n" % on_feature) out_handle.write("no_feature\t%d\n" % empty) out_handle.write("ambiguous\t%d\n" % ambiguous) out_handle.write("too_low_aQual\t%d\n" % lowqual) out_handle.write("not_aligned\t%d\n" % notaligned) out_handle.write("alignment_not_unique\t%d\n" % nonunique) return out_file
def library_generation(exp, Info): #Generation of the Class Library specific for "exp" library = Library( exp, Info.IIDefinition.input_files[Info.IIDefinition.lib_names.index(exp)]) print library.name string = '\n\n***\tInedependent Insertions (I.I.) definition\t***\n\n- Input file: %s\n- Pair ends: %s\n- Alignment cutoff: %s\n- Remove duplicates: %s\n- Insertion cutoff: %i' % ( library.input, Info.General.pair_ends, Info.IIDefinition.fidelity_limit, Info.IIDefinition.reads_duplicate, Info.IIDefinition.ins_iv) Info.print_save(exp, string) startTime = getCurrTime() string = '\tSelection of Insertions (I.): %s' % startTime Info.print_save(exp, string) aligned_file = HTSeq.SAM_Reader(library.input) #aligned_file = [seq for seq in itertools.islice(aligned_file,100000)] insertions_counts = Counter() count_aligned = 0 count_GoodQualityAlignment = 0 count_total = 0 for algnt in aligned_file: if algnt.aligned: if algnt.iv.chrom.startswith('chr'): chromosome_style = '' else: chromosome_style = 'chr' break if Info.General.pair_ends: #Pair ends library for bundle in HTSeq.pair_SAM_alignments(aligned_file, bundle=True): if len(bundle) != 1: continue # Skip multiple alignments first_almnt, second_almnt = bundle[0] # extract pair if first_almnt.aligned and second_almnt.aligned: if first_almnt.aQual >= Info.IIDefinition.fidelity_limit: ins = HTSeq.GenomicPosition( '%s%s' % (chromosome_style, str(first_almnt.iv.chrom)), first_almnt.iv.start_d, first_almnt.iv.strand) insertions_counts[ins] += 1 count_GoodQualityAlignment += 1 count_aligned += 1 count_total += 1 else: #Single ends library for algnt in aligned_file: if algnt.aligned: if algnt.aQual >= Info.IIDefinition.fidelity_limit: ins = HTSeq.GenomicPosition( '%s%s' % (chromosome_style, str(algnt.iv.chrom)), algnt.iv.start_d, algnt.iv.strand) insertions_counts[ins] += 1 count_GoodQualityAlignment += 1 count_aligned += 1 count_total += 1 del aligned_file string = '\t-Total reads: %i\n\t-Aligned reads: %i\n\t-Aligned Reads trusted: %i\n\t-Insertions identified: %i' % ( count_total, count_aligned, count_GoodQualityAlignment, len(insertions_counts.keys())) Info.print_save(exp, string) string = '\tRunTime: %s' % computeRunTime(startTime, getCurrTime()) Info.print_save(exp, string) ### To collapse insertions in insertion array that are in the same interval (4bps) startTime = getCurrTime() string = 'Define Independent Insertions\n\tStarted: %s' % startTime Info.print_save(exp, string) insertions_series = pd.Series(insertions_counts, index=insertions_counts.keys()) del insertions_counts insertions_order = insertions_series.copy() insertions_order.sort_values(ascending=False) insertions_genomicarray = HTSeq.GenomicArray("auto", stranded=True) count_indipendent_insertions = 0 count_indipendent_insertions_aborted = 0 insertions_tuple = zip(insertions_order.index, insertions_order.values) del insertions_order del insertions_series for ins in insertions_tuple: insertions_genomicarray[ins[0]] = ins[1] insertions_collapsed = {} for n in insertions_tuple: i = n[0] if insertions_genomicarray[i] > 0: counted = 0 iv_i = HTSeq.GenomicInterval(i.chrom, i.start - 2, i.start + 2, i.strand) for i_2 in iv_i.xrange(step=1): try: counted += insertions_genomicarray[i_2] insertions_genomicarray[i_2] = 0 except IndexError: string = "\t!!!Skipped from analysis: %s" % i_2 Info.print_save(exp, string) continue if counted >= Info.IIDefinition.ins_iv: if insertions_collapsed.has_key(i): insertions_collapsed[i] += counted else: insertions_collapsed[i] = counted count_indipendent_insertions += 1 else: count_indipendent_insertions_aborted += 1 string = '\t-Total insertions: %i\n\t-Independent Insertions (I.I.): %i' % ( (count_indipendent_insertions + count_indipendent_insertions_aborted), count_indipendent_insertions) Info.print_save(exp, string) string = '\tRunTime: %s' % computeRunTime(startTime, getCurrTime()) Info.print_save(exp, string) ###Storing data in library class that will be returned modifed as result of the function library.informations['Total'] = count_total library.informations['Aligned'] = count_aligned library.informations['Insertions'] = count_indipendent_insertions library.informations['II'] = count_indipendent_insertions if Info.IIDefinition.reads_duplicate: library.informations['Unique_reads'] = count_reads library.raw = pd.Series(insertions_collapsed, index=insertions_collapsed.keys()) #####Store the class!!!!!##### location = os.path.join(Info.General.storing_loc, exp + '_' + Info.General.date, 'raw', exp + '_IIRawdata.pkl') with open(location, 'wb') as saving: pickle.dump(library, saving) #####END the program##### string = 'Informations stored in %s\n***\tEND of Inedependent Insertions (I.I.) definition\t***' % location Info.print_save(exp, string) return library
def next_pair(self): """ Get next read pair """ for (first, second) in ht.pair_SAM_alignments(self.read_iter): yield (first, second)
#!/usr/bin/python import HTSeq as h from collections import defaultdict #reader_masked = h.SAM_Reader("/Users/dashazhernakova/Documents/UMCG/data/geuvadis/mappedData/ERR188022/aligned_masked.sam") #reader = h.SAM_Reader("/Users/dashazhernakova/Documents/UMCG/data/geuvadis/mappedData/ERR188022/aligned.sam") reader = h.SAM_Reader("/Users/dashazhernakova/Documents/UMCG/data/geuvadis/mappedData/ERR188022_masked/Aligned.out.filtered.new.1017680.sam") reader_masked = h.SAM_Reader("/Users/dashazhernakova/Documents/UMCG/data/geuvadis/mappedData/ERR188022/Aligned.out.filtered.new1mb.sam") it_p = iter(h.pair_SAM_alignments(reader)) it_p_m = iter(h.pair_SAM_alignments(reader_masked)) same_aligned = 0 one_same_pos = 0 both_same_pos = 0 masked_more_pos = 0 simple_more_pos = 0 #cur_read = {} #cur_m_read = {} not_in_simple = 0 not_in_masked = 0 n_m = defaultdict(list) i = 0 for r1, r2 in h.pair_SAM_alignments(reader): n_m[r1.read.name].append((r1,r2)) i += 1 if i%10000 == 0: print i, " lines" #for k,v in n_m.items():
def htseq_count(data): """ adapted from Simon Anders htseq-count.py script http://www-huber.embl.de/users/anders/HTSeq/doc/count.html """ sam_filename, gff_filename, out_file, stats_file = _get_files(data) stranded = _get_stranded_flag(data["config"]) overlap_mode = "union" feature_type = "exon" id_attribute = "gene_id" minaqual = 0 if file_exists(out_file): return out_file logger.info("Counting reads mapping to exons in %s using %s as the " "annotation and strandedness as %s." % (os.path.basename(sam_filename), os.path.basename(gff_filename), _get_strandedness(data["config"]))) features = HTSeq.GenomicArrayOfSets("auto", stranded != "no") counts = {} # Try to open samfile to fail early in case it is not there open(sam_filename).close() gff = HTSeq.GFF_Reader(gff_filename) i = 0 try: for f in gff: if f.type == feature_type: try: feature_id = f.attr[id_attribute] except KeyError: sys.exit("Feature %s does not contain a '%s' attribute" % (f.name, id_attribute)) if stranded != "no" and f.iv.strand == ".": sys.exit("Feature %s at %s does not have strand " "information but you are running htseq-count " "in stranded mode. Use '--stranded=no'." % (f.name, f.iv)) features[f.iv] += feature_id counts[f.attr[id_attribute]] = 0 i += 1 if i % 100000 == 0: sys.stderr.write("%d GFF lines processed.\n" % i) except: sys.stderr.write("Error occured in %s.\n" % gff.get_line_number_string()) raise sys.stderr.write("%d GFF lines processed.\n" % i) if len(counts) == 0: sys.stderr.write("Warning: No features of type '%s' found.\n" % feature_type) try: align_reader = htseq_reader(sam_filename) first_read = iter(align_reader).next() pe_mode = first_read.paired_end except: sys.stderr.write("Error occured when reading first line of sam " "file.\n") raise try: if pe_mode: read_seq_pe_file = align_reader read_seq = HTSeq.pair_SAM_alignments(align_reader) empty = 0 ambiguous = 0 notaligned = 0 lowqual = 0 nonunique = 0 i = 0 for r in read_seq: i += 1 if not pe_mode: if not r.aligned: notaligned += 1 continue try: if r.optional_field("NH") > 1: nonunique += 1 continue except KeyError: pass if r.aQual < minaqual: lowqual += 1 continue if stranded != "reverse": iv_seq = (co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r.cigar if co.type == "M" and co.size > 0) else: if r[0] is not None and r[0].aligned: if stranded != "reverse": iv_seq = (co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar if co.type == "M" and co.size > 0) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if stranded != "reverse": iv_seq = itertools.chain(iv_seq, (invert_strand(co.ref_iv) for co in r[1].cigar if co.type == "M" and co.size > 0)) else: iv_seq = itertools.chain(iv_seq, (co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0)) else: if (r[0] is None) or not (r[0].aligned): notaligned += 1 continue try: if (r[0] is not None and r[0].optional_field("NH") > 1) or \ (r[1] is not None and r[1].optional_field("NH") > 1): nonunique += 1 continue except KeyError: pass if (r[0] and r[0].aQual < minaqual) or (r[1] and r[1].aQual < minaqual): lowqual += 1 continue try: if overlap_mode == "union": fs = set() for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): fs = fs.union(fs2) elif (overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty"): fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): if (len(fs2) > 0 or overlap_mode == "intersection-strict"): if fs is None: fs = fs2.copy() else: fs = fs.intersection(fs2) else: sys.exit("Illegal overlap mode.") if fs is None or len(fs) == 0: empty += 1 elif len(fs) > 1: ambiguous += 1 else: counts[list(fs)[0]] += 1 except UnknownChrom: if not pe_mode: rr = r else: rr = r[0] if r[0] is not None else r[1] empty += 1 if i % 100000 == 0: sys.stderr.write("%d sam %s processed.\n" % ( i, "lines " if not pe_mode else "line pairs")) except: if not pe_mode: sys.stderr.write("Error occured in %s.\n" % read_seq.get_line_number_string()) else: sys.stderr.write("Error occured in %s.\n" % read_seq_pe_file.get_line_number_string() ) raise sys.stderr.write("%d sam %s processed.\n" % (i, "lines " if not pe_mode else "line pairs")) with file_transaction(out_file) as tmp_out_file: with open(tmp_out_file, "w") as out_handle: on_feature = 0 for fn in sorted(counts.keys()): on_feature += counts[fn] out_handle.write("%s\t%d\n" % (fn, counts[fn])) with file_transaction(stats_file) as tmp_stats_file: with open(tmp_stats_file, "w") as out_handle: out_handle.write("on_feature\t%d\n" % on_feature) out_handle.write("no_feature\t%d\n" % empty) out_handle.write("ambiguous\t%d\n" % ambiguous) out_handle.write("too_low_aQual\t%d\n" % lowqual) out_handle.write("not_aligned\t%d\n" % notaligned) out_handle.write("alignment_not_unique\t%d\n" % nonunique) return out_file
def count_reads_onto_prebuilt_features(sam_filename, features, feature_ids, stranded, overlap_mode, quiet, minaqual, samout, umis=False): def write_to_samout(r, assignment): if samoutfile is None: return if not pe_mode: r = (r, ) for read in r: if read is not None: samoutfile.write(read.original_sam_line.rstrip() + "\tXF:Z:" + assignment + "\n") if quiet: warnings.filterwarnings(action="ignore", module="HTSeq") if samout != "": samoutfile = open(samout, "w") else: samoutfile = None if umis: umi_re = re.compile(":UMI:(\w+):") umi_counts = {} def count_umis(fs, read_name): umi_seq = umi_re.search(read_name).group(1) umi_counts[fs][umi_seq] += 1 for feature_id in feature_ids: umi_counts[feature_id] = Counter() else: def count_umis(x, y): return None # Try to open samfile to fail early in case it is not there if sam_filename != "-": open(sam_filename).close() counts = {} for feature_id in feature_ids: counts[feature_id] = 0 try: if sam_filename != "-": read_seq_file = HTSeq.SAM_Reader(sam_filename) read_seq = read_seq_file first_read = iter(read_seq).next() else: read_seq_file = HTSeq.SAM_Reader(sys.stdin) read_seq_iter = iter(read_seq_file) first_read = read_seq_iter.next() read_seq = itertools.chain([first_read], read_seq_iter) pe_mode = first_read.paired_end except StopIteration: raise EmptySamError(sam_filename) try: if pe_mode: read_seq = HTSeq.pair_SAM_alignments(read_seq) empty = 0 ambiguous = 0 notaligned = 0 lowqual = 0 nonunique = 0 i = 0 for r in read_seq: i += 1 if not pe_mode: if not r.aligned: notaligned += 1 write_to_samout(r, "not_aligned") continue try: if r.optional_field("NH") > 1: write_to_samout(r, "alignment_not_unique") nonunique += 1 continue except KeyError: pass if r.aQual < minaqual: lowqual += 1 write_to_samout(r, "too_low_aQual") continue if stranded != "reverse": iv_seq = (co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r.cigar if co.type == "M" and co.size > 0) else: if r[0] is not None and r[0].aligned: if stranded != "reverse": iv_seq = (co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar if co.type == "M" and co.size > 0) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if stranded != "reverse": iv_seq = itertools.chain( iv_seq, (invert_strand(co.ref_iv) for co in r[1].cigar if co.type == "M" and co.size > 0)) else: iv_seq = itertools.chain( iv_seq, (co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0)) else: if (r[0] is None) or not (r[0].aligned): write_to_samout(r, "not_aligned") notaligned += 1 continue try: if ( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \ ( r[1] is not None and r[1].optional_field( "NH" ) > 1 ): nonunique += 1 write_to_samout(r, "alignment_not_unique") continue except KeyError: pass if (r[0] and r[0].aQual < minaqual) or (r[1] and r[1].aQual < minaqual): lowqual += 1 write_to_samout(r, "too_low_aQual") continue try: if overlap_mode == "union": fs = set() for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): fs = fs.union(fs2) elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty": fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): if len( fs2 ) > 0 or overlap_mode == "intersection-strict": if fs is None: fs = fs2.copy() else: fs = fs.intersection(fs2) else: sys.exit("Illegal overlap mode.") if fs is None or len(fs) == 0: write_to_samout(r, "no_feature") empty += 1 elif len(fs) > 1: write_to_samout(r, "ambiguous[" + '+'.join(fs) + "]") ambiguous += 1 else: write_to_samout(r, list(fs)[0]) counts[list(fs)[0]] += 1 count_umis(list(fs)[0], r.read.name) except UnknownChrom: if not pe_mode: rr = r else: rr = r[0] if r[0] is not None else r[1] empty += 1 #if not quiet: # sys.stderr.write( ( "Warning: Skipping read '%s', because chromosome " + # "'%s', to which it has been aligned, did not appear in the GFF file.\n" ) % # ( rr.read.name, iv.chrom ) ) if i % 100000 == 0 and not quiet: sys.stderr.write( "%d sam %s processed.\n" % (i, "lines " if not pe_mode else "line pairs")) except: sys.stderr.write("Error occured when processing SAM input (%s):\n" % read_seq_file.get_line_number_string()) raise if not quiet: sys.stderr.write("%d sam %s processed.\n" % (i, "lines " if not pe_mode else "line pairs")) if samoutfile is not None: samoutfile.close() #sorted feature list. features+counts feats = [fn for fn in sorted(counts.keys())] if umis: counts = [len(umi_counts[fn]) for fn in feats] else: counts = [counts[fn] for fn in feats] #cat statistics summary to feature+count list feats = feats + [ 'no_feature', 'ambiguous', 'too_low_aQual', 'not_aligned', 'alignment_not_unique' ] counts = counts + [empty, ambiguous, lowqual, notaligned, nonunique] return (feats, counts)
set_of_gene_names = set( [ f.name.split(":")[0] for f in rs ] ) if len( set_of_gene_names ) == 0: counts[ '_empty' ] += 1 elif len( set_of_gene_names ) > 1: counts[ '_ambiguous' ] +=1 else: for f in rs: counts[ f.name ] += 1 num_reads += 1 if num_reads % 100000 == 0: sys.stderr.write( "%d reads processed.\n" % num_reads ) else: # paired-end num_reads = 0 for af, ar in HTSeq.pair_SAM_alignments( HTSeq.SAM_Reader( sam_file ) ): rs = set() if af and ar and not af.aligned and not ar.aligned: counts[ '_notaligned' ] += 1 continue if af and ar and not af.aQual < minaqual and ar.aQual < minaqual: counts[ '_lowaqual' ] += 1 continue if af and af.aligned and af.aQual >= minaqual and af.iv.chrom in features.chrom_vectors.keys(): for cigop in af.cigar: if cigop.type != "M": continue if reverse: cigop.ref_iv.strand = reverse_strand( cigop.ref_iv.strand ) for iv, s in features[cigop.ref_iv].steps(): rs = rs.union( s )
def next_pair(self): """ Get next read pair """ for (first, second) in ht.pair_SAM_alignments(self.read_iter): yield (first, second)
def count_reads_in_features(sam_filename, gff_filename, stranded, overlap_mode, feature_type, id_attribute, quiet, minaqual, samout): def write_to_samout(r, assignment): if samoutfile is None: return if not pe_mode: r = (r, ) for read in r: if read is not None: samoutfile.write(read.original_sam_line.rstrip() + "\tXF:Z:" + assignment + "\n") if quiet: warnings.filterwarnings(action="ignore", module="HTSeq") if samout != "": samoutfile = open(samout, "w") else: samoutfile = None features = HTSeq.GenomicArrayOfSets("auto", stranded != "no") counts = {} ## added by CR dict_nonunique = {} # Try to open samfile to fail early in case it is not there if sam_filename != "-": open(sam_filename).close() gff = HTSeq.GFF_Reader(gff_filename) i = 0 try: for f in gff: if f.type == feature_type: try: feature_id = f.attr[id_attribute] except KeyError: sys.exit("Feature %s does not contain a '%s' attribute" % (f.name, id_attribute)) if stranded != "no" and f.iv.strand == ".": sys.exit( "Feature %s at %s does not have strand information but you are " "running htseq-count in stranded mode. Use '--stranded=no'." % (f.name, f.iv)) features[f.iv] += feature_id counts[f.attr[id_attribute]] = 0 ##added by CR dict_nonunique[f.attr[id_attribute]] = 0 i += 1 if i % 100000 == 0 and not quiet: sys.stderr.write("%d GFF lines processed.\n" % i) except: sys.stderr.write("Error occured in %s.\n" % gff.get_line_number_string()) raise if not quiet: sys.stderr.write("%d GFF lines processed.\n" % i) if len(counts) == 0 and not quiet: sys.stderr.write("Warning: No features of type '%s' found.\n" % feature_type) try: if sam_filename != "-": read_seq = HTSeq.SAM_Reader(sam_filename) first_read = iter(read_seq).next() else: read_seq = iter(HTSeq.SAM_Reader(sys.stdin)) first_read = read_seq.next() read_seq = itertools.chain([first_read], read_seq) pe_mode = first_read.paired_end except: sys.stderr.write( "Error occured when reading first line of sam file.\n") raise try: if pe_mode: read_seq_pe_file = read_seq read_seq = HTSeq.pair_SAM_alignments(read_seq) empty = 0 ambiguous = 0 notaligned = 0 lowqual = 0 nonunique = 0 #added by SB temp_read_name = "NA" temp_interval_r0 = "NA" temp_interval_r1 = "NA" ## added by CR nonunique2 = 0 #added by SB i = 0 for r in read_seq: i += 1 if not pe_mode: if not r.aligned: notaligned += 1 write_to_samout(r, "not_aligned") continue try: if r.optional_field("NH") > 1: write_to_samout(r, "alignment_not_unique") nonunique += 1 continue except KeyError: pass if r.aQual < minaqual: lowqual += 1 write_to_samout(r, "too_low_aQual") continue if stranded != "reverse": iv_seq = (co.ref_iv for co in r.cigar if co.type == "M") else: iv_seq = (invert_strand(co.ref_iv) for co in r.cigar if co.type == "M") else: if r[0] is not None and r[0].aligned: if stranded != "reverse": iv_seq = (co.ref_iv for co in r[0].cigar if co.type == "M") else: iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar if co.type == "M") else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if stranded != "reverse": iv_seq = itertools.chain( iv_seq, (invert_strand(co.ref_iv) for co in r[1].cigar if co.type == "M")) else: iv_seq = itertools.chain( iv_seq, (co.ref_iv for co in r[1].cigar if co.type == "M")) else: if (r[0] is None) or not (r[0].aligned): write_to_samout(r, "not_aligned") notaligned += 1 continue try: if (( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \ ( r[1] is not None and r[1].optional_field( "NH" ) > 1 )): #print "Reference i= ", i nonunique += 1 #print "%s--%s" % ( r[0].cigar, r[1].cigar) if (r[0] is not None and r[1] is None): result, fs_new = is_read_in_gene_interval( r[0], features) if result: if ((temp_read_name != r[0].read.name) and (temp_interval_r0 is not r[0].iv)): temp_read_name = r[0].read.name temp_interval_r0 = r[0].iv ## -- ro: dir(ro) = ['__class__', '__delattr__', '__doc__', '__format__', '__getattribute__', '__hash__', '__init__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '_read', '_read_as_sequenced', 'aQual', 'aligned', 'cigar', 'failed_platform_qc', 'from_SAM_line', 'from_pysam_AlignedRead', 'get_sam_line', 'inferred_insert_size', 'iv', 'mate_aligned', 'mate_start', 'not_primary_alignment', 'optional_field', 'optional_fields', 'original_sam_line', 'paired_end', 'pcr_or_optical_duplicate', 'pe_which', 'proper_pair', 'read', 'read_as_aligned'] #print "## -- ro: = %s---" % (r[0].original_sam_line) dict_nonunique[list(fs_new)[0]] += 1 #print "R1 %s--> %s " % (fs_new1 ,r[0].iv) if (r[0] is None and r[1] is not None): result, fs_new = is_read_in_gene_interval( r[1], features) if result: if ((temp_read_name != r[1].read.name) and (temp_interval_r1 is not r[1].iv)): temp_read_name = r[1].read.name temp_interval_r1 = r[1].iv #print "## -- r1: = %s---" % (r[1].original_sam_line) dict_nonunique[list(fs_new)[0]] += 1 #print "R2 %s--> %s" % (fs_new ,r[1].iv ) if (r[0] is not None and r[1] is not None): #print "## -- ro & r1 :: %s-%s" % (r[0].original_sam_line, r[1].original_sam_line) #print "%s--%s" % ( r[0].cigar, r[1].cigar) result1, fs_new1 = is_read_in_gene_interval( r[0], features) result2, fs_new2 = is_read_in_gene_interval( r[1], features) if result1 and not result2: if ((temp_read_name != r[0].read.name) and (temp_interval_r0 is not r[0].iv)): temp_interval_r0 = r[0].iv #print "before ---%s -" % ( temp_read_name ) temp_read_name = r[0].read.name temp_interval_r0 = r[0].iv #print "after %s" % ( temp_read_name ) dict_nonunique[list(fs_new1)[0]] += 1 #print "R1 %s--> %s" % (fs_new1 ,r[0].iv) elif result2 and not result1: if ((temp_read_name != r[1].read.name) and (temp_interval_r1 is not r[1].iv)): temp_read_name = r[1].read.name temp_interval_r1 = r[1].iv #print "## -- ro & r1: r1" #print "%s" % (r[1].read.name ) dict_nonunique[list(fs_new2)[0]] += 1 #print "R2 %s--> %s" % (fs_new2 ,r[1].iv) elif result1 and result2: if ((temp_read_name != r[0].read.name) and (temp_interval_r0 is not r[0].iv ) and \ ( temp_interval_r1 is not r[1].iv) ): temp_read_name = r[0].read.name temp_interval_r0 = r[0].iv temp_interval_r1 = r[1].iv #print "## -- ro & r1: ro&r1" #print "%s" % (r[0].original_sam_line) #print "---%s:%s -- %s --%s" % (r.count, r.index, r[1].read, r[0].read ) #print "%i---%i---%s---%s " % (result1, result2, fs_new1, fs_new2 ) if list(fs_new1)[0] != list(fs_new2)[0]: dict_nonunique[list(fs_new1)[0]] += 1 dict_nonunique[list(fs_new2)[0]] += 1 else: dict_nonunique[list(fs_new1)[0]] += 1 #dict_nonunique[ list(fs_new1)[0]] += 1 #print "R1_R2, %s--> %s ---%s " % (fs_new1 ,r[0].iv, r[1].iv) #dict_nonunique[ list(fs_new2)[0]] += 1 #-------------------------Modified by SB------------------------------------------------------ #fs_new= set() #print "%s**%s**%s" % (type(r[0]), type(r[0].iv), type(features)) #zz=0 #for iv3, fs_new2 in features[ r[0].iv ].steps(): # print "%i--%s--%s" % (zz, iv3, fs_new2) # zz+=1 # fs_new = fs_new.union( fs_new2 ) #CAS:[1465,1544)/+ *** CAS:[1465,1499)/. -- set([]) --set(['gal5p']) #CAS:[1465,1544)/+ *** CAS:[1499,1544)/. -- set(['GALopt']) --set(['gal5p', 'GALopt']) #if not ( (fs_new is None or len( fs_new ) == 0 ) or (len( fs_new ) > 1 ) ) : #added by CR #dict_nonunique[ list(fs_new)[0]] += 1 #---------------------------EOF SB_changes----------------------------------------------------- write_to_samout(r, "alignment_not_unique") continue except KeyError: pass if (r[0] and r[0].aQual < minaqual) or (r[1] and r[1].aQual < minaqual): lowqual += 1 write_to_samout(r, "too_low_aQual") continue try: if overlap_mode == "union": fs = set() for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): fs = fs.union(fs2) # added to test SB #CAS:[1465,1544)/+ *** CAS:[1465,1499)/. -- set([]) --set(['gal5p']) #CAS:[1465,1544)/+ *** CAS:[1499,1544)/. -- set(['GALopt']) --set(['gal5p', 'GALopt']) #print "%s *** %s -- %s --%s" % (iv, iv2, fs2, fs) elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty": fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): if len( fs2 ) > 0 or overlap_mode == "intersection-strict": if fs is None: fs = fs2.copy() else: fs = fs.intersection(fs2) else: sys.exit("Illegal overlap mode.") if fs is None or len(fs) == 0: write_to_samout(r, "no_feature") empty += 1 elif len(fs) > 1: write_to_samout(r, "ambiguous[" + '+'.join(fs) + "]") ambiguous += 1 else: write_to_samout(r, list(fs)[0]) counts[list(fs)[0]] += 1 ##aded by CR 2 lines #dict_nonunique[ list(fs)[0]] += nonunique2 #nonunique2 = 0 except UnknownChrom: if not pe_mode: rr = r else: rr = r[0] if r[0] is not None else r[1] if not quiet: sys.stderr.write(( "Warning: Skipping read '%s', because chromosome " + "'%s', to which it has been aligned, did not appear in the GFF file.\n" ) % (rr.read.name, iv.chrom)) if i % 100000 == 0 and not quiet: sys.stderr.write( "%d sam %s processed.\n" % (i, "lines " if not pe_mode else "line pairs")) except: if not pe_mode: sys.stderr.write("Error occured in %s.\n" % read_seq.get_line_number_string()) else: sys.stderr.write("Error occured in %s.\n" % read_seq_pe_file.get_line_number_string()) raise if not quiet: sys.stderr.write("%d sam %s processed.\n" % (i, "lines " if not pe_mode else "line pairs")) if samoutfile is not None: samoutfile.close() print "Gene\tUnique_reads" for fn in sorted(counts.keys()): print "%s\t%d" % (fn, counts[fn]) ##print "*%s\t%d" % (fn, dict_nonunique[fn]) print "no_feature\t%d" % empty print "ambiguous\t%d" % ambiguous print "too_low_aQual\t%d" % lowqual print "not_aligned\t%d" % notaligned print "alignment_not_unique\t%d" % nonunique
def run_through_sam( sam_filename ): try: almnt_file = HTSeq.SAM_Reader( sam_filename ) except KeyError: raise ValueError, ( "Can't find file %s" % (sam_filename)) count_reads = collections.Counter() i = 0 for bundle in HTSeq.pair_SAM_alignments( almnt_file, bundle=True ): if len(bundle) != 0: i += 1 if i > 0 and i % 200000 == 0 and not quiet: sys.stderr.write( "%d SAM alignment records processed.\n" % ( i ) ) rs = set() # Loop for multimapping reads: Reads that map to more than 3 positions if len(bundle) > 2: count_reads[ '__Ambigios_read' ] += 1 continue # Loop for Singles: Reads that map to 1 genomic postion elif len(bundle) == 1: for r1,r2 in bundle: if r1 is None or r2 is None: count_reads[ '__Single_hit:Not_aligned' ] += 1 continue else: try: iv_seq1 = ( co.ref_iv for co in r1.cigar if co.type == "M" and co.size > 0 ) iv_seq2 = ( co.ref_iv for co in r2.cigar if co.type == "M" and co.size > 0 ) except AttributeError: raise ValueError, ( "Single:Someting wrong with read %s" % (r1)) continue for iv in iv_seq1: for iv2, fs2 in features[ invert_strand(iv) ].steps(): rs = rs.union( fs2 ) for iv in iv_seq2: for iv2, fs2 in features[ iv ].steps(): rs = rs.union( fs2 ) # Parsing through the set if len(rs) == 0: count_reads[ '__Single_hit:No_feature' ] += 1 elif len(rs) == 1: count_reads[ '__Single_hit:Feature_found' ] += 1 elif len(rs) > 1: count_reads[ '__Single_hit:Ambigous_features' ] += 1 # Loop for Doubles: Reads that map to 2 genomic postion elif len(bundle) == 2: found = [] for r1,r2 in bundle: if r1 is None or r2 is None: found.append(False) continue else: found.append(True) try: iv_seq1 = ( co.ref_iv for co in r1.cigar if co.type == "M" and co.size > 0 ) iv_seq2 = ( co.ref_iv for co in r2.cigar if co.type == "M" and co.size > 0 ) except AttributeError: raise ValueError, ( "Double:Someting wrong with read %s" % (r1)) continue for iv in iv_seq1: for iv2, fs2 in features[ invert_strand(iv) ].steps(): rs = rs.union( fs2 ) for iv in iv_seq2: for iv2, fs2 in features[ iv ].steps(): rs = rs.union( fs2 ) if all(found) == False: count_reads[ '__Double_hit:Not_aligned' ] += 1 continue if any(found): if len(rs) == 0: count_reads[ '__Double_hit:No_feature' ] += 1 elif len(rs) == 1: count_reads[ '_'.join(rs) ] += 1 count_reads[ '__Double_hit:Single_feature_found' ] += 1 elif len(rs) == 2: count_reads[ '_'.join(rs) ] += 1 count_reads[ '__Double_hit:Double_feature_found' ] += 1 elif len(rs) > 2: count_reads[ '__Double_hit:Ambigous_features' ] += 1 else: continue # this sorts the collections.counter count_reads['__Total_reads' ] = i com_coll = sorted(count_reads.items(), key=lambda pair: pair[0], reverse=False) return( com_coll )
def count_reads_in_features( sam_filename, gff_filename, stranded, overlap_mode, feature_type, id_attribute, quiet, minaqual, samout ): def write_to_samout( r, assignment ): if samoutfile is None: return if not pe_mode: r = (r,) for read in r: if read is not None: samoutfile.write( read.original_sam_line.rstrip() + "\tXF:Z:" + assignment + "\n" ) if quiet: warnings.filterwarnings( action="ignore", module="HTSeq" ) if samout != "": samoutfile = open( samout, "w" ) else: samoutfile = None features = HTSeq.GenomicArrayOfSets( "auto", stranded != "no" ) counts = {} ## added by CR dict_nonunique = {} # Try to open samfile to fail early in case it is not there if sam_filename != "-": open( sam_filename ).close() gff = HTSeq.GFF_Reader( gff_filename ) i = 0 try: for f in gff: if f.type == feature_type: try: feature_id = f.attr[ id_attribute ] except KeyError: sys.exit( "Feature %s does not contain a '%s' attribute" % ( f.name, id_attribute ) ) if stranded != "no" and f.iv.strand == ".": sys.exit( "Feature %s at %s does not have strand information but you are " "running htseq-count in stranded mode. Use '--stranded=no'." % ( f.name, f.iv ) ) features[ f.iv ] += feature_id counts[ f.attr[ id_attribute ] ] = 0 ##added by CR dict_nonunique[ f.attr[ id_attribute ] ] = 0 i += 1 if i % 100000 == 0 and not quiet: sys.stderr.write( "%d GFF lines processed.\n" % i ) except: sys.stderr.write( "Error occured in %s.\n" % gff.get_line_number_string() ) raise if not quiet: sys.stderr.write( "%d GFF lines processed.\n" % i ) if len( counts ) == 0 and not quiet: sys.stderr.write( "Warning: No features of type '%s' found.\n" % feature_type ) try: if sam_filename != "-": read_seq = HTSeq.SAM_Reader( sam_filename ) first_read = iter(read_seq).next() else: read_seq = iter( HTSeq.SAM_Reader( sys.stdin ) ) first_read = read_seq.next() read_seq = itertools.chain( [ first_read ], read_seq ) pe_mode = first_read.paired_end except: sys.stderr.write( "Error occured when reading first line of sam file.\n" ) raise try: if pe_mode: read_seq_pe_file = read_seq read_seq = HTSeq.pair_SAM_alignments( read_seq ) empty = 0 ambiguous = 0 notaligned = 0 lowqual = 0 nonunique = 0 #added by SB temp_read_name="NA" temp_interval_r0="NA" temp_interval_r1="NA" ## added by CR nonunique2 = 0 #added by SB i = 0 for r in read_seq: i += 1 if not pe_mode: if not r.aligned: notaligned += 1 write_to_samout( r, "not_aligned" ) continue try: if r.optional_field( "NH" ) > 1: write_to_samout( r, "alignment_not_unique" ) nonunique += 1 continue except KeyError: pass if r.aQual < minaqual: lowqual += 1 write_to_samout( r, "too_low_aQual" ) continue if stranded != "reverse": iv_seq = ( co.ref_iv for co in r.cigar if co.type == "M" ) else: iv_seq = ( invert_strand( co.ref_iv ) for co in r.cigar if co.type == "M" ) else: if r[0] is not None and r[0].aligned: if stranded != "reverse": iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" ) else: iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" ) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if stranded != "reverse": iv_seq = itertools.chain( iv_seq, ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" ) ) else: iv_seq = itertools.chain( iv_seq, ( co.ref_iv for co in r[1].cigar if co.type == "M" ) ) else: if ( r[0] is None ) or not ( r[0].aligned ): write_to_samout( r, "not_aligned" ) notaligned += 1 continue try: if (( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \ ( r[1] is not None and r[1].optional_field( "NH" ) > 1 )): #print "Reference i= ", i nonunique += 1 #print "%s--%s" % ( r[0].cigar, r[1].cigar) if ( r[0] is not None and r[1] is None ): result, fs_new = is_read_in_gene_interval(r[0], features) if result: if ((temp_read_name != r[0].read.name) and ( temp_interval_r0 is not r[0].iv) ): temp_read_name=r[0].read.name temp_interval_r0=r[0].iv ## -- ro: dir(ro) = ['__class__', '__delattr__', '__doc__', '__format__', '__getattribute__', '__hash__', '__init__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '_read', '_read_as_sequenced', 'aQual', 'aligned', 'cigar', 'failed_platform_qc', 'from_SAM_line', 'from_pysam_AlignedRead', 'get_sam_line', 'inferred_insert_size', 'iv', 'mate_aligned', 'mate_start', 'not_primary_alignment', 'optional_field', 'optional_fields', 'original_sam_line', 'paired_end', 'pcr_or_optical_duplicate', 'pe_which', 'proper_pair', 'read', 'read_as_aligned'] #print "## -- ro: = %s---" % (r[0].original_sam_line) dict_nonunique[ list(fs_new)[0]] += 1 #print "R1 %s--> %s " % (fs_new1 ,r[0].iv) if ( r[0] is None and r[1] is not None ): result, fs_new = is_read_in_gene_interval(r[1], features) if result: if ((temp_read_name != r[1].read.name) and ( temp_interval_r1 is not r[1].iv) ): temp_read_name=r[1].read.name temp_interval_r1=r[1].iv #print "## -- r1: = %s---" % (r[1].original_sam_line) dict_nonunique[ list(fs_new)[0]] += 1 #print "R2 %s--> %s" % (fs_new ,r[1].iv ) if ( r[0] is not None and r[1] is not None ): #print "## -- ro & r1 :: %s-%s" % (r[0].original_sam_line, r[1].original_sam_line) #print "%s--%s" % ( r[0].cigar, r[1].cigar) result1, fs_new1 = is_read_in_gene_interval(r[0], features) result2, fs_new2 = is_read_in_gene_interval(r[1], features) if result1 and not result2: if ((temp_read_name != r[0].read.name) and ( temp_interval_r0 is not r[0].iv) ): temp_interval_r0=r[0].iv #print "before ---%s -" % ( temp_read_name ) temp_read_name=r[0].read.name temp_interval_r0=r[0].iv #print "after %s" % ( temp_read_name ) dict_nonunique[ list(fs_new1)[0]] += 1 #print "R1 %s--> %s" % (fs_new1 ,r[0].iv) elif result2 and not result1: if ((temp_read_name != r[1].read.name)and ( temp_interval_r1 is not r[1].iv)): temp_read_name=r[1].read.name temp_interval_r1=r[1].iv #print "## -- ro & r1: r1" #print "%s" % (r[1].read.name ) dict_nonunique[ list(fs_new2)[0]] += 1 #print "R2 %s--> %s" % (fs_new2 ,r[1].iv) elif result1 and result2: if ((temp_read_name != r[0].read.name) and (temp_interval_r0 is not r[0].iv ) and \ ( temp_interval_r1 is not r[1].iv) ): temp_read_name=r[0].read.name temp_interval_r0=r[0].iv temp_interval_r1=r[1].iv #print "## -- ro & r1: ro&r1" #print "%s" % (r[0].original_sam_line) #print "---%s:%s -- %s --%s" % (r.count, r.index, r[1].read, r[0].read ) #print "%i---%i---%s---%s " % (result1, result2, fs_new1, fs_new2 ) if list(fs_new1)[0] != list(fs_new2)[0]: dict_nonunique[ list(fs_new1)[0]] += 1 dict_nonunique[ list(fs_new2)[0]] += 1 else: dict_nonunique[ list(fs_new1)[0]] += 1 #dict_nonunique[ list(fs_new1)[0]] += 1 #print "R1_R2, %s--> %s ---%s " % (fs_new1 ,r[0].iv, r[1].iv) #dict_nonunique[ list(fs_new2)[0]] += 1 #-------------------------Modified by SB------------------------------------------------------ #fs_new= set() #print "%s**%s**%s" % (type(r[0]), type(r[0].iv), type(features)) #zz=0 #for iv3, fs_new2 in features[ r[0].iv ].steps(): # print "%i--%s--%s" % (zz, iv3, fs_new2) # zz+=1 # fs_new = fs_new.union( fs_new2 ) #CAS:[1465,1544)/+ *** CAS:[1465,1499)/. -- set([]) --set(['gal5p']) #CAS:[1465,1544)/+ *** CAS:[1499,1544)/. -- set(['GALopt']) --set(['gal5p', 'GALopt']) #if not ( (fs_new is None or len( fs_new ) == 0 ) or (len( fs_new ) > 1 ) ) : #added by CR #dict_nonunique[ list(fs_new)[0]] += 1 #---------------------------EOF SB_changes----------------------------------------------------- write_to_samout( r, "alignment_not_unique" ) continue except KeyError: pass if ( r[0] and r[0].aQual < minaqual ) or ( r[1] and r[1].aQual < minaqual ): lowqual += 1 write_to_samout( r, "too_low_aQual" ) continue try: if overlap_mode == "union": fs = set() for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[ iv ].steps(): fs = fs.union( fs2 ) # added to test SB #CAS:[1465,1544)/+ *** CAS:[1465,1499)/. -- set([]) --set(['gal5p']) #CAS:[1465,1544)/+ *** CAS:[1499,1544)/. -- set(['GALopt']) --set(['gal5p', 'GALopt']) #print "%s *** %s -- %s --%s" % (iv, iv2, fs2, fs) elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty": fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[ iv ].steps(): if len(fs2) > 0 or overlap_mode == "intersection-strict": if fs is None: fs = fs2.copy() else: fs = fs.intersection( fs2 ) else: sys.exit( "Illegal overlap mode." ) if fs is None or len( fs ) == 0: write_to_samout( r, "no_feature" ) empty += 1 elif len( fs ) > 1: write_to_samout( r, "ambiguous[" + '+'.join( fs ) + "]" ) ambiguous += 1 else: write_to_samout( r, list(fs)[0] ) counts[ list(fs)[0] ] += 1 ##aded by CR 2 lines #dict_nonunique[ list(fs)[0]] += nonunique2 #nonunique2 = 0 except UnknownChrom: if not pe_mode: rr = r else: rr = r[0] if r[0] is not None else r[1] if not quiet: sys.stderr.write( ( "Warning: Skipping read '%s', because chromosome " + "'%s', to which it has been aligned, did not appear in the GFF file.\n" ) % ( rr.read.name, iv.chrom ) ) if i % 100000 == 0 and not quiet: sys.stderr.write( "%d sam %s processed.\n" % ( i, "lines " if not pe_mode else "line pairs" ) ) except: if not pe_mode: sys.stderr.write( "Error occured in %s.\n" % read_seq.get_line_number_string() ) else: sys.stderr.write( "Error occured in %s.\n" % read_seq_pe_file.get_line_number_string() ) raise if not quiet: sys.stderr.write( "%d sam %s processed.\n" % ( i, "lines " if not pe_mode else "line pairs" ) ) if samoutfile is not None: samoutfile.close() print "Gene\tUnique_reads" for fn in sorted( counts.keys() ): print "%s\t%d" % ( fn, counts[fn]) ##print "*%s\t%d" % (fn, dict_nonunique[fn]) print "no_feature\t%d" % empty print "ambiguous\t%d" % ambiguous print "too_low_aQual\t%d" % lowqual print "not_aligned\t%d" % notaligned print "alignment_not_unique\t%d" % nonunique
def count_reads_in_features(sam_filename, gff_filename, samtype, order, stranded, overlap_mode, feature_type, id_attribute, quiet, minaqual, samout, include_non_annotated=False, htseq_no_ambiguous=True): """ This is taken from the function count_reads_in_features() from the script htseq-count in the HTSeq package version 0.61.p2 The reason to do so is to fix two really small bugs related to the SAM output. The code of the function is small and simple so for now we will use the patched function here. A patch request has been sent to the HTSeq team. The description of the parameters are the same as htseq-count. Two parameters were added to filter out what to write in the sam output The HTSEQ License HTSeq is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. The full text of the GNU General Public License, version 3, can be found here: http://www.gnu.org/licenses/gpl-3.0-standalone.html """ # Set up the filters count_reads_in_features.filter_htseq = \ ["__too_low_aQual", "__not_aligned", "__alignment_not_unique"] if not include_non_annotated: count_reads_in_features.filter_htseq.append("__no_feature") count_reads_in_features.filter_htseq_no_ambiguous = htseq_no_ambiguous # Open SAM output file flag_write = "wb" if samtype == "bam" else "wh" flag_read = "rb" if samtype == "bam" else "r" saminfile = pysam.AlignmentFile(sam_filename, flag_read) count_reads_in_features.samoutfile = pysam.AlignmentFile( samout, flag_write, template=saminfile) saminfile.close() # Counter of annotated records count_reads_in_features.annotated = 0 # Function to write to SAM output def write_to_samout(r, assignment): if not pe_mode: r = (r, ) for read in r: if read is not None and assignment not in count_reads_in_features.filter_htseq \ and not (count_reads_in_features.filter_htseq_no_ambiguous and assignment.find("__ambiguous") != -1): sam_record = read.to_pysam_AlignedRead( count_reads_in_features.samoutfile) sam_record.set_tag("XF", assignment, "Z") count_reads_in_features.samoutfile.write(sam_record) count_reads_in_features.annotated += 1 # Annotation objects features = HTSeq.GenomicArrayOfSets("auto", stranded != "no") counts = {} gff = HTSeq.GFF_Reader(gff_filename) try: for f in gff: if f.type == feature_type: try: feature_id = f.attr[id_attribute] except KeyError: raise ValueError, ("Feature %s does not contain a '%s' attribute" \ % (f.name, id_attribute)) if stranded != "no" and f.iv.strand == ".": raise ValueError, ("Feature %s at %s does not have strand information but you are " \ "running htseq-count in stranded mode. Use '--stranded=no'." % (f.name, f.iv)) features[f.iv] += feature_id counts[f.attr[id_attribute]] = 0 except: raise if len(counts) == 0: raise RuntimeError, "No features of type '%s' found.\n" % feature_type if samtype == "sam": SAM_or_BAM_Reader = HTSeq.SAM_Reader elif samtype == "bam": SAM_or_BAM_Reader = HTSeq.BAM_Reader else: raise ValueError, "Unknown input format %s specified." % samtype try: read_seq_file = SAM_or_BAM_Reader(sam_filename) read_seq = read_seq_file first_read = iter(read_seq).next() pe_mode = first_read.paired_end except: raise RuntimeError, "Error occurred when reading beginning of SAM/BAM file." try: if pe_mode: if order == "name": read_seq = HTSeq.pair_SAM_alignments(read_seq) elif order == "pos": read_seq = HTSeq.pair_SAM_alignments_with_buffer(read_seq) else: raise ValueError, "Illegal order specified." for r in read_seq: if not pe_mode: if not r.aligned: write_to_samout(r, "__not_aligned") continue try: if r.optional_field("NH") > 1: write_to_samout(r, "__alignment_not_unique") continue except KeyError: pass if r.aQual < minaqual: write_to_samout(r, "__too_low_aQual") continue if stranded != "reverse": iv_seq = (co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r.cigar if co.type == "M" and co.size > 0) else: if r[0] is not None and r[0].aligned: if stranded != "reverse": iv_seq = (co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar if co.type == "M" and co.size > 0) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if stranded != "reverse": iv_seq = itertools.chain( iv_seq, (invert_strand(co.ref_iv) for co in r[1].cigar if co.type == "M" and co.size > 0)) else: iv_seq = itertools.chain( iv_seq, (co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0)) else: if (r[0] is None) or not (r[0].aligned): write_to_samout(r, "__not_aligned") continue try: if (r[0] is not None and r[0].optional_field("NH") > 1) \ or (r[1] is not None and r[1].optional_field("NH") > 1): write_to_samout(r, "__alignment_not_unique") continue except KeyError: pass if (r[0] and r[0].aQual < minaqual) or (r[1] and r[1].aQual < minaqual): write_to_samout(r, "__too_low_aQual") continue try: if overlap_mode == "union": fs = set() for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): fs = fs.union(fs2) elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty": fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): if len( fs2 ) > 0 or overlap_mode == "intersection-strict": if fs is None: fs = fs2.copy() else: fs = fs.intersection(fs2) else: raise RuntimeError, "Illegal overlap mode." if fs is None or len(fs) == 0: write_to_samout(r, "__no_feature") elif len(fs) > 1: write_to_samout(r, "__ambiguous[" + '+'.join(fs) + "]") else: write_to_samout(r, list(fs)[0]) except UnknownChrom: write_to_samout(r, "__no_feature") except: count_reads_in_features.samoutfile.close() raise count_reads_in_features.samoutfile.close() return count_reads_in_features.annotated