def Trim_Phix(Info,exp,storing_loc,pair_end): to_clean = [] to_clean.append(os.path.join(storing_loc,exp + '_Aligned.sam')) if pair_end: to_clean.append(os.path.join(storing_loc,exp + '_pairend_Aligned.sam')) for library in to_clean: Phix_blast = HTSeq.SAM_Reader(library) if to_clean.index(library) == 0: storage_file = os.path.join(storing_loc,exp + '_PhixCleaned.fastq') elif to_clean.index(library) == 1: storage_file = os.path.join(storing_loc,exp + '_pairend_PhixCleaned.fastq') with open(storage_file,"w") as Library_Trimmed_Phix: count_total = 0 count_selected = 0 for read in Phix_blast: count_total +=1 if not read.aligned: selected_fastq = read.read selected_fastq.write_to_fastq_file(Library_Trimmed_Phix) count_selected +=1 if count_total%1000000 == 0: print "Analyzed ", count_total, "sequences." string = "\tSaved in: %s\n\t\tSelected(NO Phix) sequences %i of %i."% (storage_file,count_selected,count_total) Info.print_save(exp,string)
def bam_count(args): bam = HTSeq.SAM_Reader(args.fi) #exons = htseq_read_gtf(args.fg) cnts = collections.Counter() for bundle in HTSeq.pair_SAM_alignments_with_buffer(bam): if len(bundle) != 1: continue aln1, aln2 = bundle[0] if not aln1.aligned and aln2.aligned: cnts["_unmapped"] += 1 continue gids = set() for iv, val in exons[aln1.iv].steps(): gids |= val for iv, val in exons[aln2.iv].steps(): gids |= val if len(gids) == 1: gid = list(gids)[0] cnts[gid] += 1 elif len(gids) == 0: cnts["_no_feature"] += 1 else: cnts["_ambiguous"] += 1 for gid in cnts: print("%s\t%d" % (gid, cnts[gid]))
def output_merged_deletion_reads(input_sam_file): output_file = input_sam_file.split(":")[0] + "merged_pos.txt" input_sam = HTSeq.SAM_Reader(input_sam_file) with open(output_file, "w") as output_list: # output_list.write("read_ID\tread_start\tgap_start\tgap_end\tread_end\tdel_size\tother_info\n") for sam_line in input_sam: (clipping, read_start, read_start_clip, read_end_clip, read_end, insert_size, mapped_size) = cigar_analyse(sam_line) if clipping > 0: if (read_start_clip - read_start) >= 30 and (read_end - read_end_clip) >= 30: output_list.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (sam_line.get_sam_line().split("\t")[0], str(read_start), str(read_start_clip), str(read_end_clip), str(read_end), str(insert_size), "merged")) elif clipping > 1 and (mapped_size + read_start + read_end_clip - read_end - read_start_clip) >= 30: output_list.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (sam_line.get_sam_line().split("\t")[0], str(read_start), str(read_start_clip), str(read_end_clip), str(read_end), str(insert_size), "merged,multi_clip")) return output_file
def __iter__(self): if filetype.upper() == "BED": for line in HTSeq.BED_Reader(filepath): line.iv.start -= window_length line.iv.end += window_length yield line.iv elif filetype.upper() == "GFF" or filetype.upper() == "GTF": for line in HTSeq.GFF_Reader(filepath): line.iv.start -= window_length line.iv.end += window_length yield line.iv elif filetype.upper() == "SAM": for line in HTSeq.SAM_Reader(filepath): line.iv.start -= window_length line.iv.end += window_length yield line.iv elif filetype.upper() == "BAM": for line in HTSeq.BAM_Reader(filepath): line.iv.start -= window_length line.iv.end += window_length yield line.iv elif self.filetype.upper() == "OTHER": for line in func(self.filepath): line.iv.start -= window_length line.iv.end += window_length yield line.iv
def count_single_file(sam_file, handle): print "counting alignments in " + sam_file alignment_iterator = HTSeq.SAM_Reader(sam_file) count = 0 for alignment in alignment_iterator: count += 1 handle.write(str(count) + " " + sam_file + "\n")
def main(cl=None): ''' Implements the Usage exception handler that can be raised from anywhere in process. ''' if cl is None: cl = CommandLine() else : cl = CommandLine(['-r']) try: print cl.args # print the parsed argument string alignment_file = HTSeq.SAM_Reader(cl.args["sam_file"]) # Get coverage for the whole genome cvg = HTSeq.GenomicArray( "auto", stranded=False, typecode='i' ) for alngt in alignment_file: if alngt.aligned: cvg[ alngt.iv ] += 1 # Write a "Wiggle" file for genome browser viewing cvg.write_bedgraph_file(cl.args["output_prefix"]+".wig") # Now need to iterate over every gene/transcript and get the # per-transcript coverage # gtf_file = HTSeq.GFF_Reader("/home/pvcastro/reference_known_genes.gtf") except Usage, err: cl.do_usage_and_die(err.msg)
def count_features(samfile, features=None, gtffile=None, readcounts=None, merge=False): """Count reads in features from an alignment, if no truecounts we assume a non-collapsed file was used to map. Args: samfile: mapped sam file gtffile: feature file features: annotations read from bed or gtf file readcounts: read counts from original (un-collapsed) file merge: whether to merge the gtf fields with the results Returns: dataframe of genes with total counts """ if gtffile != None: gtf = HTSeq.GFF_Reader(gtffile) features = get_exons(gtf) sam = HTSeq.SAM_Reader(samfile) if type(readcounts) is pd.DataFrame: readcounts = {r.seq: r['reads'] for i, r in readcounts.iterrows()} import collections counts = collections.Counter() for almnt in sam: seq = str(almnt.read) if readcounts is not None and seq in readcounts: c = readcounts[seq] else: c = 1 if not almnt.aligned: counts["_unmapped"] += c continue gene_ids = set() for iv, val in features[almnt.iv].steps(): gene_ids |= val #print almnt.iv, almnt.read, readcounts[seq], gene_ids if len(gene_ids) == 1: gene_id = list(gene_ids)[0] counts[gene_id] += c elif len(gene_ids) == 0: counts["_no_feature"] += c else: counts["_ambiguous"] += c result = pd.DataFrame(counts.items(), columns=['name', 'reads']) #result['norm'] = result.reads/result.reads.sum()*1e6 result = result.sort_values(by='reads', ascending=False) um = ['_no_feature', '_unmapped'] mapped = float(result[-result.name.isin(um)].reads.sum()) total = result.reads.sum() print('%s/%s reads counted, %.2f percent' % (mapped, total, mapped / total * 100)) if merge == True and gtffile != None: result = merge_features(result, gtffile) return result
def analyze_integration_reads(input_sam_file_1, input_sam_file_2): input_sam_1 = HTSeq.SAM_Reader(input_sam_file_1) input_sam_2 = HTSeq.SAM_Reader(input_sam_file_2) sam1_reads = set() sam2_reads = set() for sam_line_1 in input_sam_1: if sam_line_1.aligned: sam1_reads.add("@" + sam_line_1.get_sam_line().split("\t")[0]) for sam_line_2 in input_sam_2: if sam_line_2.aligned: sam2_reads.add("@" + sam_line_2.get_sam_line().split("\t")[0]) kept_reads = sam1_reads.intersection(sam2_reads) return kept_reads
def report_indel_from_amp(sortsam, ODNS, region_start, region_end): Total_read = 0 ODNS_inserted = 0 INDEL_read = 0 substitution_read = 0 for aln in HTSeq.SAM_Reader(sortsam): ### one alignment at a time, alignment independent of read name ### ### only assign one type of mutation for one read according to priority ODNs> INDEL> substitution ### if aln.aligned and match_base_filter(aln) and check_AS_filter(aln): ### pass AS and # matches filter ### Total_read += 1 ### count number of reads being edited ### if soft_clip_filter(ODNS, aln): ### soft-clipped part contain ODNs == ODN insertion ### ODNS_inserted += 1 else: cigar_property = [(c.type, c.size) for c in aln.cigar] if [p for p in cigar_property if p[0] == "I" and p[1] > 20 ] != []: if check_ODN_presence_insertion(aln): ### count ODNS insertion marked as I ### ODNS_inserted += 1 else: INDEL_read += 1 ### these reads were either modified or not modified, so record all the indel mutations ### elif [ p[0] for p in cigar_property if p[1] < 20 and p[0] == "D" or p[0] == "I" ]: ### distinguish insertion from ODNS insertion ### indel_region = [ [c.ref_iv.start, c.ref_iv.end] for c in aln.cigar if c.type == "D" or c.type == "I" and c.size < 20 ] valid_indel = [ r for r in indel_region if r[0] >= region_start and r[1] <= region_end ] if len(valid_indel) > 0: ### count indel over substitutions ### INDEL_read += 1 #### Cannot sort out substitution ### else: if "M" in [p[0] for p in cigar_property]: ### these are reads having everything as matches, so check MD to see if there are any mutations ### SNP_genomic_coord, MD_bool = check_MD_filter( aln, region_start, region_end) print SNP_genomic_coord ### check if subtitution within pcr region???### if MD_bool == True and [ s[0] for s in SNP_genomic_coord if s[1] != 43737486 ] != []: substitution_read += 1 return Total_read, ODNS_inserted, INDEL_read, substitution_read
def main(): ## first group group1_f = args.group1 file_gtf = open(name_gtf, 'r') gff_file = HTSeq.GFF_Reader(file_gtf) ##### Sanity Check samfile = HTSeq.SAM_Reader(group1_f[0]) is_chr_sam = None set_chr_gff = set() for almnt in samfile: is_chr_sam = almnt.iv.chrom break for feature in gff_file: set_chr_gff.add(feature.iv.chrom) if is_chr_sam not in set_chr_gff: sys.stderr.write( "Error: Chromosome id in SAM files and GFF file does not agree!!\n" ) sys.exit(1) ##### file_gtf = open(name_gtf, 'r') gff_file = HTSeq.GFF_Reader(file_gtf) counts1 = meanVar(group1_f, gff_file, 'group1') file_gtf.close() ## second group group2_f = args.group2 file_gtf = open(name_gtf, 'r') gff_file = HTSeq.GFF_Reader(file_gtf) counts2 = meanVar(group2_f, gff_file, 'group2') file_gtf.close() merged_count = { k1: v1 + v2 for (k1, v1) in counts1.iteritems() for (k2, v2) in counts2.iteritems() if k1 == k2 } non0_exp_list = list() for k, v in sorted(merged_count.iteritems()): #print k,v value = np.array(v) if len(value[value == 0]) == 0: non0_exp_list.append(k) #print k,v num_non0 = len(non0_exp_list) sys.stderr.write( "randomly choose %d out of %d genes as the final target genes that are to undergo AS\n" % (NTARG, num_non0)) l = random.sample(xrange(num_non0), NTARG) out = open('AS_genes_list.txt', 'w') for i in l: out.write(non0_exp_list[i] + "\n") out.close()
def load_sam_or_bam(sam_filename): sambase, samext = os.path.splitext(sam_filename) if samext == ".sam": align_seq = iter(HTSeq.SAM_Reader(sam_filename)) elif samext == ".bam": align_seq = iter(HTSeq.BAM_Reader(sam_filename)) else: print >> sys.stderr, "Problem with SAM/BAM File:", sam_filename sys.exit(1) return align_seq
def htseq_reader(align_file): """ returns a read-by-read sequence reader for a BAM or SAM file """ if bam.is_sam(align_file): read_seq = HTSeq.SAM_Reader(align_file) elif bam.is_bam(align_file): read_seq = HTSeq.BAM_Reader(align_file) else: logger.error("%s is not a SAM or BAM file" % (align_file)) sys.exit(1) return read_seq
def htcount(samfile): sam = HTSeq.SAM_Reader(samfile) ref2cnt = defaultdict(int) i = 0 for hit in sam: i += 1 if hit.aligned: chrom = hit.iv.chrom ref2cnt[chrom] += 1 if i % 1000000 == 0: sys.stderr.write("%d\n" % i) return ref2cnt
def test_htseq(self): """htseq basic test for sam file reading""" import HTSeq samfile = os.path.join(base.datadir, 'test.sam') sam = HTSeq.SAM_Reader(samfile) f = [] for a in sam: if a.aligned == True: seq = a.read.seq.decode() f.append((seq, a.read.name, a.iv.chrom)) df = pd.DataFrame(f, columns=['seq', 'read', 'name']) #print (df) return
def __create_genomic_signals(self, stranded=True, func=None, use_wrappers=True): """Prepares coverage as a HTSeq.GenomicArray :param filepath: path to file :param filetype: type of the file (can be bed etc.) """ stderr.write("Creating %s signal. It may take few minutes...\n" % self.name) self.coverage = HTSeq.GenomicArray("auto", stranded=stranded, typecode="d") self.library_size = 0 if self.filetype.upper() == "BED": if use_wrappers: self.coverage = BedWrapper(self.filepath) else: for line in HTSeq.BED_Reader(self.filepath): self.coverage[line.iv] += 1 self.library_size += 1 elif self.filetype.upper() == "GFF" or self.filetype.upper() == "GTF": for line in HTSeq.GFF_Reader(self.filepath): self.coverage[line.iv] += 1 self.library_size += 1 elif self.filetype.upper() == "SAM": for line in HTSeq.SAM_Reader(self.filepath): self.coverage[line.iv] += 1 self.library_size += 1 elif self.filetype.upper() == "BAM": if use_wrappers: raise NotImplementedError( "Bam wrapper is not yet implemented!") self.coverage = BamWrapper(self.filetype) for line in HTSeq.BAM_Reader(self.filepath): self.coverage[line.iv] += 1 self.library_size += 1 elif (self.filetype.upper() == "BG") or (self.filetype.upper() == "BEDGRAPH"): raise NotImplementedError("BedGraph is not yet implemented!") elif (self.filetype.upper() == "BW") or (self.filetype.upper() == "BIGWIG"): self.coverage = BigWigWrapper(self.filepath) elif self.filetype.upper() == "OTHER": for line in func(self.filepath): self.coverage[line.iv] += 1 self.library_size += 1 else: assert False, "I should not be here!"
def set_up_IO(fileIN, fileOUT, gff, downstream, upstream): '''Function that will open all the file required for the alignment processing ''' ## Open alignment alignIN = HTSeq.SAM_Reader(fileIN) alignIN = HTSeq.bundle_multiple_alignments(alignIN) ## Open GFF file annotation = HTSeq.GFF_Reader(gff, end_included=True) ## Open output file - write the header countTable = open(fileOUT, 'w') coordinates = '\t'.join(i for i in map(str, range(-upstream, downstream))) countTable.write('name\t{coord}\n'.format(coord=coordinates)) return alignIN, annotation, countTable
def aln_generator_from_single_samfile(samfile): """ Generator - read SAM-format file (can have multiple alignments per read), yield (readname,alignment_object_list) tuples. """ curr_readname, curr_aln_list = '', [] # go over alignments, adding to curr_aln_list until the readname changes - then yield finished data and start new one. for aln in HTSeq.SAM_Reader(samfile): readname = aln.read.name if readname == curr_readname: curr_aln_list.append(aln) else: if curr_readname or curr_aln_list: yield (curr_readname, curr_aln_list) curr_readname, curr_aln_list = readname, [aln] # remember to yield the last result too! if curr_readname or curr_aln_list: yield (curr_readname, curr_aln_list)
def analyze_single_deletion_reads(input_sam_file): input_sam = HTSeq.SAM_Reader(input_sam_file) input_sam = HTSeq.pair_SAM_alignments(input_sam) Clipping_reads_1 = set() Mapping_reads_1 = set() Clipping_reads_2 = set() Mapping_reads_2 = set() for sam_line in input_sam: if (sam_line[0] is not None and sam_line[0].aligned) and (sam_line[1] is not None and sam_line[1].aligned): clipping_1 = 0 for cigar_line_1 in sam_line[0].cigar: if cigar_line_1.type == "N": Clipping_reads_1.add( "@" + sam_line[0].get_sam_line().split("\t")[0]) clipping_1 += 1 elif cigar_line_1.type == "D" and cigar_line_1.size > 2: Clipping_reads_1.add( "@" + sam_line[0].get_sam_line().split("\t")[0]) clipping_1 += 1 if clipping_1 == 0: Mapping_reads_1.add("@" + sam_line[0].get_sam_line().split("\t")[0]) clipping_2 = 0 for cigar_line_2 in sam_line[1].cigar: if cigar_line_2.type == "N": Clipping_reads_2.add( "@" + sam_line[1].get_sam_line().split("\t")[0]) clipping_2 += 1 elif cigar_line_2.type == "D" and cigar_line_2.size > 2: Clipping_reads_2.add( "@" + sam_line[1].get_sam_line().split("\t")[0]) clipping_2 += 1 if clipping_2 == 0: Mapping_reads_2.add("@" + sam_line[1].get_sam_line().split("\t")[0]) Clipping_reads = Clipping_reads_1.union(Clipping_reads_2) Mapping_reads = Mapping_reads_1.intersection(Mapping_reads_2) kept_reads = Clipping_reads.difference(Mapping_reads) return kept_reads
def get_aligned_reads(samfile, collapsed=None, readcounts=None): """Get all aligned reads from a sam file into a pandas dataframe""" sam = HTSeq.SAM_Reader(str(samfile)) f=[] for a in sam: if a.aligned == True: seq = a.read.seq.decode() f.append((seq,a.read.name,a.iv.chrom,a.iv.start,a.iv.end,a.iv.strand)) #else: # f.append((seq,a.read.name,'_unmapped')) counts = pd.DataFrame(f, columns=['seq','read','name','start','end','strand']) counts['length'] = counts.seq.str.len() counts = counts.drop(['read'],1) if collapsed is not None: readcounts = read_collapsed_file(collapsed) if readcounts is not None: counts = counts.merge(readcounts, on='seq') counts['align_id'] = counts.index return counts
def getAllMappedReadsSam(annot_reads, htseq_no_ambiguous = False): ''' creates a map with the read names that are annotated and mapped and their mapping scores,chromosome and gene We assume the gtf file has its gene ids replaced by gene names ''' filter = ["no_feature","ambiguous", "too_low_aQual","not_aligned", "alignment_not_unique"] mapped = dict() sam = HTSeq.SAM_Reader(annot_reads) for alig in sam: gene_name = str(alig.optional_field("XF")) if gene_name in filter or not alig.aligned or \ (htseq_no_ambiguous and gene_name.find("ambiguous") != -1): continue strand = str(alig.pe_which) name = str(alig.read.name) #seq = str(alig.read.seq) #qual = str(alig.read.qualstr) mapping_quality = int(alig.aQual) if alig.mate_start: chromosome = alig.mate_start.chrom else: chromosome = "Unknown" if strand == "first": name += "/1" elif strand == "second": name += "/2" else: print "Warning : un-strander read " + str(name) continue ## not possible mapped[name] = (mapping_quality,gene_name,chromosome) # there should not be collisions return mapped
def readcount(myfile): almnt_file = HTSeq.SAM_Reader(myfile) counts = collections.Counter() for almnt in almnt_file: if not almnt.aligned: counts["_unmapped"] += 1 continue gene_ids = set() for iv, val in exons[almnt.iv].steps(): gene_ids |= val if len(gene_ids) == 1: gene_id = list(gene_ids)[0] counts[gene_id] += 1 elif len(gene_ids) == 0: counts["_no_feature"] += 1 else: counts["_ambiguous"] += 1 return myfile, counts
def analyze_merged_deletion_reads(input_sam_file): input_sam = HTSeq.SAM_Reader(input_sam_file) Clipping_reads = set() Mapping_reads = set() for sam_line in input_sam: if sam_line.aligned: clipping = 0 for cigar_line in sam_line.cigar: if cigar_line.type == "N": Clipping_reads.add("@" + sam_line.get_sam_line().split("\t")[0]) # insert_size = cigar_line.size clipping += 1 elif cigar_line.type == "D" and cigar_line.size > 2: Clipping_reads.add("@" + sam_line.get_sam_line().split("\t")[0]) clipping += 1 if clipping == 0: Mapping_reads.add("@" + sam_line.get_sam_line().split("\t")[0]) kept_reads = Clipping_reads.difference(Mapping_reads) return kept_reads
def count_tRNAs(samples, tRNA_features, output_path='tRNA-counts.tsv'): samples_with_counts = [] with open(output_path, 'w') as f: f.write('sample\ttRNA\tcount\tfraction\n') for sample in samples: print(f'counting tRNAs in {sample["sample"]}') sample['counts'] = collections.Counter() tRNA_count_total = 0 almnt_file = HTSeq.SAM_Reader(sample['sam_path']) for almnt in almnt_file: if not almnt.aligned: sample['counts']['_unmapped'] += 1 continue aligned_tRNA_IDs = set() for iv, val in tRNA_features[almnt.iv].steps(): aligned_tRNA_IDs |= val # constructs a set of all tRNA IDs which the alignment could map to if len(aligned_tRNA_IDs) == 1: tRNA_ID = list(aligned_tRNA_IDs)[0] sample['counts'][tRNA_ID] += 1 tRNA_count_total += 1 elif len(aligned_tRNA_IDs) == 0: sample['counts']['_not_tRNA'] += 1 else: sample['counts']['unk'] += 1 # ambiguous tRNA_count_total += 1 sample['fractions'] = {} for tRNA, count in sorted(sample['counts'].items()): fraction = round(count / tRNA_count_total, 5) sample['fractions'][tRNA] = fraction if tRNA != '_unmapped' and tRNA != '_not_tRNA': with open(output_path, 'a') as f: f.write( f'{sample["sample"]}\t{tRNA}\t{str(count)}\t{str(fraction)}\n' ) samples_with_counts.append(sample) return samples_with_counts
def count_reads(start_codon_sites, stop_codon_sites, ORF_features, counts, map_file, stranded, min_quality, count_mode, first_exclude_codons, last_exclude_codons, min_read, max_read, exclude_min_ORF): lowqual = 0 notaligned = 0 nonunique = 0 too_short = 0 too_long = 0 min_read_string = "__too_short(<%i)" % min_read max_read_string = "__too_long(<%i)" % max_read first_exclude_nt = first_exclude_codons * 3 last_exclude_nt = last_exclude_codons * 3 pysam_fh = pysam.AlignmentFile(map_file) is_bam = pysam_fh.is_bam pysam_fh.close() if is_bam: tracks = HTSeq.BAM_Reader(map_file) else: tracks = HTSeq.SAM_Reader(map_file) # for i,r in enumerate(tracks): for r in tracks: # if i % 100000 == 0: # sys.stderr.write("%d alignment record processed.\r" % i) if not r.aligned: notaligned += 1 continue try: if r.optional_field("NH") > 1: nonunique += 1 continue except KeyError: pass if r.aQual < min_quality: lowqual += 1 continue read_len = len(r.read.seq) if read_len < min_read: too_short += 1 continue if read_len > max_read: too_long += 1 continue if stranded != "reverse": iv_seq = (co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r.cigar if co.type == "M" and co.size > 0) try: if count_mode == "intersection-strict": fs = None for iv in iv_seq: for iv2, fs2 in ORF_features[iv].steps(): if fs is None: fs = fs2.copy() else: fs = fs.intersection(fs2) elif count_mode == "union": fs = set() for iv in iv_seq: for iv2, fs2 in ORF_features[iv].steps(): fs = fs.union(fs2) if fs is None or len(fs) == 0: continue elif len(fs) > 1: continue else: orf_id = list(fs)[0] if read_len < exclude_min_ORF: counts[orf_id] += 1 continue try: if abs(start_codon_sites[orf_id] - r.iv.start_d) < first_exclude_nt: continue elif abs(r.iv.end_d - stop_codon_sites[orf_id]) < last_exclude_nt: continue else: counts[orf_id] += 1 except: counts[orf_id] += 1 except: sys.stderr.write( "Error occurred when processing mapping file in line:%s\n" % r.get_sam_line()) counts["__too_low_quality"] += lowqual counts["__not_aligned"] += notaligned counts[min_read_string] += too_short counts[max_read_string] += too_long counts["__alignment_not_unique"] += nonunique return counts
def sciRNAseq_count(sample, input_folder, exons, genes, gene_end, gene_annotat, sample_ID): input_sam = input_folder + "/" + sample + ".sam" report = input_folder + "/" + sample + ".report" count_output = input_folder + "/" + sample + ".count" counts = collections.Counter() sam_file = input_sam almnt_file = HTSeq.SAM_Reader(sam_file) sam_name = sample cell_ID = sample_ID.index(sample) + 1 perfect_inter_exon = 0 nearest_inter_exon = 0 perfect_combine_exon = 0 nearest_combine_exon = 0 perfect_inter_gene = 0 nearest_inter_gene = 0 perfect_combine_gene = 0 nearest_combine_gene = 0 print("Start read the input file: " + sam_file + "....") for alnmt in almnt_file: #print alnmt if not alnmt.aligned: counts["_unmapped"] += 1 continue if alnmt.iv.chrom not in genes.chrom_vectors: counts["_unmapped"] += 1 continue # First check the intersectin with exons gene_id_intersect = set() gene_id_combine = set() inter_count = 0 for cigop in alnmt.cigar: if cigop.type != "M": continue for iv, val in exons[cigop.ref_iv].steps(): #print iv, val gene_id_combine |= val if inter_count == 0: gene_id_intersect |= val inter_count += 1 else: gene_id_intersect &= val #print "intersect set:", gene_id_intersect #print "combine set:", gene_id_combine # first check the intersection set if len(gene_id_intersect) == 1: gene_id = list(gene_id_intersect)[0] counts[gene_id] += 1 perfect_inter_exon += 1 elif len(gene_id_intersect) > 1: gene_id = find_nearest_gene(alnmt.iv.end_d, gene_id_intersect, gene_end) counts[gene_id] += 1 nearest_inter_exon += 1 else: # if there no intersection match, then find the union sets if len(gene_id_combine) == 1: gene_id = list(gene_id_combine)[0] counts[gene_id] += 1 perfect_combine_exon += 1 elif len(gene_id_combine) > 1: gene_id = find_nearest_gene(alnmt.iv.end_d, gene_id_combine, gene_end) counts[gene_id] += 1 nearest_combine_exon += 1 else: # if there is no intersection match or union match, then search for genes to find the intronic match gene_id_intersect = set() gene_id_combine = set() inter_count = 0 for cigop in alnmt.cigar: if cigop.type != "M": continue for iv, val in genes[cigop.ref_iv].steps(): gene_id_combine |= val if inter_count == 0: gene_id_intersect |= val inter_count += 1 else: gene_id_intersect &= val if len(gene_id_intersect) == 1: gene_id = list(gene_id_intersect)[0] + "_intron" counts[gene_id] += 1 perfect_inter_gene += 1 elif len(gene_id_intersect) > 1: gene_id = find_nearest_gene(alnmt.iv.end_d, gene_id_intersect, gene_end) + "_intron" counts[gene_id] += 1 nearest_inter_gene += 1 else: # if there no intersection match, then find the union sets if len(gene_id_combine) == 1: gene_id = list(gene_id_combine)[0] + "_intron" counts[gene_id] += 1 perfect_combine_gene += 1 elif len(gene_id_combine) > 1: gene_id = find_nearest_gene(alnmt.iv.end_d, gene_id_combine, gene_end) + "_intron" counts[gene_id] += 1 nearest_combine_gene += 1 else: counts["_no_feature"] += 1 print("File name: ", sam_file) print("1: Perfect intersect exon match: ", perfect_inter_exon) print("2: Nearest intersect exon match: ", nearest_inter_exon) print("3: Perfect combine exon match: ", perfect_combine_exon) print("4: Nearest combine exon match: ", nearest_combine_exon) print("5: Perfect intersect gene match: ", perfect_inter_gene) print("6: Nearest intersect gene match: ", nearest_inter_gene) print("7: Perfect combine gene match: ", perfect_combine_gene) print("8: Nearest combine gene match: ", nearest_combine_gene) print("9: ambiguous match for exons: ", counts["_ambiguous"]) print("10: ambiguous match for genes: ", counts["_ambiguous_intron"]) print("11: No match: ", counts["_no_feature"]) print("Sam file analysis finished~") with open(report, 'w') as report: report.write("1" + "," + str(cell_ID) + "," + str(perfect_inter_exon) + "\n") report.write("2" + "," + str(cell_ID) + "," + str(nearest_inter_exon) + "\n") report.write("3" + "," + str(cell_ID) + "," + str(perfect_combine_exon) + "\n") report.write("4" + "," + str(cell_ID) + "," + str(nearest_combine_exon) + "\n") report.write("5" + "," + str(cell_ID) + "," + str(perfect_inter_gene) + "\n") report.write("6" + "," + str(cell_ID) + "," + str(nearest_inter_gene) + "\n") report.write("7" + "," + str(cell_ID) + "," + str(perfect_combine_gene) + "\n") report.write("8" + "," + str(cell_ID) + "," + str(nearest_combine_gene) + "\n") report.write("9" + "," + str(cell_ID) + "," + str(counts["_ambiguous"]) + "\n") report.write("10" + "," + str(cell_ID) + "," + str(counts["_ambiguous_intron"]) + "\n") report.write("11" + "," + str(cell_ID) + "," + str(counts["_no_feature"]) + "\n") with open(count_output, 'w') as count_output: for gene in counts: if (gene in [ "_unmapped", "_ambiguous", "_ambiguous_intron", "_no_feature" ]): continue else: line = str(gene_annotat.loc[gene, 4]) + "," + str( cell_ID) + "," + str(counts[gene]) + "\n" count_output.write(line) return 0
#sort you SAM file by read ID, so that multiple mappings are in adjacent lines and the write a script to filter the best one #Written by Simon Anders import sys, re import HTSeq insam = HTSeq.SAM_Reader(sys.stdin) # Go through all reads, with their alignments bundled up: for bundle in HTSeq.bundle_multiple_alignments(insam): bestAlmt = None # Go through all alignments of a given read, looking # for the one with the best alignment score for almt in bundle: if bestAlmt is None: bestAlmt = almt elif almt.aQual > bestAlmt.aQual: bestAlmt = almt elif almt.aQual == bestAlmt: # If there are more than one best alignment, # better skip the read bestAlmt = None if bestAlmt is not None: # Change the NH field to 1 and print the line print re.sub("NH:i:\d+", "NH:i:1", bestAlmt.original_sam_line) #call this script with the command sort samfile.sam | python chooseBest.py > filtered.sam
sys.stdout.flush() if args.stranded == 'yes': feature_array = hts.GenomicArrayOfSets("auto", stranded=True) elif args.stranded == 'no': feature_array = hts.GenomicArrayOfSets("auto", stranded=False) for feature in gtf: if feature.type == args.type: feature_array[feature.iv] += feature.name print "done.\n\n" # create Reader class for samfile: if args.format == 'sam': alnmt_file = hts.SAM_Reader(args.alignment_file[0]) else: alnmt_file = hts.BAM_Reader(args.alignment_file[0]) # count reads: print "Counting reads..." if args.read_type == 'single_end': counts = ungapped_se_counter(alnmt_file, feature_array) print "\nSample output for ungapped SE counts:" countlist = sorted(counts.items()) for g, c in countlist[-10:]: print "%-10s %d" % (g, c) else: counts = ungapped_pe_counter(alnmt_file, feature_array)
import sys import matplotlib.pyplot as plt if len(sys.argv) < 3: print("Please enter input file (.sam) and output file (.fastq)!") exit() input_file = sys.argv[1] output_file = sys.argv[2] if not (input_file.endswith(".sam") and output_file.endswith(".fastq")): print("Please enter input file (.sam) and output file (.fastq)!") exit() import HTSeq import numpy as np alignment_file = HTSeq.SAM_Reader(input_file) len_reads=[] my_fastq_file = open( output_file, "w" ) for aln in alignment_file: if not aln.aligned: len_reads.append(len(aln.read.seq)) if len(aln.read.seq)>200: myread = HTSeq.SequenceWithQualities( aln.read.seq, aln.read.name, aln.read.qualstr ) myread.write_to_fastq_file( my_fastq_file ) my_fastq_file.close() import matplotlib.pyplot as plt %matplotlib inline plt.hist(len_reads, bins=10) plt.savefig(output_file+".png")
#We need this little helper below: def reverse_strand(s): if s == "+": return "-" elif s == "-": return "+" else: raise SystemError, "illegal strand" # Now go through the aligned reads if not is_BAM: tmp_obj = HTSeq.SAM_Reader(sam_file) else: tmp_obj = HTSeq.BAM_Reader(sam_file) if not is_PE: num_reads = 0 # for a in HTSeq.SAM_Reader( sam_file ): for a in tmp_obj: if not a.aligned: counts['_notaligned'] += 1 continue if a.aQual < minaqual: counts['_lowaqual'] += 1 continue rs = set()
#We need this little helper below: def reverse_strand( s ): if s == "+": return "-" elif s == "-": return "+" else: raise SystemError, "illegal strand" # Now go through the aligned reads if not is_PE: num_reads = 0 for a in HTSeq.SAM_Reader( sam_file ): if not a.aligned: counts[ '_notaligned' ] += 1 continue if a.aQual < minaqual: counts[ '_lowaqual' ] += 1 continue rs = set() for cigop in a.cigar: if cigop.type != "M": continue if reverse: cigop.ref_iv.strand = reverse_strand( cigop.ref_iv.strand ) for iv, s in features[cigop.ref_iv].steps( ): rs = rs.union( s ) set_of_gene_names = set( [ f.name.split(":")[0] for f in rs ] )