def parse_fastx_sam_parallel(fastx_infile, sam_infile): """ Parse fastx and resulting sam file in parallel - generator yielding (name, seq, alignment_list) tuples. The sam file may contain multiple alignments per read. Program checks that the readnames match. """ fastx_generator = basic_seq_utilities.name_seq_generator_from_fasta_fastq(fastx_infile) sam_generator = iter(HTSeq.bundle_multiple_alignments(HTSeq.SAM_Reader(sam_infile))) if_finished_fastx, if_finished_sam = False, False while True: try: name, seq = fastx_generator.next() except StopIteration: if_finished_fastx = True try: alns = sam_generator.next() except StopIteration: if_finished_sam = True # if both finished, good, we're doine if if_finished_fastx and if_finished_sam: raise StopIteration # if one file was finished but the other wasn't, error! elif if_finished_fastx or if_finished_sam: raise DeepseqError("Parsing seq/aln files in parallel - inconsistent finished states! " +"(If finished: %s %s, %s %s)"%(fastx_infile, if_finished_fastx, sam_infile, if_finished_sam)) # if all the files still contained data, yield it else: name = name.split()[0] name2 = alns[0].read.name.split()[0] if not name2 == name: raise DeepseqError("Non-matching readnames between files! %s in %s, %s in %s"%(fastx_infile, name, sam_infile, name2)) yield (name, seq, alns)
def set_up_IO(fileIN,fileOUT,gff,downstream,upstream): '''Function that will open all the file required for the alignment processing ''' ## Open alignment alignIN = HTSeq.SAM_Reader(fileIN) alignIN = HTSeq.bundle_multiple_alignments(alignIN) ## Open GFF file annotation = HTSeq.GFF_Reader(gff,end_included = True) ## Open output file - write the header countTable = open(fileOUT,'w') coordinates = '\t'.join(i for i in map(str,range(-upstream,downstream))) countTable.write('name\t{coord}\n'.format(coord = coordinates)) return alignIN, annotation, countTable
def set_up_IO(fileIN, fileOUT, gff, downstream, upstream): '''Function that will open all the file required for the alignment processing ''' ## Open alignment alignIN = HTSeq.SAM_Reader(fileIN) alignIN = HTSeq.bundle_multiple_alignments(alignIN) ## Open GFF file annotation = HTSeq.GFF_Reader(gff, end_included=True) ## Open output file - write the header countTable = open(fileOUT, 'w') coordinates = '\t'.join(i for i in map(str, range(-upstream, downstream))) countTable.write('name\t{coord}\n'.format(coord=coordinates)) return alignIN, annotation, countTable
def test_sam_parser_comparison(self): file = f"{resources}/Lib304_test.sam" ours = SAM_reader().bundle_multi_alignments(file) theirs = HTSeq.bundle_multiple_alignments(HTSeq.BAM_Reader(file)) for our_bundle, their_bundle in zip(ours, theirs): self.assertEqual(len(our_bundle), len(their_bundle)) for our, their in zip(our_bundle, their_bundle): self.assertEqual(our['chrom'], their.iv.chrom) self.assertEqual(our['start'], their.iv.start) self.assertEqual(our['end'], their.iv.end) self.assertEqual(our['name'], their.read.name) self.assertEqual(our['nt5'], chr(their.read.seq[0])) # See note above self.assertEqual(our['strand'], their.iv.strand) if our['strand'] == '-': # See note above self.assertEqual( our['seq'][::-1].translate(helpers.complement), their.read.seq) else: self.assertEqual(our['seq'], their.read.seq)
#sort you SAM file by read ID, so that multiple mappings are in adjacent lines and the write a script to filter the best one #Written by Simon Anders import sys, re import HTSeq insam = HTSeq.SAM_Reader(sys.stdin) # Go through all reads, with their alignments bundled up: for bundle in HTSeq.bundle_multiple_alignments(insam): bestAlmt = None # Go through all alignments of a given read, looking # for the one with the best alignment score for almt in bundle: if bestAlmt is None: bestAlmt = almt elif almt.aQual > bestAlmt.aQual: bestAlmt = almt elif almt.aQual == bestAlmt: # If there are more than one best alignment, # better skip the read bestAlmt = None if bestAlmt is not None: # Change the NH field to 1 and print the line print re.sub("NH:i:\d+", "NH:i:1", bestAlmt.original_sam_line) #call this script with the command sort samfile.sam | python chooseBest.py > filtered.sam
#sort you SAM file by read ID, so that multiple mappings are in adjacent lines and the write a script to filter the best one #Written by Simon Anders import sys, re import HTSeq insam = HTSeq.SAM_Reader( sys.stdin ) # Go through all reads, with their alignments bundled up: for bundle in HTSeq.bundle_multiple_alignments( insam ): bestAlmt = None # Go through all alignments of a given read, looking # for the one with the best alignment score for almt in bundle: if bestAlmt is None: bestAlmt = almt elif almt.aQual > bestAlmt.aQual: bestAlmt = almt elif almt.aQual == bestAlmt: # If there are more than one best alignment, # better skip the read bestAlmt = None if bestAlmt is not None: # Change the NH field to 1 and print the line print re.sub( "NH:i:\d+", "NH:i:1", bestAlmt.original_sam_line ) #call this script with the command sort samfile.sam | python chooseBest.py > filtered.sam