コード例 #1
0
def parse_fastx_sam_parallel(fastx_infile, sam_infile):
    """ Parse fastx and resulting sam file in parallel - generator yielding (name, seq, alignment_list) tuples.

    The sam file may contain multiple alignments per read.  Program checks that the readnames match.
    """
    fastx_generator = basic_seq_utilities.name_seq_generator_from_fasta_fastq(fastx_infile)
    sam_generator = iter(HTSeq.bundle_multiple_alignments(HTSeq.SAM_Reader(sam_infile)))
    if_finished_fastx, if_finished_sam = False, False
    while True:
        try:                    name, seq = fastx_generator.next()
        except StopIteration:   if_finished_fastx = True
        try:                    alns = sam_generator.next()
        except StopIteration:   if_finished_sam = True
        # if both finished, good, we're doine
        if if_finished_fastx and if_finished_sam:
            raise StopIteration
        # if one file was finished but the other wasn't, error!
        elif if_finished_fastx or if_finished_sam:
            raise DeepseqError("Parsing seq/aln files in parallel - inconsistent finished states! "
                              +"(If finished: %s %s, %s %s)"%(fastx_infile, if_finished_fastx, sam_infile, if_finished_sam))
        # if all the files still contained data, yield it
        else:
            name = name.split()[0]
            name2 = alns[0].read.name.split()[0]
            if not name2 == name:
                raise DeepseqError("Non-matching readnames between files! %s in %s, %s in %s"%(fastx_infile, name, 
                                                                                               sam_infile, name2))
            yield (name, seq, alns)
コード例 #2
0
def set_up_IO(fileIN,fileOUT,gff,downstream,upstream):
    '''Function that will open all the file required for the alignment processing
    '''
    ## Open alignment
    alignIN = HTSeq.SAM_Reader(fileIN)
    alignIN = HTSeq.bundle_multiple_alignments(alignIN)

    ## Open GFF file
    annotation = HTSeq.GFF_Reader(gff,end_included = True)

    ## Open output file - write the header
    countTable = open(fileOUT,'w')
    coordinates = '\t'.join(i for i in map(str,range(-upstream,downstream)))
    countTable.write('name\t{coord}\n'.format(coord = coordinates))
    return alignIN, annotation, countTable
コード例 #3
0
def set_up_IO(fileIN, fileOUT, gff, downstream, upstream):
    '''Function that will open all the file required for the alignment processing
    '''
    ## Open alignment
    alignIN = HTSeq.SAM_Reader(fileIN)
    alignIN = HTSeq.bundle_multiple_alignments(alignIN)

    ## Open GFF file
    annotation = HTSeq.GFF_Reader(gff, end_included=True)

    ## Open output file - write the header
    countTable = open(fileOUT, 'w')
    coordinates = '\t'.join(i for i in map(str, range(-upstream, downstream)))
    countTable.write('name\t{coord}\n'.format(coord=coordinates))
    return alignIN, annotation, countTable
コード例 #4
0
    def test_sam_parser_comparison(self):
        file = f"{resources}/Lib304_test.sam"
        ours = SAM_reader().bundle_multi_alignments(file)
        theirs = HTSeq.bundle_multiple_alignments(HTSeq.BAM_Reader(file))

        for our_bundle, their_bundle in zip(ours, theirs):
            self.assertEqual(len(our_bundle), len(their_bundle))
            for our, their in zip(our_bundle, their_bundle):
                self.assertEqual(our['chrom'], their.iv.chrom)
                self.assertEqual(our['start'], their.iv.start)
                self.assertEqual(our['end'], their.iv.end)
                self.assertEqual(our['name'], their.read.name)
                self.assertEqual(our['nt5'],
                                 chr(their.read.seq[0]))  # See note above
                self.assertEqual(our['strand'], their.iv.strand)
                if our['strand'] == '-':  # See note above
                    self.assertEqual(
                        our['seq'][::-1].translate(helpers.complement),
                        their.read.seq)
                else:
                    self.assertEqual(our['seq'], their.read.seq)
コード例 #5
0
#sort you SAM file by read ID, so that multiple mappings are in adjacent lines and the write a script to filter the best one
#Written by Simon Anders
import sys, re
import HTSeq

insam = HTSeq.SAM_Reader(sys.stdin)

# Go through all reads, with their alignments bundled up:
for bundle in HTSeq.bundle_multiple_alignments(insam):
    bestAlmt = None
    # Go through all alignments of a given read, looking
    # for the one with the best alignment score
    for almt in bundle:
        if bestAlmt is None:
            bestAlmt = almt
        elif almt.aQual > bestAlmt.aQual:
            bestAlmt = almt
        elif almt.aQual == bestAlmt:
            # If there are more than one best alignment,
            # better skip the read
            bestAlmt = None
    if bestAlmt is not None:
        # Change the NH field to 1 and print the line
        print re.sub("NH:i:\d+", "NH:i:1", bestAlmt.original_sam_line)

#call this script with the command sort samfile.sam | python chooseBest.py > filtered.sam
コード例 #6
0
#sort you SAM file by read ID, so that multiple mappings are in adjacent lines and the write a script to filter the best one
#Written by Simon Anders
import sys, re
import HTSeq

insam = HTSeq.SAM_Reader( sys.stdin )

# Go through all reads, with their alignments bundled up:
for bundle in HTSeq.bundle_multiple_alignments( insam ):
   bestAlmt = None
   # Go through all alignments of a given read, looking
   # for the one with the best alignment score
   for almt in bundle:
      if bestAlmt is None:
         bestAlmt = almt
      elif almt.aQual > bestAlmt.aQual:
         bestAlmt = almt
      elif almt.aQual == bestAlmt:
         # If there are more than one best alignment, 
         # better skip the read
         bestAlmt = None
   if bestAlmt is not None:
      # Change the NH field to 1 and print the line
      print re.sub( "NH:i:\d+", "NH:i:1", bestAlmt.original_sam_line )
      
#call this script with the command sort samfile.sam | python chooseBest.py > filtered.sam