def get_cds(self, seq_dict):
     """
     Return the CDS sequence (as a string) for the transcript
     (based on the exons) using a sequenceDict as the sequence source.
     The returned sequence is in the correct 5'-3' orientation (i.e. it has
     been reverse complemented if necessary).
     """
     sequence = seq_dict[self.chromosome]
     assert self.stop <= len(sequence) + 1
     # make sure this isn't a non-coding gene
     if self.thick_start == self.thick_stop == 0:
         return ''
     s = []
     for e in self.exon_intervals:
         if self.thick_start < e.start and e.stop < self.thick_stop:
             # squarely in the CDS
             s.append(sequence[e.start:e.stop])
         elif e.start <= self.thick_start < e.stop < self.thick_stop:
             # thickStart marks the start of the CDS
             s.append(sequence[self.thick_start:e.stop])
         elif e.start <= self.thick_start and self.thick_stop <= e.stop:
             # thickStart and thickStop mark the whole CDS
             s.append(sequence[self.thick_start: self.thick_stop])
         elif self.thick_start < e.start < self.thick_stop <= e.stop:
             # thickStop marks the end of the CDS
             s.append(sequence[e.start:self.thick_stop])
     if self.strand == '-':
         cds = reverse_complement(''.join(s))
     else:
         cds = ''.join(s)
     return str(cds)
Example #2
0
 def get_cds(self, seq_dict):
     """
     Return the CDS sequence (as a string) for the transcript
     (based on the exons) using a sequenceDict as the sequence source.
     The returned sequence is in the correct 5'-3' orientation (i.e. it has
     been reverse complemented if necessary).
     """
     sequence = seq_dict[self.chromosome]
     assert self.stop <= len(sequence) + 1
     # make sure this isn't a non-coding gene
     if self.thick_start == self.thick_stop == 0:
         return ''
     s = []
     for e in self.exon_intervals:
         if self.thick_start < e.start and e.stop < self.thick_stop:
             # squarely in the CDS
             s.append(sequence[e.start:e.stop])
         elif e.start <= self.thick_start < e.stop < self.thick_stop:
             # thickStart marks the start of the CDS
             s.append(sequence[self.thick_start:e.stop])
         elif e.start <= self.thick_start and self.thick_stop <= e.stop:
             # thickStart and thickStop mark the whole CDS
             s.append(sequence[self.thick_start:self.thick_stop])
         elif self.thick_start < e.start < self.thick_stop <= e.stop:
             # thickStop marks the end of the CDS
             s.append(sequence[e.start:self.thick_stop])
     if self.strand == '-':
         cds = reverse_complement(''.join(s))
     else:
         cds = ''.join(s)
     return str(cds)
 def get_sequence(self, seq_dict, stranded=True):
     """
     Returns the sequence for this ChromosomeInterval. If stranded is True, reverse complements as necessary.
     :param seq_dict: Dictionary-like object with DNA sequences.
     :param stranded: Should we reverse complement negative strand sequences?
     :return: A sequence string.
     """
     if stranded is False or self.strand is '+':
         return seq_dict[self.chromosome][self.start: self.stop]
     elif self.strand is '-':
         return reverse_complement(seq_dict[self.chromosome][self.start: self.stop])
 def get_sequence(self, seq_dict, stranded=True):
     """
     Returns the sequence for this ChromosomeInterval. If stranded is True, reverse complements as necessary.
     :param seq_dict: Dictionary-like object with DNA sequences.
     :param stranded: Should we reverse complement negative strand sequences?
     :return: A sequence string.
     """
     if stranded is False or self.strand is '+':
         return seq_dict[self.chromosome][self.start:self.stop]
     elif self.strand is '-':
         return reverse_complement(
             seq_dict[self.chromosome][self.start:self.stop])
 def get_mrna(self, seq_dict):
     """
     Returns the mRNA sequence for this transcript based on a Fasta object.
     and the start/end positions and the exons. Sequence returned in
     5'-3' transcript orientation.
     """
     sequence = seq_dict[self.chromosome]
     assert self.stop <= len(sequence) + 1
     s = []
     for e in self.exon_intervals:
         s.append(sequence[e.start:e.stop])
     if self.strand == '+':
         mrna = ''.join(s)
     else:
         mrna = reverse_complement(''.join(s))
     return str(mrna)
Example #6
0
 def get_mrna(self, seq_dict):
     """
     Returns the mRNA sequence for this transcript based on a Fasta object.
     and the start/end positions and the exons. Sequence returned in
     5'-3' transcript orientation.
     """
     sequence = seq_dict[self.chromosome]
     assert self.stop <= len(sequence) + 1
     s = []
     for e in self.exon_intervals:
         s.append(sequence[e.start:e.stop])
     if self.strand == '+':
         mrna = ''.join(s)
     else:
         mrna = reverse_complement(''.join(s))
     return str(mrna)
Example #7
0
  def _write( self, pos, sequence, quality, variation_map, target_fh, variations=set(), offset=0, length=0, debug='', inversion=False ):
    '''
      write a sequence of given quality
      debug is not used and for humans only
    '''
    if inversion:
      inversion_text = 'inversion_'
      sequence = bio.reverse_complement( sequence )
      quality = quality[::-1]
    else:
      inversion_text = ''
  
    target_fh.write( '@mgsa_seq_%i~%i~%i\n' % ( pos, offset, length ) ) # sam is 0 indexed
    if len(variations) > 0:
      variation_map.write( '@mgsa_seq_%i~%i~%i: %s_%s%s\n' % ( pos, offset, length, ','.join([ v for v in variations ]), inversion_text, debug ) ) # sam is 0 indexed

    target_fh.write( sequence )
    target_fh.write( '\n+\n' )
    target_fh.write( quality ) # quality 
    target_fh.write( '\n' )
import sys

from bio import reverse_complement

with open(sys.argv[1]) as f:
    read_data = f.read().strip()
    print(reverse_complement(read_data))
Example #9
0
def main(args):
    used_only, args = grace.get_option_value(args, '--used-only',
                                             grace.as_bool, False)

    grace.expect_no_further_options(args)

    if len(args) != 1:
        sys.stderr.write(USAGE)
        return 1

    working_dir = args[0]

    print
    print 'Note: This is still under development'
    print '      Pairing information is not included'
    print '      Only the part of the read that was aligned is included'
    print

    if used_only:
        hit_filename = 'used_shrimp_hits.txt.gz'
        output_prefix = 'used_hits'
    else:
        hit_filename = 'shrimp_hits.txt.gz'
        output_prefix = 'hits'

    reference_filename = os.path.join(working_dir, 'reference.fa')
    references = dict(io.read_fasta(reference_filename))
    for name in references:
        references[name] = references[name].upper()

    bam_filename = safe_filename(working_dir, output_prefix + '_unsorted.bam')
    bam_sorted_prefix = safe_filename(working_dir, output_prefix)

    f = open(bam_filename, 'wb')
    sam_eater = run(['samtools', 'view', '-S', '-b', '-'],
                    stdin=subprocess.PIPE,
                    stdout=f.fileno())
    f.close()

    for name in references:
        print >> sam_eater.stdin, '@SQ\tSN:%s\tLN:%d' % (name,
                                                         len(references[name]))

    for i, (read, hits) in enumerate(
            shrimp.iter_read_hits(working_dir, hit_filename, qualities=True)):
        if (i % 10000) == 0:
            grace.status('Processing read %s' % grace.pretty_number(i))

        for line in hits:
            parts = line.rstrip('\n').split('\t')
            read_name = parts[0]
            ref_name = parts[1]
            ref_start = int(parts[3]) - 1
            ref_end = int(parts[4])
            read_start = int(parts[5]) - 1
            read_end = int(parts[6])
            read_length = int(parts[7])
            score = int(parts[8])
            forward = (parts[2] == '+')
            edit_string = parts[9]

            corresp_seq = references[ref_name][ref_start:ref_end]

            if not forward:
                corresp_seq = bio.reverse_complement(corresp_seq)

            hit_ref_ali, hit_read_ali = consensus.edit_string_to_alignment(
                edit_string, corresp_seq)

            if not forward:
                hit_ref_ali = bio.reverse_complement(hit_ref_ali)
                hit_read_ali = bio.reverse_complement(hit_read_ali)

            #Normalization -- move "-"s as far right as possible
            hit_read_ali = consensus.roll_alignment(hit_read_ali, hit_ref_ali)
            hit_ref_ali = consensus.roll_alignment(hit_ref_ali, hit_read_ali)

            if len(read) > 2:
                qual = read[2][read_start:read_end]
            else:
                qual = '*'

            if forward:
                forwardized_seq = read[1][read_start:read_end]
                forwardized_qual = qual
            else:
                forwardized_seq = bio.reverse_complement(
                    read[1][read_start:read_end])
                forwardized_qual = qual[::-1]

            #if forwardized_seq != hit_read_ali.replace('-',''):
            #    print line
            #    print read
            #    print corresp_seq
            assert forwardized_seq == hit_read_ali.replace(
                '-', ''), forwardized_seq + ' ' + hit_read_ali

            cigar_items = []
            for i in xrange(len(hit_ref_ali)):
                if hit_ref_ali[i] == '-':
                    cigar_items.append('I')
                elif hit_read_ali[i] == '-':
                    cigar_items.append('D')
                else:
                    cigar_items.append('M')

            cigar = ''.join(
                str(len(list(subiterator))) + key
                for key, subiterator in itertools.groupby(cigar_items))

            flags = 0
            if not forward: flags += 0x0010

            sam_line = '\t'.join([
                read_name,
                str(flags),
                ref_name,
                str(ref_start + 1),
                '255',
                cigar,
                '=',
                '0',
                '0',
                forwardized_seq,
                forwardized_qual,
            ])

            print >> sam_eater.stdin, sam_line

    sam_eater.stdin.close()
    assert sam_eater.wait() == 0, 'samtools failed'

    grace.status('')

    execute(['samtools', 'sort', bam_filename, bam_sorted_prefix])

    os.unlink(bam_filename)

    execute(['samtools', 'index', bam_sorted_prefix + '.bam'])

    print
    print working_dir + '/' + output_prefix + '.bam and index created'
    print
Example #10
0
def main(args):
    used_only, args = grace.get_option_value(args,'--used-only', grace.as_bool, False)
        
    grace.expect_no_further_options(args)

    if len(args) != 1:
        sys.stderr.write( USAGE )
        return 1
    
    working_dir = args[0]
    
    print
    print 'Note: This is still under development'
    print '      Pairing information is not included'
    print '      Only the part of the read that was aligned is included'
    print
    
    if used_only:
        hit_filename = 'used_shrimp_hits.txt.gz'
        output_prefix = 'used_hits'
    else:
        hit_filename = 'shrimp_hits.txt.gz'
        output_prefix = 'hits'

    reference_filename = os.path.join(working_dir,'reference.fa')
    references = dict( io.read_fasta(reference_filename) )
    for name in references:
        references[name] = references[name].upper()
    
    bam_filename = safe_filename(working_dir, output_prefix+'_unsorted.bam')
    bam_sorted_prefix = safe_filename(working_dir, output_prefix)

    f = open(bam_filename, 'wb')
    sam_eater = run(['samtools', 'view', '-S', '-b', '-'],
                    stdin=subprocess.PIPE,
                    stdout=f.fileno())
    f.close()
    
    
    for name in references:
        print >> sam_eater.stdin, '@SQ\tSN:%s\tLN:%d' % (name, len(references[name]))
        
    for i, (read, hits) in enumerate(shrimp.iter_read_hits(working_dir, hit_filename, qualities=True)):
        if (i % 10000) == 0: 
            grace.status('Processing read %s' % grace.pretty_number(i))
    
        for line in hits:
            parts = line.rstrip('\n').split('\t')
            read_name = parts[0]
            ref_name = parts[1]
            ref_start = int(parts[3])-1
            ref_end = int(parts[4])
            read_start = int(parts[5])-1
            read_end = int(parts[6])
            read_length = int(parts[7])
            score = int(parts[8])
            forward = (parts[2] == '+')
            edit_string = parts[9]

            corresp_seq = references[ref_name][ref_start:ref_end]
            
            if not forward:
                corresp_seq = bio.reverse_complement(corresp_seq)
            
            hit_ref_ali, hit_read_ali = consensus.edit_string_to_alignment(edit_string, corresp_seq)	    
            
            if not forward:
                hit_ref_ali = bio.reverse_complement(hit_ref_ali)
                hit_read_ali = bio.reverse_complement(hit_read_ali)
            
            #Normalization -- move "-"s as far right as possible
            hit_read_ali = consensus.roll_alignment(hit_read_ali, hit_ref_ali)
            hit_ref_ali = consensus.roll_alignment(hit_ref_ali, hit_read_ali)
            
            if len(read) > 2:
                qual = read[2][read_start:read_end]
            else:
                qual = '*'
            
            if forward:
                forwardized_seq = read[1][read_start:read_end]
                forwardized_qual = qual
            else:
                forwardized_seq = bio.reverse_complement(read[1][read_start:read_end])
                forwardized_qual = qual[::-1]
            
            #if forwardized_seq != hit_read_ali.replace('-',''):
            #    print line
            #    print read
            #    print corresp_seq
            assert forwardized_seq == hit_read_ali.replace('-',''), forwardized_seq + ' ' + hit_read_ali
            
            cigar_items = [ ]
            for i in xrange(len(hit_ref_ali)):
                if hit_ref_ali[i] == '-':
                    cigar_items.append('I')
                elif hit_read_ali[i] == '-':
                    cigar_items.append('D')
                else:
                    cigar_items.append('M')
            
            cigar = ''.join(
                str(len(list(subiterator))) + key
                for key, subiterator in itertools.groupby(cigar_items)
            )
            
            flags = 0
            if not forward: flags += 0x0010
            
            sam_line = '\t'.join([
                read_name,
                str(flags),
                ref_name,
                str(ref_start+1),
                '255',
                cigar,
                '=',
                '0',
                '0',
                forwardized_seq,
                forwardized_qual,
            ])
            
            print >> sam_eater.stdin, sam_line

    sam_eater.stdin.close()
    assert sam_eater.wait() == 0, 'samtools failed'
    
    grace.status('')
    
    execute([
        'samtools', 'sort', bam_filename, bam_sorted_prefix
    ])
    
    os.unlink(bam_filename)
    
    execute([
        'samtools', 'index', bam_sorted_prefix + '.bam'
    ])
    
    print
    print working_dir + '/' + output_prefix + '.bam and index created'
    print