def get_cds(self, seq_dict): """ Return the CDS sequence (as a string) for the transcript (based on the exons) using a sequenceDict as the sequence source. The returned sequence is in the correct 5'-3' orientation (i.e. it has been reverse complemented if necessary). """ sequence = seq_dict[self.chromosome] assert self.stop <= len(sequence) + 1 # make sure this isn't a non-coding gene if self.thick_start == self.thick_stop == 0: return '' s = [] for e in self.exon_intervals: if self.thick_start < e.start and e.stop < self.thick_stop: # squarely in the CDS s.append(sequence[e.start:e.stop]) elif e.start <= self.thick_start < e.stop < self.thick_stop: # thickStart marks the start of the CDS s.append(sequence[self.thick_start:e.stop]) elif e.start <= self.thick_start and self.thick_stop <= e.stop: # thickStart and thickStop mark the whole CDS s.append(sequence[self.thick_start: self.thick_stop]) elif self.thick_start < e.start < self.thick_stop <= e.stop: # thickStop marks the end of the CDS s.append(sequence[e.start:self.thick_stop]) if self.strand == '-': cds = reverse_complement(''.join(s)) else: cds = ''.join(s) return str(cds)
def get_cds(self, seq_dict): """ Return the CDS sequence (as a string) for the transcript (based on the exons) using a sequenceDict as the sequence source. The returned sequence is in the correct 5'-3' orientation (i.e. it has been reverse complemented if necessary). """ sequence = seq_dict[self.chromosome] assert self.stop <= len(sequence) + 1 # make sure this isn't a non-coding gene if self.thick_start == self.thick_stop == 0: return '' s = [] for e in self.exon_intervals: if self.thick_start < e.start and e.stop < self.thick_stop: # squarely in the CDS s.append(sequence[e.start:e.stop]) elif e.start <= self.thick_start < e.stop < self.thick_stop: # thickStart marks the start of the CDS s.append(sequence[self.thick_start:e.stop]) elif e.start <= self.thick_start and self.thick_stop <= e.stop: # thickStart and thickStop mark the whole CDS s.append(sequence[self.thick_start:self.thick_stop]) elif self.thick_start < e.start < self.thick_stop <= e.stop: # thickStop marks the end of the CDS s.append(sequence[e.start:self.thick_stop]) if self.strand == '-': cds = reverse_complement(''.join(s)) else: cds = ''.join(s) return str(cds)
def get_sequence(self, seq_dict, stranded=True): """ Returns the sequence for this ChromosomeInterval. If stranded is True, reverse complements as necessary. :param seq_dict: Dictionary-like object with DNA sequences. :param stranded: Should we reverse complement negative strand sequences? :return: A sequence string. """ if stranded is False or self.strand is '+': return seq_dict[self.chromosome][self.start: self.stop] elif self.strand is '-': return reverse_complement(seq_dict[self.chromosome][self.start: self.stop])
def get_sequence(self, seq_dict, stranded=True): """ Returns the sequence for this ChromosomeInterval. If stranded is True, reverse complements as necessary. :param seq_dict: Dictionary-like object with DNA sequences. :param stranded: Should we reverse complement negative strand sequences? :return: A sequence string. """ if stranded is False or self.strand is '+': return seq_dict[self.chromosome][self.start:self.stop] elif self.strand is '-': return reverse_complement( seq_dict[self.chromosome][self.start:self.stop])
def get_mrna(self, seq_dict): """ Returns the mRNA sequence for this transcript based on a Fasta object. and the start/end positions and the exons. Sequence returned in 5'-3' transcript orientation. """ sequence = seq_dict[self.chromosome] assert self.stop <= len(sequence) + 1 s = [] for e in self.exon_intervals: s.append(sequence[e.start:e.stop]) if self.strand == '+': mrna = ''.join(s) else: mrna = reverse_complement(''.join(s)) return str(mrna)
def _write( self, pos, sequence, quality, variation_map, target_fh, variations=set(), offset=0, length=0, debug='', inversion=False ): ''' write a sequence of given quality debug is not used and for humans only ''' if inversion: inversion_text = 'inversion_' sequence = bio.reverse_complement( sequence ) quality = quality[::-1] else: inversion_text = '' target_fh.write( '@mgsa_seq_%i~%i~%i\n' % ( pos, offset, length ) ) # sam is 0 indexed if len(variations) > 0: variation_map.write( '@mgsa_seq_%i~%i~%i: %s_%s%s\n' % ( pos, offset, length, ','.join([ v for v in variations ]), inversion_text, debug ) ) # sam is 0 indexed target_fh.write( sequence ) target_fh.write( '\n+\n' ) target_fh.write( quality ) # quality target_fh.write( '\n' )
import sys from bio import reverse_complement with open(sys.argv[1]) as f: read_data = f.read().strip() print(reverse_complement(read_data))
def main(args): used_only, args = grace.get_option_value(args, '--used-only', grace.as_bool, False) grace.expect_no_further_options(args) if len(args) != 1: sys.stderr.write(USAGE) return 1 working_dir = args[0] print print 'Note: This is still under development' print ' Pairing information is not included' print ' Only the part of the read that was aligned is included' print if used_only: hit_filename = 'used_shrimp_hits.txt.gz' output_prefix = 'used_hits' else: hit_filename = 'shrimp_hits.txt.gz' output_prefix = 'hits' reference_filename = os.path.join(working_dir, 'reference.fa') references = dict(io.read_fasta(reference_filename)) for name in references: references[name] = references[name].upper() bam_filename = safe_filename(working_dir, output_prefix + '_unsorted.bam') bam_sorted_prefix = safe_filename(working_dir, output_prefix) f = open(bam_filename, 'wb') sam_eater = run(['samtools', 'view', '-S', '-b', '-'], stdin=subprocess.PIPE, stdout=f.fileno()) f.close() for name in references: print >> sam_eater.stdin, '@SQ\tSN:%s\tLN:%d' % (name, len(references[name])) for i, (read, hits) in enumerate( shrimp.iter_read_hits(working_dir, hit_filename, qualities=True)): if (i % 10000) == 0: grace.status('Processing read %s' % grace.pretty_number(i)) for line in hits: parts = line.rstrip('\n').split('\t') read_name = parts[0] ref_name = parts[1] ref_start = int(parts[3]) - 1 ref_end = int(parts[4]) read_start = int(parts[5]) - 1 read_end = int(parts[6]) read_length = int(parts[7]) score = int(parts[8]) forward = (parts[2] == '+') edit_string = parts[9] corresp_seq = references[ref_name][ref_start:ref_end] if not forward: corresp_seq = bio.reverse_complement(corresp_seq) hit_ref_ali, hit_read_ali = consensus.edit_string_to_alignment( edit_string, corresp_seq) if not forward: hit_ref_ali = bio.reverse_complement(hit_ref_ali) hit_read_ali = bio.reverse_complement(hit_read_ali) #Normalization -- move "-"s as far right as possible hit_read_ali = consensus.roll_alignment(hit_read_ali, hit_ref_ali) hit_ref_ali = consensus.roll_alignment(hit_ref_ali, hit_read_ali) if len(read) > 2: qual = read[2][read_start:read_end] else: qual = '*' if forward: forwardized_seq = read[1][read_start:read_end] forwardized_qual = qual else: forwardized_seq = bio.reverse_complement( read[1][read_start:read_end]) forwardized_qual = qual[::-1] #if forwardized_seq != hit_read_ali.replace('-',''): # print line # print read # print corresp_seq assert forwardized_seq == hit_read_ali.replace( '-', ''), forwardized_seq + ' ' + hit_read_ali cigar_items = [] for i in xrange(len(hit_ref_ali)): if hit_ref_ali[i] == '-': cigar_items.append('I') elif hit_read_ali[i] == '-': cigar_items.append('D') else: cigar_items.append('M') cigar = ''.join( str(len(list(subiterator))) + key for key, subiterator in itertools.groupby(cigar_items)) flags = 0 if not forward: flags += 0x0010 sam_line = '\t'.join([ read_name, str(flags), ref_name, str(ref_start + 1), '255', cigar, '=', '0', '0', forwardized_seq, forwardized_qual, ]) print >> sam_eater.stdin, sam_line sam_eater.stdin.close() assert sam_eater.wait() == 0, 'samtools failed' grace.status('') execute(['samtools', 'sort', bam_filename, bam_sorted_prefix]) os.unlink(bam_filename) execute(['samtools', 'index', bam_sorted_prefix + '.bam']) print print working_dir + '/' + output_prefix + '.bam and index created' print
def main(args): used_only, args = grace.get_option_value(args,'--used-only', grace.as_bool, False) grace.expect_no_further_options(args) if len(args) != 1: sys.stderr.write( USAGE ) return 1 working_dir = args[0] print print 'Note: This is still under development' print ' Pairing information is not included' print ' Only the part of the read that was aligned is included' print if used_only: hit_filename = 'used_shrimp_hits.txt.gz' output_prefix = 'used_hits' else: hit_filename = 'shrimp_hits.txt.gz' output_prefix = 'hits' reference_filename = os.path.join(working_dir,'reference.fa') references = dict( io.read_fasta(reference_filename) ) for name in references: references[name] = references[name].upper() bam_filename = safe_filename(working_dir, output_prefix+'_unsorted.bam') bam_sorted_prefix = safe_filename(working_dir, output_prefix) f = open(bam_filename, 'wb') sam_eater = run(['samtools', 'view', '-S', '-b', '-'], stdin=subprocess.PIPE, stdout=f.fileno()) f.close() for name in references: print >> sam_eater.stdin, '@SQ\tSN:%s\tLN:%d' % (name, len(references[name])) for i, (read, hits) in enumerate(shrimp.iter_read_hits(working_dir, hit_filename, qualities=True)): if (i % 10000) == 0: grace.status('Processing read %s' % grace.pretty_number(i)) for line in hits: parts = line.rstrip('\n').split('\t') read_name = parts[0] ref_name = parts[1] ref_start = int(parts[3])-1 ref_end = int(parts[4]) read_start = int(parts[5])-1 read_end = int(parts[6]) read_length = int(parts[7]) score = int(parts[8]) forward = (parts[2] == '+') edit_string = parts[9] corresp_seq = references[ref_name][ref_start:ref_end] if not forward: corresp_seq = bio.reverse_complement(corresp_seq) hit_ref_ali, hit_read_ali = consensus.edit_string_to_alignment(edit_string, corresp_seq) if not forward: hit_ref_ali = bio.reverse_complement(hit_ref_ali) hit_read_ali = bio.reverse_complement(hit_read_ali) #Normalization -- move "-"s as far right as possible hit_read_ali = consensus.roll_alignment(hit_read_ali, hit_ref_ali) hit_ref_ali = consensus.roll_alignment(hit_ref_ali, hit_read_ali) if len(read) > 2: qual = read[2][read_start:read_end] else: qual = '*' if forward: forwardized_seq = read[1][read_start:read_end] forwardized_qual = qual else: forwardized_seq = bio.reverse_complement(read[1][read_start:read_end]) forwardized_qual = qual[::-1] #if forwardized_seq != hit_read_ali.replace('-',''): # print line # print read # print corresp_seq assert forwardized_seq == hit_read_ali.replace('-',''), forwardized_seq + ' ' + hit_read_ali cigar_items = [ ] for i in xrange(len(hit_ref_ali)): if hit_ref_ali[i] == '-': cigar_items.append('I') elif hit_read_ali[i] == '-': cigar_items.append('D') else: cigar_items.append('M') cigar = ''.join( str(len(list(subiterator))) + key for key, subiterator in itertools.groupby(cigar_items) ) flags = 0 if not forward: flags += 0x0010 sam_line = '\t'.join([ read_name, str(flags), ref_name, str(ref_start+1), '255', cigar, '=', '0', '0', forwardized_seq, forwardized_qual, ]) print >> sam_eater.stdin, sam_line sam_eater.stdin.close() assert sam_eater.wait() == 0, 'samtools failed' grace.status('') execute([ 'samtools', 'sort', bam_filename, bam_sorted_prefix ]) os.unlink(bam_filename) execute([ 'samtools', 'index', bam_sorted_prefix + '.bam' ]) print print working_dir + '/' + output_prefix + '.bam and index created' print