def extractSeq(feature, blastDb, dx, dy): """Extract the translated sequence of a feature and DNA sequence of the surrounding the genomic region. @param feature: Feature object. Mandatory attributes: accession, sStart, sEnd. @param blastDb: Blast database. @param dx: Length of sequence to extract upstream. @param dy: Length of sequence to extract downstream. @returns: tuple of fasta strings (DNA, protein). """ # Extract hmmer hit & translate header,seq = fasta.getSequence(blastDb, feature.accession, start=feature.sStart, end=feature.sEnd, strand=feature.strand) protein = '\n'.join(['>' + header, sequence.translate(seq)]) if feature.strand=='+': start = feature.sStart-dx end = feature.sEnd+dy else: start = feature.sStart-dy end = feature.sEnd+dx if start<0 or end>5000000: raise Exception('Out of block bounds.') # Extract surrounding DNA sequence header,seq = fasta.getSequence(blastDb, feature.accession, start=start, end=end) header = '%(accession)s:%(sStart)s-%(sEnd)s' % feature.__dict__ dna = '\n'.join(['>' + header, seq]) return dna, protein
for i, domain in enumerate(domains): if i > 99: break domain.domain = 'DEFB_%0.2i' % (i + 1) domain.toGenomic(relative=True) domain.addField('lowScoring', 'N') domain.addField('pseudogene', 'N') domain.addField('nCysteines', 0) summary = [] h, s = fasta.getSequence(blastdb, domain.accession, start=domain.sStart, end=domain.sEnd, strand=domain.strand) pep = sequence.translate(s) if i > 59: domain.lowScoring = 'Y' if '*' in pep: domain.pseudogene = 'Y' summary.append('Contains stops') for aa in pep: if aa == 'C': domain.nCysteines += 1 if domain.nCysteines != 6: summary.append('Has %i cysteines' % domain.nCysteines) print >> summaryFile, '%s\t%s' % (domain.domain, '; '.join(summary)) if domain.pseudogene == 'Y' or domain.nCysteines < 5 or domain.nCysteines > 7:
i = 0 writer = fasta.MfaWriter('ORFs.fa') filename = sys.argv[1] header,dna = fasta.load(filename) header = header.strip() orfIter = sequence.extractOrfsIter(dna, minLen=minLen, pattern=pattern) for i,gStart,gEnd,orf in orfIter: h = '%s.%i.%i-%i Length=%i' % (header,i,gStart,gEnd,len(orf)) writer.write(h, orf) fasta.pretty(h, orf) if gStart<gEnd: s = dna[gStart-1:gEnd] print gStart, gEnd, len(s), len(s) % 3==0 print sequence.codons(s, remainder=True) print sequence.translate(s) else: gStart,gEnd = gEnd,gStart s = dna[gStart-1:gEnd] s = sequence.reverseComplement(s) print gStart, gEnd, len(s), len(s) % 3==0 print sequence.codons(s, remainder=True) print sequence.translate(s) print writer.close()
from aaiter import find_orfs from fastamasta import FastaReader from sequence import translate, rev_complement if __name__ == "__main__": reader = FastaReader("data/12.dat") seq = reader.readnext()[1] rcseq = rev_complement(seq) orfs = find_orfs(seq) rorfs = find_orfs(rcseq) candidates = set() for orf in orfs: candidates.add(translate(seq[orf[0]:orf[1]])) for rorf in rorfs: candidates.add(translate(rcseq[rorf[0]:rorf[1]])) for candidate in candidates: print candidate
domains = hmmer.loadDomains('DEFB.txt', seqType='BlockSixFrame') print >> genomicFile, '\t'.join(domains[0].fields + ['strand', 'lowScoring', 'pseudogene', 'nCysteines']) for i,domain in enumerate(domains): if i>99: break domain.domain = 'DEFB_%0.2i' % (i+1) domain.toGenomic(relative=True) domain.addField('lowScoring', 'N') domain.addField('pseudogene', 'N') domain.addField('nCysteines', 0) summary = [] h,s = fasta.getSequence(blastdb, domain.accession, start=domain.sStart, end=domain.sEnd, strand=domain.strand) pep = sequence.translate(s) if i>59: domain.lowScoring = 'Y' if '*' in pep: domain.pseudogene = 'Y' summary.append('Contains stops') for aa in pep: if aa=='C': domain.nCysteines += 1 if domain.nCysteines!=6: summary.append('Has %i cysteines' % domain.nCysteines) print >> summaryFile, '%s\t%s' % (domain.domain, '; '.join(summary)) if domain.pseudogene=='Y' or domain.nCysteines<5 or domain.nCysteines>7:
Date: Tue Aug 22 20:14:57 EST 2006 """ import os, sys import fasta, sequence header,seq = fasta.load('NKC.fa') orfIterator = fasta.load_iter('ORFs.fa') writer = fasta.MfaWriter('ORFs2.fa') for h,orf in orfIterator: chrom,block,orfId,limits = h.split()[0].split('.') start,end = limits.split('-') start = int(start) end = int(end) if start>end: strand = '-' start,end = end,start s = sequence.translate(sequence.reverseComplement(seq[start-1:end])) else: strand = '+' s = sequence.translate(seq[start-1:end]) if s!=orf: print h writer.write(h,s + '\n') writer.close()
i = 0 writer = fasta.MfaWriter('ORFs.fa') filename = sys.argv[1] header, dna = fasta.load(filename) header = header.strip() orfIter = sequence.extractOrfsIter(dna, minLen=minLen, pattern=pattern) for i, gStart, gEnd, orf in orfIter: h = '%s.%i.%i-%i Length=%i' % (header, i, gStart, gEnd, len(orf)) writer.write(h, orf) fasta.pretty(h, orf) if gStart < gEnd: s = dna[gStart - 1:gEnd] print gStart, gEnd, len(s), len(s) % 3 == 0 print sequence.codons(s, remainder=True) print sequence.translate(s) else: gStart, gEnd = gEnd, gStart s = dna[gStart - 1:gEnd] s = sequence.reverseComplement(s) print gStart, gEnd, len(s), len(s) % 3 == 0 print sequence.codons(s, remainder=True) print sequence.translate(s) print writer.close()
h,s = fasta.load('seq/HLA-A.fa') L = len(s) if False: domains = hmmer4.load_domains('hmmer/6frames.txt') for d in domains: p = hmmer4.parseSixFrameHeader(d.accession) print d print p.name, p.frame gStart,gEnd,strand = hmmer4.convert6FrameToGenomic(d.sStart,d.sEnd,p.frame,L) print gStart,gEnd,strand if strand=='+': dna = s[gStart-1:gEnd] print len(dna), len(dna) % 3==0 print sequence.codons(dna, remainder=True) print sequence.translate(dna) else: gStart,gEnd = gEnd,gStart dna = sequence.reverseComplement(s[gStart-1:gEnd]) print len(dna), len(dna) % 3==0 print sequence.codons(dna, remainder=True) print sequence.translate(dna) print else: domains = hmmer4.load_domains('hmmer/ORFs.txt') for d in domains: o = hmmer4.parseOrfHeader(d.accession) print d print o gStart,gEnd = hmmer4.convertOrfToGenomic(d.sStart,d.sEnd,o.strand,o.start) if o.strand=='+':