Esempio n. 1
0
def extractSeq(feature, blastDb, dx, dy):
    """Extract the translated sequence of a feature and DNA 
    sequence of the surrounding the genomic region.
    
    @param feature: Feature object. Mandatory attributes: accession, sStart, sEnd.
    @param blastDb: Blast database.
    @param dx: Length of sequence to extract upstream.
    @param dy: Length of sequence to extract downstream.
    @returns: tuple of fasta strings (DNA, protein).
    """
    # Extract hmmer hit & translate
    header,seq = fasta.getSequence(blastDb, feature.accession, 
        start=feature.sStart, end=feature.sEnd, strand=feature.strand)
    protein = '\n'.join(['>' + header, sequence.translate(seq)])
    
    if feature.strand=='+':
        start = feature.sStart-dx
        end = feature.sEnd+dy
    else:
        start = feature.sStart-dy
        end = feature.sEnd+dx
    
    if start<0 or end>5000000:
        raise Exception('Out of block bounds.')
    
    # Extract surrounding DNA sequence
    header,seq = fasta.getSequence(blastDb, feature.accession, 
        start=start, end=end)
    header = '%(accession)s:%(sStart)s-%(sEnd)s' % feature.__dict__
    dna = '\n'.join(['>' + header, seq])
    
    return dna, protein
Esempio n. 2
0
for i, domain in enumerate(domains):
    if i > 99: break
    domain.domain = 'DEFB_%0.2i' % (i + 1)
    domain.toGenomic(relative=True)
    domain.addField('lowScoring', 'N')
    domain.addField('pseudogene', 'N')
    domain.addField('nCysteines', 0)
    summary = []

    h, s = fasta.getSequence(blastdb,
                             domain.accession,
                             start=domain.sStart,
                             end=domain.sEnd,
                             strand=domain.strand)
    pep = sequence.translate(s)

    if i > 59: domain.lowScoring = 'Y'
    if '*' in pep:
        domain.pseudogene = 'Y'
        summary.append('Contains stops')

    for aa in pep:
        if aa == 'C':
            domain.nCysteines += 1
    if domain.nCysteines != 6:
        summary.append('Has %i cysteines' % domain.nCysteines)

    print >> summaryFile, '%s\t%s' % (domain.domain, '; '.join(summary))

    if domain.pseudogene == 'Y' or domain.nCysteines < 5 or domain.nCysteines > 7:
Esempio n. 3
0
i = 0
writer = fasta.MfaWriter('ORFs.fa')

filename = sys.argv[1]
header,dna = fasta.load(filename)
header = header.strip()

orfIter = sequence.extractOrfsIter(dna, minLen=minLen, pattern=pattern)
for i,gStart,gEnd,orf in orfIter:
    h = '%s.%i.%i-%i  Length=%i' % (header,i,gStart,gEnd,len(orf))
    writer.write(h, orf)
    
    fasta.pretty(h, orf)
    
    if gStart<gEnd:
        s = dna[gStart-1:gEnd]
        print gStart, gEnd, len(s), len(s) % 3==0
        print sequence.codons(s, remainder=True)
        print sequence.translate(s)
    else:
        gStart,gEnd = gEnd,gStart
        s = dna[gStart-1:gEnd]
        s = sequence.reverseComplement(s)
        print gStart, gEnd, len(s), len(s) % 3==0
        print sequence.codons(s, remainder=True)
        print sequence.translate(s)
        
    print
writer.close()
Esempio n. 4
0
from aaiter import find_orfs
from fastamasta import FastaReader
from sequence import translate, rev_complement

if __name__ == "__main__":
    reader = FastaReader("data/12.dat")
    seq = reader.readnext()[1]
    rcseq = rev_complement(seq)

    orfs = find_orfs(seq)
    rorfs = find_orfs(rcseq)

    candidates = set()
    for orf in orfs:
        candidates.add(translate(seq[orf[0]:orf[1]]))

    for rorf in rorfs:
        candidates.add(translate(rcseq[rorf[0]:rorf[1]]))

    for candidate in candidates:
        print candidate
Esempio n. 5
0
domains = hmmer.loadDomains('DEFB.txt', seqType='BlockSixFrame')
print >> genomicFile, '\t'.join(domains[0].fields 
    + ['strand', 'lowScoring', 'pseudogene', 'nCysteines'])

for i,domain in enumerate(domains):
    if i>99: break
    domain.domain = 'DEFB_%0.2i' % (i+1)
    domain.toGenomic(relative=True)
    domain.addField('lowScoring', 'N')
    domain.addField('pseudogene', 'N')
    domain.addField('nCysteines', 0)
    summary = []
    
    h,s = fasta.getSequence(blastdb, domain.accession, 
        start=domain.sStart, end=domain.sEnd, strand=domain.strand)
    pep = sequence.translate(s)
    
    if i>59: domain.lowScoring = 'Y'
    if '*' in pep:
        domain.pseudogene = 'Y'
        summary.append('Contains stops')
    
    for aa in pep:
        if aa=='C':
            domain.nCysteines += 1
    if domain.nCysteines!=6:
        summary.append('Has %i cysteines' % domain.nCysteines)
    
    print >> summaryFile, '%s\t%s' % (domain.domain, '; '.join(summary))
    
    if domain.pseudogene=='Y' or domain.nCysteines<5 or domain.nCysteines>7:
Esempio n. 6
0
Date: Tue Aug 22 20:14:57 EST 2006

"""

import os, sys
import fasta, sequence


header,seq = fasta.load('NKC.fa')
orfIterator = fasta.load_iter('ORFs.fa')
writer = fasta.MfaWriter('ORFs2.fa')

for h,orf in orfIterator:
    chrom,block,orfId,limits = h.split()[0].split('.')
    start,end = limits.split('-')
    start = int(start)
    end = int(end)
    
    if start>end:
        strand = '-'
        start,end = end,start
        s = sequence.translate(sequence.reverseComplement(seq[start-1:end]))
    else:
        strand = '+'
        s = sequence.translate(seq[start-1:end])
    
    if s!=orf: print h
    
    writer.write(h,s + '\n')
writer.close()
Esempio n. 7
0
i = 0
writer = fasta.MfaWriter('ORFs.fa')

filename = sys.argv[1]
header, dna = fasta.load(filename)
header = header.strip()

orfIter = sequence.extractOrfsIter(dna, minLen=minLen, pattern=pattern)
for i, gStart, gEnd, orf in orfIter:
    h = '%s.%i.%i-%i  Length=%i' % (header, i, gStart, gEnd, len(orf))
    writer.write(h, orf)

    fasta.pretty(h, orf)

    if gStart < gEnd:
        s = dna[gStart - 1:gEnd]
        print gStart, gEnd, len(s), len(s) % 3 == 0
        print sequence.codons(s, remainder=True)
        print sequence.translate(s)
    else:
        gStart, gEnd = gEnd, gStart
        s = dna[gStart - 1:gEnd]
        s = sequence.reverseComplement(s)
        print gStart, gEnd, len(s), len(s) % 3 == 0
        print sequence.codons(s, remainder=True)
        print sequence.translate(s)

    print
writer.close()
Esempio n. 8
0
h,s = fasta.load('seq/HLA-A.fa')
L = len(s)

if False:
    domains = hmmer4.load_domains('hmmer/6frames.txt')
    for d in domains:
        p = hmmer4.parseSixFrameHeader(d.accession)
        print d
        print p.name, p.frame
        gStart,gEnd,strand = hmmer4.convert6FrameToGenomic(d.sStart,d.sEnd,p.frame,L)
        print gStart,gEnd,strand
        if strand=='+':
            dna = s[gStart-1:gEnd]
            print len(dna), len(dna) % 3==0
            print sequence.codons(dna, remainder=True)
            print sequence.translate(dna)
        else:
            gStart,gEnd = gEnd,gStart
            dna = sequence.reverseComplement(s[gStart-1:gEnd])
            print len(dna), len(dna) % 3==0
            print sequence.codons(dna, remainder=True)
            print sequence.translate(dna)
        print
else:
    domains = hmmer4.load_domains('hmmer/ORFs.txt')
    for d in domains:
        o = hmmer4.parseOrfHeader(d.accession)
        print d
        print o
        gStart,gEnd = hmmer4.convertOrfToGenomic(d.sStart,d.sEnd,o.strand,o.start)
        if o.strand=='+':