Ejemplo n.º 1
0
def extractSeq(feature, blastDb, dx, dy):
    """Extract the translated sequence of a feature and DNA 
    sequence of the surrounding the genomic region.
    
    @param feature: Feature object. Mandatory attributes: accession, sStart, sEnd.
    @param blastDb: Blast database.
    @param dx: Length of sequence to extract upstream.
    @param dy: Length of sequence to extract downstream.
    @returns: tuple of fasta strings (DNA, protein).
    """
    # Extract hmmer hit & translate
    header,seq = fasta.getSequence(blastDb, feature.accession, 
        start=feature.sStart, end=feature.sEnd, strand=feature.strand)
    protein = '\n'.join(['>' + header, sequence.translate(seq)])
    
    if feature.strand=='+':
        start = feature.sStart-dx
        end = feature.sEnd+dy
    else:
        start = feature.sStart-dy
        end = feature.sEnd+dx
    
    if start<0 or end>5000000:
        raise Exception('Out of block bounds.')
    
    # Extract surrounding DNA sequence
    header,seq = fasta.getSequence(blastDb, feature.accession, 
        start=start, end=end)
    header = '%(accession)s:%(sStart)s-%(sEnd)s' % feature.__dict__
    dna = '\n'.join(['>' + header, seq])
    
    return dna, protein
Ejemplo n.º 2
0
domains = hmmer.loadDomains('DEFB.txt', seqType='BlockSixFrame')
print >> genomicFile, '\t'.join(
    domains[0].fields + ['strand', 'lowScoring', 'pseudogene', 'nCysteines'])

for i, domain in enumerate(domains):
    if i > 99: break
    domain.domain = 'DEFB_%0.2i' % (i + 1)
    domain.toGenomic(relative=True)
    domain.addField('lowScoring', 'N')
    domain.addField('pseudogene', 'N')
    domain.addField('nCysteines', 0)
    summary = []

    h, s = fasta.getSequence(blastdb,
                             domain.accession,
                             start=domain.sStart,
                             end=domain.sEnd,
                             strand=domain.strand)
    pep = sequence.translate(s)

    if i > 59: domain.lowScoring = 'Y'
    if '*' in pep:
        domain.pseudogene = 'Y'
        summary.append('Contains stops')

    for aa in pep:
        if aa == 'C':
            domain.nCysteines += 1
    if domain.nCysteines != 6:
        summary.append('Has %i cysteines' % domain.nCysteines)
Ejemplo n.º 3
0
pepWriter = fasta.MfaWriter('DEFB_extracted_pep.fa')

domains = hmmer.loadDomains('DEFB.txt', seqType='BlockSixFrame')
print >> genomicFile, '\t'.join(domains[0].fields 
    + ['strand', 'lowScoring', 'pseudogene', 'nCysteines'])

for i,domain in enumerate(domains):
    if i>99: break
    domain.domain = 'DEFB_%0.2i' % (i+1)
    domain.toGenomic(relative=True)
    domain.addField('lowScoring', 'N')
    domain.addField('pseudogene', 'N')
    domain.addField('nCysteines', 0)
    summary = []
    
    h,s = fasta.getSequence(blastdb, domain.accession, 
        start=domain.sStart, end=domain.sEnd, strand=domain.strand)
    pep = sequence.translate(s)
    
    if i>59: domain.lowScoring = 'Y'
    if '*' in pep:
        domain.pseudogene = 'Y'
        summary.append('Contains stops')
    
    for aa in pep:
        if aa=='C':
            domain.nCysteines += 1
    if domain.nCysteines!=6:
        summary.append('Has %i cysteines' % domain.nCysteines)
    
    print >> summaryFile, '%s\t%s' % (domain.domain, '; '.join(summary))
    
Ejemplo n.º 4
0
 def getSequence(self, blastDb, padFivePrime=0, padThreePrime=0):
     start = max(1,self.start-padFivePrime)
     end = self.end+padThreePrime
     h,s = fasta.getSequence(blastDb, self.reference, start, end, self.strand)
     return h,s
Ejemplo n.º 5
0
 def getSequence(self, blastDb, padFivePrime=0, padThreePrime=0):
     start = max(1, self.start - padFivePrime)
     end = self.end + padThreePrime
     h, s = fasta.getSequence(blastDb, self.chrom, start, end, self.strand)
     return h, s