def extractSeq(feature, blastDb, dx, dy): """Extract the translated sequence of a feature and DNA sequence of the surrounding the genomic region. @param feature: Feature object. Mandatory attributes: accession, sStart, sEnd. @param blastDb: Blast database. @param dx: Length of sequence to extract upstream. @param dy: Length of sequence to extract downstream. @returns: tuple of fasta strings (DNA, protein). """ # Extract hmmer hit & translate header,seq = fasta.getSequence(blastDb, feature.accession, start=feature.sStart, end=feature.sEnd, strand=feature.strand) protein = '\n'.join(['>' + header, sequence.translate(seq)]) if feature.strand=='+': start = feature.sStart-dx end = feature.sEnd+dy else: start = feature.sStart-dy end = feature.sEnd+dx if start<0 or end>5000000: raise Exception('Out of block bounds.') # Extract surrounding DNA sequence header,seq = fasta.getSequence(blastDb, feature.accession, start=start, end=end) header = '%(accession)s:%(sStart)s-%(sEnd)s' % feature.__dict__ dna = '\n'.join(['>' + header, seq]) return dna, protein
domains = hmmer.loadDomains('DEFB.txt', seqType='BlockSixFrame') print >> genomicFile, '\t'.join( domains[0].fields + ['strand', 'lowScoring', 'pseudogene', 'nCysteines']) for i, domain in enumerate(domains): if i > 99: break domain.domain = 'DEFB_%0.2i' % (i + 1) domain.toGenomic(relative=True) domain.addField('lowScoring', 'N') domain.addField('pseudogene', 'N') domain.addField('nCysteines', 0) summary = [] h, s = fasta.getSequence(blastdb, domain.accession, start=domain.sStart, end=domain.sEnd, strand=domain.strand) pep = sequence.translate(s) if i > 59: domain.lowScoring = 'Y' if '*' in pep: domain.pseudogene = 'Y' summary.append('Contains stops') for aa in pep: if aa == 'C': domain.nCysteines += 1 if domain.nCysteines != 6: summary.append('Has %i cysteines' % domain.nCysteines)
pepWriter = fasta.MfaWriter('DEFB_extracted_pep.fa') domains = hmmer.loadDomains('DEFB.txt', seqType='BlockSixFrame') print >> genomicFile, '\t'.join(domains[0].fields + ['strand', 'lowScoring', 'pseudogene', 'nCysteines']) for i,domain in enumerate(domains): if i>99: break domain.domain = 'DEFB_%0.2i' % (i+1) domain.toGenomic(relative=True) domain.addField('lowScoring', 'N') domain.addField('pseudogene', 'N') domain.addField('nCysteines', 0) summary = [] h,s = fasta.getSequence(blastdb, domain.accession, start=domain.sStart, end=domain.sEnd, strand=domain.strand) pep = sequence.translate(s) if i>59: domain.lowScoring = 'Y' if '*' in pep: domain.pseudogene = 'Y' summary.append('Contains stops') for aa in pep: if aa=='C': domain.nCysteines += 1 if domain.nCysteines!=6: summary.append('Has %i cysteines' % domain.nCysteines) print >> summaryFile, '%s\t%s' % (domain.domain, '; '.join(summary))
def getSequence(self, blastDb, padFivePrime=0, padThreePrime=0): start = max(1,self.start-padFivePrime) end = self.end+padThreePrime h,s = fasta.getSequence(blastDb, self.reference, start, end, self.strand) return h,s
def getSequence(self, blastDb, padFivePrime=0, padThreePrime=0): start = max(1, self.start - padFivePrime) end = self.end + padThreePrime h, s = fasta.getSequence(blastDb, self.chrom, start, end, self.strand) return h, s