def reverseComplement(self, preserveHeader=False): """ Return a new FastaRecord with the reverse-complemented DNA sequence. Optionally, supply a name """ rcSequence = sequence.reverseComplement(self.sequence) if preserveHeader: return FastaRecord(self.header, rcSequence) else: rcName = '{0} [revcomp]'.format(self.header.strip()) return FastaRecord(rcName, rcSequence)
def reference(self, aligned=True, orientation="native"): if not (orientation == "native" or orientation == "genomic"): raise ValueError, "Bad `orientation` value" tSeq = self.bam.referenceFasta[self.referenceName].sequence[self.tStart : self.tEnd] shouldRC = orientation == "native" and self.isReverseStrand tSeqOriented = reverseComplement(tSeq) if shouldRC else tSeq if aligned: x = np.fromstring(tSeqOriented, dtype=np.int8) y = self._gapifyRef(x, orientation) return y.tostring() else: return tSeqOriented
def reference(self, aligned=True, orientation="native"): if not (orientation == "native" or orientation == "genomic"): raise ValueError, "Bad `orientation` value" tSeq = self.bam.referenceFasta[self.referenceName].sequence[self.tStart:self.tEnd] shouldRC = orientation == "native" and self.isReverseStrand tSeqOriented = reverseComplement(tSeq) if shouldRC else tSeq if aligned: x = np.fromstring(tSeqOriented, dtype=np.int8) y = self._gapifyRef(x, orientation) return y.tostring() else: return tSeqOriented
def test_reverse_complement_error(self): sequence.reverseComplement(self.bad_sequence)
def test_iupac(self): assert_equal(self.iupac_complement, sequence.complement(self.iupac_sequence)) assert_equal(self.iupac_reverse_complement, sequence.reverseComplement(self.iupac_sequence))
def test_reverseComplement(self): assert_equal(self.reverse_complement, sequence.reverseComplement(self.sequence)) assert_equal(self.sequence, sequence.reverseComplement(self.reverse_complement))
def test_reverse_complement_error(self): with pytest.raises(ValueError): sequence.reverseComplement(self.BAD_SEQUENCE)
def test_iupac(self): assert self.IUPAC_COMPLEMENT == sequence.complement( self.IUPAC_SEQUENCE) assert self.IUPAC_REVERSE_COMPLEMENT == sequence.reverseComplement( self.IUPAC_SEQUENCE)
def test_reverseComplement(self): assert self.REVERSE_COMPLEMENT == sequence.reverseComplement( self.SEQUENCE) assert self.SEQUENCE == sequence.reverseComplement( self.REVERSE_COMPLEMENT)
def extractIPDonlyTgenomic(cbamfile, alncbamfile, sbamfile, zmwqueue, outqueue): #filebase, nit): with pb.IndexedBamReader(alncbamfile) as alncbam, pb.IndexedBamReader(sbamfile) as sbam,\ pb.IndexedBamReader(cbamfile) as cbam: zmw = zmwqueue.get() while zmw is not None: cc = cbam.readsByHoleNumber(zmw)[0] ccread = cc.read(aligned=False, orientation='native') subrs = sbam.readsByHoleNumber(zmw) ccalns = alncbam.readsByHoleNumber(zmw) zmres = {} if len(ccalns) > 0: alnlen = np.array([ccal.readLength for ccal in ccalns]) usealn = np.where(alnlen == np.max(alnlen))[0][0] elif len(ccalns) == 1: usealn = 0 elif len(ccalns) == 0: usealn = None allipds = np.empty((len(subrs), len(ccread)), dtype='float32') allipds.fill(np.nan) subrOrient = np.empty(len(subrs), dtype='bool') for index, sr in enumerate(subrs): # Test if this subread aligns to the forward or reverse of the CCS forwardSread = sr.read(aligned=False, orientation='native') reverseSread = reverseComplement(forwardSread) faln = edlib.align(forwardSread, ccread, mode='NW', task='path') raln = edlib.align(reverseSread, ccread, mode='NW', task='path') if faln['editDistance'] < raln['editDistance']: subrOrient[index] = True alndir = faln useread = forwardSread else: subrOrient[index] = False alndir = raln useread = reverseSread # Use the alignment information to extract IPD at each base that aligns to the CCS origb = np.empty(len(useread), dtype=np.int16 ) origb.fill(np.nan) ccb = np.empty(len(useread), dtype=np.int16) ccb.fill(np.nan) subI = 0 ccI = 0 for m in p.finditer(alndir['cigar']): lg = int(m.group()[-len(m.group()):-1]) mtype = m.group()[-1] if mtype == '=': origb[subI:(subI + lg)] = range(subI, subI + lg) ccb[subI:(subI + lg)] = range(ccI, ccI + lg) subI += lg ccI += lg elif mtype == 'X': subI += lg ccI += lg elif mtype == 'I': subI += lg elif mtype == 'D': ccI += lg ccb = ccb[~np.isnan(ccb)] origb = origb[~np.isnan(origb)] if not subrOrient[index]: for i in range(len(origb)): origb[i] = -1 - origb[i] ipds = sr.baseFeature('Ipd',aligned=False, orientation="native") allipds[index, ccb] = ipds[origb] # Normalize the IPD values from each subread allipds = allipds / np.mean(np.percentile(allipds[~np.isnan(allipds)],usepercentiles)) readisb = {refb:np.where([b == refb for b in ccread])[0] for refb in ['A','C','G','T']} # Take the mean IPD at each position with warnings.catch_warnings(): # ignoring warnings from taking the mean of columns that are all NaN warnings.simplefilter("ignore", category=RuntimeWarning) forwardMean = np.nanmean(allipds[subrOrient == True,:], axis=0) reverseMean = np.nanmean(allipds[subrOrient == False,:], axis=0) # get the mean at just Ts tonlyMean = np.empty(len(ccread), dtype='float32') tonlyMean.fill(np.nan) tonlyMean[readisb['T']] = forwardMean[readisb['T']] tonlyMean[readisb['A']] = reverseMean[readisb['A']] # Save useful information about this molecule zmres['zmw'] = zmw zmres['cclen'] = len(ccread) zmres['nsubr'] = len(subrs) zmres['naln'] = len(ccalns) if usealn is not None: zmres['chr'] = ccalns[usealn].referenceName zmres['refStart'] = ccalns[usealn].referenceStart zmres['refEnd'] = ccalns[usealn].referenceEnd zmres['alnStart'] = ccalns[usealn].aStart zmres['alnEnd'] = ccalns[usealn].aEnd else: zmres['chr'] = "noAlignment" zmres['refStart'] = -1 zmres['refEnd'] = -1 zmres['alnStart'] = -1 zmres['alnEnd'] = -1 with warnings.catch_warnings(): # ignoring warnings from taking the mean of all NaN warnings.simplefilter("ignore", category=RuntimeWarning) # Something is wrong with the read if these throw a warning, but still no need to print on command line # The stored value will be NaN zmres['basemeanA'] = np.nanmean(np.concatenate([forwardMean[readisb['A']], reverseMean[readisb['T']]],axis=None)) zmres['basemeanC'] = np.nanmean(np.concatenate([forwardMean[readisb['C']], reverseMean[readisb['G']]],axis=None)) zmres['basemeanG'] = np.nanmean(np.concatenate([forwardMean[readisb['G']], reverseMean[readisb['C']]],axis=None)) zmres['basemeanT'] = np.nanmean(np.concatenate([forwardMean[readisb['T']], reverseMean[readisb['A']]],axis=None)) zmres['onlyt'] = tonlyMean # binarize the onlyT ipds respf, gmmeans, gmweights, aic, convinf = fitGaussian(tonlyMean, initmean=True, zmw=zmw) zmres['bingmm'] = respf zmres['gmmlowmean'] = gmmeans[0] zmres['gmmlowweight'] = gmweights[0] zmres['gmmhighmean'] = gmmeans[1] zmres['gmmhighweight'] = gmweights[1] zmres['aicDiff'] = aic zmres['convergeInf'] = convinf outqueue.put(zmres) # put the results in the output queue zmwqueue.task_done() zmw = zmwqueue.get() zmwqueue.task_done() outqueue.put(None)