def constructAdjacencies(self, seq): """ Constructs adjacency edges for each the graph. """ prev = seq[:self.kmer_size].upper() prev_strandless = strandless(prev) for i in xrange(1, len(seq) - self.kmer_size + 1): prev_size = len(self.G) kmer = seq[i:i + self.kmer_size].upper() if "N" in kmer or "N" in prev: continue kmer_strandless = strandless(kmer) if prev == prev_strandless: # exiting right side of previous kmer if kmer == kmer_strandless: # entering left side of next kmer self.G.add_edge(prev + "_R", kmer + "_L") else: # entering right side of next kmer self.G.add_edge(prev + "_R", reverseComplement(kmer) + "_R") else: # exiting left side of previous kmer if kmer == kmer_strandless: # entering left side of next kmer self.G.add_edge( reverseComplement(prev) + "_L", kmer + "_L") else: # entering right side of next kmer self.G.add_edge( reverseComplement(prev) + "_L", reverseComplement(kmer) + "_R") assert prev_size == len(self.G) prev = kmer prev_strandless = kmer_strandless
def constructAdjacencies(self, seq): """ Constructs adjacency edges for each the graph. """ prev = seq[:self.kmer_size].upper() prev_strandless = strandless(prev) for i in xrange(1, len(seq) - self.kmer_size + 1): prev_size = len(self.G) kmer = seq[i:i + self.kmer_size].upper() if "N" in kmer or "N" in prev: continue kmer_strandless = strandless(kmer) if prev == prev_strandless: # exiting right side of previous kmer if kmer == kmer_strandless: # entering left side of next kmer self.G.add_edge(prev + "_R", kmer + "_L") else: # entering right side of next kmer self.G.add_edge(prev + "_R", reverseComplement(kmer) + "_R") else: # exiting left side of previous kmer if kmer == kmer_strandless: # entering left side of next kmer self.G.add_edge(reverseComplement(prev) + "_L", kmer + "_L") else: # entering right side of next kmer self.G.add_edge(reverseComplement(prev) + "_L", reverseComplement(kmer) + "_R") assert prev_size == len(self.G) prev = kmer prev_strandless = kmer_strandless
def countKmers(self): refKmers, readKmers = Counter(), Counter() for name, seq in fastaRead(self.referenceFastaFile): for i in xrange(self.kmerSize, len(seq)): s = seq[ i - self.kmerSize : i ] if "N" not in s: refKmers[s] += 1 refKmers[reverseComplement(s)] += 1 for name, seq, qual in fastqRead(self.readFastqFile): for i in xrange(self.kmerSize, len(seq)): s = seq[ i - self.kmerSize : i ] if "N" not in s: readKmers[s] += 1 readKmers[reverseComplement(s)] += 1 return (refKmers, readKmers)
def mergeChainedAlignedSegments(chainedAlignedSegments, refSequence, readSequence): """Makes a single alignment for the given chained reads. Will soft soft clip the unaligned prefix and suffix of the readSequence. From doc on building pysam line a = pysam.AlignedSegment() a.query_name = "read_28833_29006_6945" a.query_sequence="AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG" a.flag = 99 a.reference_id = 0 a.reference_start = 32 a.mapq = 20 a.cigar = ( (0,10), (2,1), (0,25) ) a.mrnm = 0 a.mpos=199 a.isize=167 a.qual="<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<" a.tags = ( ("NM", 1), ("RG", "L1") ) """ cAR = pysam.AlignedSegment() aR = chainedAlignedSegments[0] cAR.query_name = aR.query_name #Parameters we don't and therefore set properly #cAR.flag = aR.flag #cAR.mapq = aR.mapq #cAR.mrnm = 0 #cAR.mpos=0 #cAR.isize=0 #cAR.qual = "<" * len(readSequence) #cAR.tags = aR.tags cAR.next_reference_id = -1 cAR.reference_start = aR.reference_start #Reference start cAR.is_reverse = aR.is_reverse cAR.query_sequence = reverseComplement(readSequence) if cAR.is_reverse else readSequence cAR.reference_id = aR.reference_id cigarList = [] pPos = aR.reference_start #Iterate from the other end of the sequence if reversed pQPos = -(len(readSequence)-1) if cAR.is_reverse else 0 for aR in chainedAlignedSegments: assert cAR.is_reverse == aR.is_reverse #Add a deletion representing the preceding unaligned reference positions assert aR.reference_start >= pPos if aR.reference_start > pPos: cigarList.append((2, aR.reference_start - pPos)) pPos = aR.reference_start #Add an insertion representing the preceding unaligned read positions #make it a soft clip if it is the first chained alignment qPos = getFirstNonClippedPositionInRead(aR, readSequence) assert qPos >= pQPos if qPos > pQPos: cigarList.append((4 if aR == chainedAlignedSegments[0] else 1, qPos - pQPos)) pQPos = qPos #Add the operations of the cigar, filtering hard and soft clipping for op, length in aR.cigar: assert op in (0, 1, 2, 4, 5) if op in (0, 1, 2): cigarList.append((op, length)) if op in (0, 2): #Is match or deletion pPos += length if op in (0, 1): #Is match or insertion pQPos += length assert pPos <= len(refSequence) #Set reference end coordinate (which is exclusive) #cAR.reference_end = pPos #We don't do this because it is set by cigar string #Now add any trailing, necessary soft clipping if cAR.is_reverse: assert pQPos <= 1 if pQPos < 1: cigarList.append((4, -pQPos + 1)) else: assert pQPos <= len(readSequence) if pQPos < len(readSequence): cigarList.append((4, len(readSequence) - pQPos)) cAR.cigar = tuple(cigarList) #Check ops for op, length in cAR.cigar: #We should have no hard clipped ops assert op in (0, 1, 2, 4) #Reference sequence check coordinates assert sum([ length for op, length in cigarList if op in (0, 2)]) == cAR.reference_end - cAR.reference_start assert cAR.reference_start >= 0 and cAR.reference_start < len(refSequence) assert cAR.reference_end >= 0 and cAR.reference_end <= len(refSequence) #Read sequence check coordinates assert cAR.query_alignment_start >= 0 and cAR.query_alignment_start < len(readSequence) assert cAR.query_alignment_end >= 0 and cAR.query_alignment_end <= len(readSequence) assert cAR.query_alignment_start + sum([ length for op, length in cigarList if op in (0, 1)]) == cAR.query_alignment_end return cAR
def strandless(k): """ Returns the strandless version of this kmer. This is defined as whichever comes first, the kmer or the reverse complement of the kmer lexicographically. """ return sorted([k, reverseComplement(k)])[0]
def learnModelFromSamFileTargetFn(target, samFile, readFastqFile, referenceFastaFile, outputModel): """Does expectation maximisation on sam file to learn the hmm for the sam file. """ #Convert the read file to fasta refSequences = getFastaDictionary( referenceFastaFile) #Hash of names to sequences readSequences = getFastqDictionary( readFastqFile) #Hash of names to sequences reads = os.path.join(target.getGlobalTempDir(), "temp.fa") fH = open(reads, 'w') for name in readSequences.keys(): seq = readSequences[name] fastaWrite(fH, name, seq) fastaWrite(fH, name + "_reverse", reverseComplement(seq)) fH.close() #Get cigars file cigars = os.path.join(target.getGlobalTempDir(), "temp.cigar") fH = open(cigars, 'w') sam = pysam.Samfile(samFile, "r") for aR in sam: #Iterate on the sam lines realigning them in parallel #Because these are global alignments with reverse complement coordinates reversed the following should all be true assert aR.pos == 0 assert aR.qstart == 0 assert aR.qend == len(readSequences[aR.qname]) #aR.query) assert aR.aend == len(refSequences[sam.getrname(aR.rname)]) assert len(aR.query) == len(readSequences[aR.qname]) if aR.is_reverse: #Deal with reverse complements assert aR.query.upper() == reverseComplement( readSequences[aR.qname]).upper() aR.qname += "_reverse" else: assert aR.query.upper() == readSequences[aR.qname].upper() fH.write(getExonerateCigarFormatString(aR, sam) + "\n") #Exonerate format Cigar string, using global coordinates #fH.write(getGlobalAlignmentExonerateCigarFormatString(aR, sam, refSequences[sam.getrname(aR.rname)], readSequences[aR.qname]) + "\n") fH.close() #Run cactus_expectationMaximisation options = cactus_expectationMaximisation.Options() options.modelType = "fiveStateAsymmetric" #"threeStateAsymmetric" options.optionsToRealign = "--diagonalExpansion=10 --splitMatrixBiggerThanThis=300" options.randomStart = True options.trials = 3 options.outputTrialHmms = True options.iterations = 100 options.maxAlignmentLengthPerJob = 700000 options.maxAlignmentLengthToSample = 50000000 options.outputXMLModelFile = outputModel + ".xml" #options.updateTheBand = True #options.useDefaultModelAsStart = True #options.setJukesCantorStartingEmissions=0.3 options.trainEmissions = True #options.tieEmissions = True unnormalisedOutputModel = outputModel + "_unnormalised" #Do training if necessary if not os.path.exists(unnormalisedOutputModel): target.addChildTargetFn( cactus_expectationMaximisation.expectationMaximisationTrials, args=(" ".join([reads, referenceFastaFile]), cigars, unnormalisedOutputModel, options)) #Now set up normalisation target.setFollowOnTargetFn(learnModelFromSamFileTargetFn2, args=(unnormalisedOutputModel, outputModel))
def mergeChainedAlignedReads(chainedAlignedReads, refSequence, readSequence): """Makes a global aligment for the given chained reads. From doc on building pysam line a = pysam.AlignedRead() a.qname = "read_28833_29006_6945" a.seq="AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG" a.flag = 99 a.rname = 0 a.pos = 32 a.mapq = 20 a.cigar = ( (0,10), (2,1), (0,25) ) a.mrnm = 0 a.mpos=199 a.isize=167 a.qual="<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<" a.tags = ( ("NM", 1), ("RG", "L1") ) """ cAR = pysam.AlignedRead() aR = chainedAlignedReads[0] cAR.qname = aR.qname #Parameters we don't and therefore set properly #cAR.flag = aR.flag #cAR.mapq = aR.mapq #cAR.mrnm = 0 #cAR.mpos=0 #cAR.isize=0 #cAR.qual = "<" * len(readSequence) #cAR.tags = aR.tags cAR.rnext = -1 cAR.pos = 0 cAR.is_reverse = aR.is_reverse if cAR.is_reverse: cAR.seq = reverseComplement(readSequence) else: cAR.seq = readSequence cAR.rname = aR.rname cigarList = [] pPos = 0 if cAR.is_reverse: #Iterate from the other end of the sequence pQPos = -(len(readSequence) - 1) else: pQPos = 0 for aR in chainedAlignedReads: assert cAR.is_reverse == aR.is_reverse #Add a deletion representing the preceding unaligned reference positions assert aR.pos >= pPos if aR.pos > pPos: cigarList.append((2, aR.pos - pPos)) pPos = aR.pos #Add an insertion representing the preceding unaligned read positions qPos = getAbsoluteReadOffset(aR, refSequence, readSequence) assert qPos >= pQPos if qPos > pQPos: cigarList.append((1, qPos - pQPos)) pQPos = qPos #Add the operations of the cigar, filtering hard and soft clipping for op, length in aR.cigar: assert op in (0, 1, 2, 4, 5) if op in (0, 1, 2): cigarList.append((op, length)) if op in (0, 2): #Is match or deletion pPos += length if op in (0, 1): #Is match or insertion pQPos += length #Now add any trailing deletions/insertions assert pPos <= len(refSequence) if pPos < len(refSequence): cigarList.append((2, len(refSequence) - pPos)) if cAR.is_reverse: assert pQPos <= 1 if pQPos < 1: cigarList.append((1, -pQPos + 1)) else: assert pQPos <= len(readSequence) if pQPos < len(readSequence): cigarList.append((1, len(readSequence) - pQPos)) #Check coordinates #print cAR.is_reverse, sum([ length for op, length in cigarList if op in (0, 2)]), len(refSequence), sum([ length for op, length in cigarList if op in (0, 1)]), len(readSequence), cAR.qname assert sum([length for op, length in cigarList if op in (0, 2)]) == len(refSequence) assert sum([length for op, length in cigarList if op in (0, 1)]) == len(readSequence) cAR.cigar = tuple(cigarList) return cAR
def getReadBase(self): if self.isReversed: return reverseComplement(self.readSeq[self.readPos]) return self.readSeq[self.readPos]
def learnModelFromSamFileTargetFn(target, samFile, readFastqFile, referenceFastaFile, outputModel): """Does expectation maximisation on sam file to learn the hmm for the sam file. """ #Convert the read file to fasta refSequences = getFastaDictionary(referenceFastaFile) #Hash of names to sequences readSequences = getFastqDictionary(readFastqFile) #Hash of names to sequences reads = os.path.join(target.getGlobalTempDir(), "temp.fa") fH = open(reads, 'w') for name in readSequences.keys(): seq = readSequences[name] fastaWrite(fH, name, seq) fastaWrite(fH, name + "_reverse", reverseComplement(seq)) fH.close() #Get cigars file cigars = os.path.join(target.getGlobalTempDir(), "temp.cigar") fH = open(cigars, 'w') sam = pysam.Samfile(samFile, "r" ) for aR in sam: #Iterate on the sam lines realigning them in parallel #Because these are global alignments with reverse complement coordinates reversed the following should all be true assert aR.pos == 0 assert aR.qstart == 0 assert aR.qend == len(readSequences[aR.qname]) #aR.query) assert aR.aend == len(refSequences[sam.getrname(aR.rname)]) assert len(aR.query) == len(readSequences[aR.qname]) if aR.is_reverse: #Deal with reverse complements assert aR.query.upper() == reverseComplement(readSequences[aR.qname]).upper() aR.qname += "_reverse" else: assert aR.query.upper() == readSequences[aR.qname].upper() fH.write(getExonerateCigarFormatString(aR, sam) + "\n") #Exonerate format Cigar string, using global coordinates #fH.write(getGlobalAlignmentExonerateCigarFormatString(aR, sam, refSequences[sam.getrname(aR.rname)], readSequences[aR.qname]) + "\n") fH.close() #Run cactus_expectationMaximisation options = cactus_expectationMaximisation.Options() options.modelType="fiveStateAsymmetric" #"threeStateAsymmetric" options.optionsToRealign="--diagonalExpansion=10 --splitMatrixBiggerThanThis=300" options.randomStart = True options.trials = 3 options.outputTrialHmms = True options.iterations = 100 options.maxAlignmentLengthPerJob=700000 options.maxAlignmentLengthToSample = 50000000 options.outputXMLModelFile = outputModel + ".xml" #options.updateTheBand = True #options.useDefaultModelAsStart = True #options.setJukesCantorStartingEmissions=0.3 options.trainEmissions=True #options.tieEmissions = True unnormalisedOutputModel = outputModel + "_unnormalised" #Do training if necessary if not os.path.exists(unnormalisedOutputModel): target.addChildTargetFn(cactus_expectationMaximisation.expectationMaximisationTrials, args=(" ".join([reads, referenceFastaFile ]), cigars, unnormalisedOutputModel, options)) #Now set up normalisation target.setFollowOnTargetFn(learnModelFromSamFileTargetFn2, args=(unnormalisedOutputModel, outputModel))
def mergeChainedAlignedReads(chainedAlignedReads, refSequence, readSequence): """Makes a global aligment for the given chained reads. From doc on building pysam line a = pysam.AlignedRead() a.qname = "read_28833_29006_6945" a.seq="AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG" a.flag = 99 a.rname = 0 a.pos = 32 a.mapq = 20 a.cigar = ( (0,10), (2,1), (0,25) ) a.mrnm = 0 a.mpos=199 a.isize=167 a.qual="<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<" a.tags = ( ("NM", 1), ("RG", "L1") ) """ cAR = pysam.AlignedRead() aR = chainedAlignedReads[0] cAR.qname = aR.qname #Parameters we don't and therefore set properly #cAR.flag = aR.flag #cAR.mapq = aR.mapq #cAR.mrnm = 0 #cAR.mpos=0 #cAR.isize=0 #cAR.qual = "<" * len(readSequence) #cAR.tags = aR.tags cAR.rnext = -1 cAR.pos = 0 cAR.is_reverse = aR.is_reverse if cAR.is_reverse: cAR.seq = reverseComplement(readSequence) else: cAR.seq = readSequence cAR.rname = aR.rname cigarList = [] pPos = 0 if cAR.is_reverse: #Iterate from the other end of the sequence pQPos = -(len(readSequence)-1) else: pQPos = 0 for aR in chainedAlignedReads: assert cAR.is_reverse == aR.is_reverse #Add a deletion representing the preceding unaligned reference positions assert aR.pos >= pPos if aR.pos > pPos: cigarList.append((2, aR.pos - pPos)) pPos = aR.pos #Add an insertion representing the preceding unaligned read positions qPos = getAbsoluteReadOffset(aR, refSequence, readSequence) assert qPos >= pQPos if qPos > pQPos: cigarList.append((1, qPos - pQPos)) pQPos = qPos #Add the operations of the cigar, filtering hard and soft clipping for op, length in aR.cigar: assert op in (0, 1, 2, 4, 5) if op in (0, 1, 2): cigarList.append((op, length)) if op in (0, 2): #Is match or deletion pPos += length if op in (0, 1): #Is match or insertion pQPos += length #Now add any trailing deletions/insertions assert pPos <= len(refSequence) if pPos < len(refSequence): cigarList.append((2, len(refSequence) - pPos)) if cAR.is_reverse: assert pQPos <= 1 if pQPos < 1: cigarList.append((1, -pQPos + 1)) else: assert pQPos <= len(readSequence) if pQPos < len(readSequence): cigarList.append((1, len(readSequence) - pQPos)) #Check coordinates #print cAR.is_reverse, sum([ length for op, length in cigarList if op in (0, 2)]), len(refSequence), sum([ length for op, length in cigarList if op in (0, 1)]), len(readSequence), cAR.qname assert sum([ length for op, length in cigarList if op in (0, 2)]) == len(refSequence) assert sum([ length for op, length in cigarList if op in (0, 1)]) == len(readSequence) cAR.cigar = tuple(cigarList) return cAR