def random_mutations(self): # add random indels all_indels = [[] for n in self.sequences] for i in xrange(self.ploidy): for j in xrange(self.indelsToAdd[i]): if random.random() <= self.models[i][1]: # insert homozygous indel whichPloid = range(self.ploidy) else: # insert heterozygous indel whichPloid = [self.ploidMutPrior.sample()] # try to find suitable places to insert indels eventPos = -1 for attempt in xrange(MAX_ATTEMPTS): eventPos = random.randint(self.winBuffer,self.seqLen-1) for p in whichPloid: if self.blackList[p][eventPos]: eventPos = -1 if eventPos != -1: break if eventPos == -1: continue if random.random() <= self.models[i][3]: # insertion inLen = self.models[i][4].sample() # sequence content of random insertions is uniformly random (change this later) inSeq = ''.join([random.choice(NUCL) for n in xrange(inLen)]) refNucl = chr(self.sequences[i][eventPos]) myIndel = (eventPos,refNucl,refNucl+inSeq) else: # deletion inLen = self.models[i][5].sample() if eventPos+inLen+1 >= len(self.sequences[i]): # skip if deletion too close to boundary continue if inLen == 1: inSeq = chr(self.sequences[i][eventPos+1]) else: inSeq = str(self.sequences[i][eventPos+1:eventPos+inLen+1]) refNucl = chr(self.sequences[i][eventPos]) myIndel = (eventPos,refNucl+inSeq,refNucl) # if event too close to boundary, skip. if event conflicts with other indel, skip. skipEvent = False if eventPos+len(myIndel[1]) >= self.seqLen-self.winBuffer-1: skipEvent = True if skipEvent: continue for p in whichPloid: for k in xrange(eventPos,eventPos+inLen+1): if self.blackList[p][k]: skipEvent = True if skipEvent: continue for p in whichPloid: for k in xrange(eventPos,eventPos+inLen+1): self.blackList[p][k] = 1 all_indels[p].append(myIndel) for i in xrange(len(all_indels)): all_indels[i].extend(self.indelList[i]) all_indels = [sorted(n,reverse=True) for n in all_indels] #print all_indels # add random snps all_snps = [[] for n in self.sequences] for i in xrange(self.ploidy): for j in xrange(self.snpsToAdd[i]): if random.random() <= self.models[i][1]: # insert homozygous SNP whichPloid = range(self.ploidy) else: # insert heterozygous SNP whichPloid = [self.ploidMutPrior.sample()] # try to find suitable places to insert snps eventPos = -1 for attempt in xrange(MAX_ATTEMPTS): # based on the mutation model for the specified ploid, choose a SNP location based on trinuc bias # (if there are multiple ploids, choose one at random) if IGNORE_TRINUC: eventPos = random.randint(self.winBuffer+1,self.seqLen-2) else: ploid_to_use = whichPloid[random.randint(0,len(whichPloid)-1)] eventPos = self.trinuc_bias[ploid_to_use].sample() for p in whichPloid: if self.blackList[p][eventPos]: eventPos = -1 if eventPos != -1: break if eventPos == -1: continue refNucl = chr(self.sequences[i][eventPos]) context = str(chr(self.sequences[i][eventPos-1])+chr(self.sequences[i][eventPos+1])) # sample from tri-nucleotide substitution matrices to get SNP alt allele newNucl = self.models[i][6][TRI_IND[context]][NUC_IND[refNucl]].sample() mySNP = (eventPos,refNucl,newNucl) for p in whichPloid: all_snps[p].append(mySNP) self.blackList[p][mySNP[0]] = 2 # combine random snps with inserted snps, remove any snps that overlap indels for p in xrange(len(all_snps)): all_snps[p].extend(self.snpList[p]) all_snps[p] = [n for n in all_snps[p] if self.blackList[p][n[0]] != 1] # modify reference sequences for i in xrange(len(all_snps)): for j in xrange(len(all_snps[i])): # sanity checking (for debugging purposes) vPos = all_snps[i][j][0] if all_snps[i][j][1] != chr(self.sequences[i][vPos]): print '\nError: Something went wrong!\n', all_snps[i][j], chr(self.sequences[i][vPos]),'\n' exit(1) else: self.sequences[i][vPos] = all_snps[i][j][2] adjToAdd = [[] for n in xrange(self.ploidy)] for i in xrange(len(all_indels)): for j in xrange(len(all_indels[i])): # sanity checking (for debugging purposes) vPos = all_indels[i][j][0] vPos2 = vPos + len(all_indels[i][j][1]) #print all_indels[i][j], str(self.sequences[i][vPos:vPos2]) #print len(self.sequences[i]),'-->', if all_indels[i][j][1] != str(self.sequences[i][vPos:vPos2]): print '\nError: Something went wrong!\n', all_indels[i][j], str(self.sequences[i][vPos:vPos2]),'\n' exit(1) else: self.sequences[i] = self.sequences[i][:vPos] + bytearray(all_indels[i][j][2]) + self.sequences[i][vPos2:] adjToAdd[i].append((all_indels[i][j][0],len(all_indels[i][j][2])-len(all_indels[i][j][1]))) #print len(self.sequences[i]) adjToAdd[i].sort() #print adjToAdd[i] self.adj[i] = np.zeros(len(self.sequences[i]),dtype='<i4') indSoFar = 0 valSoFar = 0 for j in xrange(len(self.adj[i])): if indSoFar < len(adjToAdd[i]) and j >= adjToAdd[i][indSoFar][0]+1: valSoFar += adjToAdd[i][indSoFar][1] indSoFar += 1 self.adj[i][j] = valSoFar # precompute cigar strings (we can skip this is going for only vcf output) if not self.onlyVCF: tempSymbolString = ['M'] prevVal = self.adj[i][0] j = 1 while j < len(self.adj[i]): diff = self.adj[i][j] - prevVal prevVal = self.adj[i][j] if diff > 0: # insertion tempSymbolString.extend(['I']*abs(diff)) j += abs(diff) elif diff < 0: # deletion tempSymbolString.append('D'*abs(diff)+'M') j += 1 else: tempSymbolString.append('M') j += 1 for j in xrange(len(tempSymbolString)-self.readLen): self.allCigar[i].append(CigarString(listIn=tempSymbolString[j:j+self.readLen]).getString()) # pre-compute reference position of first matching base my_fm_pos = None for k in xrange(self.readLen): if 'M' in tempSymbolString[j+k]: my_fm_pos = j+k break if my_fm_pos == None: self.FM_pos[i].append(None) self.FM_span[i].append(None) else: self.FM_pos[i].append(my_fm_pos-self.adj[i][my_fm_pos]) span_dif = len([nnn for nnn in tempSymbolString[j:j+self.readLen] if 'M' in nnn]) self.FM_span[i].append(self.FM_pos[i][-1] + span_dif) # tally up variants implemented countDict = {} all_variants = [sorted(all_snps[i]+all_indels[i]) for i in xrange(self.ploidy)] for i in xrange(len(all_variants)): for j in xrange(len(all_variants[i])): all_variants[i][j] = tuple([all_variants[i][j][0]+self.x])+all_variants[i][j][1:] t = tuple(all_variants[i][j]) if t not in countDict: countDict[t] = [] countDict[t].append(i) # # TODO: combine multiple variants that happened to occur at same position into single vcf entry # output_variants = [] for k in sorted(countDict.keys()): output_variants.append(k+tuple([len(countDict[k])/float(self.ploidy)])) ploid_string = ['0' for n in xrange(self.ploidy)] for k2 in [n for n in countDict[k]]: ploid_string[k2] = '1' output_variants[-1] += tuple(['WP='+'/'.join(ploid_string)]) return output_variants
def sample_read(self, sequencingModel, fragLen=None): # choose a ploid myPloid = random.randint(0,self.ploidy-1) # stop attempting to find a valid position if we fail enough times MAX_READPOS_ATTEMPTS = 100 attempts_thus_far = 0 # choose a random position within the ploid, and generate quality scores / sequencing errors readsToSample = [] if fragLen == None: rPos = self.coverage_distribution[myPloid].sample() #####rPos = random.randint(0,len(self.sequences[myPloid])-self.readLen-1) # uniform random #### ##### decide which subsection of the sequence to sample from using coverage probabilities ####coords_bad = True ####while coords_bad: #### attempts_thus_far += 1 #### if attempts_thus_far > MAX_READPOS_ATTEMPTS: #### return None #### myBucket = max([self.which_bucket.sample() - self.win_per_read, 0]) #### coords_to_select_from = [myBucket*self.windowSize,(myBucket+1)*self.windowSize] #### if coords_to_select_from[0] >= len(self.adj[myPloid]): # prevent going beyond region boundaries #### continue #### coords_to_select_from[0] += self.adj[myPloid][coords_to_select_from[0]] #### coords_to_select_from[1] += self.adj[myPloid][coords_to_select_from[0]] #### if max(coords_to_select_from) <= 0: # prevent invalid negative coords due to adj #### continue #### if coords_to_select_from[1] - coords_to_select_from[0] <= 2: # we don't span enough coords to sample #### continue #### if coords_to_select_from[1] < len(self.sequences[myPloid])-self.readLen: #### coords_bad = False ####rPos = random.randint(coords_to_select_from[0],coords_to_select_from[1]-1) # sample read position and call function to compute quality scores / sequencing errors rDat = self.sequences[myPloid][rPos:rPos+self.readLen] (myQual, myErrors) = sequencingModel.getSequencingErrors(rDat) readsToSample.append([rPos,myQual,myErrors,rDat]) else: rPos1 = self.coverage_distribution[myPloid][self.fraglens_indMap[fragLen]].sample() # EXPERIMENTAL #coords_to_select_from = self.coverage_distribution[myPloid][self.fraglens_indMap[fragLen]].sample() #rPos1 = random.randint(coords_to_select_from[0],coords_to_select_from[1]) #####rPos1 = random.randint(0,len(self.sequences[myPloid])-fragLen-1) # uniform random #### ##### decide which subsection of the sequence to sample from using coverage probabilities ####coords_bad = True ####while coords_bad: #### attempts_thus_far += 1 #### if attempts_thus_far > MAX_READPOS_ATTEMPTS: #### #print coords_to_select_from #### return None #### myBucket = max([self.which_bucket.sample() - self.win_per_read, 0]) #### coords_to_select_from = [myBucket*self.windowSize,(myBucket+1)*self.windowSize] #### if coords_to_select_from[0] >= len(self.adj[myPloid]): # prevent going beyond region boundaries #### continue #### coords_to_select_from[0] += self.adj[myPloid][coords_to_select_from[0]] #### coords_to_select_from[1] += self.adj[myPloid][coords_to_select_from[0]] # both ends use index of starting position to avoid issues with reads spanning breakpoints of large events #### if max(coords_to_select_from) <= 0: # prevent invalid negative coords due to adj #### continue #### if coords_to_select_from[1] - coords_to_select_from[0] <= 2: # we don't span enough coords to sample #### continue #### rPos1 = random.randint(coords_to_select_from[0],coords_to_select_from[1]-1) #### # for PE-reads, flip a coin to decide if R1 or R2 will be the "covering" read #### if random.randint(1,2) == 1 and rPos1 > fragLen - self.readLen: #### rPos1 -= fragLen - self.readLen #### if rPos1 < len(self.sequences[myPloid])-fragLen: #### coords_bad = False rPos2 = rPos1 + fragLen - self.readLen rDat1 = self.sequences[myPloid][rPos1:rPos1+self.readLen] rDat2 = self.sequences[myPloid][rPos2:rPos2+self.readLen] #print len(rDat1), rPos1, len(self.sequences[myPloid]) (myQual1, myErrors1) = sequencingModel.getSequencingErrors(rDat1) (myQual2, myErrors2) = sequencingModel.getSequencingErrors(rDat2,isReverseStrand=True) readsToSample.append([rPos1,myQual1,myErrors1,rDat1]) readsToSample.append([rPos2,myQual2,myErrors2,rDat2]) # error format: # myError[i] = (type, len, pos, ref, alt) # examine sequencing errors to-be-inserted. # - remove deletions that don't have enough bordering sequence content to "fill in" # if error is valid, make the changes to the read data rOut = [] for r in readsToSample: try: myCigar = self.allCigar[myPloid][r[0]] except IndexError: print 'Index error when attempting to find cigar string.' print len(self.allCigar[myPloid]), r[0] if fragLen != None: print (rPos1, rPos2) print myPloid, fragLen, self.fraglens_indMap[fragLen] exit(1) totalD = sum([error[1] for error in r[2] if error[0] == 'D']) totalI = sum([error[1] for error in r[2] if error[0] == 'I']) availB = len(self.sequences[myPloid]) - r[0] - self.readLen - 1 # add buffer sequence to fill in positions that get deleted r[3] += self.sequences[myPloid][r[0]+self.readLen:r[0]+self.readLen+totalD] expandedCigar = [] extraCigar = [] adj = 0 sse_adj = [0 for n in xrange(self.readLen + max(sequencingModel.errP[3]))] anyIndelErr = False # sort by letter (D > I > S) such that we introduce all indel errors before substitution errors # secondarily, sort by index arrangedErrors = {'D':[],'I':[],'S':[]} for error in r[2]: arrangedErrors[error[0]].append((error[2],error)) sortedErrors = [] for k in sorted(arrangedErrors.keys()): sortedErrors.extend([n[1] for n in sorted(arrangedErrors[k])]) skipIndels = False for error in sortedErrors: #print '-se-',r[0], error #print sse_adj eLen = error[1] ePos = error[2] if error[0] == 'D' or error[0] == 'I': anyIndelErr = True extraCigarVal = [] if totalD > availB: # if not enough bases to fill-in deletions, skip all indel erors continue if expandedCigar == []: expandedCigar = CigarString(stringIn=myCigar).getList() fillToGo = totalD - totalI + 1 if fillToGo > 0: try: extraCigarVal = CigarString(stringIn=self.allCigar[myPloid][r[0]+fillToGo]).getList()[-fillToGo:] except IndexError: # applying the deletions we want requires going beyond region boundaries. skip all indel errors skipIndels = True if skipIndels: continue # insert deletion error into read and update cigar string accordingly if error[0] == 'D': myadj = sse_adj[ePos] pi = ePos+myadj pf = ePos+myadj+eLen+1 if str(r[3][pi:pf]) == str(error[3]): r[3] = r[3][:pi+1] + r[3][pf:] expandedCigar = expandedCigar[:pi+1] + expandedCigar[pf:] if pi+1 == len(expandedCigar): # weird edge case with del at very end of region. Make a guess and add a "M" expandedCigar.append('M') expandedCigar[pi+1] = 'D'*eLen + expandedCigar[pi+1] else: print '\nError, ref does not match alt while attempting to insert deletion error!\n' exit(1) adj -= eLen for i in xrange(ePos,len(sse_adj)): sse_adj[i] -= eLen # insert insertion error into read and update cigar string accordingly else: myadj = sse_adj[ePos] if chr(r[3][ePos+myadj]) == error[3]: r[3] = r[3][:ePos+myadj] + error[4] + r[3][ePos+myadj+1:] expandedCigar = expandedCigar[:ePos+myadj] + ['I']*eLen + expandedCigar[ePos+myadj:] else: print '\nError, ref does not match alt while attempting to insert insertion error!\n' print '---',chr(r[3][ePos+myadj]), '!=', error[3] exit(1) adj += eLen for i in xrange(ePos,len(sse_adj)): sse_adj[i] += eLen else: # substitution errors, much easier by comparison... if chr(r[3][ePos+sse_adj[ePos]]) == error[3]: r[3][ePos+sse_adj[ePos]] = error[4] else: print '\nError, ref does not match alt while attempting to insert substitution error!\n' exit(1) if anyIndelErr: if len(expandedCigar): relevantCigar = (expandedCigar+extraCigarVal)[:self.readLen] myCigar = CigarString(listIn=relevantCigar).getString() r[3] = r[3][:self.readLen] rOut.append([self.FM_pos[myPloid][r[0]],myCigar,str(r[3]),str(r[1])]) # rOut[i] = (pos, cigar, read_string, qual_string) return rOut
def sample_read(self, sequencingModel, fragLen=None): # choose a ploid myPloid = random.randint(0, self.ploidy - 1) # choose a random position within the ploid, and generate quality scores / sequencing errors readsToSample = [] if fragLen == None: #rPos = random.randint(0,len(self.sequences[myPloid])-self.readLen-1) # uniform random # decide which subsection of the sequence to sample from using coverage probabilities coords_bad = True while coords_bad: myBucket = max( [self.which_bucket.sample() - self.win_per_read, 0]) coords_to_select_from = [ myBucket * self.windowSize, (myBucket + 1) * self.windowSize ] coords_to_select_from[0] += self.adj[myPloid][ coords_to_select_from[0]] coords_to_select_from[1] += self.adj[myPloid][ coords_to_select_from[1]] if coords_to_select_from[1] < len( self.sequences[myPloid]) - self.readLen: coords_bad = False rPos = random.randint(coords_to_select_from[0], coords_to_select_from[1] - 1) # sample read position and call function to compute quality scores / sequencing errors rDat = self.sequences[myPloid][rPos:rPos + self.readLen] (myQual, myErrors) = sequencingModel.getSequencingErrors(rDat) readsToSample.append([rPos, myQual, myErrors, rDat]) else: #rPos1 = random.randint(0,len(self.sequences[myPloid])-fragLen-1) # uniform random # decide which subsection of the sequence to sample from using coverage probabilities coords_bad = True while coords_bad: myBucket = max( [self.which_bucket.sample() - self.win_per_read, 0]) coords_to_select_from = [ myBucket * self.windowSize, (myBucket + 1) * self.windowSize ] coords_to_select_from[0] += self.adj[myPloid][ coords_to_select_from[0]] coords_to_select_from[1] += self.adj[myPloid][ coords_to_select_from[ 0]] # both ends use index of starting position to avoid issues with reads spanning breakpoints of large events rPos1 = random.randint(coords_to_select_from[0], coords_to_select_from[1] - 1) # for PE-reads, flip a coin to decide if R1 or R2 will be the "covering" read if random.randint(1, 2) == 1 and rPos1 > fragLen - self.readLen: rPos1 -= fragLen - self.readLen if rPos1 < len(self.sequences[myPloid]) - fragLen: coords_bad = False rPos2 = rPos1 + fragLen - self.readLen rDat1 = self.sequences[myPloid][rPos1:rPos1 + self.readLen] rDat2 = self.sequences[myPloid][rPos2:rPos2 + self.readLen] (myQual1, myErrors1) = sequencingModel.getSequencingErrors(rDat1) (myQual2, myErrors2) = sequencingModel.getSequencingErrors( rDat2, isReverseStrand=True) readsToSample.append([rPos1, myQual1, myErrors1, rDat1]) readsToSample.append([rPos2, myQual2, myErrors2, rDat2]) # error format: # myError[i] = (type, len, pos, ref, alt) # examine sequencing errors to-be-inserted. # - remove deletions that don't have enough bordering sequence content to "fill in" # if error is valid, make the changes to the read data rOut = [] for r in readsToSample: myCigar = self.allCigar[myPloid][r[0]] totalD = sum([error[1] for error in r[2] if error[0] == 'D']) totalI = sum([error[1] for error in r[2] if error[0] == 'I']) availB = len(self.sequences[myPloid]) - r[0] - self.readLen - 1 # add buffer sequence to fill in positions that get deleted r[3] += self.sequences[myPloid][r[0] + self.readLen:r[0] + self.readLen + totalD] expandedCigar = [] extraCigar = [] adj = 0 sse_adj = [0 for n in xrange(self.readLen)] anyIndelErr = False # sort by letter (D > I > S) such that we introduce all indel errors before substitution errors # secondarily, sort by index arrangedErrors = {'D': [], 'I': [], 'S': []} for error in r[2]: arrangedErrors[error[0]].append((error[2], error)) sortedErrors = [] for k in sorted(arrangedErrors.keys()): sortedErrors.extend([n[1] for n in sorted(arrangedErrors[k])]) for error in sortedErrors: #print r[0], error eLen = error[1] ePos = error[2] if error[0] == 'D' or error[0] == 'I': anyIndelErr = True extraCigarVal = [] if totalD > availB: # if not enough bases to fill-in deletions, skip all indel erors continue if expandedCigar == []: expandedCigar = CigarString(stringIn=myCigar).getList() fillToGo = totalD - totalI if fillToGo > 0: extraCigarVal = CigarString( stringIn=self.allCigar[myPloid][ r[0] + fillToGo]).getList()[-fillToGo:] # insert deletion error into read and update cigar string accordingly if error[0] == 'D': pi = ePos + adj pf = ePos + adj + eLen + 1 if str(r[3][pi:pf]) == str(error[3]): r[3] = r[3][:pi + 1] + r[3][pf:] expandedCigar = expandedCigar[:pi + 1] + expandedCigar[ pf:] expandedCigar[pi + 1] = 'D' * eLen + expandedCigar[pi + 1] else: print '\nError, ref does not match alt while attempting to insert deletion error!\n' exit(1) adj -= eLen for i in xrange(ePos, len(sse_adj)): sse_adj[i] -= eLen # insert insertion error into read and update cigar string accordingly else: if chr(r[3][ePos + adj]) == error[3]: r[3] = r[3][:ePos + adj] + error[4] + r[3][ePos + adj + 1:] expandedCigar = expandedCigar[:ePos + adj] + [ 'I' ] * eLen + expandedCigar[ePos + adj + 1:] else: print '\nError, ref does not match alt while attempting to insert insertion error!\n' exit(1) adj += eLen for i in xrange(ePos, len(sse_adj)): sse_adj[i] += eLen else: # substitution errors, much easier by comparison... if chr(r[3][ePos + sse_adj[ePos]]) == error[3]: r[3][ePos + sse_adj[ePos]] = error[4] else: print '\nError, ref does not match alt while attempting to insert substitution error!\n' exit(1) if anyIndelErr: if len(expandedCigar): #print myCigar,'-->', relevantCigar = (expandedCigar + extraCigarVal)[:self.readLen] myCigar = CigarString(listIn=relevantCigar).getString() #print myCigar r[3] = r[3][:self.readLen] #if len(r[3]) != self.readLen: # print 'AHHHHHH_1' # exit(1) #if len(expandedCigar+extraCigarVal) != self.readLen: # print 'AHHHHHH_2' # exit(1) rOut.append([ r[0] - self.adj[myPloid][r[0]], myCigar, str(r[3]), str(r[1]) ]) # rOut[i] = (pos, cigar, read_string, qual_string) return rOut