def clip_test(): filename = programName + ".wav" print >>stderr, "writing audio output to %s" % filename output = WavOut(filename=filename,channels=2) clip = Clip(source=os.path.join(clipPath,clipFilename)) clip.gain = gain clip >> output rate = 2.0 numSteps = 10 for ix in xrange(numSteps): clip.rate = rate duration = clip.duration() print "playing at rate %s for %s msec" % (rate,duration/zook.msec) clip.trigger() yield duration rate /= 2.0**(1.0/numSteps) for rate in [1.0,-1.0,2.0,-2.0,0.5,-0.5]: clip.rate = rate duration = clip.duration() * 1.10 print "playing at rate %s for %s msec" % (rate,duration/zook.msec) clip.trigger() yield duration for ix in xrange(10): rate = (2*randint(0,1)-1) * (.8+.4*unit_random()) clip.rate = rate print "playing at rate %s for %s msec" % (rate,250) clip.trigger() yield 250*zook.msec output.close()
def apply_errors(profile,seq,catalog): pMm = 0.01 pI = 0.12 pD = 0.02 pMm = profile["mm"] pI = profile["i"] pD = profile["d"] if (catalog == None): newCatalog = None events = None else: newCatalog = deepcopy(catalog) startToIx = {} endToIx = {} for (catIx,c) in enumerate(catalog): startToIx[c.start] = catIx endToIx [c.end ] = catIx c.start = c.end = None # (so we'll know if we failed to change them) events = {} newSeq = [] newPos = 0 m = None for pos in xrange(len(seq)+1): if (newCatalog != None): # nota bene: we assume catalog intervals don't overlap, but they # may abut if (pos in endToIx): catIx = endToIx[pos] newCatalog[catIx].end = newPos events[catIx] = (m,mm,i,d) m = None if (pos in startToIx): catIx = startToIx[pos] newCatalog[catIx].start = newPos m = mm = i = d = 0 if (pos == len(seq)): break nuc = seq[pos] r = unit_random() if (r < pMm): newSeq += [choice(mismatchLookup[nuc])] newPos += 1 if (m != None): mm += 1 elif (r < pMm+pI): newSeq += [choice("ACGT")+nuc] newPos += 2 if (m != None): i += 1 elif (r < pMm+pI+pD): if (m != None): d += 1 else: newSeq += [nuc] newPos += 1 if (m != None): m += 1 return ("".join(newSeq),newCatalog,events)
def choice(self,count=None,randVal=None): if (count == None): if (randVal == None): randVal = unit_random() randVal *= len(self.table) (p,sym1,sym2) = self.table[int(randVal)] if (randVal < p): return sym1 else: return sym2 else: if (randVal != None): raise ValueError if (count < 0): raise ValueError choices = [] for _ in xrange(count): randVal = unit_random() * len(self.table) (p,sym1,sym2) = self.table[int(randVal)] if (randVal < p): choices += [sym1] else: choices += [sym2] return choices
def generate(self): self.errorSeq = [] nErrors = remainingErrors = round(pSubstitution * ntSequenceLength) for remainingSeqLen in range(ntSequenceLength, 0, -1): if (remainingSeqLen * unit_random() >= remainingErrors): self.errorSeq += [0] else: self.errorSeq += [1] remainingErrors -= 1 return self.errorSeq
def generate(self): errorSeq = list( map(lambda _: 1 if (unit_random() < self.pSubstitution) else 0, range(len(self.seq)))) errorPositions = [ pos for (pos, err) in enumerate(errorSeq) if (err == 1) and (self.seq[pos] in ntToMutations) ] self.mutatedSeq = self.apply_errors(errorPositions) return self.mutatedSeq
def random_cigar(readLen, openProb, extendProb): global prevOpenProb, prevExtendProb global matchLenFunc, insertionLenFunc, deletionLenFunc if (openProb == None): openProb = 0 if (readLen <= 0): return (readLen, []) elif (openProb == 0): return (readLen, [("M", readLen)]) if (openProb != prevOpenProb): prevOpenProb = openProb matchLenFunc = geometric_distribution_func(1 - openProb) if (extendProb != prevExtendProb): prevExtendProb = extendProb insertionLenFunc = geometric_distribution_func(extendProb) deletionLenFunc = geometric_distribution_func(extendProb) basesToGo = seqNeeded = readLen cigar = [] while (basesToGo > 0): # first pass through the loop might not have an indel; subsequent # passes will always have an indel; note that if we get a deletion # in this first indel, we don't bother to save it (see note above) if (basesToGo < readLen) or (unit_random() < openProb): if (unit_random() < 0.5): runLen = min(basesToGo, insertionLenFunc()) cigar += [("I", runLen)] basesToGo -= runLen seqNeeded -= runLen elif (len(cigar) > 0): runLen = deletionLenFunc() cigar += [("D", runLen)] seqNeeded += runLen if (basesToGo > 0): runLen = min(basesToGo, matchLenFunc()) cigar += [("M", runLen)] basesToGo -= runLen return (seqNeeded, cigar)
def random_subs(seq, prob): errors = [ix for ix in xrange(len(seq)) if (unit_random() < prob)] if (errors == []): return seq seq = list(seq) subs = [] for ix in errors: nuc = seq[ix] if (nuc in nucToSubstitutions): seq[ix] = random_choice(nucToSubstitutions[nuc]) subs += [(ix, nuc, seq[ix])] return "".join(seq)
def my_random(u=None, v=None): # my_random() --> real value in 0..1 # my_random(int u) --> integer value in 1..u # my_random(int u, int v) --> integer value in u..v # my_random(str u) --> char value in u # my_random(str u, str v) --> str value u or v # my_random(list u) --> choice from u # my_random(tuple u) --> choice from u if (u == None): return unit_random() if (v == None): if (type(u) == str): return choice(u) elif (type(u) == list): return choice(u) elif (type(u) == tuple): return choice(u) else: return randint(1, u) if (type(u) == str) and (type(v) == str): return choice([u, v]) else: return randint(u, v)
def main(): # parse the command line arraysFilename = None motifs = [] sequenceName = None sequenceLen = 0 numRepeats = None genNeighbors = 0.0 genMixture = 0.0 lengthsFilename = None minFill = None errorProfile = None catalogFilename = None wrapLength = 100 for arg in argv[1:]: if ("=" in arg): argVal = arg.split("=",1)[1] if (arg.startswith("--arrays=")): arraysFilename = argVal elif (arg.startswith("--name=")): sequenceName = argVal elif (arg.startswith("--length=")) or (arg.startswith("--len=")) or (arg.startswith("L=")): if (argVal.endswith("%")): sequenceLen = float(argVal[:-1]) / 100.0 assert (sequenceLen >= 1.0) sequenceLen = ("%",sequenceLen) elif (argVal.startswith("+")): sequenceLen = int_with_unit(argVal[1:]) assert (sequenceLen >= 0) sequenceLen = ("+",sequenceLen) else: sequenceLen = int_with_unit(argVal) assert (sequenceLen >= 0) elif (arg.startswith("--repeats=")) or (arg.startswith("N=")): numRepeats = int_with_unit(argVal) assert (numRepeats > 0) elif (arg.startswith("--motif:neighbor=")): genNeighbors = parse_probability(argVal) elif (arg.startswith("--motif:mixture=")): genMixture = parse_probability(argVal) elif (arg.startswith("--lengths=")): lengthsFilename = argVal elif (arg.startswith("--minfill=")) or (arg.startswith("F=")): minFill = int(argVal) if (minFill < 0): print >>stderr, "WARNING: \"%s\" interpreted as no minimum fill" % argVal minFill = None if (minFill == 0): minFill = None elif (arg.startswith("--errors=")): errorProfile = None if (argVal in ["pacbio","pacbio.v3","pacbio.GIAB","pacbio.giab"]): errorProfile = errorProfilePacbioV3 elif (argVal == "pacbio.v2"): # for historical reasons, v2 is an alias for v3 errorProfile = errorProfilePacbioV3 elif (argVal in ["pacbio.v1","pacbio.Guiblet","pacbio.guiblet"]): errorProfile = errorProfilePacbioV1 elif (argVal in ["pacbio.readsim"]): errorProfile = errorProfilePacbioReadsim elif (argVal in ["nanopore","nanopore.v3","nanopore.GIAB","nanopore.giab"]): errorProfile = errorProfileNanoporeV3 elif (argVal == "nanopore.v2"): # for historical reasons, v2 is an alias for v3 errorProfile = errorProfileNanoporeV3 elif (argVal in ["nanopore.v1","nanopore.Jain","nanopore.jain"]): errorProfile = errorProfileNanoporeV1 elif (argVal in ["nanopore.readsim"]): errorProfile = errorProfileNanoporeReadSim elif (":" in argVal): try: errorProfile = parse_error_spec(argVal) except ValueError: pass else: p = parse_probability(argVal) errorProfile = {"mm":p, "i":p, "d":p } if (errorProfile == None): usage("\"%s\" is not a valid error spec" % argVal) subProb = errorProfile["mm"] insOpenProb = errorProfile["i"] delOpenProb = errorProfile["d"] insExtendProb = delExtendProb = 0.0 elif (arg.startswith("--catalog=")): catalogFilename = argVal elif (arg.startswith("--wrap=")): wrapLength = int(argVal) if (wrapLength <= 0): wrapLength = None elif (arg.startswith("--seed=")): # nota bene: if the seed is a number, use it as a number, since # string seeds can produce different sequences on # different versions/builds of python seed = argVal try: seed = int(seed) except ValueError: try: seed = float(seed) except ValueError: pass random_seed(seed) elif (arg.startswith("--")): usage("unrecognized option: %s" % arg) elif (is_nucleotide_string(arg)): motifs += [arg.upper()] else: usage("unrecognized option: %s" % arg) if (arraysFilename != None): if (motifs != []): usage("command line <motif>s cannot be used with --arrays") if (numRepeats != None): usage("--repeats cannot be used with --arrays") if (lengthsFilename != None): usage("--lengths cannot be used with --arrays") if (genNeighbors != 0.0): usage("--motif:neighbor cannot be used with --arrays") if (genMixture != 0.0): usage("--motif:mixture cannot be used with --arrays") elif (motifs == []): usage("you have to give me at least one motif") if (numRepeats == None) and (arraysFilename != None): numRepeats = 1 # read the arrays file, if we have one repeatLengths = {} haveSpecificArrays = False if (arraysFilename != None): haveSpecificArrays = True f = file(arraysFilename,"rt") numRepeats = 0 for (length,motif,_) in read_arrays(f,arraysFilename): numRepeats += 1 if (motif not in repeatLengths): motifs += [(motif)] repeatLengths[motif] = [length] else: repeatLengths[motif] += [length] f.close() if (motifs == []): usage("array file \"%s\" contains no arrays" % arraysFilename) # read the lengths file if (repeatLengths == {}): if (lengthsFilename == None): lengths = read_integers(stdin) for motif in motifs: repeatLengths[motif] = lengths elif ("{motif}" not in lengthsFilename): f = file(lengthsFilename,"rt") lengths = read_integers(f,lengthsFilename) f.close() for motif in motifs: repeatLengths[motif] = lengths else: for motif in motifs: motifLengthsFilename = lengthsFilename.replace("{motif}",motif) f = file(motifLengthsFilename,"rt") lengths = read_integers(f,motifLengthsFilename) f.close() repeatLengths[motif] = lengths # generate the number and type of motifs we'll embed # # note: to satisfy the requirement that the same seed generates the same # pre-error sequence, we should have no variance in the use of the # PRNG until after we've generated that sequence; see "point A" below embeddings = [] if (haveSpecificArrays): for motif in motifs: for length in repeatLengths[motif]: strand = choice(["+","-"]) offset = choice(xrange(len(motif))) embeddings += [(1.0,motif,motif,strand,offset,length)] shuffle(embeddings) else: for _ in xrange(numRepeats): motif = choice(motifs) length = choice(repeatLengths[motif]) u = unit_random() if (genNeighbors > 0) and (u < genNeighbors): motif = motif_neighbor(motif) (mix,motif2) = (1.0,motif) elif (genMixture > 0) and (u < genNeighbors+genMixture): (mix,motif2) = (0.5,motif_neighbor(motif)) else: (mix,motif2) = (1.0,motif) strand = choice(["+","-"]) offset = choice(xrange(len(motif))) embeddings += [(mix,motif,motif2,strand,offset,length)] totalRepeatBp = sum([length for (_,_,_,_,_,length) in embeddings]) # assign each repeat a position within the "fill" sequence; note that we # might have more than one repeat assigned to the same position, in which # case they will be back-to-back with no fill between them if (type(sequenceLen) == tuple): (op,sequenceLen) = sequenceLen if (op == "%"): sequenceLen = int(round(totalRepeatBp*sequenceLen)) else: # if (op == "+"): sequenceLen = totalRepeatBp + sequenceLen if (totalRepeatBp > sequenceLen): fillBp = 0 if (sequenceLen > 0): print >>stderr, "WARNING: length of embedded repeats (%d) exceeds specified" % totalRepeatBp print >>stderr, " sequence length (%d); there will be no fill DNA" % sequenceLen elif (minFill != None): fillBp = sequenceLen - totalRepeatBp totalMinFill = (numRepeats+1) * minFill if (totalMinFill > fillBp): print >>stderr, "WARNING: minimum fill of %d cannot be achieved" % minFill print >>stderr, " total minimum fill (%d) exceeds total fill (%d)" % (totalMinFill,fillBp) minFill = fillBp / (numRepeats+1) fillBp -= minFill * (numRepeats+1) else: fillBp = sequenceLen - totalRepeatBp fillPositions = [randint(0,fillBp) for _ in xrange(numRepeats)] fillPositions.sort() if (minFill != None): fillBp += minFill * (numRepeats+1) for rptNum in xrange(numRepeats): fillPositions[rptNum] += (rptNum+1) * minFill # generate the sequence catalog = None if (catalogFilename != None): catalog = [] fillSeq = str(EchyDna(fillBp)) seq = [] seqPos = 0 prevEnd = 0 fillPos = 0 for (ix,pos) in enumerate(fillPositions): if (fillPos < pos): seq += [fillSeq[fillPos:pos]] seqPos += pos - fillPos fillPos = pos (mix,motif,motif2,strand,offset,length) = embeddings[ix] if (catalog != None): c = CatalogEntry() c.start = seqPos c.end = seqPos+length c.mix = mix c.motif = motif c.motif2 = motif2 c.strand = strand c.repeatLength = length c.offset = offset catalog += [c] enoughCopies = (length+offset+len(motif)-1) / len(motif) if (strand == "-"): motif = reverse_complement(motif) if (mix >= 1.0): repeat = motif * enoughCopies else: repeat = [] for _ in xrange(enoughCopies): if (unit_random() < mix): repeat += [motif] else: repeat += [motif2] repeat = "".join(repeat) seq += repeat[offset:offset+length] seqPos += length prevEnd = seqPos if (fillPos < fillBp): seq += [fillSeq[fillPos:fillBp]] seq = "".join(seq) #=== point A: it's now safe to make additional use of the PRNG === # apply error profile events = profile = None if (argVal in ["pacbio","pacbio.v3","pacbio.GIAB","pacbio.giab"]): errorProfile = errorProfilePacbioV3 elif (argVal == "pacbio.v2"): # for historical reasons, v2 is an alias for v3 errorProfile = errorProfilePacbioV3 elif (argVal in ["pacbio.v1","pacbio.Guiblet","pacbio.guiblet"]): errorProfile = errorProfilePacbioV1 elif (argVal in ["pacbio.readsim"]): errorProfile = errorProfilePacbioReadsim elif (argVal in ["nanopore","nanopore.v3","nanopore.GIAB","nanopore.giab"]): errorProfile = errorProfileNanoporeV3 elif (argVal == "nanopore.v2"): # for historical reasons, v2 is an alias for v3 errorProfile = errorProfileNanoporeV3 elif (argVal in ["nanopore.v1","nanopore.Jain","nanopore.jain"]): errorProfile = errorProfileNanoporeV1 elif (argVal in ["nanopore.readsim"]): errorProfile = errorProfileNanoporeReadSim elif (type(errorProfile) == float): eRate = errorProfile / 3.0; profile = {"mm":eRate, "i":eRate, "d":eRate } elif (type(errorProfile) == dict): profile = dict(errorProfile) if (profile != None): print >>stderr, "(applying error profile mm=%.2f%% i=%.2f%% d=%.2f%%)" \ % (100*profile["mm"],100*profile["i"],100*profile["d"]) (seq,catalog,events) = apply_errors(profile,seq,catalog) # write the sequence if (sequenceName != None): print ">%s" % sequenceName if (wrapLength == None): print seq else: for i in range(0,len(seq),wrapLength): print seq[i:i+wrapLength] # write the catalog if (catalogFilename != None): catalogF = file(catalogFilename,"wt") if (sequenceName in [None,""]): seqNameForCatalog = "seq" else: seqNameForCatalog = sequenceName if (events == None): print >>catalogF, "#%s\t%s\t%s\t%s\t%s\t%s\t%s" \ % ("chrom","start","end","motif","rptLen","len","fill") else: print >>catalogF, "#%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" \ % ("chrom","start","end","motif","rptLen","len","fill", "mRatio","m","mm","i","d") prevEnd = 0 for (catIx,c) in enumerate(catalog): motifStr = c.motif if (c.mix < 1.0): motifStr += "," + c.motif2 motifStr += ".%s%s" % (c.offset,c.strand) if (events == None): print >>catalogF, "%s\t%s\t%s\t%s\t%s\t%s\t%s" \ % (seqNameForCatalog,c.start,c.end,motifStr, c.repeatLength,c.end-c.start,c.start-prevEnd) else: if (catIx in events): (m,mm,i,d) = events[catIx] mRatio = "%.1f%%" % (100.0*m/(m+mm+i+d)) else: mRatio = m = mm = i = d = "NA" print >>catalogF, "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" \ % (seqNameForCatalog,c.start,c.end,motifStr, c.repeatLength,c.end-c.start,c.start-prevEnd, mRatio,m,mm,i,d) prevEnd = c.end catalogF.close()
def main(): assert (len(argv) == 1), "give me no arguments" numTrials = 1000 random_seed("acorn") explainFailure = False path = "kmer_histograms" #sampleId = "mixedB" #defaultParams = {"zp.copy.y" : 3.000, # "zp.copy.hom" : 3.000, # "zp.copy.het" : 3.000, # "p.e" : 0.942, # "shape.e" : 3.000, # "scale.e" : 1.000, # "p.y" : 0.900, # "u.y" : 64.000, # "sd.y" : 14.826, # "shape.y" : 0.000, # "p.hom" : 0.800, # "u.hom" : 5.120, # "sd.hom" : 1.186, # "var.het" : 1.407} #goodParams = {"zp.copy.y" : 2.042, # "zp.copy.hom" : 3.157, # "zp.copy.het" : 17.795, # "p.e" : 0.935, # "shape.e" : 0.096, # "scale.e" : 0.465, # "p.y" : 0.621, # "u.y" : 68.084, # "sd.y" : 8.626, # "shape.y" : 0.057, # "p.hom" : 0.853, # "u.hom" : 11.101, # "sd.hom" : 3.600, # "var.het" : 10.916} sampleId = "apple_E12_L150_D80_K25" defaultParams = {"zp.copy.y" : 3.000, "zp.copy.hom" : 3.000, "zp.copy.het" : 3.000, "p.e" : 0.940, "shape.e" : 3.000, "scale.e" : 1.000, "p.y" : 0.900, "u.y" : 62.000, "sd.y" : 16.309, "shape.y" : 0.000, "p.hom" : 0.800, "u.hom" : 4.960, "sd.hom" : 1.305, "var.het" : 1.702} goodParams = {"zp.copy.y" : 2.047, "zp.copy.hom" : 3.390, "zp.copy.het" : 1.137, "p.e" : 0.937, "shape.e" : 0.114, "scale.e" : 0.452, "p.y" : 0.630, "u.y" : 65.974, "sd.y" : 8.666, "shape.y" : 0.228, "p.hom" : 0.818, "u.hom" : 13.622, "sd.hom" : 4.086, "var.het" : 15.274} fitter = EnrichedHapDipFitter(path+"/"+sampleId+".mixed.kmer_dist") paramNames = fitter.paramNames convergenceCount = 0 for trialNumber in xrange(numTrials): print "=== trial %d of %d ===" \ % (1+trialNumber,numTrials) # choose initial params as a random point in hypercube between "good" # and "bad" initParams = dict(goodParams) norm2Init = 0.0 for (paramIx,name) in enumerate(paramNames): step = unit_random() initParams[name] += step*(defaultParams[name]-goodParams[name]) norm2Init += step*step normInit = sqrt(norm2Init) / len(paramNames) fitter.set_params(initParams) fitParams = fitter.fit() if (fitParams == None): print params_to_text(paramNames,initParams,prefix="init-[%d]:" % trialNumber) print "normInit: %.8f" % normInit print "(failure or non-convergence)" if (explainFailure): print "... return code ..." print fitter.retCode print "... stdout ..." print fitter.stdout print "... stderr ..." print fitter.stderr continue print params_to_text(paramNames,initParams,fitParams, prefix="init+[%d]:" % trialNumber, prefix2="cvrg[%d]:" % trialNumber) fitParams = params_to_float(fitParams) dGood = vector_distance(fitParams,goodParams) print "normInit: %.8f" % normInit print "dGood: %.8f" % dGood convergenceCount += 1 print "%d of %d trials converged" % (convergenceCount,numTrials)
def generate_read(readLength,errorRate): read = ["-"] * readLength for ix in xrange(readLength): if (unit_random() < errorRate): read[ix] = "x" return "".join(read)
def generate(self): self.errorSeq = list( map(lambda _: 1 if (unit_random() < self.pSubstitution) else 0, range(self.ntSequenceLength))) return self.errorSeq
def main(): assert (len(argv) == 3), "need the sampleID and number of trials, and nothing else" sampleId = argv[1] numTrials = int(argv[2]) random_seed("acorn") explainFailure = False path = "kmer_histograms" # ask the curve fitter what the default paramters are fitter = EnrichedHapDipFitter(path+"/"+sampleId+".mixed.kmer_dist") paramNames = fitter.paramNames defaultParams = fitter.default_params() if (defaultParams == None): print "(failed to get default params)" if (explainFailure): print "... return code ..." print hdFitter.retCode print "... stdout ..." print hdFitter.stdout print "... stderr ..." print hdFitter.stderr assert (False) defaultParams = params_to_float(defaultParams) # read the "good" parameters (usually produced by explore3_hap_dip) fitFilename = path+"/"+sampleId+".mixed.fit" f = file(fitFilename,"rt") goodParams = params_from_text([line for line in f]) f.close() for name in defaultParams: assert (name in goodParams), \ "parameter \"%s\" missing from %s" % (name,fitFilename) for name in goodParams: assert (name in defaultParams), \ "extra parameter \"%s\" in %s" % (name,fitFilename) goodParams = params_to_float(goodParams) print params_to_text(paramNames,goodParams,defaultParams, prefix="good:",prefix2="dflt:") # run the convergence trials convergenceCount = 0 for trialNumber in xrange(numTrials): print "=== trial %d of %d ===" \ % (1+trialNumber,numTrials) # choose initial params as a random point in hypercube between "good" # and "bad" initParams = dict(goodParams) norm2Init = 0.0 for (paramIx,name) in enumerate(paramNames): step = unit_random() initParams[name] += step*(defaultParams[name]-goodParams[name]) norm2Init += step*step normInit = sqrt(norm2Init) / len(paramNames) fitter.set_params(initParams) fitParams = fitter.fit() if (fitParams == None): print params_to_text(paramNames,initParams,prefix="init-[%d]:" % trialNumber) print "normInit: %.8f" % normInit print "(failure or non-convergence)" if (explainFailure): print "... return code ..." print fitter.retCode print "... stdout ..." print fitter.stdout print "... stderr ..." print fitter.stderr continue print params_to_text(paramNames,initParams,fitParams, prefix="init+[%d]:" % trialNumber, prefix2="cvrg[%d]:" % trialNumber) fitParams = params_to_float(fitParams) dGood = vector_distance(fitParams,goodParams) print "normInit: %.8f" % normInit print "dGood: %.8f" % dGood convergenceCount += 1 print "%d of %d trials converged" % (convergenceCount,numTrials)
def main(): global reportProgress, debug, hasherFmt # parse the command line kmerSize = 28 sketchSizes = None numSequences = None noiseKind = None pSubstitution = None sequenceType = "linear" sortBy = "nMutated" statsFilename = None mutatedFilename = None mutateOnly = False prngSeed = None hashSeed = None hashBits = None reportProgress = None debug = [] statsOfInterest = [ "name", "r1", "k", "L", "trials", "q", "Mean[|A|].obs", "Mean[|B|].obs", "Mean[|A^B|].obs", "Mean[|AuB|].obs", "Mean[nMut.A,B].obs", "Mean[L.A,B].obs", "Mean[r1est.A,B].obs" ] for arg in argv[1:]: if ("=" in arg): argVal = arg.split("=", 1)[1] if (arg in ["--help", "-help", "--h", "-h"]): usage() elif (arg.startswith("--kmer=")) or (arg.startswith("K=")): kmerSize = int(argVal) elif (arg.startswith("--sketch=")) or (arg.startswith("S=")): if (sketchSizes == None): sketchSizes = [] sketchSizes += map(int_with_unit, argVal.split(",")) elif (arg.startswith("--sequences=")) or (arg.startswith("T=")): numSequences = int_with_unit(argVal) elif (arg.startswith("--poisson=")) or ( arg.startswith("--noise=")) or (arg.startswith("P=")): noiseKind = "poisson" pSubstitution = parse_probability(argVal) elif (arg.startswith("--bernoulli=")) or ( arg.startswith("--error=")) or (arg.startswith("B=")) or ( arg.startswith("E=")): noiseKind = "bernoulli" pSubstitution = parse_probability(argVal) elif (arg == "--linear"): sequenceType = "linear" elif (arg == "--circular"): sequenceType = "circular" elif (arg == "--nosort"): sortBy = None elif (arg.startswith("--stats=")): statsFilename = argVal elif (arg.startswith("--mutated=")): mutatedFilename = argVal elif (arg == "--mutateonly"): mutateOnly = True elif (arg.startswith("--seed=")): prngSeed = argVal elif (arg in ["--hashbits=none", "--hash=none"]): hashBits = None elif (arg.startswith("--hash=")) or (arg.startswith("--hashseed=")): hashSeed = int(argVal) elif (arg.startswith("--hashbits=")): hashBits = int(argVal) elif (arg.startswith("--progress=")): reportProgress = int_with_unit(argVal) elif (arg == "--debug"): debug += ["debug"] elif (arg.startswith("--debug=")): debug += argVal.split(",") elif (arg.startswith("--")): usage("unrecognized option: %s" % arg) else: usage("unrecognized option: %s" % arg) if (pSubstitution == None): usage("you have to tell me the mutation probability") if (numSequences == None): numSequences = 1 if (noiseKind == None): usage("you must specify either poisson or bernoulli error model") if (noiseKind == "bernoulli"): # the presence of non-ACGT nucleotides isn't considered usage("the bernoulli noise model is not currently supported") if (sequenceType == "circular") and (sketchSizes != None): # sketch_intersection() assumes linear sequences usage("sketches are not currently supported for circular sequences") if (sequenceType == "circular"): # all the estimator code assumes linear sequences usage("circular sequences are not currently supported") if (hashBits == None) and (hashSeed != None): print( "WARNING, hash seed is ignored, since no hashing is being performed", file=stderr) if (hashBits != None) and (not haveHashers): usage( "was unable to import module mmh3, so hashing can't be supported") if (sketchSizes != None): sketchSizes = list(set(sketchSizes)) # (remove duplicates) sketchSizes.sort() if (sketchSizes != None): for sketchSize in sketchSizes: statsOfInterest += [ "Mean[nIntersection(S=%d)].obs" % sketchSize, "Mean[Jaccard(S=%d)].obs" % sketchSize, "StDev[Jaccard(S=%d)].obs" % sketchSize ] # set up randomness # # note that we choose the hash seed randomly *before* seeding the PRNG, so # that we (allegedly) get a randomly chosen hash; but users will be better # off specifically choosing the hash seed if (hashSeed == None): hashSeed = hashSeed & 0xFFFFFFFF # (mmh3 seeds are limited to 32 bits) else: hashSeed = int(0x100000000 * unit_random()) if (prngSeed != None): random_seed(prngSeed.encode("utf-8")) if (hashBits == 128): hasher = lambda kmer: hash128(kmer, hashSeed, signed=False) hasherFmt = "%032X" elif (hashBits == 64): hasher = lambda kmer: hash64(kmer, hashSeed, signed=False)[0] hasherFmt = "%016X" elif (hashBits == 32): hasher = lambda kmer: hash(kmer, hashSeed, signed=False) hasherFmt = "%08X" elif (hashBits == 16): hasher = lambda kmer: hash(kmer, hashSeed, signed=False) & 0xFFFF hasherFmt = "%04X" elif (hashBits == None): hasher = lambda kmer: kmer hasherFmt = "%s" else: raise ValueError # open a file to receive the mutated sequences mutatedF = None if (mutateOnly) and (mutatedFilename == None): mutatedF = stdout else: if (mutatedFilename != None): if (mutatedFilename.endswith(".gz")) or ( mutatedFilename.endswith(".gzip")): mutatedF = gzip_open(mutatedFilename, "wt") else: mutatedF = open(mutatedFilename, "wt") # fetch the *single* input sequence numSequencesSeen = 0 for (seqName, seq) in fasta_sequences(stdin): numSequencesSeen += 1 assert (numSequencesSeen < 2), "there was more than one sequence in the input" seqLen = len(seq) assert (numSequencesSeen == 1), "there were no sequences in the input" ntSequenceLength = len(seq) assert ( ntSequenceLength >= kmerSize ), "input sequence length (%d) is shorter than the kmer size (%d)" % ( ntSequenceLength, kmerSize) distinctKmersA = kmer_set(seq, kmerSize, hasher) numDistinctKmersA = len(distinctKmersA) # set up model/generator if (noiseKind == "poisson") and (sequenceType == "linear"): kmerSequenceLength = ntSequenceLength - (kmerSize - 1) mutationModel = PoissonModel \ (seq,kmerSize,pSubstitution, count_mutated_kmers_linear, hashBits=hashBits) elif (noiseKind == "bernoulli") and (sequenceType == "linear"): kmerSequenceLength = ntSequenceLength - (kmerSize - 1) mutationModel = BernoulliModel \ (seq,kmerSize,pSubstitution, count_mutated_kmers_linear, hashBits=hashBits) elif (noiseKind == "poisson") and (sequenceType == "circular"): kmerSequenceLength = ntSequenceLength mutationModel = PoissonModel \ (seq,kmerSize,pSubstitution, count_mutated_kmers_circular, hashBits=hashBits) elif (noiseKind == "bernoulli") and (sequenceType == "circular"): kmerSequenceLength = ntSequenceLength mutationModel = BernoulliModel \ (seq,kmerSize,pSubstitution, count_mutated_kmers_circular, hashBits=hashBits) else: assert (False), "internal error" # generate mutated sequences and collect stats q = p_mutated(kmerSize, pSubstitution) nErrorsObserved = [] nMutatedObserved = [] r1EstNMutatedObserved = [] nDistinctAObserved = [] nDistinctBObserved = [] nDistinctIntersectionObserved = [] nDistinctUnionObserved = [] nMutatedABObserved = [] kmerSequenceLengthABObserved = [] r1EstABObserved = [] inConfR1EstABObserved = [] if (sketchSizes != None): nIntersectionObserved = {} jaccardObserved = {} for sketchSize in sketchSizes: nIntersectionObserved[sketchSize] = [] jaccardObserved[sketchSize] = [] for seqNum in range(numSequences): if (reportProgress != None): if (1 + seqNum <= 2) or ((1 + seqNum) % reportProgress == 0): print("testing mutated sequence %d" % (1 + seqNum), file=stderr) # generate a mutated sequence and collect stats mutatedSeq = mutationModel.generate() if (mutatedF != None): write_fasta(mutatedF, seqName + "_mutation_%d)" % (1 + seqNum), mutatedSeq) (nErrors, nMutated) = mutationModel.count() nErrorsObserved += [nErrors] nMutatedObserved += [nMutated] r1EstNMutated = estimate_r1_from_n_mutated(kmerSequenceLength, kmerSize, nMutated) r1EstNMutatedObserved += [r1EstNMutated] distinctKmersB = kmer_set(mutatedSeq, kmerSize, hasher) numDistinctKmersB = len(distinctKmersB) if ("kmers" in debug): print("=== trial %d ===" % seqNum, file=stderr) numKmers = len(seq) - (kmerSize - 1) for pos in range(numKmers): sKmer = seq[pos:pos + kmerSize] if (not is_valid_kmer(sKmer)): continue mKmer = mutatedSeq[pos:pos + kmerSize] sH = hasher(sKmer) mH = hasher(mKmer) print(("[%3d] %s %s %s "+hasherFmt+" "+hasherFmt) \ % (pos,sKmer,mKmer,"-" if (sKmer==mKmer) else "X",sH,mH), file=stderr) nDistinctKmersIntersection = len( distinctKmersA.intersection(distinctKmersB)) nDistinctKmersUnion = len(distinctKmersA.union(distinctKmersB)) nDistinctAObserved += [numDistinctKmersA] nDistinctBObserved += [numDistinctKmersB] nDistinctIntersectionObserved += [nDistinctKmersIntersection] nDistinctUnionObserved += [nDistinctKmersUnion] kmerSequenceLengthAB = (numDistinctKmersA + numDistinctKmersB) / 2.0 nMutatedAB = kmerSequenceLengthAB - nDistinctKmersIntersection r1EstAB = estimate_r1_from_n_mutated(kmerSequenceLengthAB, kmerSize, nMutatedAB) nMutatedABObserved += [nMutatedAB] kmerSequenceLengthABObserved += [kmerSequenceLengthAB] r1EstABObserved += [r1EstAB] # generate sketches and collect basic stats if (sketchSizes != None): mutationModel.compute_sketches(distinctKmersA, distinctKmersB, sketchSizes) for sketchSize in sketchSizes: nIntersection = mutationModel.sketch_intersection(sketchSize) nIntersectionObserved[sketchSize] += [nIntersection] jaccardObserved[sketchSize] += [ float(nIntersection) / sketchSize ] #if ("kmers" in debug): # assert (False) # report per-trial results if (sortBy == "nMutated"): order = [(nDistinctIntersectionObserved[ix], ix) for ix in range(numSequences)] order.sort() order.reverse() order = [ix for (_, ix) in order] else: # if (sortBy == None): order = list(range(numSequences)) header = [ "L", "K", "r", "trial", "q", "nErr", "nMut", "r1est.nMut", "|A|", "|B|", "|A^B|", "|AuB|", "nMut.A,B", "L.A,B", "r1est.A,B" ] if (sketchSizes != None): for sketchSize in sketchSizes: header += ["nIntersection(s=%d)" % sketchSize] header += ["j.est(nMut,s=%d)" % sketchSize] print("#%s" % "\t".join(header)) for ix in range(numSequences): line = "\t".join(["%d","%d","%0.3f","%d","%0.9f","%d","%d","%0.9f","%d","%d","%d","%d","%0.1f","%0.1f","%0.9f"]) \ % (kmerSequenceLength, # L kmerSize, # K pSubstitution, # r 1+order[ix], # trial q, # q nErrorsObserved[order[ix]], # nErr nMutatedObserved[order[ix]], # nMut r1EstNMutatedObserved[order[ix]], # r1est.nMut nDistinctAObserved[order[ix]], # |A| nDistinctBObserved[order[ix]], # |B| nDistinctIntersectionObserved[order[ix]], # |A^B| nDistinctUnionObserved[order[ix]], # |AuB| nMutatedABObserved[order[ix]], # nMut.A,B kmerSequenceLengthABObserved[order[ix]], # L.A,B r1EstABObserved[order[ix]]) # r1est.A,B if (sketchSizes != None): for sketchSize in sketchSizes: line += "\t%d" % nIntersectionObserved[sketchSize][order[ix]] line += "\t%0.9f" % jaccardObserved[sketchSize][order[ix]] print(line) if (mutatedF != None) and (mutatedF != stdout): mutatedF.close() if (mutateOnly): exit() # compute stats q = p_mutated(kmerSize, pSubstitution) nMutatedMean = sample_mean(nMutatedObserved) nMutatedStDev = sqrt(sample_variance(nMutatedObserved)) predNMutatedMean = exp_n_mutated(kmerSequenceLength, kmerSize, pSubstitution) predNMutatedStDev = sqrt( var_n_mutated(kmerSequenceLength, kmerSize, pSubstitution)) rmseNMutatedStDev = abs(nMutatedStDev - predNMutatedStDev) rmseR1EstNMutated = sqrt( mean_squared_error(r1EstNMutatedObserved, pSubstitution)) nDistinctAMean = sample_mean(nDistinctAObserved) nDistinctBMean = sample_mean(nDistinctBObserved) nDistinctIntersectionMean \ = sample_mean(nDistinctIntersectionObserved) nDistinctUnionMean = sample_mean(nDistinctUnionObserved) nMutatedABMean = sample_mean(nMutatedABObserved) kmerSequenceLengthABMean \ = sample_mean(kmerSequenceLengthABObserved) r1EstABMean = sample_mean(r1EstABObserved) if (sketchSizes != None): nIntersectionMean = {} jaccardEstMean = {} jaccardEstStDev = {} inConfJaccardEstNMutated = {} for sketchSize in sketchSizes: nIntersectionMean[sketchSize] = sample_mean( nIntersectionObserved[sketchSize]) jaccardEstMean[sketchSize] = sample_mean( jaccardObserved[sketchSize]) jaccardEstStDev[sketchSize] = sqrt( sample_variance(jaccardObserved[sketchSize])) # report stats statToText = {} statToText["name"] = seqName statToText["r1"] = "%0.3f" % pSubstitution statToText["k"] = "%d" % kmerSize statToText["L"] = "%d" % kmerSequenceLength statToText["trials"] = "%d" % numSequences statToText["q"] = "%0.9f" % q statToText["E[nMut].theory"] = "%0.9f" % predNMutatedMean statToText["StDev[nMut].theory"] = "%0.9f" % predNMutatedStDev statToText["Mean[nMut].obs"] = "%0.9f" % nMutatedMean statToText["StDev[nMut].obs"] = "%0.9f" % nMutatedStDev statToText["RMSE(StDev[nMut])"] = "%0.9f" % rmseNMutatedStDev statToText["RMSE(r1est.nMut)"] = "%0.9f" % rmseR1EstNMutated statToText["Mean[|A|].obs"] = "%d" % nDistinctAMean statToText["Mean[|B|].obs"] = "%d" % nDistinctBMean statToText["Mean[|A^B|].obs"] = "%d" % nDistinctIntersectionMean statToText["Mean[|AuB|].obs"] = "%d" % nDistinctUnionMean statToText["Mean[nMut.A,B].obs"] = "%d" % nMutatedABMean statToText["Mean[L.A,B].obs"] = "%d" % kmerSequenceLengthABMean statToText["Mean[r1est.A,B].obs"] = "%0.9f" % r1EstABMean if (sketchSizes != None): for sketchSize in sketchSizes: statToText["Mean[nIntersection(S=%d)].obs" % sketchSize] = "%0.9f" % nIntersectionMean[sketchSize] statToText["Mean[Jaccard(S=%d)].obs" % sketchSize] = "%0.9f" % jaccardEstMean[sketchSize] statToText["StDev[Jaccard(S=%d)].obs" % sketchSize] = "%0.9f" % jaccardEstStDev[sketchSize] if (statsFilename != None): if (statsFilename.endswith(".gz")) or ( statsFilename.endswith(".gzip")): statsF = gzip_open(statsFilename, "wt") else: statsF = open(statsFilename, "wt") print("#%s" % "\t".join(statsOfInterest), file=statsF) statsLine = [statToText[stat] for stat in statsOfInterest] print("\t".join(statsLine), file=statsF) statsF.close() else: statW = max(len(stat) for stat in statsOfInterest) for stat in statsOfInterest: print("%*s = %s" % (statW, stat, statToText[stat]), file=stderr)
def geometric_distribution(pExtend): if (pExtend == 0): return 1 u = unit_random() return int(floor(1 + log(1 - u) / log(pExtend)))