Ejemplo n.º 1
0
def clip_test():
	filename = programName + ".wav"
	print >>stderr, "writing audio output to %s" % filename
	output = WavOut(filename=filename,channels=2)

	clip = Clip(source=os.path.join(clipPath,clipFilename))
	clip.gain = gain
	clip >> output

	rate = 2.0
	numSteps = 10
	for ix in xrange(numSteps):
		clip.rate = rate
		duration = clip.duration()
		print "playing at rate %s for %s msec" % (rate,duration/zook.msec)
		clip.trigger()
		yield duration
		rate /= 2.0**(1.0/numSteps)

	for rate in [1.0,-1.0,2.0,-2.0,0.5,-0.5]:
		clip.rate = rate
		duration = clip.duration() * 1.10
		print "playing at rate %s for %s msec" % (rate,duration/zook.msec)
		clip.trigger()
		yield duration

	for ix in xrange(10):
		rate = (2*randint(0,1)-1) * (.8+.4*unit_random())
		clip.rate = rate
		print "playing at rate %s for %s msec" % (rate,250)
		clip.trigger()
		yield 250*zook.msec

	output.close()
Ejemplo n.º 2
0
def apply_errors(profile,seq,catalog):
	pMm = 0.01
	pI  = 0.12
	pD  = 0.02
	pMm = profile["mm"]
	pI  = profile["i"]
	pD  = profile["d"]

	if (catalog == None):
		newCatalog = None
		events     = None
	else:
		newCatalog = deepcopy(catalog)
		startToIx = {}
		endToIx   = {}
		for (catIx,c) in enumerate(catalog):
			startToIx[c.start] = catIx
			endToIx  [c.end  ] = catIx
			c.start = c.end = None  # (so we'll know if we failed to change them)
		events = {}

	newSeq = []
	newPos = 0
	m = None
	for pos in xrange(len(seq)+1):
		if (newCatalog != None):
			# nota bene: we assume catalog intervals don't overlap, but they
			#            may abut
			if (pos in endToIx):
				catIx = endToIx[pos]
				newCatalog[catIx].end = newPos
				events[catIx] = (m,mm,i,d)
				m = None
			if (pos in startToIx):
				catIx = startToIx[pos]
				newCatalog[catIx].start = newPos
				m = mm = i = d = 0

		if (pos == len(seq)):
			break

		nuc = seq[pos]
		r = unit_random()
		if (r < pMm):
			newSeq += [choice(mismatchLookup[nuc])]
			newPos += 1
			if (m != None): mm += 1
		elif (r < pMm+pI):
			newSeq += [choice("ACGT")+nuc]
			newPos += 2
			if (m != None): i += 1
		elif (r < pMm+pI+pD):
			if (m != None): d += 1
		else:
			newSeq += [nuc]
			newPos += 1
			if (m != None): m += 1

	return ("".join(newSeq),newCatalog,events)
Ejemplo n.º 3
0
	def choice(self,count=None,randVal=None):
		if (count == None):
			if (randVal == None): randVal = unit_random()
			randVal *= len(self.table)
			(p,sym1,sym2) = self.table[int(randVal)]
			if (randVal < p): return sym1
			else:             return sym2
		else:
			if (randVal != None): raise ValueError
			if (count < 0):       raise ValueError
			choices = []
			for _ in xrange(count):
				randVal = unit_random() * len(self.table)
				(p,sym1,sym2) = self.table[int(randVal)]
				if (randVal < p): choices += [sym1]
				else:             choices += [sym2]
			return choices
 def generate(self):
     self.errorSeq = []
     nErrors = remainingErrors = round(pSubstitution * ntSequenceLength)
     for remainingSeqLen in range(ntSequenceLength, 0, -1):
         if (remainingSeqLen * unit_random() >= remainingErrors):
             self.errorSeq += [0]
         else:
             self.errorSeq += [1]
             remainingErrors -= 1
     return self.errorSeq
 def generate(self):
     errorSeq = list(
         map(lambda _: 1 if (unit_random() < self.pSubstitution) else 0,
             range(len(self.seq))))
     errorPositions = [
         pos for (pos, err) in enumerate(errorSeq)
         if (err == 1) and (self.seq[pos] in ntToMutations)
     ]
     self.mutatedSeq = self.apply_errors(errorPositions)
     return self.mutatedSeq
Ejemplo n.º 6
0
def random_cigar(readLen, openProb, extendProb):
    global prevOpenProb, prevExtendProb
    global matchLenFunc, insertionLenFunc, deletionLenFunc

    if (openProb == None): openProb = 0

    if (readLen <= 0): return (readLen, [])
    elif (openProb == 0): return (readLen, [("M", readLen)])

    if (openProb != prevOpenProb):
        prevOpenProb = openProb
        matchLenFunc = geometric_distribution_func(1 - openProb)
    if (extendProb != prevExtendProb):
        prevExtendProb = extendProb
        insertionLenFunc = geometric_distribution_func(extendProb)
        deletionLenFunc = geometric_distribution_func(extendProb)

    basesToGo = seqNeeded = readLen
    cigar = []

    while (basesToGo > 0):
        # first pass through the loop might not have an indel; subsequent
        # passes will always have an indel; note that if we get a deletion
        # in this first indel, we don't bother to save it (see note above)

        if (basesToGo < readLen) or (unit_random() < openProb):
            if (unit_random() < 0.5):
                runLen = min(basesToGo, insertionLenFunc())
                cigar += [("I", runLen)]
                basesToGo -= runLen
                seqNeeded -= runLen
            elif (len(cigar) > 0):
                runLen = deletionLenFunc()
                cigar += [("D", runLen)]
                seqNeeded += runLen

        if (basesToGo > 0):
            runLen = min(basesToGo, matchLenFunc())
            cigar += [("M", runLen)]
            basesToGo -= runLen

    return (seqNeeded, cigar)
Ejemplo n.º 7
0
def random_subs(seq, prob):
    errors = [ix for ix in xrange(len(seq)) if (unit_random() < prob)]
    if (errors == []): return seq

    seq = list(seq)
    subs = []
    for ix in errors:
        nuc = seq[ix]
        if (nuc in nucToSubstitutions):
            seq[ix] = random_choice(nucToSubstitutions[nuc])
            subs += [(ix, nuc, seq[ix])]

    return "".join(seq)
def my_random(u=None, v=None):
    # my_random()             --> real value in 0..1
    # my_random(int u)        --> integer value in 1..u
    # my_random(int u, int v) --> integer value in u..v
    # my_random(str u)        --> char value in u
    # my_random(str u, str v) --> str value u or v
    # my_random(list u)       --> choice from u
    # my_random(tuple u)      --> choice from u
    if (u == None):
        return unit_random()
    if (v == None):
        if (type(u) == str): return choice(u)
        elif (type(u) == list): return choice(u)
        elif (type(u) == tuple): return choice(u)
        else: return randint(1, u)
    if (type(u) == str) and (type(v) == str): return choice([u, v])
    else: return randint(u, v)
Ejemplo n.º 9
0
def main():

	# parse the command line

	arraysFilename   = None
	motifs          = []
	sequenceName    = None
	sequenceLen     = 0
	numRepeats      = None
	genNeighbors    = 0.0
	genMixture      = 0.0
	lengthsFilename = None
	minFill         = None
	errorProfile    = None
	catalogFilename = None
	wrapLength      = 100

	for arg in argv[1:]:
		if ("=" in arg):
			argVal = arg.split("=",1)[1]

		if (arg.startswith("--arrays=")):
			arraysFilename = argVal
		elif (arg.startswith("--name=")):
			sequenceName = argVal
		elif (arg.startswith("--length=")) or (arg.startswith("--len=")) or (arg.startswith("L=")):
			if (argVal.endswith("%")):
				sequenceLen = float(argVal[:-1]) / 100.0
				assert (sequenceLen >= 1.0)
				sequenceLen = ("%",sequenceLen)
			elif (argVal.startswith("+")):
				sequenceLen = int_with_unit(argVal[1:])
				assert (sequenceLen >= 0)
				sequenceLen = ("+",sequenceLen)
			else:
				sequenceLen = int_with_unit(argVal)
				assert (sequenceLen >= 0)
		elif (arg.startswith("--repeats=")) or (arg.startswith("N=")):
			numRepeats = int_with_unit(argVal)
			assert (numRepeats > 0)
		elif (arg.startswith("--motif:neighbor=")):
			genNeighbors = parse_probability(argVal)
		elif (arg.startswith("--motif:mixture=")):
			genMixture = parse_probability(argVal)
		elif (arg.startswith("--lengths=")):
			lengthsFilename = argVal
		elif (arg.startswith("--minfill=")) or (arg.startswith("F=")):
			minFill = int(argVal)
			if (minFill < 0):
				print >>stderr, "WARNING: \"%s\" interpreted as no minimum fill" % argVal
				minFill = None
			if (minFill == 0):
				minFill = None
		elif (arg.startswith("--errors=")):
			errorProfile = None
			if (argVal in ["pacbio","pacbio.v3","pacbio.GIAB","pacbio.giab"]):
				errorProfile = errorProfilePacbioV3
			elif (argVal == "pacbio.v2"):  # for historical reasons, v2 is an alias for v3
				errorProfile = errorProfilePacbioV3
			elif (argVal in ["pacbio.v1","pacbio.Guiblet","pacbio.guiblet"]):
				errorProfile = errorProfilePacbioV1
			elif (argVal in ["pacbio.readsim"]):
				errorProfile = errorProfilePacbioReadsim
			elif (argVal in ["nanopore","nanopore.v3","nanopore.GIAB","nanopore.giab"]):
				errorProfile = errorProfileNanoporeV3
			elif (argVal == "nanopore.v2"):  # for historical reasons, v2 is an alias for v3
				errorProfile = errorProfileNanoporeV3
			elif (argVal in ["nanopore.v1","nanopore.Jain","nanopore.jain"]):
				errorProfile = errorProfileNanoporeV1
			elif (argVal in ["nanopore.readsim"]):
				errorProfile = errorProfileNanoporeReadSim
			elif (":" in argVal):
				try:
					errorProfile = parse_error_spec(argVal)
				except ValueError:
					pass
			else:
				p = parse_probability(argVal)
				errorProfile = {"mm":p, "i":p, "d":p }
			if (errorProfile == None):
				usage("\"%s\" is not a valid error spec" % argVal)
			subProb       = errorProfile["mm"]
			insOpenProb   = errorProfile["i"]
			delOpenProb   = errorProfile["d"]
			insExtendProb = delExtendProb = 0.0
		elif (arg.startswith("--catalog=")):
			catalogFilename = argVal
		elif (arg.startswith("--wrap=")):
			wrapLength = int(argVal)
			if (wrapLength <= 0): wrapLength = None
		elif (arg.startswith("--seed=")):
			# nota bene: if the seed is a number, use it as a number, since
			#            string seeds can produce different sequences on
			#            different versions/builds of python
			seed = argVal
			try:
				seed = int(seed)
			except ValueError:
				try:               seed = float(seed)
				except ValueError: pass
			random_seed(seed)
		elif (arg.startswith("--")):
			usage("unrecognized option: %s" % arg)
		elif (is_nucleotide_string(arg)):
			motifs += [arg.upper()]
		else:
			usage("unrecognized option: %s" % arg)

	if (arraysFilename != None):
		if (motifs != []):
			usage("command line <motif>s cannot be used with --arrays")
		if (numRepeats != None):
			usage("--repeats cannot be used with --arrays")
		if (lengthsFilename != None):
			usage("--lengths cannot be used with --arrays")
		if (genNeighbors != 0.0):
			usage("--motif:neighbor cannot be used with --arrays")
		if (genMixture != 0.0):
			usage("--motif:mixture cannot be used with --arrays")
	elif (motifs == []):
		usage("you have to give me at least one motif")

	if (numRepeats == None) and (arraysFilename != None):
		numRepeats = 1
	
	# read the arrays file, if we have one

	repeatLengths = {}
	haveSpecificArrays = False

	if (arraysFilename != None):
		haveSpecificArrays = True
		f = file(arraysFilename,"rt")
		numRepeats = 0
		for (length,motif,_) in read_arrays(f,arraysFilename):
			numRepeats += 1
			if (motif not in repeatLengths):
				motifs += [(motif)]
				repeatLengths[motif] =  [length]
			else:
				repeatLengths[motif] += [length]
		f.close()

		if (motifs == []):
			usage("array file \"%s\" contains no arrays" % arraysFilename)

	# read the lengths file

	if (repeatLengths == {}):
		if (lengthsFilename == None):
			lengths = read_integers(stdin)
			for motif in motifs:
				repeatLengths[motif] = lengths
		elif ("{motif}" not in lengthsFilename):
			f = file(lengthsFilename,"rt")
			lengths = read_integers(f,lengthsFilename)
			f.close()
			for motif in motifs:
				repeatLengths[motif] = lengths
		else:
			for motif in motifs:
				motifLengthsFilename = lengthsFilename.replace("{motif}",motif)
				f = file(motifLengthsFilename,"rt")
				lengths = read_integers(f,motifLengthsFilename)
				f.close()
				repeatLengths[motif] = lengths

	# generate the number and type of motifs we'll embed
	#
	# note: to satisfy the requirement that the same seed generates the same
	#       pre-error sequence, we should have no variance in the use of the
	#       PRNG until after we've generated that sequence; see "point A" below

	embeddings = []

	if (haveSpecificArrays):
		for motif in motifs:
			for length in repeatLengths[motif]:
				strand = choice(["+","-"])
				offset = choice(xrange(len(motif)))
				embeddings += [(1.0,motif,motif,strand,offset,length)]
		shuffle(embeddings)
	else:
		for _ in xrange(numRepeats):
			motif = choice(motifs)
			length = choice(repeatLengths[motif])
			u = unit_random()
			if (genNeighbors > 0) and (u < genNeighbors):
				motif = motif_neighbor(motif)
				(mix,motif2) = (1.0,motif)
			elif (genMixture > 0) and (u < genNeighbors+genMixture):
				(mix,motif2) = (0.5,motif_neighbor(motif))
			else:
				(mix,motif2) = (1.0,motif)
			strand = choice(["+","-"])
			offset = choice(xrange(len(motif)))
			embeddings += [(mix,motif,motif2,strand,offset,length)]

	totalRepeatBp = sum([length for (_,_,_,_,_,length) in embeddings])

	# assign each repeat a position within the "fill" sequence;  note that we
	# might have more than one repeat assigned to the same position, in which
	# case they will be back-to-back with no fill between them

	if (type(sequenceLen) == tuple):
		(op,sequenceLen) = sequenceLen
		if (op == "%"):
			sequenceLen = int(round(totalRepeatBp*sequenceLen))
		else: # if (op == "+"):
			sequenceLen = totalRepeatBp + sequenceLen

	if (totalRepeatBp > sequenceLen):
		fillBp = 0
		if (sequenceLen > 0):
			print >>stderr, "WARNING: length of embedded repeats (%d) exceeds specified" % totalRepeatBp
			print >>stderr, "         sequence length (%d); there will be no fill DNA"   % sequenceLen
	elif (minFill != None):
		fillBp = sequenceLen - totalRepeatBp
		totalMinFill = (numRepeats+1) * minFill
		if (totalMinFill > fillBp):
			print >>stderr, "WARNING: minimum fill of %d cannot be achieved"           % minFill
			print >>stderr, "         total minimum fill (%d) exceeds total fill (%d)" % (totalMinFill,fillBp)
			minFill = fillBp / (numRepeats+1)
		fillBp -= minFill * (numRepeats+1)
	else:
		fillBp = sequenceLen - totalRepeatBp

	fillPositions = [randint(0,fillBp) for _ in xrange(numRepeats)]
	fillPositions.sort()

	if (minFill != None):
		fillBp += minFill * (numRepeats+1)
		for rptNum in xrange(numRepeats):
			fillPositions[rptNum] += (rptNum+1) * minFill

	# generate the sequence

	catalog = None
	if (catalogFilename != None):
		catalog = []

	fillSeq = str(EchyDna(fillBp))
	seq = []
	seqPos  = 0
	prevEnd = 0
	fillPos = 0
	for (ix,pos) in enumerate(fillPositions):
		if (fillPos < pos):
			seq += [fillSeq[fillPos:pos]]
			seqPos  += pos - fillPos
			fillPos =  pos

		(mix,motif,motif2,strand,offset,length) = embeddings[ix]
		if (catalog != None):
			c = CatalogEntry()
			c.start        = seqPos
			c.end          = seqPos+length
			c.mix          = mix
			c.motif        = motif
			c.motif2       = motif2
			c.strand       = strand
			c.repeatLength = length
			c.offset       = offset
			catalog += [c]

		enoughCopies = (length+offset+len(motif)-1) / len(motif)
		if (strand == "-"): motif = reverse_complement(motif)

		if (mix >= 1.0):
			repeat = motif * enoughCopies
		else:
			repeat = []
			for _ in xrange(enoughCopies):
				if (unit_random() < mix): repeat += [motif]
				else:                     repeat += [motif2]
			repeat = "".join(repeat)

		seq += repeat[offset:offset+length]
		seqPos += length
		prevEnd = seqPos

	if (fillPos < fillBp):
		seq += [fillSeq[fillPos:fillBp]]

	seq = "".join(seq)

	#=== point A: it's now safe to make additional use of the PRNG ===

	# apply error profile

	events = profile = None
	if (argVal in ["pacbio","pacbio.v3","pacbio.GIAB","pacbio.giab"]):
		errorProfile = errorProfilePacbioV3
	elif (argVal == "pacbio.v2"):  # for historical reasons, v2 is an alias for v3
		errorProfile = errorProfilePacbioV3
	elif (argVal in ["pacbio.v1","pacbio.Guiblet","pacbio.guiblet"]):
		errorProfile = errorProfilePacbioV1
	elif (argVal in ["pacbio.readsim"]):
		errorProfile = errorProfilePacbioReadsim
	elif (argVal in ["nanopore","nanopore.v3","nanopore.GIAB","nanopore.giab"]):
		errorProfile = errorProfileNanoporeV3
	elif (argVal == "nanopore.v2"):  # for historical reasons, v2 is an alias for v3
		errorProfile = errorProfileNanoporeV3
	elif (argVal in ["nanopore.v1","nanopore.Jain","nanopore.jain"]):
		errorProfile = errorProfileNanoporeV1
	elif (argVal in ["nanopore.readsim"]):
		errorProfile = errorProfileNanoporeReadSim
	elif (type(errorProfile) == float):
		eRate = errorProfile / 3.0;
		profile = {"mm":eRate, "i":eRate, "d":eRate }
	elif (type(errorProfile) == dict):
		profile = dict(errorProfile)

	if (profile != None):
		print >>stderr, "(applying error profile mm=%.2f%% i=%.2f%% d=%.2f%%)" \
		             % (100*profile["mm"],100*profile["i"],100*profile["d"])
		(seq,catalog,events) = apply_errors(profile,seq,catalog)

	# write the sequence

	if (sequenceName != None):
		print ">%s" % sequenceName

	if (wrapLength == None):
		print seq
	else:
		for i in range(0,len(seq),wrapLength):
			print seq[i:i+wrapLength]

	# write the catalog

	if (catalogFilename != None):
		catalogF = file(catalogFilename,"wt")
		if (sequenceName in [None,""]): seqNameForCatalog = "seq"
		else:                           seqNameForCatalog = sequenceName

		if (events == None):
			print >>catalogF, "#%s\t%s\t%s\t%s\t%s\t%s\t%s" \
			                % ("chrom","start","end","motif","rptLen","len","fill")
		else:
			print >>catalogF, "#%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" \
			                % ("chrom","start","end","motif","rptLen","len","fill",
			                   "mRatio","m","mm","i","d")

		prevEnd = 0
		for (catIx,c) in enumerate(catalog):
			motifStr = c.motif
			if (c.mix < 1.0): motifStr += "," + c.motif2
			motifStr += ".%s%s" % (c.offset,c.strand)
			if (events == None):
				print >>catalogF, "%s\t%s\t%s\t%s\t%s\t%s\t%s" \
				                % (seqNameForCatalog,c.start,c.end,motifStr,
				                   c.repeatLength,c.end-c.start,c.start-prevEnd)
			else:
				if (catIx in events):
					(m,mm,i,d) = events[catIx]
					mRatio = "%.1f%%" % (100.0*m/(m+mm+i+d))
				else:
					mRatio = m = mm = i = d = "NA"
				print >>catalogF, "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" \
				                % (seqNameForCatalog,c.start,c.end,motifStr,
				                   c.repeatLength,c.end-c.start,c.start-prevEnd,
				                   mRatio,m,mm,i,d)
			prevEnd = c.end

		catalogF.close()
Ejemplo n.º 10
0
def main():
	assert (len(argv) == 1), "give me no arguments"

	numTrials = 1000
	random_seed("acorn")
	explainFailure = False
	path = "kmer_histograms"

	#sampleId = "mixedB"
	#defaultParams = {"zp.copy.y"   :  3.000,
	#                 "zp.copy.hom" :  3.000,
	#                 "zp.copy.het" :  3.000,
	#                 "p.e"         :  0.942,
	#                 "shape.e"     :  3.000,
	#                 "scale.e"     :  1.000,
	#                 "p.y"         :  0.900,
	#                 "u.y"         : 64.000,
	#                 "sd.y"        : 14.826,
	#                 "shape.y"     :  0.000,
	#                 "p.hom"       :  0.800,
	#                 "u.hom"       :  5.120,
	#                 "sd.hom"      :  1.186,
	#                 "var.het"     :  1.407}
	#goodParams    = {"zp.copy.y"   :  2.042,
	#                 "zp.copy.hom" :  3.157,
	#                 "zp.copy.het" : 17.795,
	#                 "p.e"         :  0.935,
	#                 "shape.e"     :  0.096,
	#                 "scale.e"     :  0.465,
	#                 "p.y"         :  0.621,
	#                 "u.y"         : 68.084,
	#                 "sd.y"        :  8.626,
	#                 "shape.y"     :  0.057,
	#                 "p.hom"       :  0.853,
	#                 "u.hom"       : 11.101,
	#                 "sd.hom"      :  3.600,
	#                 "var.het"     : 10.916}

	sampleId = "apple_E12_L150_D80_K25"
	defaultParams = {"zp.copy.y"   :  3.000,
	                 "zp.copy.hom" :  3.000,
	                 "zp.copy.het" :  3.000,
	                 "p.e"         :  0.940,
	                 "shape.e"     :  3.000,
	                 "scale.e"     :  1.000,
	                 "p.y"         :  0.900,
	                 "u.y"         : 62.000,
	                 "sd.y"        : 16.309,
	                 "shape.y"     :  0.000,
	                 "p.hom"       :  0.800,
	                 "u.hom"       :  4.960,
	                 "sd.hom"      :  1.305,
	                 "var.het"     :  1.702}
	goodParams    = {"zp.copy.y"   :  2.047,
	                 "zp.copy.hom" :  3.390,
	                 "zp.copy.het" :  1.137,
	                 "p.e"         :  0.937,
	                 "shape.e"     :  0.114,
	                 "scale.e"     :  0.452,
	                 "p.y"         :  0.630,
	                 "u.y"         : 65.974,
	                 "sd.y"        :  8.666,
	                 "shape.y"     :  0.228,
	                 "p.hom"       :  0.818,
	                 "u.hom"       : 13.622,
	                 "sd.hom"      :  4.086,
	                 "var.het"     : 15.274}

	fitter = EnrichedHapDipFitter(path+"/"+sampleId+".mixed.kmer_dist")
	paramNames = fitter.paramNames

	convergenceCount = 0
	for trialNumber in xrange(numTrials):
		print "=== trial %d of %d ===" \
		    % (1+trialNumber,numTrials)

		# choose initial params as a random point in hypercube between "good"
		# and "bad"

		initParams = dict(goodParams)
		norm2Init = 0.0
		for (paramIx,name) in enumerate(paramNames):
			step = unit_random()
			initParams[name] += step*(defaultParams[name]-goodParams[name])
			norm2Init += step*step
		normInit = sqrt(norm2Init) / len(paramNames)

		fitter.set_params(initParams)
		fitParams = fitter.fit()
		if (fitParams == None):
			print params_to_text(paramNames,initParams,prefix="init-[%d]:" % trialNumber)
			print "normInit: %.8f" % normInit
			print "(failure or non-convergence)"
			if (explainFailure):
				print "... return code ..."
				print fitter.retCode
				print "... stdout ..."
				print fitter.stdout
				print "... stderr ..."
				print fitter.stderr
			continue

		print params_to_text(paramNames,initParams,fitParams,
		                     prefix="init+[%d]:" % trialNumber,
		                     prefix2="cvrg[%d]:" % trialNumber)
		fitParams = params_to_float(fitParams)
		dGood = vector_distance(fitParams,goodParams)
		print "normInit: %.8f" % normInit
		print "dGood: %.8f" % dGood
		convergenceCount += 1

	print "%d of %d trials converged" % (convergenceCount,numTrials)
def generate_read(readLength,errorRate):
	read = ["-"] * readLength
	for ix in xrange(readLength):
		if (unit_random() < errorRate):
			read[ix] = "x"
	return "".join(read)
 def generate(self):
     self.errorSeq = list(
         map(lambda _: 1 if (unit_random() < self.pSubstitution) else 0,
             range(self.ntSequenceLength)))
     return self.errorSeq
Ejemplo n.º 13
0
def main():
	assert (len(argv) == 3), "need the sampleID and number of trials, and nothing else"
	sampleId = argv[1]
	numTrials = int(argv[2])

	random_seed("acorn")
	explainFailure = False
	path = "kmer_histograms"

	# ask the curve fitter what the default paramters are

	fitter = EnrichedHapDipFitter(path+"/"+sampleId+".mixed.kmer_dist")
	paramNames = fitter.paramNames

	defaultParams = fitter.default_params()
	if (defaultParams == None):
		print "(failed to get default params)"
		if (explainFailure):
			print "... return code ..."
			print hdFitter.retCode
			print "... stdout ..."
			print hdFitter.stdout
			print "... stderr ..."
			print hdFitter.stderr
		assert (False)

	defaultParams = params_to_float(defaultParams)

	# read the "good" parameters (usually produced by explore3_hap_dip)

	fitFilename = path+"/"+sampleId+".mixed.fit"

	f = file(fitFilename,"rt")
	goodParams = params_from_text([line for line in f])
	f.close()

	for name in defaultParams:
		assert (name in goodParams), \
		       "parameter \"%s\" missing from %s" % (name,fitFilename)

	for name in goodParams:
		assert (name in defaultParams), \
		       "extra parameter \"%s\" in %s" % (name,fitFilename)

	goodParams = params_to_float(goodParams)

	print params_to_text(paramNames,goodParams,defaultParams,
	                     prefix="good:",prefix2="dflt:")

	# run the convergence trials

	convergenceCount = 0
	for trialNumber in xrange(numTrials):
		print "=== trial %d of %d ===" \
		    % (1+trialNumber,numTrials)

		# choose initial params as a random point in hypercube between "good"
		# and "bad"

		initParams = dict(goodParams)
		norm2Init = 0.0
		for (paramIx,name) in enumerate(paramNames):
			step = unit_random()
			initParams[name] += step*(defaultParams[name]-goodParams[name])
			norm2Init += step*step
		normInit = sqrt(norm2Init) / len(paramNames)

		fitter.set_params(initParams)
		fitParams = fitter.fit()
		if (fitParams == None):
			print params_to_text(paramNames,initParams,prefix="init-[%d]:" % trialNumber)
			print "normInit: %.8f" % normInit
			print "(failure or non-convergence)"
			if (explainFailure):
				print "... return code ..."
				print fitter.retCode
				print "... stdout ..."
				print fitter.stdout
				print "... stderr ..."
				print fitter.stderr
			continue

		print params_to_text(paramNames,initParams,fitParams,
		                     prefix="init+[%d]:" % trialNumber,
		                     prefix2="cvrg[%d]:" % trialNumber)
		fitParams = params_to_float(fitParams)
		dGood = vector_distance(fitParams,goodParams)
		print "normInit: %.8f" % normInit
		print "dGood: %.8f" % dGood
		convergenceCount += 1

	print "%d of %d trials converged" % (convergenceCount,numTrials)
def main():
    global reportProgress, debug, hasherFmt

    # parse the command line

    kmerSize = 28
    sketchSizes = None
    numSequences = None
    noiseKind = None
    pSubstitution = None
    sequenceType = "linear"
    sortBy = "nMutated"
    statsFilename = None
    mutatedFilename = None
    mutateOnly = False
    prngSeed = None
    hashSeed = None
    hashBits = None
    reportProgress = None
    debug = []

    statsOfInterest = [
        "name", "r1", "k", "L", "trials", "q", "Mean[|A|].obs",
        "Mean[|B|].obs", "Mean[|A^B|].obs", "Mean[|AuB|].obs",
        "Mean[nMut.A,B].obs", "Mean[L.A,B].obs", "Mean[r1est.A,B].obs"
    ]

    for arg in argv[1:]:
        if ("=" in arg):
            argVal = arg.split("=", 1)[1]

        if (arg in ["--help", "-help", "--h", "-h"]):
            usage()
        elif (arg.startswith("--kmer=")) or (arg.startswith("K=")):
            kmerSize = int(argVal)
        elif (arg.startswith("--sketch=")) or (arg.startswith("S=")):
            if (sketchSizes == None): sketchSizes = []
            sketchSizes += map(int_with_unit, argVal.split(","))
        elif (arg.startswith("--sequences=")) or (arg.startswith("T=")):
            numSequences = int_with_unit(argVal)
        elif (arg.startswith("--poisson=")) or (
                arg.startswith("--noise=")) or (arg.startswith("P=")):
            noiseKind = "poisson"
            pSubstitution = parse_probability(argVal)
        elif (arg.startswith("--bernoulli=")) or (
                arg.startswith("--error=")) or (arg.startswith("B=")) or (
                    arg.startswith("E=")):
            noiseKind = "bernoulli"
            pSubstitution = parse_probability(argVal)
        elif (arg == "--linear"):
            sequenceType = "linear"
        elif (arg == "--circular"):
            sequenceType = "circular"
        elif (arg == "--nosort"):
            sortBy = None
        elif (arg.startswith("--stats=")):
            statsFilename = argVal
        elif (arg.startswith("--mutated=")):
            mutatedFilename = argVal
        elif (arg == "--mutateonly"):
            mutateOnly = True
        elif (arg.startswith("--seed=")):
            prngSeed = argVal
        elif (arg in ["--hashbits=none", "--hash=none"]):
            hashBits = None
        elif (arg.startswith("--hash=")) or (arg.startswith("--hashseed=")):
            hashSeed = int(argVal)
        elif (arg.startswith("--hashbits=")):
            hashBits = int(argVal)
        elif (arg.startswith("--progress=")):
            reportProgress = int_with_unit(argVal)
        elif (arg == "--debug"):
            debug += ["debug"]
        elif (arg.startswith("--debug=")):
            debug += argVal.split(",")
        elif (arg.startswith("--")):
            usage("unrecognized option: %s" % arg)
        else:
            usage("unrecognized option: %s" % arg)

    if (pSubstitution == None):
        usage("you have to tell me the mutation probability")

    if (numSequences == None):
        numSequences = 1

    if (noiseKind == None):
        usage("you must specify either poisson or bernoulli error model")

    if (noiseKind == "bernoulli"):
        # the presence of non-ACGT nucleotides isn't considered
        usage("the bernoulli noise model is not currently supported")

    if (sequenceType == "circular") and (sketchSizes != None):
        # sketch_intersection() assumes linear sequences
        usage("sketches are not currently supported for circular sequences")

    if (sequenceType == "circular"):
        # all the estimator code assumes linear sequences
        usage("circular sequences are not currently supported")

    if (hashBits == None) and (hashSeed != None):
        print(
            "WARNING, hash seed is ignored, since no hashing is being performed",
            file=stderr)

    if (hashBits != None) and (not haveHashers):
        usage(
            "was unable to import module mmh3, so hashing can't be supported")

    if (sketchSizes != None):
        sketchSizes = list(set(sketchSizes))  # (remove duplicates)
        sketchSizes.sort()

    if (sketchSizes != None):
        for sketchSize in sketchSizes:
            statsOfInterest += [
                "Mean[nIntersection(S=%d)].obs" % sketchSize,
                "Mean[Jaccard(S=%d)].obs" % sketchSize,
                "StDev[Jaccard(S=%d)].obs" % sketchSize
            ]

    # set up randomness
    #
    # note that we choose the hash seed randomly *before* seeding the PRNG, so
    # that we (allegedly) get a randomly chosen hash; but users will be better
    # off specifically choosing the hash seed

    if (hashSeed == None):
        hashSeed = hashSeed & 0xFFFFFFFF  # (mmh3 seeds are limited to 32 bits)
    else:
        hashSeed = int(0x100000000 * unit_random())

    if (prngSeed != None):
        random_seed(prngSeed.encode("utf-8"))

    if (hashBits == 128):
        hasher = lambda kmer: hash128(kmer, hashSeed, signed=False)
        hasherFmt = "%032X"
    elif (hashBits == 64):
        hasher = lambda kmer: hash64(kmer, hashSeed, signed=False)[0]
        hasherFmt = "%016X"
    elif (hashBits == 32):
        hasher = lambda kmer: hash(kmer, hashSeed, signed=False)
        hasherFmt = "%08X"
    elif (hashBits == 16):
        hasher = lambda kmer: hash(kmer, hashSeed, signed=False) & 0xFFFF
        hasherFmt = "%04X"
    elif (hashBits == None):
        hasher = lambda kmer: kmer
        hasherFmt = "%s"
    else:
        raise ValueError

    # open a file to receive the mutated sequences

    mutatedF = None
    if (mutateOnly) and (mutatedFilename == None):
        mutatedF = stdout
    else:
        if (mutatedFilename != None):
            if (mutatedFilename.endswith(".gz")) or (
                    mutatedFilename.endswith(".gzip")):
                mutatedF = gzip_open(mutatedFilename, "wt")
            else:
                mutatedF = open(mutatedFilename, "wt")

    # fetch the *single* input sequence

    numSequencesSeen = 0
    for (seqName, seq) in fasta_sequences(stdin):
        numSequencesSeen += 1
        assert (numSequencesSeen <
                2), "there was more than one sequence in the input"
        seqLen = len(seq)

    assert (numSequencesSeen == 1), "there were no sequences in the input"

    ntSequenceLength = len(seq)
    assert (
        ntSequenceLength >= kmerSize
    ), "input sequence length (%d) is shorter than the kmer size (%d)" % (
        ntSequenceLength, kmerSize)

    distinctKmersA = kmer_set(seq, kmerSize, hasher)
    numDistinctKmersA = len(distinctKmersA)

    # set up model/generator

    if (noiseKind == "poisson") and (sequenceType == "linear"):
        kmerSequenceLength = ntSequenceLength - (kmerSize - 1)
        mutationModel = PoissonModel \
                          (seq,kmerSize,pSubstitution,
                           count_mutated_kmers_linear,
                           hashBits=hashBits)
    elif (noiseKind == "bernoulli") and (sequenceType == "linear"):
        kmerSequenceLength = ntSequenceLength - (kmerSize - 1)
        mutationModel = BernoulliModel \
                          (seq,kmerSize,pSubstitution,
                           count_mutated_kmers_linear,
                           hashBits=hashBits)
    elif (noiseKind == "poisson") and (sequenceType == "circular"):
        kmerSequenceLength = ntSequenceLength
        mutationModel = PoissonModel \
                          (seq,kmerSize,pSubstitution,
                           count_mutated_kmers_circular,
                           hashBits=hashBits)
    elif (noiseKind == "bernoulli") and (sequenceType == "circular"):
        kmerSequenceLength = ntSequenceLength
        mutationModel = BernoulliModel \
                          (seq,kmerSize,pSubstitution,
                           count_mutated_kmers_circular,
                           hashBits=hashBits)
    else:
        assert (False), "internal error"

    # generate mutated sequences and collect stats

    q = p_mutated(kmerSize, pSubstitution)

    nErrorsObserved = []
    nMutatedObserved = []
    r1EstNMutatedObserved = []
    nDistinctAObserved = []
    nDistinctBObserved = []
    nDistinctIntersectionObserved = []
    nDistinctUnionObserved = []
    nMutatedABObserved = []
    kmerSequenceLengthABObserved = []
    r1EstABObserved = []
    inConfR1EstABObserved = []

    if (sketchSizes != None):
        nIntersectionObserved = {}
        jaccardObserved = {}
        for sketchSize in sketchSizes:
            nIntersectionObserved[sketchSize] = []
            jaccardObserved[sketchSize] = []

    for seqNum in range(numSequences):
        if (reportProgress != None):
            if (1 + seqNum <= 2) or ((1 + seqNum) % reportProgress == 0):
                print("testing mutated sequence %d" % (1 + seqNum),
                      file=stderr)

        # generate a mutated sequence and collect stats

        mutatedSeq = mutationModel.generate()
        if (mutatedF != None):
            write_fasta(mutatedF, seqName + "_mutation_%d)" % (1 + seqNum),
                        mutatedSeq)
        (nErrors, nMutated) = mutationModel.count()
        nErrorsObserved += [nErrors]
        nMutatedObserved += [nMutated]

        r1EstNMutated = estimate_r1_from_n_mutated(kmerSequenceLength,
                                                   kmerSize, nMutated)
        r1EstNMutatedObserved += [r1EstNMutated]

        distinctKmersB = kmer_set(mutatedSeq, kmerSize, hasher)
        numDistinctKmersB = len(distinctKmersB)

        if ("kmers" in debug):
            print("=== trial %d ===" % seqNum, file=stderr)
            numKmers = len(seq) - (kmerSize - 1)
            for pos in range(numKmers):
                sKmer = seq[pos:pos + kmerSize]
                if (not is_valid_kmer(sKmer)): continue
                mKmer = mutatedSeq[pos:pos + kmerSize]
                sH = hasher(sKmer)
                mH = hasher(mKmer)
                print(("[%3d] %s %s %s "+hasherFmt+" "+hasherFmt) \
                     % (pos,sKmer,mKmer,"-" if (sKmer==mKmer) else "X",sH,mH),
                        file=stderr)

        nDistinctKmersIntersection = len(
            distinctKmersA.intersection(distinctKmersB))
        nDistinctKmersUnion = len(distinctKmersA.union(distinctKmersB))
        nDistinctAObserved += [numDistinctKmersA]
        nDistinctBObserved += [numDistinctKmersB]
        nDistinctIntersectionObserved += [nDistinctKmersIntersection]
        nDistinctUnionObserved += [nDistinctKmersUnion]

        kmerSequenceLengthAB = (numDistinctKmersA + numDistinctKmersB) / 2.0
        nMutatedAB = kmerSequenceLengthAB - nDistinctKmersIntersection
        r1EstAB = estimate_r1_from_n_mutated(kmerSequenceLengthAB, kmerSize,
                                             nMutatedAB)
        nMutatedABObserved += [nMutatedAB]
        kmerSequenceLengthABObserved += [kmerSequenceLengthAB]
        r1EstABObserved += [r1EstAB]

        # generate sketches and collect basic stats

        if (sketchSizes != None):
            mutationModel.compute_sketches(distinctKmersA, distinctKmersB,
                                           sketchSizes)
            for sketchSize in sketchSizes:
                nIntersection = mutationModel.sketch_intersection(sketchSize)
                nIntersectionObserved[sketchSize] += [nIntersection]
                jaccardObserved[sketchSize] += [
                    float(nIntersection) / sketchSize
                ]

        #if ("kmers" in debug):
        #	assert (False)

    # report per-trial results

    if (sortBy == "nMutated"):
        order = [(nDistinctIntersectionObserved[ix], ix)
                 for ix in range(numSequences)]
        order.sort()
        order.reverse()
        order = [ix for (_, ix) in order]
    else:  # if (sortBy == None):
        order = list(range(numSequences))

    header = [
        "L", "K", "r", "trial", "q", "nErr", "nMut", "r1est.nMut", "|A|",
        "|B|", "|A^B|", "|AuB|", "nMut.A,B", "L.A,B", "r1est.A,B"
    ]
    if (sketchSizes != None):
        for sketchSize in sketchSizes:
            header += ["nIntersection(s=%d)" % sketchSize]
            header += ["j.est(nMut,s=%d)" % sketchSize]
    print("#%s" % "\t".join(header))

    for ix in range(numSequences):
        line = "\t".join(["%d","%d","%0.3f","%d","%0.9f","%d","%d","%0.9f","%d","%d","%d","%d","%0.1f","%0.1f","%0.9f"]) \
             % (kmerSequenceLength,                       # L
                kmerSize,                                 # K
                pSubstitution,                            # r
                1+order[ix],                              # trial
                q,                                        # q
                nErrorsObserved[order[ix]],               # nErr
                nMutatedObserved[order[ix]],              # nMut
                r1EstNMutatedObserved[order[ix]],         # r1est.nMut
                nDistinctAObserved[order[ix]],            # |A|
                nDistinctBObserved[order[ix]],            # |B|
                nDistinctIntersectionObserved[order[ix]], # |A^B|
                nDistinctUnionObserved[order[ix]],        # |AuB|
                nMutatedABObserved[order[ix]],            # nMut.A,B
                kmerSequenceLengthABObserved[order[ix]],  # L.A,B
                r1EstABObserved[order[ix]])               # r1est.A,B
        if (sketchSizes != None):
            for sketchSize in sketchSizes:
                line += "\t%d" % nIntersectionObserved[sketchSize][order[ix]]
                line += "\t%0.9f" % jaccardObserved[sketchSize][order[ix]]
        print(line)

    if (mutatedF != None) and (mutatedF != stdout):
        mutatedF.close()

    if (mutateOnly):
        exit()

    # compute stats

    q = p_mutated(kmerSize, pSubstitution)

    nMutatedMean = sample_mean(nMutatedObserved)
    nMutatedStDev = sqrt(sample_variance(nMutatedObserved))
    predNMutatedMean = exp_n_mutated(kmerSequenceLength, kmerSize,
                                     pSubstitution)
    predNMutatedStDev = sqrt(
        var_n_mutated(kmerSequenceLength, kmerSize, pSubstitution))
    rmseNMutatedStDev = abs(nMutatedStDev - predNMutatedStDev)
    rmseR1EstNMutated = sqrt(
        mean_squared_error(r1EstNMutatedObserved, pSubstitution))

    nDistinctAMean = sample_mean(nDistinctAObserved)
    nDistinctBMean = sample_mean(nDistinctBObserved)
    nDistinctIntersectionMean \
                       = sample_mean(nDistinctIntersectionObserved)
    nDistinctUnionMean = sample_mean(nDistinctUnionObserved)
    nMutatedABMean = sample_mean(nMutatedABObserved)
    kmerSequenceLengthABMean \
                       = sample_mean(kmerSequenceLengthABObserved)
    r1EstABMean = sample_mean(r1EstABObserved)

    if (sketchSizes != None):
        nIntersectionMean = {}
        jaccardEstMean = {}
        jaccardEstStDev = {}
        inConfJaccardEstNMutated = {}
        for sketchSize in sketchSizes:
            nIntersectionMean[sketchSize] = sample_mean(
                nIntersectionObserved[sketchSize])
            jaccardEstMean[sketchSize] = sample_mean(
                jaccardObserved[sketchSize])
            jaccardEstStDev[sketchSize] = sqrt(
                sample_variance(jaccardObserved[sketchSize]))

    # report stats

    statToText = {}
    statToText["name"] = seqName
    statToText["r1"] = "%0.3f" % pSubstitution
    statToText["k"] = "%d" % kmerSize
    statToText["L"] = "%d" % kmerSequenceLength
    statToText["trials"] = "%d" % numSequences
    statToText["q"] = "%0.9f" % q
    statToText["E[nMut].theory"] = "%0.9f" % predNMutatedMean
    statToText["StDev[nMut].theory"] = "%0.9f" % predNMutatedStDev
    statToText["Mean[nMut].obs"] = "%0.9f" % nMutatedMean
    statToText["StDev[nMut].obs"] = "%0.9f" % nMutatedStDev
    statToText["RMSE(StDev[nMut])"] = "%0.9f" % rmseNMutatedStDev
    statToText["RMSE(r1est.nMut)"] = "%0.9f" % rmseR1EstNMutated
    statToText["Mean[|A|].obs"] = "%d" % nDistinctAMean
    statToText["Mean[|B|].obs"] = "%d" % nDistinctBMean
    statToText["Mean[|A^B|].obs"] = "%d" % nDistinctIntersectionMean
    statToText["Mean[|AuB|].obs"] = "%d" % nDistinctUnionMean
    statToText["Mean[nMut.A,B].obs"] = "%d" % nMutatedABMean
    statToText["Mean[L.A,B].obs"] = "%d" % kmerSequenceLengthABMean
    statToText["Mean[r1est.A,B].obs"] = "%0.9f" % r1EstABMean

    if (sketchSizes != None):
        for sketchSize in sketchSizes:
            statToText["Mean[nIntersection(S=%d)].obs" %
                       sketchSize] = "%0.9f" % nIntersectionMean[sketchSize]
            statToText["Mean[Jaccard(S=%d)].obs" %
                       sketchSize] = "%0.9f" % jaccardEstMean[sketchSize]
            statToText["StDev[Jaccard(S=%d)].obs" %
                       sketchSize] = "%0.9f" % jaccardEstStDev[sketchSize]

    if (statsFilename != None):
        if (statsFilename.endswith(".gz")) or (
                statsFilename.endswith(".gzip")):
            statsF = gzip_open(statsFilename, "wt")
        else:
            statsF = open(statsFilename, "wt")

        print("#%s" % "\t".join(statsOfInterest), file=statsF)
        statsLine = [statToText[stat] for stat in statsOfInterest]
        print("\t".join(statsLine), file=statsF)
        statsF.close()
    else:
        statW = max(len(stat) for stat in statsOfInterest)
        for stat in statsOfInterest:
            print("%*s = %s" % (statW, stat, statToText[stat]), file=stderr)
Ejemplo n.º 15
0
def geometric_distribution(pExtend):
    if (pExtend == 0): return 1
    u = unit_random()
    return int(floor(1 + log(1 - u) / log(pExtend)))