i_c = 1.0 / (2 * g * u * i_N) i_m = 0.1 estates = 10 name = "ILS.test" seqs = ["example_data.fa"] # Creating an isolation model with three species is done by merging species # pairwise. modelILS = build_epoch_seperated_model( 3, # we start with a,b,c. [ # merge a and b to '0', c continues as '1' [0, 0, 1], # and now '0' and '1' is merged to a new '0' [0, 0] ], [1, estates, estates]) nstates = len(modelILS.tree_map) names = ["bonobo", "pantro2", "hg18"] init_ILS = (1.0e6 * u, 4.5e6 * u, i_c, i_r) all_obs = [] total_L = 0 for seq in seqs: # each sequence creates a new column map extended with any new columns that # might be seen. obs, colmap = readObservations(seq, names, COL_MAP) COL_MAP = colmap
def runILSctmc(seqs, **args): global COL_MAP NeRef = args["NeRef"] g = args["g"] u = args["u"] i_r = args["r"] i_N1 = args["N1"] i_N2 = args["N12"] i_N3 = args["N123"] i_c1 = 1.0/(2*g*u*i_N1) i_c2 = 1.0/(2*g*u*i_N2) i_c3 = 1.0/(2*g*u*i_N3) i_t1 = u * args["T12"] / g / (2*NeRef) i_t2 = u * args["T123"] / g / (2*NeRef) startvalues = dict( [ (name,eval(name)) for name in ['i_t1', 'i_t2', 'i_c1', 'i_c2', 'i_c3', 'i_r'] ] ) # # generation time and mutation rate # g = 20 # u = 1e-9 # # # initial values for recombination rate, population size/coalescence rate and # # migration rate. # i_r = 0.4 # i_N = 100e3 # i_c = 1.0/(2*g*u*i_N) # i_t1 = 3.7e6*u # i_t2 = 5.95e6*u estates = 4 print >>sys.stderr, 'running with %d epoch states' % estates # inst = int(sys.argv[1]) # # seqs = ["data/x_ils_%i.fa" % inst] modelILS = build_epoch_seperated_model(3, [[0,0,1], [0,0]], [1,estates,estates]) nstates = len(modelILS.tree_map) names = ["'1'", "'2'", "'0'"] #names = ["'0'", "'2'", "'1'"] all_obs = [] forwarders = [] COL_MAP = dict((v,i) for i,v in enumerate(product('ACGT', repeat=3))) for seq in seqs: # each sequence creates a new column map extended with any new columns that # might be seen. obs, colmap = readObservations(seq, names, COL_MAP) all_obs.append(obs) # print len(COL_MAP) doEstimate = False L = None estimates = list() if doEstimate: for obs in all_obs: print 'next obs:' ffd, foutname = tempfile.mkstemp() print ' temp fd/name:', ffd, foutname fout = os.fdopen(ffd, 'w') L = len(obs) for j in xrange(L-1): o = obs[j] print >>fout, o, print >>fout, obs[j] fout.close() print ' written, creating forwarder' f = Forwarder.fromSequence(seqFilename = foutname, alphabetSize = len(COL_MAP)) #f = Forwarder(seqFilename = foutname, nStates = len(modelILS.tree_map), nObservables = len(COL_MAP)) print ' - done.' forwarders.append(f) os.system("rm %s" % foutname) L, estimates = estimate_ILS(modelILS, forwarders, i_t1, i_t2, i_c1, i_c2, i_c3, i_r, outfile="/dev/null") i_t1, i_t2, i_c1, i_c2, i_c3, i_r = estimates estimates = dict( [ (name,eval(name)) for name in ['i_t1', 'i_t2', 'i_c1', 'i_c2', 'i_c3', 'i_r'] ] ) #print 'Estimates:' #print "\t".join(map(str, [L] + est)) if "hook" in args: args["hook"].run(modelILS, COL_MAP, all_obs, [i_c1, i_c2, i_c3], [i_r]*3, [0]*3, [0, i_t1, i_t2]) return L, estimates, startvalues
def main(): usage = """%prog [options] <forwarder dir> This program calculates the posterior state probabilities of an isolation model with two species and uniform coalescence/recombination rate.""" parser = OptionParser(usage=usage, version="%prog 1.0") parser.add_option("-o", "--out", dest="outfile", type="string", default="/dev/stdout", help="Output file for the estimate (/dev/stdout)") parser.add_option( "-n", "--seq-name", dest="seq_name", type="string", default="sequence", help="Name of the sequence. Used in the output for tabix indexing") parser.add_option( "-p", "--first-pos", dest="first_pos", type="int", default=0, help="Position in the sequence where the first element is." "Used in the output for tabix indexing. Default 1.") optimized_params = [ ('T', 'split time'), ('C', 'Coalescence rate'), ('R', 'recombination rate'), ] for (cname, desc) in optimized_params: parser.add_option("-%s" % cname, dest=cname, type="float", help="Model parameter %s" % desc) parser.add_option( "--intervals", dest="intervals", type="int", default=10, help="Number of sub intervals used to discretize the time (10)") parser.add_option("--no-header", dest="no_header", action="store_true", default=False, help="Do not include the header in the output") (options, args) = parser.parse_args() if len(args) != 1: parser.error("Needs a preprocessed sequence to work on") split_time = options.T coal_rate = options.C rec_rate = options.R if split_time is None: print 'You must specify the split time.' sys.exit(2) if coal_rate is None: print 'You must specify the coalescence rate.' sys.exit(2) if rec_rate is None: print 'You must specify the recombination rate.' sys.exit(2) intervals = options.intervals model = build_epoch_seperated_model(2, [[0, 0]], [1, intervals]) mpi, mT, mE, time_breaks = get_model_matrices(model, coal_rate, rec_rate, split_time) pi, T, E = zipHMM_prepare_matrices(mpi, mT, mE) with open(options.outfile, 'w') as fout: if not options.no_header: print >> fout, '## Posterior probabilities for isolation model.' print >> fout, '# intervals =', intervals print >> fout, '# T =', split_time print >> fout, '# C =', coal_rate print >> fout, '# R =', rec_rate print >> fout, '# time_breaks =', ' '.join(map(str, time_breaks)) try: for ziphmmdir in args: seqfile = os.path.join(ziphmmdir, 'original_sequence') _, pdTable = posteriorDecoding(seqfile, pi, T, E) print_matrix(pdTable, options.seq_name, options.first_pos, fout) except IOError as e: if e.errno == errno.EPIPE: sys.exit(0) # the pipe died, this is probably because it is in # a shell pipe where we have stopped reading else: raise e
i_c = 1.0/(2*g*u*i_N) i_m = 0.1 estates = 10 name = "ILS.test" seqs = ["example_data.fa"] # Creating an isolation model with three species is done by merging species # pairwise. modelILS = build_epoch_seperated_model( 3, # we start with a,b,c. [ # merge a and b to '0', c continues as '1' [0,0,1], # and now '0' and '1' is merged to a new '0' [0,0] ], [1,estates,estates]) nstates = len(modelILS.tree_map) names = ["bonobo", "pantro2", "hg18"] init_ILS = (1.0e6*u, 4.5e6*u, i_c, i_r) all_obs = [] total_L = 0 for seq in seqs: # each sequence creates a new column map extended with any new columns that # might be seen. obs, colmap = readObservations(seq, names, COL_MAP) COL_MAP = colmap
def main(): usage = """%prog [options] <forwarder dirs> This program estimates the parameters of an isolation model with two species and uniform coalescence/recombination rate.""" parser = OptionParser(usage=usage, version="%prog 1.0") parser.add_option("-o", "--out", dest="outfile", type="string", default="/dev/stdout", help="Output file for the estimate (/dev/stdout)") parser.add_option( "--tmpfile", dest="tmpfile", type="string", default="/dev/null", help="Log for all points estimated in the optimization (/dev/null)") optimized_params = [ ('splittime', 'split time', 1e6), ('Ne', 'effective population size', 20e3), ('recomb', 'recombination rate', 0.1), ] for (cname, desc, default) in optimized_params: parser.add_option("--%s" % cname, dest=cname, type="float", default=default, help="Initial guess at the %s (%g)" % (desc, default)) fixed_params = [ ('mu', 'mutation rate', 1e-9), ('g', 'generation time', 20), ] for (cname, desc, default) in fixed_params: parser.add_option("--%s" % cname, dest=cname, type="float", default=default, help="Value of the %s (%g)" % (desc, default)) parser.add_option( "--intervals", dest="intervals", type="int", default=10, help="Number of sub intervals used to discretize the time (10)") parser.add_option("--header", dest="include_header", action="store_true", default=False, help="Include a header on the output") parser.add_option("-v", "--verbose", dest="verbose", action="store_true", default=False, help="Print help") (options, args) = parser.parse_args() if len(args) < 1: parser.error("Needs at least one preprocessed sequence to work on") if not options.verbose: log = lambda s: None logu = lambda s: None else: logu = log_unfinished_line log = log_finished_line logu("Loading forwarders...") forwarders = [Forwarder.fromDirectory(dir) for dir in args] log("done") logu("Constructing model...") intervals = options.intervals modelI = build_epoch_seperated_model(2, [[0, 0]], [1, intervals]) log("done") mu = options.mu g = options.g T = options.splittime * mu C = 1.0 / (g * mu * 2 * options.Ne) R = options.recomb with open(options.tmpfile, 'w') as tmpfile: L, est = estimate_I(modelI, forwarders, T, C, R, outfile=tmpfile) vals = "\t".join(map(str, est)) with open(options.outfile, 'w') as outfile: if options.include_header: print >> outfile, 'logL\tT\tC\tR' print >> outfile, "%f\t%s" % (L, vals)
def main(): usage="""%prog [options] <forwarder dirs> This program estimates the parameters of an isolation model with two species and uniform coalescence/recombination rate.""" parser = OptionParser(usage=usage, version="%prog 1.0") parser.add_option("-o", "--out", dest="outfile", type="string", default="/dev/stdout", help="Output file for the estimate (/dev/stdout)") parser.add_option("--tmpfile", dest="tmpfile", type="string", default="/dev/null", help="Log for all points estimated in the optimization (/dev/null)") optimized_params = [ ('splittime', 'split time', 1e6), ('Ne', 'effective population size', 20e3), ('recomb', 'recombination rate', 0.1), ] for (cname, desc, default) in optimized_params: parser.add_option("--%s" % cname, dest=cname, type="float", default=default, help="Initial guess at the %s (%g)" % (desc, default)) fixed_params = [ ('mu', 'mutation rate', 1e-9), ('g', 'generation time', 20), ] for (cname, desc, default) in fixed_params: parser.add_option("--%s" % cname, dest=cname, type="float", default=default, help="Value of the %s (%g)" % (desc, default)) parser.add_option("--intervals", dest="intervals", type="int", default=10, help="Number of sub intervals used to discretize the time (10)") parser.add_option("--header", dest="include_header", action="store_true", default=False, help="Include a header on the output") parser.add_option("-v", "--verbose", dest="verbose", action="store_true", default=False, help="Print help") (options, args) = parser.parse_args() if len(args) < 1: parser.error("Needs at least one preprocessed sequence to work on") if not options.verbose: log = lambda s: None logu = lambda s: None else: logu = log_unfinished_line log = log_finished_line logu("Loading forwarders...") forwarders = [Forwarder.fromDirectory(dir) for dir in args] log("done") logu("Constructing model...") intervals = options.intervals modelI = build_epoch_seperated_model(2, [[0,0]], [1,intervals]) log("done") mu = options.mu g = options.g T = options.splittime * mu C = 1.0/(g*mu*2*options.Ne) R = options.recomb with open(options.tmpfile, 'w') as tmpfile: L, est = estimate_I(modelI, forwarders, T, C, R, outfile=tmpfile) vals = "\t".join(map(str,est)) with open(options.outfile, 'w') as outfile: if options.include_header: print >>outfile, 'logL\tT\tC\tR' print >>outfile, "%f\t%s" % (L,vals)
def main(): usage="""%prog [options] <forwarder dir> This program calculates the posterior state probabilities of an isolation model with two species and uniform coalescence/recombination rate.""" parser = OptionParser(usage=usage, version="%prog 1.0") parser.add_option("-o", "--out", dest="outfile", type="string", default="/dev/stdout", help="Output file for the estimate (/dev/stdout)") parser.add_option("-n", "--seq-name", dest="seq_name", type="string", default="sequence", help="Name of the sequence. Used in the output for tabix indexing") parser.add_option("-p", "--first-pos", dest="first_pos", type="int", default=0, help="Position in the sequence where the first element is." "Used in the output for tabix indexing. Default 1.") optimized_params = [ ('T', 'split time'), ('C', 'Coalescence rate'), ('R', 'recombination rate'), ] for (cname, desc) in optimized_params: parser.add_option("-%s" % cname, dest=cname, type="float", help="Model parameter %s" % desc) parser.add_option("--intervals", dest="intervals", type="int", default=10, help="Number of sub intervals used to discretize the time (10)") parser.add_option("--no-header", dest="no_header", action="store_true", default=False, help="Do not include the header in the output") (options, args) = parser.parse_args() if len(args) != 1: parser.error("Needs a preprocessed sequence to work on") split_time = options.T coal_rate = options.C rec_rate = options.R if split_time is None: print 'You must specify the split time.' sys.exit(2) if coal_rate is None: print 'You must specify the coalescence rate.' sys.exit(2) if rec_rate is None: print 'You must specify the recombination rate.' sys.exit(2) intervals = options.intervals model = build_epoch_seperated_model(2, [[0,0]], [1,intervals]) mpi, mT, mE, time_breaks = get_model_matrices(model, coal_rate, rec_rate, split_time) pi, T, E = zipHMM_prepare_matrices(mpi, mT, mE) with open(options.outfile,'w') as fout: if not options.no_header: print >> fout, '## Posterior probabilities for isolation model.' print >> fout, '# intervals =', intervals print >> fout, '# T =', split_time print >> fout, '# C =', coal_rate print >> fout, '# R =', rec_rate print >> fout, '# time_breaks =', ' '.join(map(str, time_breaks)) try: for ziphmmdir in args: seqfile = os.path.join(ziphmmdir, 'original_sequence') _, pdTable = posteriorDecoding(seqfile, pi, T, E) print_matrix(pdTable, options.seq_name, options.first_pos, fout) except IOError as e: if e.errno == errno.EPIPE: sys.exit(0) # the pipe died, this is probably because it is in # a shell pipe where we have stopped reading else: raise e