i_c = 1.0 / (2 * g * u * i_N)
i_m = 0.1

estates = 10

name = "ILS.test"

seqs = ["example_data.fa"]

# Creating an isolation model with three species is done by merging species
# pairwise.
modelILS = build_epoch_seperated_model(
    3,
    # we start with a,b,c.
    [
        # merge a and b to '0', c continues as '1'
        [0, 0, 1],
        # and now '0' and '1' is merged to a new '0'
        [0, 0]
    ],
    [1, estates, estates])
nstates = len(modelILS.tree_map)
names = ["bonobo", "pantro2", "hg18"]
init_ILS = (1.0e6 * u, 4.5e6 * u, i_c, i_r)

all_obs = []
total_L = 0
for seq in seqs:
    # each sequence creates a new column map extended with any new columns that
    # might be seen.
    obs, colmap = readObservations(seq, names, COL_MAP)
    COL_MAP = colmap
def runILSctmc(seqs, **args):
    global COL_MAP
    NeRef = args["NeRef"]
    g = args["g"]
    u = args["u"]
    i_r = args["r"]
    i_N1 = args["N1"] 
    i_N2 = args["N12"]
    i_N3 = args["N123"]
    i_c1 = 1.0/(2*g*u*i_N1)
    i_c2 = 1.0/(2*g*u*i_N2)
    i_c3 = 1.0/(2*g*u*i_N3)
    i_t1 = u * args["T12"] / g / (2*NeRef)
    i_t2 = u * args["T123"] / g / (2*NeRef)

    startvalues = dict( [ (name,eval(name)) for name in ['i_t1', 'i_t2', 'i_c1', 'i_c2', 'i_c3', 'i_r'] ] )

#     # generation time and mutation rate
#     g = 20
#     u = 1e-9
# 
#     # initial values for recombination rate, population size/coalescence rate and
#     # migration rate.
#     i_r = 0.4
#     i_N = 100e3
#     i_c = 1.0/(2*g*u*i_N)
#     i_t1 = 3.7e6*u
#     i_t2 = 5.95e6*u

    estates = 4
    print >>sys.stderr, 'running with %d epoch states' % estates
    
#     inst = int(sys.argv[1])
# 
#     seqs = ["data/x_ils_%i.fa" % inst]

    modelILS = build_epoch_seperated_model(3, [[0,0,1], [0,0]], [1,estates,estates])
    nstates = len(modelILS.tree_map)
    names = ["'1'", "'2'", "'0'"]
    #names = ["'0'", "'2'", "'1'"]

    all_obs = []
    forwarders = []

    COL_MAP = dict((v,i) for i,v in enumerate(product('ACGT', repeat=3)))
    for seq in seqs:
        # each sequence creates a new column map extended with any new columns that
        # might be seen.
        obs, colmap = readObservations(seq, names, COL_MAP)
        all_obs.append(obs)

#     print len(COL_MAP)

    doEstimate = False
    
    L = None
    estimates = list()
    if doEstimate:
        for obs in all_obs:
            print 'next obs:'
            ffd, foutname = tempfile.mkstemp()
            print '  temp fd/name:', ffd, foutname
            fout = os.fdopen(ffd, 'w')
            L = len(obs)
            for j in xrange(L-1):
                o = obs[j]
                print >>fout, o,
            print >>fout, obs[j]
            fout.close()
            print '  written, creating forwarder'
            f = Forwarder.fromSequence(seqFilename = foutname, alphabetSize = len(COL_MAP))
            #f = Forwarder(seqFilename = foutname, nStates = len(modelILS.tree_map), nObservables = len(COL_MAP))
            print '  - done.'
            forwarders.append(f)
            os.system("rm %s" % foutname)                                                                

        L, estimates = estimate_ILS(modelILS, forwarders, i_t1, i_t2, i_c1, i_c2, i_c3, i_r, outfile="/dev/null")
        i_t1, i_t2, i_c1, i_c2, i_c3, i_r = estimates

    estimates = dict( [ (name,eval(name)) for name in ['i_t1', 'i_t2', 'i_c1', 'i_c2', 'i_c3', 'i_r'] ] )

    #print 'Estimates:'
    #print "\t".join(map(str, [L] + est))


    if "hook" in args:
        args["hook"].run(modelILS, COL_MAP, all_obs, [i_c1, i_c2, i_c3], [i_r]*3, [0]*3, [0, i_t1, i_t2])

    return L, estimates, startvalues
Example #3
0
def main():
    usage = """%prog [options] <forwarder dir>

This program calculates the posterior state probabilities of an isolation
model with two species and uniform coalescence/recombination rate."""

    parser = OptionParser(usage=usage, version="%prog 1.0")

    parser.add_option("-o",
                      "--out",
                      dest="outfile",
                      type="string",
                      default="/dev/stdout",
                      help="Output file for the estimate (/dev/stdout)")

    parser.add_option(
        "-n",
        "--seq-name",
        dest="seq_name",
        type="string",
        default="sequence",
        help="Name of the sequence. Used in the output for tabix indexing")
    parser.add_option(
        "-p",
        "--first-pos",
        dest="first_pos",
        type="int",
        default=0,
        help="Position in the sequence where the first element is."
        "Used in the output for tabix indexing. Default 1.")

    optimized_params = [
        ('T', 'split time'),
        ('C', 'Coalescence rate'),
        ('R', 'recombination rate'),
    ]
    for (cname, desc) in optimized_params:
        parser.add_option("-%s" % cname,
                          dest=cname,
                          type="float",
                          help="Model parameter %s" % desc)
    parser.add_option(
        "--intervals",
        dest="intervals",
        type="int",
        default=10,
        help="Number of sub intervals used to discretize the time (10)")
    parser.add_option("--no-header",
                      dest="no_header",
                      action="store_true",
                      default=False,
                      help="Do not include the header in the output")

    (options, args) = parser.parse_args()
    if len(args) != 1:
        parser.error("Needs a preprocessed sequence to work on")

    split_time = options.T
    coal_rate = options.C
    rec_rate = options.R

    if split_time is None:
        print 'You must specify the split time.'
        sys.exit(2)
    if coal_rate is None:
        print 'You must specify the coalescence rate.'
        sys.exit(2)
    if rec_rate is None:
        print 'You must specify the recombination rate.'
        sys.exit(2)

    intervals = options.intervals
    model = build_epoch_seperated_model(2, [[0, 0]], [1, intervals])

    mpi, mT, mE, time_breaks = get_model_matrices(model, coal_rate, rec_rate,
                                                  split_time)
    pi, T, E = zipHMM_prepare_matrices(mpi, mT, mE)

    with open(options.outfile, 'w') as fout:

        if not options.no_header:
            print >> fout, '## Posterior probabilities for isolation model.'
            print >> fout, '# intervals =', intervals
            print >> fout, '# T =', split_time
            print >> fout, '# C =', coal_rate
            print >> fout, '# R =', rec_rate
            print >> fout, '# time_breaks =', ' '.join(map(str, time_breaks))

        try:
            for ziphmmdir in args:
                seqfile = os.path.join(ziphmmdir, 'original_sequence')
                _, pdTable = posteriorDecoding(seqfile, pi, T, E)
                print_matrix(pdTable, options.seq_name, options.first_pos,
                             fout)

        except IOError as e:
            if e.errno == errno.EPIPE:
                sys.exit(0)  # the pipe died, this is probably because it is in
                # a shell pipe where we have stopped reading
            else:
                raise e
i_c = 1.0/(2*g*u*i_N)
i_m = 0.1

estates = 10

name = "ILS.test"

seqs = ["example_data.fa"]

# Creating an isolation model with three species is done by merging species
# pairwise.
modelILS = build_epoch_seperated_model(
        3,
        # we start with a,b,c.
        [
            # merge a and b to '0', c continues as '1'
            [0,0,1],
            # and now '0' and '1' is merged to a new '0'
            [0,0]
        ],
        [1,estates,estates])
nstates = len(modelILS.tree_map)
names = ["bonobo", "pantro2", "hg18"]
init_ILS = (1.0e6*u, 4.5e6*u, i_c, i_r)

all_obs = []
total_L = 0
for seq in seqs:
    # each sequence creates a new column map extended with any new columns that
    # might be seen.
    obs, colmap = readObservations(seq, names, COL_MAP)
    COL_MAP = colmap
def main():
    usage = """%prog [options] <forwarder dirs>

This program estimates the parameters of an isolation model with two species
and uniform coalescence/recombination rate."""

    parser = OptionParser(usage=usage, version="%prog 1.0")

    parser.add_option("-o",
                      "--out",
                      dest="outfile",
                      type="string",
                      default="/dev/stdout",
                      help="Output file for the estimate (/dev/stdout)")
    parser.add_option(
        "--tmpfile",
        dest="tmpfile",
        type="string",
        default="/dev/null",
        help="Log for all points estimated in the optimization (/dev/null)")
    optimized_params = [
        ('splittime', 'split time', 1e6),
        ('Ne', 'effective population size', 20e3),
        ('recomb', 'recombination rate', 0.1),
    ]
    for (cname, desc, default) in optimized_params:
        parser.add_option("--%s" % cname,
                          dest=cname,
                          type="float",
                          default=default,
                          help="Initial guess at the %s (%g)" %
                          (desc, default))
    fixed_params = [
        ('mu', 'mutation rate', 1e-9),
        ('g', 'generation time', 20),
    ]
    for (cname, desc, default) in fixed_params:
        parser.add_option("--%s" % cname,
                          dest=cname,
                          type="float",
                          default=default,
                          help="Value of the %s (%g)" % (desc, default))
    parser.add_option(
        "--intervals",
        dest="intervals",
        type="int",
        default=10,
        help="Number of sub intervals used to discretize the time (10)")
    parser.add_option("--header",
                      dest="include_header",
                      action="store_true",
                      default=False,
                      help="Include a header on the output")
    parser.add_option("-v",
                      "--verbose",
                      dest="verbose",
                      action="store_true",
                      default=False,
                      help="Print help")

    (options, args) = parser.parse_args()
    if len(args) < 1:
        parser.error("Needs at least one preprocessed sequence to work on")

    if not options.verbose:
        log = lambda s: None
        logu = lambda s: None
    else:
        logu = log_unfinished_line
        log = log_finished_line

    logu("Loading forwarders...")
    forwarders = [Forwarder.fromDirectory(dir) for dir in args]
    log("done")

    logu("Constructing model...")
    intervals = options.intervals
    modelI = build_epoch_seperated_model(2, [[0, 0]], [1, intervals])
    log("done")

    mu = options.mu
    g = options.g
    T = options.splittime * mu
    C = 1.0 / (g * mu * 2 * options.Ne)
    R = options.recomb

    with open(options.tmpfile, 'w') as tmpfile:
        L, est = estimate_I(modelI, forwarders, T, C, R, outfile=tmpfile)

    vals = "\t".join(map(str, est))
    with open(options.outfile, 'w') as outfile:
        if options.include_header:
            print >> outfile, 'logL\tT\tC\tR'
        print >> outfile, "%f\t%s" % (L, vals)
def main():
    usage="""%prog [options] <forwarder dirs>

This program estimates the parameters of an isolation model with two species
and uniform coalescence/recombination rate."""


    parser = OptionParser(usage=usage, version="%prog 1.0")

    parser.add_option("-o", "--out",
                      dest="outfile",
                      type="string",
                      default="/dev/stdout",
                      help="Output file for the estimate (/dev/stdout)")
    parser.add_option("--tmpfile",
                      dest="tmpfile",
                      type="string",
                      default="/dev/null",
                      help="Log for all points estimated in the optimization (/dev/null)")
    optimized_params = [
            ('splittime', 'split time', 1e6),
            ('Ne', 'effective population size', 20e3),
            ('recomb', 'recombination rate', 0.1),
            ]
    for (cname, desc, default) in optimized_params:
        parser.add_option("--%s" % cname,
                          dest=cname,
                          type="float",
                          default=default,
                          help="Initial guess at the %s (%g)" % (desc, default))
    fixed_params = [
            ('mu', 'mutation rate', 1e-9),
            ('g', 'generation time', 20),
            ]
    for (cname, desc, default) in fixed_params:
        parser.add_option("--%s" % cname,
                          dest=cname,
                          type="float",
                          default=default,
                          help="Value of the %s (%g)" % (desc, default))
    parser.add_option("--intervals",
                      dest="intervals",
                      type="int",
                      default=10,
                      help="Number of sub intervals used to discretize the time (10)")
    parser.add_option("--header",
                      dest="include_header",
                      action="store_true",
                      default=False,
                      help="Include a header on the output")
    parser.add_option("-v", "--verbose",
                      dest="verbose",
                      action="store_true",
                      default=False,
                      help="Print help")

    (options, args) = parser.parse_args()
    if len(args) < 1:
        parser.error("Needs at least one preprocessed sequence to work on")

    if not options.verbose:
        log = lambda s: None
        logu = lambda s: None
    else:
        logu = log_unfinished_line
        log = log_finished_line

    logu("Loading forwarders...")
    forwarders = [Forwarder.fromDirectory(dir) for dir in args]
    log("done")

    logu("Constructing model...")
    intervals = options.intervals
    modelI = build_epoch_seperated_model(2, [[0,0]], [1,intervals])
    log("done")


    mu = options.mu
    g = options.g
    T = options.splittime * mu
    C = 1.0/(g*mu*2*options.Ne)
    R = options.recomb
    
    with open(options.tmpfile, 'w') as tmpfile:
        L, est = estimate_I(modelI, forwarders, T, C, R, outfile=tmpfile)
    
    vals = "\t".join(map(str,est))
    with open(options.outfile, 'w') as outfile:
        if options.include_header:
            print >>outfile, 'logL\tT\tC\tR'
        print >>outfile, "%f\t%s" % (L,vals)
def main():
    usage="""%prog [options] <forwarder dir>

This program calculates the posterior state probabilities of an isolation
model with two species and uniform coalescence/recombination rate."""


    parser = OptionParser(usage=usage, version="%prog 1.0")

    parser.add_option("-o", "--out",
                      dest="outfile",
                      type="string",
                      default="/dev/stdout",
                      help="Output file for the estimate (/dev/stdout)")

    parser.add_option("-n", "--seq-name",
                      dest="seq_name",
                      type="string",
                      default="sequence",
                      help="Name of the sequence. Used in the output for tabix indexing")
    parser.add_option("-p", "--first-pos",
                      dest="first_pos",
                      type="int",
                      default=0,
                      help="Position in the sequence where the first element is."
                        "Used in the output for tabix indexing. Default 1.")

    optimized_params = [
            ('T', 'split time'),
            ('C', 'Coalescence rate'),
            ('R', 'recombination rate'),
            ]
    for (cname, desc) in optimized_params:
        parser.add_option("-%s" % cname,
                          dest=cname,
                          type="float",
                          help="Model parameter %s" % desc)
    parser.add_option("--intervals",
                      dest="intervals",
                      type="int",
                      default=10,
                      help="Number of sub intervals used to discretize the time (10)")
    parser.add_option("--no-header",
                      dest="no_header",
                      action="store_true",
                      default=False,
                      help="Do not include the header in the output")

    (options, args) = parser.parse_args()
    if len(args) != 1:
        parser.error("Needs a preprocessed sequence to work on")

    split_time = options.T 
    coal_rate  = options.C
    rec_rate   = options.R
    
    if split_time is None:
        print 'You must specify the split time.'
        sys.exit(2)
    if coal_rate is None:
        print 'You must specify the coalescence rate.'
        sys.exit(2)
    if rec_rate is None:
        print 'You must specify the recombination rate.'
        sys.exit(2)
    
    intervals = options.intervals
    model = build_epoch_seperated_model(2, [[0,0]], [1,intervals])

    mpi, mT, mE, time_breaks = get_model_matrices(model, coal_rate, rec_rate, split_time)
    pi,   T,  E = zipHMM_prepare_matrices(mpi, mT, mE)

    with open(options.outfile,'w') as fout:
    
        if not options.no_header:
            print >> fout, '## Posterior probabilities for isolation model.'
            print >> fout, '# intervals =', intervals
            print >> fout, '# T =', split_time
            print >> fout, '# C =', coal_rate
            print >> fout, '# R =', rec_rate
            print >> fout, '# time_breaks =', ' '.join(map(str, time_breaks))
    
        try:
            for ziphmmdir in args:    
                seqfile = os.path.join(ziphmmdir, 'original_sequence')
                _, pdTable = posteriorDecoding(seqfile, pi, T, E)
                print_matrix(pdTable, options.seq_name, options.first_pos, fout)
                
        except IOError as e:
            if e.errno == errno.EPIPE:
                sys.exit(0) # the pipe died, this is probably because it is in
                            # a shell pipe where we have stopped reading
            else:
                raise e