def _execute_model_iim(optimiser):
    """
    Execute an experiment for the isolation-with-migration model using the specified optimiser.
    :param optimiser: The optimiser to use while executing the model.
    :return: None
    """
    from IMCoalHMM.hmm import Forwarder
    from IMCoalHMM.isolation_with_migration_model import IsolationMigrationModel
    from IMCoalHMM.likelihood import Likelihood

    no_ancestral_states = _config.try_int('model.ancestral_states', 10)
    no_migration_states = _config.try_int('model.migration_states', 10)

    transformer = _Transformer()
    transformer.add('isolation_time',
                    _config.try_float('model.isolation_time.min', 0.0),
                    _config.try_float('model.isolation_time.max', 0.002))
    transformer.add('mig_time', _config.try_float('model.mig_time.min', 0.0),
                    _config.try_float('model.mig_time.max', 0.016))
    transformer.add('coal_rate', _config.try_float('model.coal_rate.min', 0.0),
                    _config.try_float('model.coal_rate.max', 2000.0))
    transformer.add('recomb_rate',
                    _config.try_float('model.recomb_rate.min', 0.0),
                    _config.try_float('model.recomb_rate.max', 0.8))
    transformer.add('mig_rate', _config.try_float('model.mig_rate.min', 0.0),
                    _config.try_float('model.mig_rate.max', 500.0))

    _log_header()

    forwarders = [Forwarder(arg, NSYM=3) for arg in _alignments]
    model = IsolationMigrationModel(no_migration_states, no_ancestral_states)
    log_likelihood = Likelihood(model, forwarders)

    def fitness_function(parameters):
        transformed_parameters = transformer.transform(parameters)
        return log_likelihood(numpy.array(transformed_parameters))

    def log_function(context):
        _process_logged_context(context, transformer)

    optimiser.log = log_function
    mle_context = optimiser.maximise(fitness_function, len(transformer.names))
    mle_parameters, mle_log_likelihood = _process_final_context(
        mle_context, transformer)

    mle_isolation_time, mle_mig_time, mle_coal_rate, mle_recomb_rate, mle_mig_rate = mle_parameters
    mle_theta = 2.0 / mle_coal_rate

    _log_comment('')
    _log_comment('mle_isolation_time = {0}'.format(mle_isolation_time))
    _log_comment('mle_mig_time       = {0}'.format(mle_mig_time))
    _log_comment('mle_coal_rate      = {0}'.format(mle_coal_rate))
    _log_comment('mle_recomb_rate    = {0}'.format(mle_recomb_rate))
    _log_comment('mle_mig_rate       = {0}'.format(mle_mig_rate))
    _log_comment('mle_theta          = {0}'.format(mle_theta))
    _log_comment('mle_log_likelihood = {0}'.format(mle_log_likelihood))
def _execute_model_iim_epochs(optimiser):
    """
    Execute an experiment for the isolation-with-migration-with-epochs model using the specified optimiser.
    :param optimiser: The optimiser to use while executing the model.
    :return: None
    """
    from IMCoalHMM.hmm import Forwarder
    from IMCoalHMM.isolation_with_migration_model_epochs import IsolationMigrationEpochsModel
    from IMCoalHMM.likelihood import Likelihood

    epoch_factor = _config.try_int('model.epoch_factor', 1)
    coal_count = 1 + (2 * epoch_factor)

    no_ancestral_states = _config.try_int('model.ancestral_states', 10)
    no_migration_states = _config.try_int('model.migration_states', 10)

    transformer = _Transformer()
    transformer.add('isolation_time',
                    _config.try_float('model.isolation_time.min', 0.0),
                    _config.try_float('model.isolation_time.max', 0.002))
    transformer.add('mig_time', _config.try_float('model.mig_time.min', 0.0),
                    _config.try_float('model.mig_time.max', 0.016))
    transformer.add('recomb_rate',
                    _config.try_float('model.recomb_rate.min', 0.0),
                    _config.try_float('model.recomb_rate.max', 0.8))
    for i in xrange(coal_count):
        key = 'coal_rate_{0}'.format(i + 1)
        transformer.add(key, _config.try_float('model.{0}.min'.format(key),
                                               0.0),
                        _config.try_float('model.{0}.max'.format(key), 2000.0))
    for i in xrange(epoch_factor):
        key = 'mig_rate_{0}'.format(i + 1)
        transformer.add(key, _config.try_float('model.{0}.min'.format(key),
                                               0.0),
                        _config.try_float('model.{0}.max'.format(key), 500.0))

    _log_header()

    forwarders = [Forwarder(arg, NSYM=3) for arg in _alignments]
    model = IsolationMigrationEpochsModel(epoch_factor, no_migration_states,
                                          no_ancestral_states)
    log_likelihood = Likelihood(model, forwarders)

    def fitness_function(parameters):
        transformed_parameters = transformer.transform(parameters)
        return log_likelihood(numpy.array(transformed_parameters))

    def log_function(context):
        _process_logged_context(context, transformer)

    optimiser.log = log_function
    mle_context = optimiser.maximise(fitness_function, len(transformer.names))
    mle_parameters, mle_log_likelihood = _process_final_context(
        mle_context, transformer)

    mle_isolation_time = mle_parameters[0]
    mle_mig_time = mle_parameters[1]
    mle_recomb_rate = mle_parameters[2]
    mle_coal_rates = mle_parameters[3:coal_count + 3]
    mle_mig_rates = mle_parameters[coal_count + 3:]

    _log_comment('')
    _log_comment('mle_isolation_time = {0}'.format(mle_isolation_time))
    _log_comment('mle_mig_time       = {0}'.format(mle_mig_time))
    _log_comment('mle_recomb_rate    = {0}'.format(mle_recomb_rate))
    for i, coal_rate in enumerate(mle_coal_rates):
        _log_comment('mle_coal_rate_{0}    = {1}'.format(i + 1, coal_rate))
    for i, mig_rate in enumerate(mle_mig_rates):
        _log_comment('mle_mig_rate_{0}     = {1}'.format(i + 1, mig_rate))
    _log_comment('mle_log_likelihood = {0}'.format(mle_log_likelihood))
def main():
    """
    Run the main script.
    """
    usage = """%(prog)s [options] <forwarder dirs>

This program estimates the parameters of an isolation model with an initial migration period with two species
and uniform coalescence and recombination rates."""

    parser = ArgumentParser(usage=usage, version="%(prog)s 1.2")

    parser.add_argument("--header",
                        action="store_true",
                        default=False,
                        help="Include a header on the output")
    parser.add_argument("-o",
                        "--outfile",
                        type=str,
                        default="/dev/stdout",
                        help="Output file for the estimate (/dev/stdout)")

    parser.add_argument(
        "--logfile",
        type=str,
        default=None,
        help="Log for all points estimated in the optimization")

    parser.add_argument(
        "--ancestral-states",
        type=int,
        default=10,
        help=
        "Number of intervals used to discretize the time in the ancestral population (10)"
    )
    parser.add_argument(
        "--migration-states",
        type=int,
        default=10,
        help=
        "Number of intervals used to discretize the time in the migration period (10)"
    )

    parser.add_argument(
        "--optimizer",
        type=str,
        default="Nelder-Mead",
        help=
        "Optimization algorithm to use for maximizing the likelihood (Nealder-Mead)",
        choices=['Nelder-Mead', 'Powell', 'L-BFGS-B', 'TNC'])

    optimized_params = [
        ('isolation-period', 'time where the populations have been isolated',
         1e6 / 1e9),
        ('migration-period',
         'time period where the populations exchanged genes', 1e6 / 1e9),
        ('theta', 'effective population size in 4Ne substitutions', 1e6 / 1e9),
        ('rho', 'recombination rate in substitutions', 0.4),
        ('migration-rate',
         'migration rate in number of migrations per substitution', 200.0)
    ]

    for parameter_name, description, default in optimized_params:
        parser.add_argument("--%s" % parameter_name,
                            type=float,
                            default=default,
                            help="Initial guess at the %s (%g)" %
                            (description, default))

    parser.add_argument('alignments',
                        nargs='+',
                        help='Alignments in ZipHMM format')

    options = parser.parse_args()
    if len(options.alignments) < 1:
        parser.error("Input alignment not provided!")

    # get options
    no_migration_states = options.migration_states
    no_ancestral_states = options.ancestral_states
    theta = options.theta
    rho = options.rho

    forwarders = [Forwarder(arg, NSYM=3) for arg in options.alignments]

    init_isolation_time = options.isolation_period
    init_migration_time = options.migration_period
    init_coal = 1 / (theta / 2)
    init_recomb = rho
    init_migration = options.migration_rate

    log_likelihood = Likelihood(
        IsolationMigrationModel(no_migration_states, no_ancestral_states),
        forwarders)
    initial_parameters = (init_isolation_time, init_migration_time, init_coal,
                          init_recomb, init_migration)

    if options.logfile:
        with open(options.logfile, 'w') as logfile:

            if options.header:
                print >> logfile, '\t'.join([
                    'isolation.period', 'migration.period', 'theta', 'rho',
                    'migration'
                ])

            mle_parameters = \
                maximum_likelihood_estimate(log_likelihood, initial_parameters,
                                            log_file=logfile, optimizer_method=options.optimizer,
                                            log_param_transform=transform)
    else:
        mle_parameters = \
            maximum_likelihood_estimate(log_likelihood, initial_parameters,
                                        optimizer_method=options.optimizer)

    max_log_likelihood = log_likelihood(mle_parameters)
    with open(options.outfile, 'w') as outfile:
        if options.header:
            print >> outfile, '\t'.join([
                'isolation.period', 'migration.period', 'theta', 'rho',
                'migration', 'log.likelihood'
            ])
        print >> outfile, '\t'.join(
            map(str,
                transform(mle_parameters) + (max_log_likelihood, )))
Exemple #4
0
 def _set_chain(self):
     forwarders = [Forwarder(arg, NSYM=3) for arg in self.input_files]
     log_likelihood = Likelihood(self.model, forwarders)
     self.chain = MCMC(priors=self.priors,
                       log_likelihood=log_likelihood,
                       thinning=self.thinning)
def main():
    """
    Run the main script.
    """
    usage = """%(prog)s [options] alignments...

This program estimates the parameters of an isolation model with an initial migration period with two species
and uniform coalescence and recombination rates."""

    parser = ArgumentParser(usage=usage, version="%(prog)s 1.7")

    parser.add_argument("-o",
                        "--outfile",
                        type=str,
                        default="/dev/stdout",
                        help="Output file for the estimate (/dev/stdout)")

    parser.add_argument("--logfile",
                        type=str,
                        default=None,
                        help="Log for sampled points in all chains for the MCMCMC during the run." \
                             "This parameter is only valid when running --mc3.")

    parser.add_argument(
        "--ancestral-states",
        type=int,
        default=10,
        help=
        "Number of intervals used to discretize the time in the ancestral population (10)"
    )
    parser.add_argument(
        "--migration-states",
        type=int,
        default=10,
        help=
        "Number of intervals used to discretize the time in the migration period (10)"
    )

    parser.add_argument("-n",
                        "--samples",
                        type=int,
                        default=500,
                        help="Number of samples to draw (500)")

    parser.add_argument("--mc3",
                        help="Run a Metropolis-Coupled MCMC",
                        action="store_true")
    parser.add_argument("--mc3-chains",
                        type=int,
                        default=3,
                        help="Number of MCMCMC chains")
    parser.add_argument("--temperature-scale", type=float, default=10.0,
                        help="The scale by which higher chains will have added temperature." \
                             "Chain i will have temperature scale*i.")
    parser.add_argument("-k",
                        "--thinning",
                        type=int,
                        default=100,
                        help="Number of MCMC steps between samples (100)")

    parser.add_argument("--sample-priors",
                        help="Sample independently from the priors",
                        action="store_true")
    parser.add_argument("--mcmc-priors",
                        help="Run the MCMC but use the prior as the posterior",
                        action="store_true")

    meta_params = [
        ('isolation-period', 'time where the populations have been isolated',
         1e6 / 1e9),
        ('migration-period',
         'time period where the populations exchanged genes', 1e6 / 1e9),
        ('theta', 'effective population size in 4Ne substitutions', 1e6 / 1e9),
        ('rho', 'recombination rate in substitutions', 0.4),
        ('migration-rate',
         'migration rate in number of migrations per substitution', 200.0)
    ]

    for parameter_name, description, default in meta_params:
        parser.add_argument("--%s" % parameter_name,
                            type=float,
                            default=default,
                            help="Meta-parameter mean of the %s (%g)" %
                            (description, default))

    parser.add_argument('alignments',
                        nargs='*',
                        help='Alignments in ZipHMM format')

    options = parser.parse_args()
    if len(options.alignments) < 1 and not (options.sample_priors
                                            or options.mcmc_priors):
        parser.error("Input alignment not provided!")

    if len(options.alignments) > 0 and options.mcmc_priors:
        parser.error(
            "You should not provide alignments to the command line when sampling from the prior"
        )

    if options.logfile and not options.mc3:
        parser.error(
            "the --logfile option is only valid together with the --mc3 option."
        )

    # Specify priors and proposal distributions...
    # I am sampling in log-space to make it easier to make a random walk
    isolation_period_prior = LogNormPrior(log(options.isolation_period))
    migration_period_prior = LogNormPrior(log(options.migration_period))
    coal_prior = LogNormPrior(log(1 / (options.theta / 2)))
    rho_prior = LogNormPrior(log(options.rho))
    migration_rate_prior = ExpLogNormPrior(options.migration_rate)
    priors = [
        isolation_period_prior, migration_period_prior, coal_prior, rho_prior,
        migration_rate_prior
    ]

    # If we only want to sample from the priors we simply collect random points from these
    if options.sample_priors:
        with open(options.outfile, 'w') as outfile:
            print >> outfile, '\t'.join([
                'isolation.period', 'migration.period', 'theta', 'rho',
                'migration', 'log.prior', 'log.likelihood', 'log.posterior'
            ])
            for _ in xrange(options.samples):
                params = [prior.sample() for prior in priors]
                prior = sum(
                    log(prior.pdf(p)) for prior, p in zip(priors, params))
                likelihood = 0.0
                posterior = prior + likelihood
                print >> outfile, '\t'.join(
                    map(str,
                        transform(params) + (prior, likelihood, posterior)))
                outfile.flush()

        sys.exit(0)  # Successful termination

    if options.mc3:
        mcmc = MC3(priors,
                   input_files=options.alignments,
                   model=IsolationMigrationModel(options.migration_states,
                                                 options.ancestral_states),
                   thinning=options.thinning,
                   no_chains=options.mc3_chains,
                   switching=options.thinning / 10,
                   temperature_scale=options.temperature_scale)
    else:
        forwarders = [Forwarder(arg, NSYM=3) for arg in options.alignments]
        log_likelihood = Likelihood(
            IsolationMigrationModel(options.migration_states,
                                    options.ancestral_states), forwarders)
        mcmc = MCMC(priors, log_likelihood, thinning=options.thinning)

    with open(options.outfile, 'w') as outfile:
        print >> outfile, '\t'.join([
            'isolation.period', 'migration.period', 'theta', 'rho',
            'migration', 'log.prior', 'log.likelihood', 'log.posterior'
        ])

        if options.logfile:
            with open(options.logfile, 'w') as logfile:
                print >> logfile, '\t'.join([
                    'chain', 'isolation.period', 'migration.period', 'theta',
                    'rho', 'migration', 'log.prior', 'log.likelihood',
                    'log.posterior'
                ])

                for _ in xrange(options.samples):
                    # Write main chain to output
                    params, prior, likelihood, posterior = mcmc.sample()
                    print >> outfile, '\t'.join(
                        map(str,
                            transform(params) +
                            (prior, likelihood, posterior)))
                    outfile.flush()

                    # All chains written to the log
                    for chain_no, chain in enumerate(mcmc.chains):
                        params = chain.current_theta
                        prior = chain.current_prior
                        likelihood = chain.current_likelihood
                        posterior = chain.current_posterior
                        print >> logfile, '\t'.join(
                            map(str, (chain_no, ) + transform(params) +
                                (prior, likelihood, posterior)))
                    logfile.flush()

        else:
            for _ in xrange(options.samples):
                params, prior, likelihood, posterior = mcmc.sample()
                print >> outfile, '\t'.join(
                    map(str,
                        transform(params) + (prior, likelihood, posterior)))
                outfile.flush()

    if options.mc3:
        mcmc.terminate()
Exemple #6
0
def main():
    usage = """%(prog)s [options] <input> <input format> <output dir>

This program reads in an input sequence in any format supported by BioPython
and writes out a preprocessed file ready for use with CoalHMM (python module ziphmm).
Also supports gzipped input files, if the name ends with `.gz`.

Assumption #1: Either the file is a pairwise alignment, or you have provided
exactly two names to the `--names` option.

Assumption #2: The file uses a simple ACGT format (and N/-). Anything else will
be interpreted as N and a warning will be given with all unknown symbols.

Warning: This program uses SeqIO.to_dict to read in the entire alignment, you
may want to split the alignment first if it's very large.
"""

    parser = ArgumentParser(usage=usage, version="%(prog)s 1.1")

    parser.add_argument("--names",
                        type=str,
                        default=None,
                        help="A comma-separated list of names to use from the source file")
    parser.add_argument("--verbose",
                        action="store_true",
                        default=False,
                        help="Print status information during processing")

    # positional arguments
    parser.add_argument("in_filename", type=str, help="Input file")
    parser.add_argument("in_format", type=str, help="The file format for the input")
    parser.add_argument("output_filename", type=str, help="Where to write the processed alignment")

    options = parser.parse_args()

    if not os.path.exists(options.in_filename):
        print 'The input file', options.in_filename, 'does not exists.'
        sys.exit(1)

    if os.path.exists(options.output_filename):
        print 'The output file', options.output_filename, 'already exists.'
        print 'If you want to replace it, please explicitly remove the current'
        print 'version first.'
        sys.exit(1)

    if options.in_filename.endswith('.gz'):
        if options.verbose:
            print "Assuming '%s' is a gzipped file." % options.in_filename
        inf = gzip.open(options.in_filename)
    else:
        inf = open(options.in_filename)

    if options.verbose:
        print "Loading data...",
        sys.stdout.flush()
    alignments = SeqIO.to_dict(SeqIO.parse(inf, options.in_format))
    if options.verbose:
        print "done"

    clean = set('ACGT')

    if options.names:
        names = options.names.split(',')
    else:
        names = list(alignments.keys())

    if len(names) == 2:
        # PAIRWISE ALIGNMENT ###########################################################################
        if options.verbose:
            print "Assuming pairwise alignment between '%s' and '%s'" % (names[0], names[1])
        srcs = [alignments[name].seq for name in names]

        sequence1 = srcs[0]
        sequence2 = srcs[1]
        assert len(sequence1) == len(sequence2)
        sequence_length = len(sequence1)
        
        if options.verbose:
            print "Writing file readable by ziphmm to '%s'..." % options.output_filename,
            sys.stdout.flush()

        seen = set()
        with open(options.output_filename, 'w', 64 * 1024) as f:
            for i in xrange(sequence_length):
                s1, s2 = sequence1[i].upper(), sequence2[i].upper()
                seen.add(s1)
                seen.add(s2)

                if s1 not in clean or s2 not in clean:
                    print >> f, 2,

                elif s1 == s2:
                    print >> f, 0,
                else:
                    print >> f, 1,

        if options.verbose:
            print "done"
        if len(seen - set('ACGTN-')) > 1:
            print >> sys.stderr, "I didn't understand the following symbols form the input sequence: %s" % (
                ''.join(list(seen - set('ACGTN-'))))


    elif len(names) == 3:
        # TRIPLET ALIGNMENT ###########################################################################
        if options.verbose:
            print "Assuming triplet alignment between '%s', '%s', and '%s'" % (names[0], names[1], names[2])
        srcs = [alignments[name].seq for name in names]

        sequence1 = srcs[0]
        sequence2 = srcs[1]
        sequence3 = srcs[2]
        assert len(sequence1) == len(sequence2)
        assert len(sequence1) == len(sequence3)

        sequence_length = len(sequence1)
        outname = options.output_filename

        if options.verbose:
            print "Writing file readable by ziphmm to '%s'..." % outname,
            sys.stdout.flush()

        seen = set()
        nuc_map = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
        with open(outname, 'w', 64 * 1024) as f:
            for i in xrange(sequence_length):
                s1, s2, s3 = sequence1[i].upper(), sequence2[i].upper(), sequence3[i].upper()
                seen.add(s1)
                seen.add(s2)
                seen.add(s3)

                if s1 in clean and s2 in clean and s3 in clean:
                    i1, i2, i3 = nuc_map[s1], nuc_map[s2], nuc_map[s3]
                    print >> f, i1 + 4*i2 + 16*i3,
                else:
                    print >> f, 64,

        if options.verbose:
            print "done"
        if len(seen - set('ACGTN-')) > 1:
            print >> sys.stderr, "I didn't understand the following symbols form the input sequence: %s" % (
                ''.join(list(seen - set('ACGTN-'))))


    elif len(names) == 4:
        # QUARTET ALIGNMENT ###########################################################################
        if options.verbose:
            print "Assuming quartet alignment between '%s', '%s', '%s', and '%s'" % (names[0], names[1], names[2], names[3])
        srcs = [alignments[name].seq for name in names]

        sequence1 = srcs[0]
        sequence2 = srcs[1]
        sequence3 = srcs[2]
        sequence4 = srcs[3]
        assert len(sequence1) == len(sequence2)
        assert len(sequence1) == len(sequence3)
        assert len(sequence1) == len(sequence4)

        sequence_length = len(sequence1)
        outname = options.output_filename

        if options.verbose:
            print "Writing file readable by ziphmm to '%s'..." % outname,
            sys.stdout.flush()

        seen = set()
        nuc_map = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
        with open(outname, 'w', 64 * 1024) as f:
            for i in xrange(sequence_length):
                s1, s2, s3, s4 = sequence1[i].upper(), sequence2[i].upper(), sequence3[i].upper(), sequence4[i].upper()
                seen.add(s1)
                seen.add(s2)
                seen.add(s3)
                seen.add(s4)

                if s1 in clean and s2 in clean and s3 in clean and s4 in clean:
                    i1, i2, i3, i4 = nuc_map[s1], nuc_map[s2], nuc_map[s3], nuc_map[s4]
                    print >> f, i1 + 4*i2 + 16*i3 + 32*i4,
                else:
                    print >> f, 128,

        if options.verbose:
            print "done"
        if len(seen - set('ACGTN-')) > 1:
            print >> sys.stderr, "I didn't understand the following symbols form the input sequence: %s" % (
                ''.join(list(seen - set('ACGTN-'))))

        if options.verbose:
            print "ZipHMM is pre-processing...",
            sys.stdout.flush()
        f = Forwarder.fromSequence(seqFilename=outname, alphabetSize=9, minNoEvals=500)
        if options.verbose:
            print "done"

    else:
        print 'There are', len(names), 'species identified. We do not know how to convert that into something'
        print 'that CoalHMM can handle, sorry.'
        sys.exit(1)