Ejemplo n.º 1
0
 def __init__(self, **kwargs):
     print 'invoke _BaseTreeSimulator__init__'
     #~ print 'kwargs:', kwargs
     self.model = kwargs.get('model')
     self.profile = kwargs.get('profile', IOsimul.SimulProfile(**kwargs))
     if not self.model:
         if getattr(self.profile, 'modeltype', False):
             # by default create a private model instance; these are better not shared between gene families
             # as their rates might have to be updated at different time points
             self.model = eval('models.' + self.profile.modeltype)(**kwargs)
         else:
             raise ValueError, "must specify the model to simulate either through the 'profile' of the 'model' arguments"
     self.t = 0
     self.times = kwargs.get('times', [])
     self.events = kwargs.get('events', {})
     self.eventsrecord = kwargs.get('eventsrecord', [])
     self.eventsmap = kwargs.get('eventsmap', {})
     self.extincts = kwargs.get('extincts', [])
     self.contempbranches = kwargs.get('contempbranches', [])
     self.nodeClass = kwargs.get('nodeClass', tree2.AnnotatedNode)
     self.nodeAttr = kwargs.get('nodeAttr', []) + ['extinct']
     self.ngen = kwargs.get('ngen')
     self.eventidgen = models.eventIdGen(
     )  # generator object providing serial unique identifiers for event objects
     self.dumppickle_warning = '\n'.join(["Cannot pickle generator objects; to pickle simulator object instance %s,"%repr(self), \
              "you have to first delete its event id generator attribute 'eventidgen'", \
              "(will discontinue numeration of events for further simulation)."])
     self.logger = IOsimul.SimulLogger(simultype=type(self))
Ejemplo n.º 2
0
    def __init__(self,
                 noTrigger=False,
                 modeltype='BirthDeathDTLModel',
                 **kwargs):
        print 'invoke _DTLtreeSimulator__init__'
        #~ print 'kwargs:', kwargs
        super(DTLtreeSimulator, self).__init__(
            modeltype=modeltype,
            noTrigger=True,
            allow_multiple=kwargs.get('allow_multiple', False),
            **kwargs)  ##W : dynamic allow_multiple rather than static False
        # automatic checkdata() and evolve(ngen) triggers from parent classses are deactivated by noTrigger=True
        #~ self.rootfreq = kwargs['rootfreq'] # frequency a which the gene family is found at the root of each tree in the multiple reference tree set (species lineages from a Moran process)
        refsimul = kwargs.get('refsimul')
        if refsimul:
            self.refsimul = refsimul  # keep hard link to reference/species tree simulator object; better pickle them together!
            self.refroot = refsimul.connecttrees(
                returnCopy=False
            )  # enforce existence of a root in reference simulation
            self.reftrees = refsimul.trees
            self.refconbran = refsimul.contempbranches
            self.refextincts = refsimul.extincts
            self.ngen = refsimul.t - 1
            self.times = refsimul.times
        else:
            self.reftrees = kwargs['reftrees']
            self.refconbran = kwargs['refcontempbranches']
            self.refextincts = kwargs['refextincts']
            self.times = kwargs['times']
            self.ngen = kwargs.get('ngen')

        self.reftimeslices = _get_timeslices(len(self.times), self.times)

        # generate a list of nodes of the reference tree
        self.refnodeswithdescent = kwargs.get(
            'refnodeswithdescent', refsimul.get_nodes_with_descendants())
        self.transferrec = {}
        self.extantevents = []

        self.profile = kwargs.get('profile',
                                  IOsimul.DTLSimulProfile(type='core'))
        self.genetrees = []
        self.pickgenelineages(allow_multiple=kwargs.get(
            'allow_multiple', False
        ))  ## W : go search in kwargs rather than direct call to unknown

        if not noTrigger:
            # self.checkdata()
            # if ngen specified, launch simulation for ngen iterations
            if self.ngen:
                self.evolve(self.ngen,
                            connecttrees=kwargs.get('connecttrees', False))
Ejemplo n.º 3
0
    def __init__(self, model, **kwargs):
        print 'invoke _MultipleTreeSimulator__init__'
        print 'kwargs:', kwargs
        super(MultipleTreeSimulator, self).__init__(model=model, **kwargs)
        self.popsize = self.model.popsize
        self.trees = [
            self.nodeClass(l=float(0), lab="Root_%d" % i)
            for i in range(self.popsize)
        ]
        self.profile = kwargs.get('profile', IOsimul.SimulProfile())

        if not kwargs.get('noTrigger'):
            self.checkdata()
            # if ngen specified, launch simulation for ngen iterations
            if self.ngen: self.evolve(self.ngen)
Ejemplo n.º 4
0
 def __init__(self, model, **kwargs):
     print 'invoke _BaseTreeSimulator__init__'
     print 'kwargs:', kwargs
     self.model = model
     self.t = 0
     self.times = kwargs.get('times', [])
     self.events = kwargs.get('events', {})
     self.eventsrecord = kwargs.get('eventsrecord', [])
     self.eventsmap = kwargs.get('eventsmap', {})
     self.extincts = kwargs.get('extincts', [])
     self.contempbranches = kwargs.get('contempbranches', [])
     self.nodeClass = kwargs.get('nodeClass', tree2.AnnotatedNode)
     self.ngen = kwargs.get('ngen')
     self.eventidgen = models.eventIdGen(
     )  # generator object providing serial unique identifiers for event objects
     self.dumppickle_warning = '\n'.join(["Cannot pickle generator objects; to pickle simulator object instance %s,"%repr(self), \
              "you have to first delete its event id generator attribute 'eventidgen'", \
              "(will discontinue numeration of events for further simulation)."])
     self.logger = IOsimul.SimulLogger(simultype=type(self))
Ejemplo n.º 5
0
 def get_extanttrees(self,
                     compute=True,
                     addextincts=[],
                     collapsenodes=False,
                     removelosses=True):
     """returns a COPY of the list of all trees 'cleaned' of their dead branches ; NB: original tree will have all its nodes labelled afterward"""
     if not compute: return self._extanttrees
     # only works with all nodes being labelled, to use labels as references rather than the node objects (which refer to the original tree) in Node.pop()
     self.labeltreenodes(silent=False)  # should not be necessary though
     # NB: will exclude null trees (None objects) from the returned list
     # first remove lineages associated to extinct species
     lineages = [
         self.copy_prune_dead_lineages(t,
                                       trimroot=False,
                                       collapsenodes=collapsenodes)
         for t in self.genetrees if t
     ]
     # and second remove lineages associated to gene loss
     if removelosses:
         if collapsenodes:
             extanttrees = [
                 BaseTreeSimulator.copy_prune_dead_lineages(
                     t,
                     self.extincts,
                     trimroot=False,
                     collapsenodes=collapsenodes) for t in lineages if t
             ]
         else:
             # keep a "rosary" structure of branches where single-child nodes are annotated as speciation-loss events
             extanttrees = [
                 IOsimul.annotateSpeciationLossEvents(
                     extanttree=t, lossnodes=self.extincts, trimLosses=True)
                 for t in lineages if t
             ]
     else:
         extanttrees = lineages
     return extanttrees
Ejemplo n.º 6
0
def main():

    # option parsing
    try:
        opts, args = getopt.getopt(sys.argv[1:], "o:n:r:f:e:l:c:p:vh", ["outputdir=", "ngenes=", "popsize=", "ngen=", \
                       "dtlrates=", "rootfreq=", "profiles=", \
                       "sample.larger.trees=", "connect.all.trees=", "sample.extant.species=", \
                       "help", "verbose"])
    except getopt.GetoptError as err:
        # print help information and exit:
        print str(err)  # will print something like "option -a not recognized"
        print usage()
        sys.exit(2)

    dopt = dict(opts)
    if ('-h' in dopt) or ('--help' in dopt):
        print usage()
        sys.exit(0)
    if ('-v' in dopt) or ('--verbose' in dopt): silent = False
    else: silent = True
    outdir = dopt.get('-o', dopt.get('--outputdir', os.getcwd()))
    popsize = int(dopt.get('-s', dopt.get('--popsize', 100)))
    ngen = int(dopt.get('-g', dopt.get('--ngen', 1000)))

    nfprofiles = dopt.get('-p', dopt.get('--profiles'))
    # define the evolution rates and original frequencies of gene families
    if nfprofiles != None:
        if nfprofiles == '':
            exsampleprof = [(2, 'core'), (2, 'accessory-slow'),
                            (6, 'orfan-fast')]
            dtlprof = IOsimul.MetaSimulProfile(
                profiles=[(n, IOsimul.DTLSimulProfile(type=t))
                          for n, t in exsampleprof])
        else:
            # expects a JSON formating of simulator profiles conforming to the IOsimul.MetaSimulProfile class parsers
            # e.g. a list of dict objects representing the arguments of a simulators.DTLSimulProfile class instance
            dtlprof = IOsimul.MetaSimulProfile(json=nfprofiles)
    else:
        # default global parameters (no pangenome structure), overridden by any provided profile
        rootfreq = dopt.get('-f', dopt.get('--rootfreq', 0.5))
        sdtlrates = dopt.get('-r', dopt['--dtlrates'])
        if sdtlrates:
            dtlrates = [
                float(s) for s in dopt.get('-r', dopt['--dtlrates']).split(',')
            ]
        else:  # by default, rates are:
            dtlrates = [0.001]  # D = 1e-3
        if len(dtlrates) < 2: dtlrates += dtlrates[0]  # T = D
        if len(dtlrates) < 3: dtlrates += [sum(dtlrates)]  # L = T+D
        dglobalprof = {
            0: {
                'rdup': dtlrates[0],
                'rtrans': dtlrates[1],
                'rloss': dtlrates[2]
            }
        }
        globalprof = IOsimul.DTLSimulProfile(rateschedule=dglobalprof,
                                             rootfreq=rootfreq)
        dtlprof = IOsimul.MetaSimulProfile(profiles=[(1, globalprof)])

    # derive number of gene families to simulate from profile weights, or from dedicated option -n (overrides profiles), or take default value of 10
    if dtlprof.ngenes > 1:
        ngenes = dtlprof.ngenes
    else:
        ngenes = int(dopt.get('-n', dopt.get('--ngenes', 10)))

    nlargegenetrees = int(dopt.get('-l', dopt.get('--sample.larger.trees',
                                                  -1)))
    lentoroot = float(dopt.get('-c', dopt.get('--connect.all.trees', -1)))
    samplextant = int(dopt.get('-e', dopt.get('--sample.extant.species', 0)))
    assert samplextant <= popsize

    #~ parser = argparse.ArgumentParser(description='Simulate phylogenic trees describing evolution of a population of bacterial genomes, with species, replicon/locus and gene layers.')
    #~ parser.add_argument('-o', '--outdir', )

    # creating output directories
    for d in ['logs', 'pickles', 'genetrees', 'reftrees']:
        outd = "%s/%s" % (outdir, d)
        if not os.path.exists(outd):
            os.mkdir(outd)

    # simualte species tree
    moranmodel = models.MoranProcess(popsize=popsize)
    moransim = simulators.MultipleTreeSimulator(moranmodel, ngen=ngen)
    if lentoroot >= 0:
        # connect all roots of the species lineage trees
        conrt = moransim.connecttrees(lentoroot, returnCopy=True)
        conrt.write_newick("%s/reftrees/connected.reftree_full.nwk" % (outdir))
        # prune dead lineages and connect all roots of the species lineage trees
        extconrt = moransim.get_extanttree(compute=True, lentoroot=lentoroot)
        extconrt.write_newick("%s/reftrees/connected.reftree_extant.nwk" %
                              (outdir))
        extantspe = extconrt.get_leaf_labels()
    else:
        # write lineage trees separately
        lextrt = moransim.get_extanttrees(compute=True)
        extantspe = []
        for k, extrt in enumerate(lextrt):
            extconrt.write_newick("%s/reftrees/reftree.%d_extant.nwk" %
                                  (outdir, k))
            extantspe += extconrt.get_leaf_labels()

    # select sampled species among the N extant
    if samplextant:
        sampledspe = random.sample(extantspe, samplextant)
        refnodeswithdescent = moransim.get_nodes_with_descendants(
            sample=sampledspe)
    else:
        refnodeswithdescent = moransim.get_nodes_with_descendants()
    # serial simulation of gene families, have to offer a parrallel version
    for k in range(ngenes):
        print "### simulate gene tree", k
        # simulate gene tree under the same reference tree set (= species/organism population history)
        #~ bddtlmodel = models.BirthDeathDTLModel(rdup=dtlrates[0], rtrans=dtlrates[1], rloss=dtlrates[2])
        bddtlmodel = models.BirthDeathDTLModel()
        bddtlsim = simulators.DTLtreeSimulator(
            model=bddtlmodel,
            refsimul=moransim,
            refnodeswithdescent=refnodeswithdescent,
            profile=dtlprof.sampleprofile(verbose=True),
            noTrigger=True)
        bddtlsim.evolve(bddtlsim.ngen)

        # save ref and gene tree simulation object together to save space as they share references to same objects
        IOsimul.dumppickle({
            'refsim': moransim,
            'genesim': bddtlsim
        }, "%s/pickles/simul.%d.pickle" % (outdir, k))

        # write out the largest n gene trees and corresponding species trees
        if nlargegenetrees >= 0:
            genetreesizes = [(genetree.nb_leaves(), i)
                             for i, genetree in enumerate(bddtlsim.genetrees)]
            genetreesizes.sort(reverse=True)
            isavetrees = (genetreesizes[l][1] for l in range(nlargegenetrees))
        else:
            isavetrees = xrange(len(bddtlsim.genetrees))
        for l in isavetrees:
            genetree = bddtlsim.genetrees[l]
            genetree.write_newick("%s/genetrees/simul.%d.all_gt.nwk" %
                                  (outdir, k),
                                  mode=('w' if l == 0 else 'a'))
            #~ genetree.ref.write_newick("%s/reftrees/simul.%d.rt.%d.nwk"%(outdir, k, l))

        if lentoroot >= 0:
            # connect all the gene trees in each gene population
            congt = bddtlsim.connecttrees(lentoroot, returnCopy=True)
            # write out connected trees
            congt.write_newick("%s/genetrees/simul.%d.connected_gt_full.nwk" %
                               (outdir, k))
            # prune dead lineages and connect all roots of the species lineage trees
            extconrt = bddtlsim.get_extanttree(compute=True,
                                               lentoroot=lentoroot)
            print extconrt
            extconrt.write_newick(
                "%s/genetrees/simul.%d.connected_gt_extant.nwk" % (outdir, k))
Ejemplo n.º 7
0
for k in range(nsims):
    print "### simulate gene tree", k
    # simulate gene tree under the same reference tree set (= species population history)
    bddtlmodel = models.BirthDeathDTLModel(rdup=dtlrate[0],
                                           rtrans=dtlrate[1],
                                           rloss=dtlrate[2])
    bddtlsim = simulators.DTLtreeSimulator(model=bddtlmodel,
                                           refsimul=moransim,
                                           noTrigger=True)
    bddtlsim.evolve(bddtlsim.ngen)

    # save ref and gene tree simulation object together to save space as they share references to same objects
    IOsimul.dumppickle({
        'refsim': moransim,
        'genesim': bddtlsim
    },
                       "%s/pickles/simul.%d.pickle" % (outdir, k),
                       prompt=True)

    # write out the largest n gene trees and corresponding species trees
    genetreesizes = [(genetree.nb_leaves(), i)
                     for i, genetree in enumerate(bddtlsim.genetrees)]
    largesizes = copy.copy(genetreesizes)
    largesizes.sort(reverse=True)
    for l in range(nlargegenetrees):
        il = largesizes[l]
        maxgenetree = bddtlsim.genetrees[il[1]]
        maxgenetree.write_newick("%s/genetrees/simul.%d.gt.%d.nwk" %
                                 (outdir, k, l))
        maxgenetree.ref.write_newick("%s/reftrees/simul.%d.rt.%d.nwk" %
                                     (outdir, k, l))
Ejemplo n.º 8
0
 def write_endlog(self, dirout, simultype):
     """data export of simulation (a posteriori)"""
     logger = IOsimul.SimulLogger(simultype=simultype,
                                  bnfout="%/log_" % dirout)
     logger