Example #1
0
def test_populate_parameters():
    """populate_parameters should set up a nested likelihood function"""
    lf_GTR = nest.inflate_likelihood_function(_GTR)
    lf_General = nest.inflate_likelihood_function(_General)
    for edge in lf_GTR.tree.getTipNames():
        assert not allclose(lf_GTR.getPsubForEdge(edge),
                lf_General.getPsubForEdge(edge)), 'models started close'
    nest.populate_parameters(lf_General, lf_GTR)
    for edge in lf_GTR.tree.getTipNames():
        assert_array_almost_equal(lf_GTR.getPsubForEdge(edge),
                lf_General.getPsubForEdge(edge))
Example #2
0
def test_inflate_deflate_likelihood_function():
    """deflate/inflate_likelihood_function are reciprocal maps"""
    lf = nest.inflate_likelihood_function(_GTRplusGamma)
    aln = get_aln('GTRplusGamma', _GTRplusGamma['aln_length'])
    lf.setAlignment(aln)

    down = nest.deflate_likelihood_function(lf)
    down_up = nest.inflate_likelihood_function(down)
    down_up.setAlignment(aln)
    down_up_down = nest.deflate_likelihood_function(down_up)
    
    assert_equal(down, down_up_down)
Example #3
0
def ml_bootstraps(empirical, num_bootstraps=100):
    assert empirical['model'] in \
        ('NG', 'NFG', 'MG94G', 'GNC', 'Y98GTR', 'CNFGTR', 'MG94GTR', 'Y98')
    gc = get_genetic_code(empirical['gc'].encode('utf-8'))
    model = lambda **kw: eval(empirical['model'])(gc=gc, **kw)
    elf = nest.inflate_likelihood_function(empirical['lf'], model)

    aln_length = empirical['lf']['aln_length']
    if empirical['model'] != 'NG':  # for unexpected simulateAlignment behaviour
        aln_length = int(aln_length / 3)
        assert empirical['lf']['aln_length'] == 3 * aln_length

    def bootstrap(empdoc):
        aln = elf.simulateAlignment(aln_length)
        simdoc = {'aln': str(aln), 'tree': empdoc['lf']['tree']}
        result = ml(simdoc, **empdoc)
        return result['lf']['gs']

    def extract_result(bootstraps):
        egs = empirical['lf']['gs']
        result = {
            'gstats': bootstraps,
            'gstat': egs,
            'pvalue': sum(g > egs for g in bootstraps) / (num_bootstraps + 1)
        }
        return result

    emp_gen = (empirical for i in [None] * num_bootstraps)

    return extract_result(map(bootstrap, emp_gen))
Example #4
0
def test_deflate_likelihood_function():
    """deflate_likelihood_function produces internally consistent statistics"""
    lf = nest.inflate_likelihood_function(_General)
    aln = get_aln('General', _General['aln_length'])
    lf.setAlignment(aln)
    EN = nest.deflate_likelihood_function(lf)['EN']
    assert_equal(EN, nest.get_expected_no_subs(lf))
Example #5
0
def split_ens(doc):
    assert doc['model'] in ('GNC', 'Y98GTR', 'CNFGTR')
    gc = get_genetic_code(doc['gc'].encode('utf-8'))
    model = lambda **kw: eval(doc['model'])(gc=gc, **kw)
    lf = inflate_likelihood_function(doc['lf'], model)
    ENS = get_expected_no_subs(lf, gc)
    return {'ENS' : get_expected_no_subs(lf, gc)}
def test_makeContinuousPsubDefn():
    standard_params = {'is_independent': True, 'is_constant': False}

    lf_gen = inflate_likelihood_function(_General)

    model = GeneralBen(DNA.Alphabet,
                       recode_gaps=True,
                       model_gaps=False,
                       optimise_motif_probs=True,
                       name='GeneralBen')
    lf_ben = model.makeLikelihoodFunction(lf_gen.tree)

    populate_parameters(lf_ben, lf_gen, **standard_params)

    ben_ens = get_expected_no_subs(lf_ben)
    ben_lens = lf_ben.getParamValueDict(['edge'])['length']
    gen_ens = get_expected_no_subs(lf_gen)
    for edge in lf_gen.tree.getTipNames():
        assert_array_almost_equal(
            np.array(lf_ben.getRateMatrixForEdge(edge)) *
            lf_ben.getParamValue('length', edge),
            np.array(lf_gen.getRateMatrixForEdge(edge)) *
            lf_gen.getParamValue('length', edge))
        assert_almost_equal(ben_ens[edge], gen_ens[edge])
        assert_almost_equal(ben_lens[edge], ben_ens[edge])
Example #7
0
def param_bootstrap(stats, num_reps=None, model_pos=None, fitter=None, **kw):
    gene, f_stats, g_stats = stats
    try:
        f_row = f_stats[model_pos]
    except IndexError:
        logging.error(' Skipping ' + '/'.join(f_stats[0]['tip_names']) +
                      ' in ' + gene + ': position ' + model_pos + ' invalid')
        return
    model = f_row['name']

    if model_pos in g_stats:
        g_row = g_stats[model_pos]
    else:
        g_row = {
            'name': model,
            'tip_names': f_row['tip_names'],
            'gs_samples': [],
            'll_samples': [],
            'en_samples': []
        }
        g_stats[model_pos] = g_row
    gs_samples = g_row['gs_samples']
    ll_samples = g_row['ll_samples']
    en_samples = g_row['en_samples']
    if 'state' in g_row:
        random.setstate(eval(g_row['state']))

    lf = nest.inflate_likelihood_function(f_row)
    aln_length = f_row['aln_length']
    start = time.time()
    for i in 10 * range(num_reps):
        if len(gs_samples) >= num_reps:
            break
        try:
            aln = lf.simulateAlignment(aln_length, random_series=random)
            lfs = fitter(aln, lf.tree, return_lfs=model, **kw)
            fitted_lf = lfs[model_pos]
            ll_samples.append(fitted_lf.getLogLikelihood())
            gs_samples.append(fitted_lf.getGStatistic())
            if 'Q' in fitted_lf.defn_for:
                en_samples.append(nest.get_expected_no_subs(fitted_lf))
        except:
            logging.warning(' Missed a G stat for ' + model + ' and ' +
                            '/'.join(f_row['tip_names']) + ' in ' + gene +
                            ':\n' + format_exc())
    else:
        logging.error(
            ' Failed to compile sufficient bootstrap repetitions for ' +
            model + ' and ' + '/'.join(f_row['tip_names']) + ' in ' + gene)
    g_row['state'] = repr(random.getstate())
    f_row['gs_p'] = (sum(1 for g in gs_samples
                         if g < f_row['gs']), len(gs_samples) + 1)
    f_row['ll_p'] = (sum(1 for l in ll_samples
                         if l < f_row['ll']), len(ll_samples) + 1)
    logging.info(' Done ' + model + ' and ' + '/'.join(f_row['tip_names']) +
                 ' in ' + gene + ' in ' + str(time.time() - start) + ' secs')

    return gene, f_stats, g_stats
Example #8
0
def test_hetero_fit():
    """hetero_fit should fit GTR plus Gamma models"""
    pre_lf = nest.inflate_likelihood_function(_GTRplusGamma)
    prefit = nest.get_expected_no_subs(pre_lf)
    aln = get_aln('GTRplusGamma', 100000)
    lfs = nest.hetero_fit(aln, pre_lf.tree, param_limit=20, return_lfs=True)
    postfit = nest.get_expected_no_subs(lfs[-1])
    for taxon in prefit:
        assert_almost_equal(postfit[taxon], prefit[taxon], decimal=2) 
Example #9
0
 def check(self, row):
     try:
         armu = inflate_likelihood_function(row).allRateMatricesUnique()
     except (ArithmeticError, NotImplementedError):
         logging.debug(traceback.format_exc())
         return False
     except AssertionError:
         logging.warning(traceback.format_exc())
         return False
     return armu
Example #10
0
def fit(aln, tree, result, model, omega_indep, genetic_code, format):
    """ Fit the selected model to the input fasta ALN with the selected TREE 
    and output the RESULT. """
    data = aln.read()
    data = _decompress_if_zipped(data)
    doc = {'tree': tree.read().strip(), 'aln': data}
    doc = ml.ml(doc, model=model, omega_indep=omega_indep, gc=genetic_code)
    if format == 'json':
        json.dump(doc, result)
    else:
        lf = nest.inflate_likelihood_function(doc['lf'],
                                              lambda: getattr(ml, model)())
        result.write(str(lf) + '\n')
    return 0
Example #11
0
def test_hetero_clock_fit():
    """hetero_clock_fit should fit a molecular clock constrained GTR plus Gamma
    model nested in a GTR plus Gamma model"""
    pre_lf = nest.inflate_likelihood_function(_GTRplusGammaClockTest)
    prefit = nest.get_expected_no_subs(pre_lf)
    aln = get_aln('GTRplusGammaClockTest', 100000)
    lfs = nest.hetero_clock_fit(aln, pre_lf.tree, outgroup='Opossum',
            param_limit=20, return_lfs=True)
    lf_equal_length, lf = lfs
    assert_less(lf_equal_length.getLogLikelihood(), lf.getLogLikelihood())
    postfit = nest.get_expected_no_subs(lf)
    postfit_equal_length = nest.get_expected_no_subs(lf_equal_length)
    for taxon in prefit:
        assert_almost_equal(postfit[taxon], prefit[taxon], decimal=2) 
        assert_almost_equal(postfit_equal_length[taxon], prefit[taxon], decimal=2) 
Example #12
0
def test_seq_fit():
    """seq_fit should fit nested GTR and General models"""
    for model in 'GTR', 'General':
        pre_lf = nest.inflate_likelihood_function(eval('_'+model))
        prefit = nest.get_expected_no_subs(pre_lf)
        
        aln = get_aln(model, 100000)
        lfs = nest.seq_fit(aln, pre_lf.tree, param_limit=20, return_lfs=model)
        if model == 'General':
            assert_less(lfs[0].getLogLikelihood(),
                    lfs[1].getLogLikelihood())
        lf = lfs[-1]
        postfit = nest.get_expected_no_subs(lf)
        for taxon in prefit:
            assert_almost_equal(postfit[taxon], prefit[taxon], decimal=2) 
Example #13
0
def generate_alignments():
    from gzip import GzipFile
    from data import get_data_dir
    from os.path import join
    alns = [('GTRplusGamma', _GTRplusGamma['aln_length']),
            ('General', _General['aln_length']),
            ('GTR', 100000), ('General', 100000),
            ('GTRplusGamma', 100000), ('GTRplusGammaClockTest', 100000),
            ('GTRClockTest', 100000), ('GeneralBen', 100000)]
    alns = [('GTRClockTest', 100000), ('GeneralBen', 100000)]
    for model, aln_len in alns:
        lf = nest.inflate_likelihood_function(eval('_' + model))
        aln = lf.simulateAlignment(aln_len)
        filename = '_'.join((model, str(aln_len))) + '.fasta.gz'
        with GzipFile(join(get_data_dir(), filename), 'w') as aln_file:
            aln_file.write(aln.toFasta())
    return 0
Example #14
0
def omega(aln, tree, result, model, genetic_code, outgroup, neutral, format):
    """ Fit the selected model to the input fasta ALN with the selected TREE 
    and output the RESULT, with specific constraints on omega. """
    data = aln.read()
    data = _decompress_if_zipped(data)
    doc = {'tree': tree.read().strip(), 'aln': data}
    doc = omega_module.ml(doc,
                          model=model,
                          gc=genetic_code,
                          outgroup=outgroup,
                          neutral=neutral)
    if format == 'json':
        json.dump(doc, result)
    else:
        lf = nest.inflate_likelihood_function(doc['lf'],
                                              lambda: getattr(ml, model)())
        result.write(str(lf) + '\n')
    return 0
Example #15
0
def rooted(aln, tree, result, genetic_code, format):
    """ Fit GNC to the input fasta ALN with the selected TREE and output the
    RESULT. Parameters other than the scale parameter are constrained to be
    equal on branches connected to the root."""
    data = aln.read()
    data = _decompress_if_zipped(data)
    treestring = tree.read().strip()
    tree = LoadTree(treestring=treestring)
    assert len(tree.Children) == 2, 'Tree must be edge-rooted'
    rooted_edges = [child.Name for child in tree.Children]
    doc = {'tree': treestring, 'aln': data}
    doc = ml.rooted(doc, rooted_edges=rooted_edges, gc=genetic_code)
    if format == 'json':
        json.dump(doc, result)
    else:
        lf = nest.inflate_likelihood_function(doc['lf'], ml.GNC)
        result.write(str(lf) + '\n')
    return 0
Example #16
0
def clock(aln, tree, outgroup, result, model, omega_indep, genetic_code,
          format):
    """ Fit the selected model to the input fasta ALN with the input TREE with
    genetic distance constrained to be equal on all branches but the OUTGROUP
    and output the RESULT. """
    data = aln.read()
    data = _decompress_if_zipped(data)
    doc = {'tree': tree.read().strip(), 'aln': data}
    doc = clock_module.ml(doc,
                          model=model,
                          gc=genetic_code,
                          outgroup=outgroup,
                          omega_indep=omega_indep)
    if format == 'json':
        json.dump(doc, result)
    else:
        lf = nest.inflate_likelihood_function(doc['lf'],
                                              lambda: getattr(ml, model)())
        result.write(str(lf) + '\n')
    return 0
Example #17
0
def test_clock_fit():
    """clock_fit should fit nested GTR, General, and GeneralBen models,
    some with equal branch lengths"""
    for modelname in ('GTRClockTest', 'GeneralBen'):
        model = eval('_' + modelname)
        pre_lf = nest.inflate_likelihood_function(model)
        prefit = nest.get_expected_no_subs(pre_lf)
        aln = get_aln(modelname, 100000)
        lfs = nest.clock_fit(aln, pre_lf.tree, outgroup='Opossum', param_limit=20, 
                return_lfs='GTR' if modelname.startswith('GTR') else 'General')
        lf_equal_length, lf = lfs[:2] if modelname[:3] == 'GTR' else lfs[2:]
        assert_less(lf_equal_length.getLogLikelihood(), lf.getLogLikelihood())
        if modelname == 'GeneralBen':
            assert_less(lfs[0].getLogLikelihood(),
                        lf_equal_length.getLogLikelihood())
        postfit = nest.get_expected_no_subs(lf)
        postfit_equal_length = nest.get_expected_no_subs(lf_equal_length)
        for taxon in prefit:
            assert_almost_equal(postfit[taxon], prefit[taxon], decimal=2) 
            assert_almost_equal(postfit_equal_length[taxon], prefit[taxon], 
                    decimal=2) 
def test_constrain_lengths():
    lf_gen = inflate_likelihood_function(_General)
    aln = get_aln('General', _General['aln_length'])

    model = GeneralBen(DNA.Alphabet,
                       recode_gaps=True,
                       model_gaps=False,
                       optimise_motif_probs=True)
    lf_ben = model.makeLikelihoodFunction(lf_gen.tree)
    for param in lf_ben.getParamNames():
        if '/' in param:
            lf_ben.setParamRule(param, is_independent=True, is_constant=False)
    lf_ben.setParamRule('length', is_independent=False)
    lf_ben.setParamRule('length', edge='Opossum', is_independent=True)
    lf_ben.setAlignment(aln)
    lf_ben.optimise(local=True, show_progress=False)

    ens = get_expected_no_subs(lf_ben)
    lens = lf_ben.getParamValueDict(['edge'])['length']
    assert_almost_equal(lens['Mouse'], lens['Human'])
    for edge in lf_ben.tree.getTipNames():
        assert_almost_equal(lens[edge], ens[edge])
Example #19
0
 def check(self, row):
     return inflate_likelihood_function(row).allPsubsDLC()
Example #20
0
def test_get_expected_no_subs():
    """expected_no_subs should return dictionary of ENS by edge"""
    GS_lf = nest.inflate_likelihood_function(_GeneralStationary)
    EN = nest.get_expected_no_subs(GS_lf)
    for name in GS_lf.tree.getTipNames():
        assert_almost_equal(EN[name], GS_lf.getParamValue('length', name))