def test_populate_parameters(): """populate_parameters should set up a nested likelihood function""" lf_file = open(os.path.join(get_data_dir(), 'brca1_murphy_gtr.json')) lf_json = json.load(lf_file) lf_GTR = nest.inflate_likelihood_function(lf_json) aln = LoadSeqs(os.path.join(get_data_dir(), 'brca1.fasta')) lf_GTR.setAlignment(aln) model = General(DNA.Alphabet, optimise_motif_probs=True, recode_gaps=True, model_gaps=False) lf_General = model.makeLikelihoodFunction(lf_GTR.tree) nest.populate_parameters(lf_General, lf_GTR) lf_General.setAlignment(aln) assert_almost_equal(lf_GTR.getGStatistic(), lf_General.getGStatistic(), 6) lf_GTR = nest.inflate_likelihood_function(_GTR) lf_General = nest.inflate_likelihood_function(_General) for edge in lf_GTR.tree.getTipNames(): assert not allclose( lf_GTR.getPsubForEdge(edge), lf_General.getPsubForEdge(edge)), 'models started close' nest.populate_parameters(lf_General, lf_GTR) for edge in lf_GTR.tree.getTipNames(): assert_array_almost_equal(lf_GTR.getPsubForEdge(edge), lf_General.getPsubForEdge(edge))
def test_GNC(): with open(os.path.join(get_data_dir(), 'GNC.json')) as infile: flat_lf = json.load(infile) lf = inflate_likelihood_function(flat_lf, _ml.GNC) aln = get_aln(os.path.join(get_data_dir(), 'ENSG00000100393.fasta.gz'), codon_position=-1) lf.setAlignment(aln) flat_again = deflate_likelihood_function(lf) assert_almost_equal(flat_lf['EN'].values(), flat_again['EN'].values(), 9)
def test_joint_reconstruction(): data_dir = get_data_dir() with open(join(data_dir, 'small_cnfgtr.json')) as lf_in: flat_cnfgtr = json.load(lf_in) model = lambda: gapped.CNFGTR(optimise_motif_probs=True, model_gaps=True) cnfgtr = gapped.inflate_likelihood_function(flat_cnfgtr, model) aln = get_aln(join(data_dir, 'small_aln.fasta'), filter_gaps=False, codon_position=-1)[:99] cnfgtr.setAlignment(aln) anc_aln = gapped.joint(cnfgtr) def prob_for_col(col): p = cnfgtr.getMotifProbs()[col['root']] p *= cnfgtr.getPsubForEdge('BterF3')[col['root']][col['BterF3']] p *= cnfgtr.getPsubForEdge('OsmaF4')[col['root']][col['OsmaF4']] p *= cnfgtr.getPsubForEdge('twoBeesF2')[col['root']][col['twoBeesF2']] p *= cnfgtr.getPsubForEdge('AmelF2')[col['twoBeesF2']][col['AmelF2']] p *= cnfgtr.getPsubForEdge('AdorF1')[col['twoBeesF2']][col['AdorF1']] return p flipper = {'A': 'C', 'C': 'G', 'G': 'T', 'T': 'A'} for i in range(0, 99, 3): col = anc_aln[i:i + 3].todict() p = prob_for_col(col) if col['twoBeesF2'] == '---': col['twoBeesF2'] = 'CAC' else: col['twoBeesF2'] = \ col['twoBeesF2'][:2] + flipper[col['twoBeesF2'][2]] pm = prob_for_col(col) assert_array_less(pm, p)
def test_command_line(self): datadir = get_data_dir() logfile = join(self._output, 'nsl.log') cmd = 'nonstationary_lengths.py -i ' + datadir + ' -o ' + \ self._output + ' -l ' + logfile + ' -L DEBUG -c 3 -u 20 -F seq_fit' sys.argv = cmd.split() from nonstationary_lengths import main assert_equal(main(), 0) if USING_MPI: MPI.COMM_WORLD.Barrier() log = '' with open(logfile) as logfile: log = logfile.read() assert_in('Done Mouse/Opossum/Human in ENSG00000111145', log) logfile = join(self._output, 'gs.log') cmd = 'g_stats.py -o ' + self._output + ' -l ' + logfile + \ ' -N 1 -u 20 -P 1 -F seq_fit -L DEBUG' sys.argv = cmd.split() from g_stats import main assert_equal(main(), 0) if USING_MPI: MPI.COMM_WORLD.Barrier() log = '' with open(logfile) as logfile: log = logfile.read() assert_in('Done General and Mouse/Opossum/Human in ENSG00000111145', log)
def test_gapped_CNFGTR(): aln = get_aln(os.path.join(get_data_dir(), 'ENSG00000100393.fasta.gz'), codon_position=-1, filter_gaps=False) tree = LoadTree(treestring='(Human,Mouse,Opossum);') doc = {'aln': str(aln), 'tree': str(tree)} cnfgtr_result = gapped.ml(doc, model='CNFGTR', model_gaps=True, omega_indep=False, indel_indep=False) model = lambda: gapped.CNFGTR(optimise_motif_probs=True, model_gaps=True) cnfgtr = gapped.inflate_likelihood_function(cnfgtr_result['lf'], model) pi = cnfgtr.getMotifProbsByNode()['root'].asarray() P = cnfgtr.getPsubForEdge('Human') assert_almost_equal(pi.dot(P), pi) omega = cnfgtr.getParamValue('omega') pi = cnfgtr.getMotifProbs() Q = cnfgtr.getRateMatrixForEdge('Human') cond_p = pi['CCG'] / sum(pi['CC' + c] for c in 'ACGT') ref_cell = Q['CCT']['CCG'] / cond_p cond_p = pi['CCC'] / sum(pi['CC' + c] for c in 'ACGT') assert_almost_equal(Q['CCA']['CCC'] / cond_p / ref_cell, cnfgtr.getParamValue('A/C')) assert_almost_equal(Q['---']['CCC'] / pi['CCC'] / ref_cell, cnfgtr.getParamValue('indel')) R = Q.asarray() / pi.asarray() assert_almost_equal(R.T, R)
def testClock(self): ''' clock should fit a clock type model ''' datadir = data.get_data_dir() aln = os.path.join(datadir, 'aln.fasta') tree = os.path.join(datadir, 'tree.nwk') correct_result = os.path.join(datadir, 'MG94GTRClock.txt') test_result = os.path.join(self.tempdir, 'MG94GTRClock.txt') args = ['clock', '--model', 'MG94GTR', aln, tree, 'Mouse', test_result] runner = click.testing.CliRunner() result = runner.invoke(cli.main, args) compare_files(test_result, correct_result)
def testFit(self): ''' fit should fit a model ''' datadir = data.get_data_dir() aln = os.path.join(datadir, 'aln.fasta') tree = os.path.join(datadir, 'tree.nwk') fit_result = os.path.join(datadir, 'MG94GTR.json') correct_result = os.path.join(datadir, 'MG94GTR.bootstrap') test_result = os.path.join(self.tempdir, 'MG94GTR.bootstrap') args = ['bootstrap', '--num_bootstraps', '1', fit_result, test_result] runner = click.testing.CliRunner() result = runner.invoke(cli.main, args) compare_files(test_result, correct_result)
def test_distribution(): """distribution should return empirical distribution for DNA sequence""" with GzipFile(os.path.join(get_data_dir(), 'General_1031.fasta.gz')) as ff: data = ff.read() al = Alignment(data=data).takeSeqs(('Mouse', )) distribution = jsd.distribution(al.getSeq('Mouse')) st = LoadTree(tip_names=('Mouse', )) sm = GTR() lf = sm.makeLikelihoodFunction(st) lf.setMotifProbsFromData(al) probs = lf.getMotifProbs() assert_array_almost_equal(array(probs), array(distribution))
def testFit(self): ''' omega should fit a model with omega constraints''' datadir = data.get_data_dir() aln = os.path.join(datadir, 'aln.fasta') tree = os.path.join(datadir, 'tree.nwk') correct_result = os.path.join(datadir, 'Y98.txt') test_result = os.path.join(self.tempdir, 'Y98.txt') args = [ 'omega', '--model', 'Y98', '--outgroup', 'Mouse', aln, tree, test_result ] runner = click.testing.CliRunner() result = runner.invoke(cli.main, args) compare_files(test_result, correct_result)
def generate_alignments(): from gzip import GzipFile from data import get_data_dir from os.path import join alns = [('GTRplusGamma', _GTRplusGamma['aln_length']), ('General', _General['aln_length']), ('GTR', 100000), ('General', 100000), ('GTRplusGamma', 100000), ('GTRplusGammaClockTest', 100000), ('GTRClockTest', 100000), ('GeneralBen', 100000)] alns = [('GTRClockTest', 100000), ('GeneralBen', 100000)] for model, aln_len in alns: lf = nest.inflate_likelihood_function(eval('_' + model)) aln = lf.simulateAlignment(aln_len) filename = '_'.join((model, str(aln_len))) + '.fasta.gz' with GzipFile(join(get_data_dir(), filename), 'w') as aln_file: aln_file.write(aln.toFasta()) return 0
def get_aln(model, aln_len): filename = '_'.join((model, str(aln_len))) + '.fasta.gz' data = '' with GzipFile(os.path.join(get_data_dir(), filename)) as fastafile: data = fastafile.read() return Alignment(data=data)