Ejemplo n.º 1
0
 def test_tree_internal_file_to_file(self):
     name = os.path.join(DATA, 'e.newick')
     self.assertEqual(
         '(A:0.20000,(B:0.30000,(C:0.40000,D:0.50000)'
         ':0.20000):0.10000):0.00000;',
         _read(Tree(self.tree_nodes).file(name)))
     self.rm = name
Ejemplo n.º 2
0
 def test_tree_confidence_file_to_file(self):
     name = os.path.join(DATA, 'g.newick')
     self.assertEqual(
         '(A,(B,(C,D)7)6)5;',
         _read(Tree(self.tree_ic).file(name, ic2name=True, brlen=False)))
     self.rm = name
Ejemplo n.º 3
0
 def test_tree_internal_brlen_file_to_file(self):
     name = os.path.join(DATA, 'f.newick')
     self.assertEqual('(A,(B,(C,D)));',
                      _read(Tree(self.tree_nodes).file(name, brlen=False)))
     self.rm = name
Ejemplo n.º 4
0
 def test_tree_brlen_file_to_file(self):
     name = os.path.join(DATA, 'd.newick')
     self.assertEqual(
         '(A,(B,(C,D)E)F)root;',
         _read(Tree(self.tree_nodes).file(name, brlen=False, nodes=True)))
     self.rm = name
Ejemplo n.º 5
0
 def test_leave(self):
     with self.assertRaises(SystemExit) as cm:
         Tree(self.s + 'abc', leave=True)
     self.assertEqual(cm.exception.code, 1)
Ejemplo n.º 6
0
 def test_topology_internal_file_to_file(self):
     name = os.path.join(DATA, 'b.newick')
     self.assertEqual('(A,(B,(C,D)));',
                      _read(Tree(self.top_nodes).file(name)))
     self.rm = name
Ejemplo n.º 7
0
 def test_tree_internal_string(self):
     self.assertEqual(
         '(A:0.20000,(B:0.30000,(C:0.40000,D:0.50000):'
         '0.20000):0.10000):0.00000;',
         Tree(self.s_nodes).string())
Ejemplo n.º 8
0
def sim(exe,
        tree='',
        sequence='',
        model='JTT',
        length=100,
        freq='empirical',
        n=100,
        seed=0,
        gamma=4,
        alpha=0.5,
        invp=0,
        outfile='',
        verbose=False):
    """
    Sequence simulation via EVOLVER.

    :param exe: str, path to the executable of EVOLVER.
    :param tree: str, path to the tree (must has branch lengths and in NEWICK
        format). If not provided, sequence file need to be a tsv file
        consisting a tree with branch lengths and sequences. If both tree and
        sequence in tsv format file were provided, the tree in tsv file will
        be ignored.
    :param sequence: str, path to a multiple sequence alignment file in FASTA
        format or a tsv file generated by function `asr()` that have a line
        contains a tree with branch lengths. If provided, the length and base
        amino acid frequencies will be calculated based on the leaf sequences.
    :param model: str, name of a model a filename of a model file.
    :param length: int, the number of the amino acid sites need to be simulated,
        default: 0, the length will be obtained from the sequence.
    :param freq: str, "empirical", "estimate", or a comma separated string of
        base frequencies of 20  amino acids in the order of
        "ARNDCQEGHILKMFPSTWYV".
    :param n: int, number of datasets (or duplicates) need to be simulated.
    :param seed: int, the seed used to initiate the random number generator.
    :param gamma: int, 0 means discrete Gamma model not in use, any
        positive integer larger than 1 will invoke discrete Gamma model and
        set the number of categories to gamma.
    :param alpha: float, the Gamma shape parameter alpha, without setting, the
        value will be estimated by the program, in case an initial value is
        needed, the initial value of alpha will be set to 0.5.
    :param invp: float, proportion of invariable site.
    :param outfile: pathname of the output ML tree. If not set, default name
        [basename].[program].ML.newick, where basename is the filename of the
        sequence file without extension, program is the name of the ML inference
        program, and newick is the extension for NEWICK format tree file.
    :param verbose: bool, invoke verbose or silent process mode,
        default: False, silent mode.
    :return: str, path to the simulation output file.
    """

    logger.setLevel(logging.INFO if verbose else logging.ERROR)

    if tree:
        t = Tree(tree, leave=True)
        if not t.length:
            error('Unscaled tree: {}, cannot simulate sequences without branch'
                  'lengths.'.format(tree))
            sys.exit(1)
        if sequence:
            _, l, freqs = _seq2info(sequence)
        else:
            l, freqs = 0, None

    elif sequence:
        t, l, freqs = _seq2info(sequence)
    else:
        error('Neither tree or a sequence file contains a tree was provided, '
              'simulation aborted.')
        sys.exit(1)

    if length:
        try:
            length = int(length)
        except ValueError:
            error('Invalid length, length should be a integer.')
            sys.exit(1)
    else:
        if l:
            length = l
        else:
            error('Neither valid sequence nor argument length was specified, '
                  'failed to obtain length, simulation aborted.')
            sys.exit(1)

    fs = freqs if freq == 'estimate' else None

    if freq:
        if freq == 'equal':
            fs = ['0.05'] * 20
        elif freq == 'estimate':
            fs = freqs
            if not fs:
                warn('Failed to get observed amino acid frequency from '
                     'sequence, use the default frequency of model instead.')
        elif freq == 'empirical':
            fs = None
        elif freq.startswith('0') and freq.count(',') == 19:
            fs = [i.strip() for i in freq.split(',')]
            if 1 - sum([float(s) for s in fs]) > 0.000001:
                error('Specified frequencies do not add up to 1.0, simulation '
                      'aborted.')
                sys.exit(1)
        else:
            warn('Unknown frequency encounter, use the default frequency of '
                 'model instead.')
            fs = None

    try:
        seed = int(seed) if seed else random.randint(0, 10000)
    except ValueError:
        warn('Invalid seed, use generated random number instead.')
        seed = random.randint(0, 10000)

    name, func = _guess(exe)
    if not outfile:
        outfile = os.path.join(os.getcwd(), '{}.simulations.tsv'.format(name))

    if os.path.isfile(outfile):
        info('Found pre-existing simulated sequences.')
    else:
        outfile = func(exe, tree, length, fs, model, n, seed, gamma, alpha,
                       invp, outfile)
    return outfile
Ejemplo n.º 9
0
 def test_tree_string(self):
     self.assertEqual(
         '(A:0.20000,(B:0.30000,(C:0.40000,D:0.50000)'
         ':0.20000):0.10000):0.00000;',
         Tree(self.s).string())
Ejemplo n.º 10
0
 def test_tree_brlen_string(self):
     self.assertEqual('(A,(B,(C,D)E)F)root;',
                      Tree(self.s_nodes).string(brlen=False, nodes=True))
Ejemplo n.º 11
0
 def test_topology_internal_string(self):
     self.assertEqual('(A,(B,(C,D)));',
                      Tree('(A,(B,(C,D)E)F)root;').string())
Ejemplo n.º 12
0
 def test_topology_string(self):
     self.assertEqual('(A,(B,(C,D)));', Tree('(A,(B,(C,D)));').string())
Ejemplo n.º 13
0
 def test_topology_internal_file(self):
     self.assertEqual('(A,(B,(C,D)));', Tree(self.top_nodes).string())
Ejemplo n.º 14
0
def _evolver(exe, tree, length, freq, model, n, seed, gamma, alpha, invp,
             outfile):
    """
    Sequence simulation via EVOLVER.
    
    :param exe: str, path to the executable of EVOLVER.
    :param tree: str, path to the tree (must has branch lengths and in NEWICK
        format).
    :param length: int, the number of the amino acid sites need to be simulated.
    :param freq: list or None, base frequencies of 20 amino acids.
    :param model: str, name of a model a filename of a model file.
    :param n: int, number of datasets (or duplicates) need to be simulated.
    :param seed: int, the seed used to initiate the random number generator.
    :param gamma: int, 0 means discrete Gamma model not in use, any
        positive integer larger than 1 will invoke discrete Gamma model and
        set the number of categories to gamma.
    :param alpha: float, the Gamma shape parameter alpha, without setting, the
        value will be estimated by the program, in case an initial value is
        needed, the initial value of alpha will be set to 0.5.
    :param invp: float, proportion of invariable site.
    :param outfile: pathname of the output ML tree. If not set, default name
        [basename].[program].ML.newick, where basename is the filename of the
        sequence file without extension, program is the name of the ML inference
        program, and newick is the extension for NEWICK format tree file.
    :return: str, path to the simulation output file.
    """

    wd = os.path.dirname(outfile)
    cwd = tempfile.mkdtemp(dir=wd)
    dat = 'MCaa.dat'
    tree = Tree(tree, leave=True)
    tn, ts = tree.leaves, tree.string()
    m = modeling(model)
    if m.type == 'custom':
        mf = m.name
    else:
        name = m.name
        if name.lower() in models:
            info('Using {} model for simulation.'.format(name))
            with open(os.path.join(cwd, name), 'w') as o:
                o.write(models[name.lower()])
            mf = name
        else:
            error('PAML (evolver) does not support model {}.'.format(name))
            sys.exit(1)

    if freq is None:
        mn = 2
        f1, f2 = '', ''
    else:
        mn = 3
        f1 = ' '.join([str(i) for i in freq[:10]])
        f2 = ' '.join([str(i) for i in freq[10:]])

    with open(os.path.join(cwd, dat), 'w') as o:
        o.write(
            MC_DAT.format(seed, tn, length, n, ts, alpha, gamma, mn, mf, f1,
                          f2))

    try:
        info('Simulating sequences using EVOLVER.')
        log = os.path.join(cwd, 'simulation.log')
        with open(log, 'w') as stdoe:
            process = Popen([exe, '7', dat],
                            cwd=cwd,
                            stdout=stdoe,
                            stderr=stdoe,
                            universal_newlines=True)
            code = process.wait()
        if code:
            with open(log) as handle:
                error('Sequence simulation via EVOLVER failed for {} due to:'
                      '\n{}'.format(tree, indent(handle.read(), prefix='\t')))
                sys.exit(1)
        else:
            info('Parsing and saving simulation results.')
            simulations, tree = _evolver_parse(cwd)

            try:
                with open(outfile, 'w') as o:
                    o.write('#TREE\t{}\n'.format(tree.format('newick')))
                    for simulation in simulations:
                        o.writelines('{}\t{}\n'.format(s.id, s.seq)
                                     for s in simulation)
                        o.write('\n')
            except OSError:
                error('Failed to save simulation results to {} ('
                      'IOError, permission denied).'.format(outfile))
                outfile = ''
            info('Successfully saved simulation results to {}'.format(outfile))
    except OSError:
        error('Invalid PAML (EVOLVER) executable {}, running EVOLVER failed '
              'for {}.'.format(exe, tree))
        sys.exit(1)
    finally:
        shutil.rmtree(cwd)
    return outfile
Ejemplo n.º 15
0
 def test_tree_internal_brlen_string(self):
     self.assertEqual('(A,(B,(C,D)));',
                      Tree(self.s_nodes).string(brlen=False))
Ejemplo n.º 16
0
def _seqgen(exe, tree, length, freq, model, n, seed, gamma, alpha, invp,
            outfile):
    """
    Sequence simulation via EVOLVER.

    :param exe: str, path to the executable of EVOLVER.
    :param tree: str, path to the tree (must has branch lengths and in NEWICK
        format).
    :param length: int, the number of the amino acid sites need to be simulated.
    :param freq: list or None, base frequencies of 20 amino acids.
    :param model: str, name of a model a filename of a model file.
    :param n: int, number of datasets (or duplicates) need to be simulated.
    :param seed: int, the seed used to initiate the random number generator.
    :param gamma: int, 0 means discrete Gamma model not in use, any
        positive integer larger than 1 will invoke discrete Gamma model and
        set the number of categories to gamma.
    :param alpha: float, the Gamma shape parameter alpha, without setting, the
        value will be estimated by the program, in case an initial value is
        needed, the initial value of alpha will be set to 0.5.
    :param invp: float, proportion of invariable site.
    :param outfile: pathname of the output ML tree. If not set, default name
        [basename].[program].ML.newick, where basename is the filename of the
        sequence file without extension, program is the name of the ML inference
        program, and newick is the extension for NEWICK format tree file.
    :return: str, path to the simulation output file.
    """

    wd = os.path.dirname(outfile)
    cmd = [exe, '-l{}'.format(length), '-n{}'.format(n), '-z{}'.format(seed)]
    m = modeling(model)
    if m.type == 'custom':
        try:
            with open(m.name) as handle:
                lines = handle.readlines()
        except IndexError:
            error('Invalid model file {}, Line 22 (amino acid frequencies)'
                  'does not exist in model file.'.format(m.name))
            sys.exit(1)
        r = [line.strip() for line in lines[:19]]
        r = re.sub('\s+', ',', ','.join(r))
        cmd.append('-r{}'.format(r))
        if freq is None:
            freq = re.sub(r'\s+', ',', ','.join(lines[21]))
            cmd.append('-f{}'.format(freq))
            freq = None
    else:
        cmd.append('-m{}'.format(m.name.upper()))
    if freq:
        cmd.append('-f{}'.format(','.join([str(i) for i in freq])))

    if gamma:
        cmd.append('-g{}'.format(gamma))
    if alpha:
        cmd.append('-a{}'.format(alpha))
    cmd.extend(['-wa', '-q'])

    cwd = tempfile.mkdtemp(dir=wd)
    output = os.path.join(cwd, 'output.phylip')
    tree = Tree(tree).file(os.path.join(cwd, 'tree.newick'))

    try:
        info('Simulating sequences using Seq-Gen.')
        stdout, stdin = open(output, 'w'), open(tree)
        process = Popen(cmd,
                        cwd=cwd,
                        stdout=stdout,
                        stdin=stdin,
                        stderr=PIPE,
                        universal_newlines=True)
        code = process.wait()
        stdout.close(), stdin.close()
        if code:
            msg = indent(process.stderr.read(), prefix='\t')
            error('Sequence simulation via Seq-Gen failed due to:'
                  '\n{}'.format(tree, msg))
            sys.exit(1)
        else:
            info('Parsing and saving simulation results.')
            tree = Phylo.read(tree, 'newick')
            number, nodes = tree.count_terminals(), []
            for clade in tree.find_clades():
                if not clade.is_terminal():
                    number += 1
                    clade.name = 'NODE{}'.format(number)
                    nodes.append(str(number))
            try:
                with open(outfile, 'w') as o:
                    o.write('#TREE\t{}\n'.format(tree.format('newick')))
                    with open(output) as f:
                        for line in f:
                            if line.strip():
                                i, s = line.strip().split()
                                if i.isdigit() and s.isdigit():
                                    o.write('\n')
                                else:
                                    if i.isdigit():
                                        i = 'NODE{}'.format(i)
                                    o.write('{}\t{}\n'.format(i, s))
            except OSError:
                error('Failed to save simulation results to {} ('
                      'IOError, permission denied).'.format(outfile))
                outfile = ''
            info('Successfully saved simulation results to {}'.format(outfile))
    except OSError:
        error('Invalid Seq-Gen executable {}, running Seq-Gen failed for '
              '{}.'.format(exe, tree))
        sys.exit(1)
    finally:
        shutil.rmtree(cwd)
    return outfile
Ejemplo n.º 17
0
 def test_tree_confidence_string(self):
     self.assertEqual('(A,(B,(C,D)7)6)5;',
                      Tree(self.s_ic).string(ic2name=True, brlen=False))
Ejemplo n.º 18
0
def asr(exe, msa, tree, model, gamma=4, alpha=1.8, freq='',
        outfile='', verbose=False):
    """
    General use function for (marginal) ancestral states reconstruction (ASR).

    :param exe: str, path to the executable of an ASR program.
    :param msa: str, path to the MSA file (must in FASTA format).
    :param tree: str, path to the tree file (must in NEWICK format) or a NEWICK
        format tree string (must start with "(" and end with ";").
    :param model: str, substitution model for ASR. Either a path to a model
        file or a valid model string (name of an empirical model plus some 
        other options like gamma category and equilibrium frequency option).
        If a model file is in use, the file format of the model file depends
        on the ASR program, see the its documentation for details.
    :param gamma: int, The number of categories for the discrete gamma rate
        heterogeneity model. Without setting gamma, RAxML will use CAT model
        instead, while CODEML will use 4 gamma categories.
    :param freq: str, the base frequencies of the twenty amino acids.
        Accept empirical, or estimate, where empirical will set frequencies
        use the empirical values associated with the specified substitution
        model, and estimate will use a ML estimate of base frequencies.
    :param alpha: float, the shape (alpha) for the gamma rate heterogeneity.
    :param outfile: str, path to the output file. Whiteout setting, results
        of ancestral states reconstruction will be saved using the filename
        `[basename].[asrer].tsv`, where basename is the filename of MSA file
        without known FASTA file extension, asrer is the name of the ASR
        program (in lower case). The first line of the file will start with
        '#TREE' and followed by a TAB (\t) and then a NEWICK formatted tree
        string, the internal nodes were labeled. The second line of the tsv
        file is intentionally left as a blank line and the rest lines of the
        file are tab separated sequence IDs and amino acid sequences.
    :param verbose: bool, invoke verbose or silent (default) process mode.
    :return: tuple, the paths of the ancestral states file.

    .. note::
    
        If a tree (with branch lengths and/or internal nodes labeled) is
        provided, the branch lengths and internal node labels) will be ignored.
        
        If the model name combined with Gamma category numbers, i.e. JTT+G4,
        WAG+G8, etc., only the name of the model will be used. For all models
        contain G letter, a discrete Gamma model will be used to account for
        among-site rate variation. If there is a number after letter G,
        the number will be used to define number of categories in CODEML. For
        RAxML, the number of categories will always be set to 4 if G presented.
        
    """
    
    level = logging.INFO if verbose else logging.ERROR
    logger.setLevel(level)
    
    if os.path.isfile(msa):
        msa = os.path.abspath(msa)
    else:
        error('Ancestral reconstruction aborted, msa {} is not a file or '
              'does not exist.'.format(msa))
        sys.exit(1)
    
    tree = Tree(tree, leave=True)
        
    if not isinstance(model, str):
        error('Ancestral reconstruction aborted, model {} is not a valid '
              'model name or model file.'.format(model))
        sys.exit(1)
        
    model = modeling(model)
    asrer, func = _guess(exe)
    if not outfile:
        if msa.endswith('.trimmed.fasta'):
            name = msa.replace('.trimmed.fasta', '')
        else:
            name = msa
        outfile = '{}.{}.tsv'.format(basename(name), asrer)
    
    if os.path.isfile(outfile):
        info('Found pre-existing ancestral state file.')
    else:
        outfile = func(exe, msa, tree, model, gamma, alpha, freq, outfile)
    return outfile
Ejemplo n.º 19
0
 def test_log(self):
     with self.assertLogs('[iMC]', level='INFO') as cm:
         Tree(self.s + 'abc')
     self.assertRegex(cm.output[0], r'ERROR.*Invalid tree:.*a NEWICK.*.')