Example #1
0
 def test_seqgen_anc(self):
     outfile = os.path.join(PATH, 'tests', 'data', 'sim',
                            'seqgen.output.simulation.tsv')
     out = sim(SEQGEN, self.tree, sequence=self.anc, outfile=outfile, n=10)
     self.assertTrue(os.path.isfile(out))
     self.assertEqual(outfile, out)
     self.rm = outfile
Example #2
0
 def test_seqgen_msa(self):
     out = sim(SEQGEN, self.tree, sequence=self.msa, outfile=self.out, n=10)
     self.assertTrue(os.path.isfile(out))
     self.assertEqual(out, self.out)
     self.rm = out
Example #3
0
 def test_seqgen_default(self):
     out = sim(SEQGEN, self.tree, length=100, outfile=self.out)
     self.assertTrue(os.path.isfile(out))
     self.assertEqual(out, self.out)
     self.rm = out
Example #4
0
 def test_evolver_anc(self):
     out = sim(EVOLVER, self.tree, sequence=self.anc, outfile=self.out)
     self.assertTrue(os.path.isfile(out))
     self.assertEqual(out, self.out)
     self.rm = out
Example #5
0
 def test_evolver_out(self):
     out = sim(EVOLVER, self.tree, length=100, outfile=self.out)
     self.assertTrue(os.path.isfile(out))
     self.assertEqual(out, self.out)
     self.rm = out
Example #6
0
 def test_evolver_default(self):
     out = sim(EVOLVER, self.tree, length=100)
     self.assertTrue(os.path.isfile(out))
     self.rm = out
Example #7
0
def imc(sequence,
        tree='',
        aligner='',
        ancestor='',
        simulator='',
        asr_model='JTT',
        exp_model='JTT',
        n=100,
        divergent=True,
        indpairs=True,
        threshold=0.0,
        exp_prob=False,
        verbose=False):
    """
    Identify molecular parallel and convergent changes.
    
    :param sequence: str, path to the sequence data file. Sequence data file
        here covers a wide range of files and formats:
        
        * sequences: raw protein sequence file, need to be in FASTA format
          and a NEWICK format tree is also required for argument tree.
        * msa: multiple sequence alignment file, need to be in FASTA format
          and a NEWICK format tree is also required for argument tree.
        * ancestors: reconstructed ancestral states file, need to be in tsv
          (tab separated) file, the first line needs to start with #TREE,
          second line needs to be a blank line, and the rest lines in the
          file need to be tab separated sequence name (or ID) and amino
          acid sequences.
        * simulations: simulated sequences, need to be in tsv file, the
          first line needs to start with #TREE, second line needs to be
          a blank line, each dataset need to be separated by a blank line
          and inside each dataset block, each line should consist of tab
          separated sequence name (or ID) and amino acid sequences.
          
    :param tree: str, NEWICK format tree string or tree file. This need to be
        set according to argument sequence. if sequence is raw sequence file or
        MSA file, tree is required for guiding ancestral states reconstruction.
        If sequence is ancestors or simulations, then tree is not necessary.
    :param aligner: str, executable of an alignment program.
    :param ancestor: str, executable of an ancestral states reconstruction
        program.
    :param simulator: str, executable of an sequence simulation program.
    :param asr_model: str, model name or model file for ancestral states
        reconstruction, default: JTT.
    :param exp_model: str, model name or model file for estimate expected
        changes based on simulation or replacement probability manipulation,
        default: JTT.
    :param n: int, number of datasets (or duplicates) should be simulated.
    :param divergent: bool, identify divergent changes if True, or only
        identify parallel and convergent changes if False.
    :param indpairs: bool, only identify changes for independent branch pairs
        if true, or identify changes for all branch pairs if False.
    :param threshold: float, a probability threshold that ranges from 0.0 to
        1.0. If provided, only ancestral states with probability equal or
        larger than the threshold will be used, default: 0.0.
    :param exp_prob: bool, calculate the probability of expected changes if set
        to True and the exp_model contains a probability matrix. Time consuming
        process, be patient for the calculation.
    :param verbose: bool, invoke verbose or silent process mode,
        default: False, silent mode.
    :return: tuple, a dict object of counts of parallel replacements, a dict
        object of counts of convergent replacements, a list consists of details
        of replacements (namedtuple) and the p-value of AU Test (float or None).
    """

    logger.setLevel(logging.INFO if verbose else logging.ERROR)

    if os.path.isfile(sequence):
        sequence = os.path.abspath(sequence)
        wd = os.path.dirname(sequence)
    else:
        error('Invalid sequence {}, sequence is not a file or dose not '
              'exist, exited.'.format(sequence))
        sys.exit(1)

    basename = utilities.basename(sequence)
    rs = _sequencing(sequence, tree, aligner, ancestor, wd, asr_model, verbose)
    tree, rates, records, aps, size, sequence = rs

    basename_more = utilities.basename(sequence)
    pars, cons, divs, details, aup = None, None, None, None, None
    h1 = ['Category', 'BranchPair']
    h2 = ['Category', 'Position', 'BranchPair', 'R1', 'R2', 'Dataset']

    probs, pi = None, None
    if size == 1:
        h1.append('OBS')
        if exp_model:
            if simulator:
                h1.append('EXP')
                h1.extend(['SIM-{}'.format(i + 1) for i in range(n)])

            else:
                if exp_prob:
                    probs, pi = _load_matrix(exp_model)
                    if probs is not None:
                        h1.append('EXP')
    else:
        h1.append('EXP')
        h1.extend(['SIM-{}'.format(i + 1) for i in range(size)])

    tips = [v[0] for k, v in records.items() if not k.startswith('NODE')]
    length = len(tips[0])
    if size > 1:
        info('Estimating expected changes ... ')
    else:
        info('Identifying observed changes ...')
    tree, pars, cons, divs, details = _pc(tree, rates, records, aps, size,
                                          length, probs, pi, indpairs,
                                          threshold)

    if size == 1 and simulator:
        freq = _frequencing(tips, site=False)
        ts = tree.format('newick').strip()
        out = '{}.{}.tsv'.format(basename, sim._guess(simulator)[0])

        s = sim.sim(simulator,
                    ts,
                    model=exp_model,
                    length=length,
                    freq=freq,
                    n=n,
                    outfile=out,
                    verbose=verbose)

        if s and os.path.isfile(s):
            tree, rates, records, aps, size = _load(s)
            info('Estimating expected changes ... ')
            tree, par, con, div, detail = _pc(tree, rates, records, aps, size,
                                              length, None, None, indpairs,
                                              threshold)

            for k, v in par.items():
                pars[k].append(np.mean(v))
                cons[k].append(np.mean(con[k]))
                divs[k].append(np.mean(div[k]))
                pars[k].extend(v), cons[k].extend(con[k])
                divs[k].extend(div[k])
            details.extend(detail)

    if any([pars, cons, divs, details]):
        info('Writing identified parallel and convergent amino acid '
             'replacements to files.')
        counts = ''.join([basename_more, '.counts.tsv'])
        changes = ''.join([basename_more, '.details.tsv'])

        with open(counts, 'w') as o, open(changes, 'w') as c:
            o.write('{}\n'.format('\t'.join(h1)))
            s = lambda x: '{:.4f}'.format(x) if isinstance(x, float) else str(x
                                                                              )
            o.writelines('P\t{}\t{}\n'.format(k, '\t'.join([s(x) for x in v]))
                         for k, v in pars.items())
            o.writelines('C\t{}\t{}\n'.format(k, '\t'.join([s(x) for x in v]))
                         for k, v in cons.items())
            o.writelines('D\t{}\t{}\n'.format(k, '\t'.join([s(x) for x in v]))
                         for k, v in divs.items())

            c.write('{}\n'.format('\t'.join(h2)))
            c.writelines('{}\t{}\t{}\t{}\t{}\t{}\n'.format(*detail)
                         for detail in details)

    return pars, cons, divs, details, length