Example #1
0
def etoki():
    try:
        if len(sys.argv) <= 1:
            raise ValueError
        try:
            exec('from modules.{0} import {0}'.format(sys.argv[1]))
        except ImportError as e:
            logger(str(e))
            raise ValueError
        else:
            sys.argv[0] = ' '.join(sys.argv[:2])

    except ValueError as e:
        sys.stdout.write('''
Program: EToKi (Enterobase Tool Kit)

Usage:   EToKi.py <command> [options]

Commands:
''' + '\n'.join([
            '    {0} {1}'.format(cmd[0].ljust(12), cmd[1]) for cmd in commands
        ]) + '''
Use EToKi.py <command> -h to get help for each command.
''')
    else:
        eval(sys.argv[1])(sys.argv[2:])
Example #2
0
def writeCGAV(prefix, ortho, pCore):
    genomes = sorted(ortho.keys())
    minPresence = len(ortho) * pCore / 100.
    genes = defaultdict(int)
    for gs in ortho.values():
        for g, i in gs.items():
            if len(i) == 1 and list(i.values())[0] > 0:
                genes[g] += 1
    genes = sorted(
        [gene for gene, presence in genes.items() if presence >= minPresence])
    profiles = []
    with open('{0}_CGAV.profile'.format(prefix), 'w') as fout:
        fout.write('\t'.join(['#Genome'] + genes) + '\n')
        for genome in genomes:
            grp = ortho[genome]
            profiles.append([(max(list(grp[g].values())[0], 0) if len(
                grp.get(g, {})) == 1 else 0) for g in genes])
            fout.write('{0}\t{1}\n'.format(
                genome, '\t'.join([str(allele) for allele in profiles[-1]])))

    distances = getDistance(np.array(profiles))
    with open('{0}_CGAV.dist'.format(prefix), 'w') as fout:
        fout.write('    {0}\n'.format(len(genomes)))
        for genome, dist in zip(genomes, distances):
            fout.write('{0} {1}\n'.format(genome, ' '.join(dist.astype(str))))

    subprocess.Popen(
        '''{rapidnj} -i pd {0}_CGAV.dist | sed "s/'//g"  > {0}_CGAV.nwk'''.
        format(prefix, **externals),
        stderr=subprocess.PIPE,
        shell=True).wait()
    logger('Core gene allelic variation profile is saved in {0}_CGAV.profile'.
           format(prefix))
    logger('Core gene allelic variation tree is saved in {0}_CGAV.nwk'.format(
        prefix))
Example #3
0
def splitGFF(gff, folder, prefix):
    if not os.path.isdir(folder):
        try:
            os.makedirs(folder)
        except FileExistsError:
            raise FileExistsError(
                'Fail to create the output folder for GFF files')
    prev, fout = None, None
    with uopen(gff) as fin:
        for line in fin:
            if line.startswith('#'):
                continue
            header = line.split(':', 1)[0]
            if header != prev:
                prev = header
                if fout:
                    fout.close()
                fout = uopen(
                    os.path.join(
                        folder, '{0}.{1}.gff.gz'.format(
                            prefix.rsplit('/', 1)[-1], header)), 'w')
                fout.write(
                    '#!gff-version 3\n#!annotation-source PEPPA from enterobase.warwick.ac.uk\n'
                )
            fout.write(line)
    fout.close()
    logger('GFF files are saved under folder {0}'.format(folder))
Example #4
0
def writeTree(prefix, ortho):
    genes = sorted({g for gs in ortho.values() for g in gs})
    with open('{0}_content.fas'.format(prefix), 'w') as fout:
        for genome, content in ortho.items():
            fout.write('>{0}\n{1}\n'.format(
                genome,
                ''.join([['A', 'T'][int(g in content)] for g in genes])))
    tree = subprocess.Popen('{fasttree} -quiet -nt {0}_content.fas'.format(
        prefix, **externals).split(),
                            stdout=subprocess.PIPE,
                            universal_newlines=True).communicate()[0]

    tree = Tree(tree, format=1)
    for n in tree.traverse():
        n.dist *= len(genes)
    tree.write(outfile='{0}_content.nwk'.format(prefix), format=1)
    logger('Gene content tree is saved in {0}_content.nwk'.format(prefix))
Example #5
0
def writeMatrix(prefix, ortho):
    genomes = sorted(ortho.keys())
    genes = defaultdict(int)
    for gs in ortho.values():
        for g, i in gs.items():
            genes[g] += 1
    presences = np.array(list(genes.values()))
    n = len(genomes)
    with open('{0}_content.summary_statistics.txt'.format(prefix),
              'w') as fout:
        fout.write('Strict core genes\t(strains = 100%)\t{0}\n'.format(
            np.sum(presences >= n)))
        fout.write('Core genes\t(99% <= strains < 100%)\t{0}\n'.format(
            np.sum((1. * n > presences) & (presences >= 0.99 * n))))
        fout.write('Soft core genes\t(95% <= strains < 99%)\t{0}\n'.format(
            np.sum((0.99 * n > presences) & (presences >= 0.95 * n))))
        fout.write('Shell genes\t(15% <= strains < 95%)\t{0}\n'.format(
            np.sum((0.95 * n > presences) & (presences >= 0.15 * n))))
        fout.write('Cloud genes\t(0% <= strains < 15%)\t{0}\n'.format(
            np.sum((0.15 * n > presences) & (presences >= 0.0 * n))))
        fout.write('Total genes\t(0% <= strains <= 100%)\t{0}\n'.format(
            presences.size))
    logger(
        'Summary of the pan-genome is saved in {0}_content.summary_statistics.txt'
        .format(prefix))
    genes = [g[0] for g in sorted(genes.items(), key=lambda x: (-x[1], x[0]))]
    with open('{0}_content.Rtab'.format(prefix),
              'w') as fout, open('{0}_content.csv'.format(prefix),
                                 'w') as fout2:
        fout.write('\t'.join(['Gene'] + genomes) + '\n')
        fout2.write(','.join(['Gene'] + genomes) + '\n')
        for g in genes:
            mat = [str(len(ortho[genome].get(g, {}))) for genome in genomes]
            fout.write('\t'.join([g] + mat) + '\n')
            mat2 = [
                ';'.join(sorted(ortho[genome].get(g, {}).keys()))
                for genome in genomes
            ]
            fout2.write(','.join([g] + mat2) + '\n')
    logger('Gene content matrix is saved in {0}_content.csv'.format(prefix))
    logger('Gene presence matrix is saved in {0}_content.Rtab'.format(prefix))
Example #6
0
def writeCurve(prefix, groups, pseudogene=True, n_iter=1000):
    gtype = ['CDSs', 'genes'][int(pseudogene)]
    encode = {}
    for grp in groups.values():
        for g in grp.keys():
            if g not in encode:
                encode[g] = len(encode)
    mat = [set(encode[g] for g, c in grp.items()) for grp in groups.values()]
    mat = [np.array(list(m)) for m in mat]
    x = np.arange(len(mat)) + 1

    curves = np.zeros([len(mat), n_iter, 2], dtype=int)
    for ite in np.arange(n_iter):
        np.random.shuffle(mat)
        genes = np.zeros(len(encode), dtype=int)
        for id, m in enumerate(mat):
            genes[m] += 1
            curves[id,
                   ite, :] = (np.sum(genes >= 1), np.sum(genes >= (id + 1)))
    y = np.mean(curves, 1)

    yopt1, ycov1 = curve_fit(func_powerlaw, x, y[:, 0], maxfev=3000)
    tval1 = t.ppf(1.0 - .05 / 2.0, max(0, y.shape[0] - 2))
    yci1 = np.diag(ycov1)**0.5 * tval1

    yopt2, ycov2 = curve_fit(func_powerlaw,
                             x[1:],
                             y[1:, 0] - y[:-1, 0],
                             maxfev=3000)
    tval2 = t.ppf(1.0 - .05 / 2.0, max(0, y.shape[0] - 1 - 2))
    yci2 = np.diag(ycov2)**0.5 * tval2

    yopt3, ycov3 = curve_fit(func_powerlaw, x, y[:, 1], maxfev=3000)
    tval3 = t.ppf(1.0 - .05 / 2.0, max(0, y.shape[0] - 2))
    yci3 = np.diag(ycov3)**0.5 * tval3

    with open('{0}_content.curve'.format(prefix), 'w') as fout:
        fout.write('#! No. genomes: {0}\n'.format(len(groups)))
        fout.write('#! Ave. {1} per genome: {0:.03f}\n'.format(
            np.mean([m.size for m in mat]), gtype))
        fout.write('#! No. pan {1}: {0}\n'.format(len(encode), gtype))
        fout.write('#! No. core {1}: {0}\n'.format(curves[-1, 0, 1], gtype))
        fout.write('#! Heaps\' law model in DOI: 10.1016/j.mib.2008.09.006:\tGamma={0:.03f} +/- {1:.03f}, Kappa={2:.03f} +/- {3:.03f}, ~{4:.03f} new genes per new genome.\n'.format(\
            yopt1[0], yci1[0], yopt1[1], yci1[1], (x[-1] + 1) ** yopt1[0] * yopt1[1] - x[-1] ** yopt1[0] * yopt1[1]))
        fout.write('#! Power law model in DOI: 10.1016/j.mib.2008.09.006:\tAlpha={0:.03f} +/- {1:.03f}, Kappa={2:.03f} +/- {3:.03f}, ~{4:.03f} new genes per new genome.\n'.format( \
            -yopt2[0], yci2[0], yopt2[1], yci2[1], (x[-1] + 1) ** yopt2[0] * yopt2[1]))
        fout.write('#! Power law model for the core genome:              \tAlpha={0:.03f} +/- {1:.03f}, Kappa={2:.03f} +/- {3:.03f}, ~{4:.03f} fewer core genes per new genome.\n'.format( \
            -yopt3[0], yci3[0], yopt3[1], yci3[1], -(x[-1] + 1) ** yopt3[0] * yopt3[1] + x[-1] ** yopt3[0] * yopt3[1]))
        fout.write(
            '#No. genome\t(Pan-genome)Median\t2.5%\t97.5%\t|\t(Core-genome)Median\t2.5%\t97.5%\n'
        )
        summary = np.zeros([len(mat), 2, 5], dtype=int)
        for id, curve in enumerate(curves):
            pan = np.sort(curve.T[0])
            core = np.sort(curve.T[1])
            pan_s = [
                pan[int(pan.size * 0.025)], pan[int(pan.size * 0.5)],
                pan[int(pan.size * 0.975)]
            ]
            core_s = [
                core[int(core.size * 0.025)], core[int(core.size * 0.5)],
                core[int(core.size * 0.975)]
            ]
            fout.write(
                '{0}\t{1[1]}\t{1[0]}\t{1[2]}\t|\t{2[1]}\t{2[0]}\t{2[2]}\n'.
                format(id + 1, pan_s, core_s))
    logger('Curves for {1} are saved in {0}_content.curve'.format(
        prefix, ['CDS', 'all genes'][int(pseudogene)]))
    return summary