Beispiel #1
0
def startjob(indir,resultdir,skip="",checkpoint=False,workflow=1,refdb="",cpu=1,concat=False,
             model="GTR",bs=0,kf=False,maxmlst=100,maxorg=50,filtMLST=True,refdir="",fast=False,minmlst=10):
    #Setup working directory
    if not os.path.exists(os.path.join(os.path.realpath(resultdir),"queryseqs")):
        os.makedirs(os.path.join(os.path.realpath(resultdir),"queryseqs")) #query sequence folder

    #Read checkpoint from log if exists or use checkpoint parameter
    if not checkpoint and os.path.exists(os.path.join(resultdir,"automlst.log")):
        with open(os.path.join(resultdir,"automlst.log"),"r") as fil:
            for line in fil:
                x = line.strip().split("::")
                if "JOB_CHECKPOINT" in x[0]:
                    checkpoint = x[1]
                elif "WORKFLOW" in x[0]:
                    workflow = int(x[1])
    #Start log
    global log
    log = setlog.init(os.path.join(resultdir,"automlst.log"),toconsole=True)

    #ensure bootsrtap value is valid and no more than 1000
    try:
        bs = int(bs)
        if bs > 1000:
            bs = 1000
        elif bs < 0:
            bs = 0
    except ValueError:
        log.warning("Invalid bootsrap value found, setting to 0")
        bs = 0

    log.debug("START / RESUME JOB last checkpoint: %s"%checkpoint)
    log.info('JOB_PARAMS::{"resultdir":"%s","skip":"%s","workflow":%s,"concat":"%s","model":"%s"}'%(resultdir,skip,workflow,concat,model))
    if workflow == 1:
        log.info("WORKFLOW::1")
        try:
            return startwf1(indir,resultdir,checkpoint=checkpoint,skip=skip,refdb=refdb,cpu=cpu,concat=concat,
                            model=model,bs=bs,kf=kf,maxmlst=maxmlst,maxorg=maxorg,filtMLST=filtMLST,fast=fast,minmlst=minmlst)
        except Exception as e:
            log.error("Unexpected failure: %s"%e)
            raise
    elif workflow == 2:
        log.info("WORKFLOW::2")
        try:
            return startwf2(indir,resultdir,refdir=refdir,checkpoint=checkpoint,model=model,bs=bs,kf=kf,maxmlst=maxmlst,cpu=cpu,fast=fast)
        except Exception as e:
            log.error("Unexpected failure: %s"%e)
            raise
    else:
        log.error("Improper workflow specified: %s"%workflow)
        return False
Beispiel #2
0
#!/usr/bin/env python
import argparse, subprocess, glob, os, tempfile, json, setlog, pickle, base64
from numpy import median, mean, floor, round
from multiprocessing import cpu_count

log = setlog.init(toconsole=True)


def mashdist(listfile, reffile, outputfile, cpu=1, maxdist=1.0):
    cmd = ["mash", "dist", "-d", str(maxdist), reffile, "-l", listfile]
    if (cpu > 1):
        cmd = cmd[:2] + ["-p", str(cpu)] + cmd[2:]
    with open(outputfile + ".temp", "w") as ofil:
        try:
            log.info(
                "MASH_STATUS:: Running MASH ANI estimation on all input genomes"
            )
            subprocess.call(cmd, stdout=ofil)
            log.debug("MASH_STATUS:: Finished MASH ANI estimation")
            return True
        except subprocess.CalledProcessError as e:
            log.error("MASH_ERROR:: Could not process %s - %s" % (listfile, e))
            return False


def writefilelist(indir, outdir):
    try:
        flist = glob.glob(os.path.join(os.path.realpath(indir), "*.fa"))
        if len(flist):
            tf = tempfile.NamedTemporaryFile(prefix="queryflist_",
                                             suffix=".txt",
Beispiel #3
0
# Lab of Nadine Ziemert, Div. of Microbiology/Biotechnology
# Funding by the German Centre for Infection Research (DZIF)
#
# This file is part of ARTS
# ARTS is free software. you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version
#
# License: You should have received a copy of the GNU General Public License v3 with ARTS
# A copy of the GPLv3 can also be found at: <http://www.gnu.org/licenses/>.
#

import argparse, os, setlog, numpy as np, json, sqlite3 as sql

log = setlog.init(toconsole=True, level="info")


def getsingles(genemat, pct2=1.0):
    """Get rows with only single counts"""
    gmat = np.vstack(genemat.values())
    #Store only values of 1 in matrix
    gmat = ((gmat < 2) * (gmat > 0)).astype(int)
    thresh = len(gmat[0]) * pct2
    counts = gmat.sum(axis=1)
    #get index of ubiquitus singles and send gene list
    inds = [i for i, count in enumerate(counts) if count >= thresh]
    genenames = np.array(genemat.keys())
    return list(genenames[inds]), counts

Beispiel #4
0
def findsingles(infil,
                minnum=7,
                minorg=0.8,
                maxgenes=30,
                dnds="",
                outdir="",
                indir=".",
                log=None):
    if log == None:
        log = setlog.init(toconsole=True)
    orgs = []
    hks = []
    gmat = []
    njvals = {}
    if dnds and os.path.isfile(dnds):
        with open(dnds, "r") as fil:
            njvals = json.load(fil)
    if os.path.isfile(infil):
        with open(infil, "r") as fil:
            for line in fil:
                if line.startswith("#Gene"):
                    orgs.extend(line.strip().split()[5:])
                elif not line.startswith("#"):
                    x = line.strip().split()
                    hks.append(x[0])
                    gmat.append([int(float(v)) for v in x[5:]])
    gmat = np.vstack(gmat)
    numorgs = len(orgs)
    filtinds = [
        i for i, x in enumerate(gmat)
        if float(list(x).count(1)) / numorgs >= 0.95
    ]
    hkinds = []
    orginds = range(numorgs)
    #Remove organisms with missing/duplicate genes until minimum single copy genes are found
    while len(hkinds) < minnum and float(len(orginds)) / numorgs >= minorg:
        hkinds = [
            i for i in filtinds
            if float(list(gmat[i][orginds]).count(1)) == len(orginds)
        ]
        if len(hkinds) >= minnum:
            break
        tophks = sorted([[x, i, list(x).count(1)]
                         for i, x in enumerate(gmat) if i not in hkinds],
                        key=lambda row: row[2],
                        reverse=True)
        maxcount = tophks[0][2]
        tempmat = np.vstack([x[0] for x in tophks if x[2] == maxcount])
        orgcounts = sorted([[list(tempmat[:, j]).count(1), j]
                            for j in orginds],
                           key=lambda row: row[0])
        orginds = [x[1] for x in orgcounts[numorgs - maxcount:]]

    remorg = [x for i, x in enumerate(orgs) if i not in orginds]
    singhks = [x for i, x in enumerate(hks) if i in hkinds]
    if len(njvals):
        singhks = sorted(singhks, key=lambda k: njvals.get(k, 9))
        mdnds = njvals.get(singhks[:maxgenes][-1], "N/A")
        log.info("Lowest %s single copy genes found with dNdS < %s" %
                 (min(maxgenes, len(singhks)), mdnds))
    if len(hkinds) >= minnum:
        log.info("# of single copy genes found: %s\t# removed organisms: %s" %
                 (len(singhks), len(remorg)))
        log.info("Removed orgs:%s\tHKs:%s" % (remorg, singhks[:maxgenes]))
        if outdir:
            if not os.path.exists(outdir):
                os.makedirs(outdir)
            for hkgn in singhks[:maxgenes]:
                #Copy all single copy sequences to outdir after removing organisms
                seqrecs = SeqIO.parse(os.path.join(indir, hkgn + ".fna"),
                                      "fasta")
                # seqrecs = [rec for rec in seqrecs if not any(x in rec.id for x in remorg)]
                with open(os.path.join(outdir, hkgn + ".fna"),
                          "w") as fnafil, open(
                              os.path.join(outdir, hkgn + ".faa"),
                              "w") as faafil:
                    for rec in seqrecs:
                        if not any(x in rec.id for x in remorg):
                            fnafil.write(">%s\n%s\n" % (rec.id, rec.seq))
                            faafil.write(
                                ">%s\n%s\n" %
                                (rec.id, rec.seq.translate(to_stop=True)))

        return remorg, singhks[:maxgenes], mdnds
    else:
        log.error(
            "Failed: Only %s single copy genes found after removing orgs: %s" %
            (len(singhks), remorg))
        log.info("Try different set or allow more deletions")
        return False, False, False
Beispiel #5
0
def findsingles(db,
                minnum=7,
                minorg=0.8,
                maxgenes=30,
                dnds="",
                outdir="",
                keepgenes="",
                lf=""):
    global log
    if lf:
        log = setlog.init(logfile=lf, level="info")
    else:
        log = setlog.init(toconsole=True, level="info")

    log.info("Getting 16S genes...")
    seqdict = get16S(db)
    log.info("Done. Getting Core genes...")
    orgs, hks, gmat, seqdict = getcoregenes(db, seqdict=seqdict)

    if len(keepgenes):
        keepgenes = keepgenes.split(",")
    else:
        keepgenes = []

    njvals = {}
    if dnds and os.path.isfile(dnds):
        with open(dnds, "r") as fil:
            njvals = json.load(fil)
    numorgs = len(orgs)
    filtinds = [
        i for i, x in enumerate(gmat)
        if float(list(x).count(1)) / numorgs >= 0.95
    ]
    filtinds.extend([i for i, x in enumerate(hks) if x in keepgenes])
    filtinds = list(set(filtinds))
    hkinds = []
    #Start with organisms that have all keepgenes
    orginds = [
        i for i in range(numorgs)
        if all(gmat[hks.index(x), i] == 1 for x in keepgenes)
    ]
    hksfound = []

    #Remove organisms with missing/duplicate genes until minimum single copy genes are found
    while float(len(orginds)) / numorgs >= minorg:
        hkinds = [
            i for i in filtinds
            if float(list(gmat[i][orginds]).count(1)) == len(orginds)
        ]
        hksfound = [hks[i] for i in hkinds]
        if len(hkinds) >= minnum and all(x in hksfound for x in keepgenes):
            log.info("Found minimum genes")
            break
        tophks = sorted([[x, i, list(x).count(1)]
                         for i, x in enumerate(gmat) if i not in hkinds],
                        key=lambda row: row[2],
                        reverse=True)
        maxcount = tophks[0][2]
        tempmat = np.vstack([x[0] for x in tophks if x[2] == maxcount])
        orgcounts = sorted([[list(tempmat[:, j]).count(1), j]
                            for j in orginds],
                           key=lambda row: row[0])
        orginds = [x[1] for x in orgcounts[numorgs - maxcount:]]

    remorg = [x for i, x in enumerate(orgs) if i not in orginds]
    singhks = [x for i, x in enumerate(hks) if i in hkinds]
    #Prioritize by lowest dNdS values
    singhks = sorted(singhks, key=lambda k: njvals.get(k, 9))
    #Move keep genes at the top of list
    for x in keepgenes:
        if x in singhks:
            singhks.insert(0, singhks.pop(singhks.index(x)))

    if len(hkinds) >= minnum and all(x in hksfound for x in keepgenes):
        log.info("# of single copy genes found: %s\t# removed organisms: %s" %
                 (len(singhks), len(remorg)))
        log.info("Removed orgs:%s" % remorg)
        for i, x in enumerate(singhks):
            if i < maxgenes:
                log.info("Top Singles: %s\tdNdS=%s" % (x, njvals.get(x, "NA")))
            else:
                log.info("Other Singles: %s\tdNdS=%s" %
                         (x, njvals.get(x, "NA")))
    #WRITE GENES TO FASTA
        if outdir:
            if not os.path.exists(outdir):
                os.makedirs(outdir)
            for hkgn in singhks[:maxgenes]:
                allnucrecs = [
                    ">%s\n%s" % (org, x[hkgn][0][1])
                    for org, x in seqdict.items()
                    if hkgn in x.keys() and x[hkgn][0][1] and org not in remorg
                ]
                allaarecs = [
                    ">%s\n%s" % (org, x[hkgn][0][2])
                    for org, x in seqdict.items()
                    if hkgn in x.keys() and x[hkgn][0][2] and org not in remorg
                ]
                with open(os.path.join(outdir, hkgn + ".fna"),
                          "w") as fnafil, open(
                              os.path.join(outdir, hkgn + ".faa"),
                              "w") as faafil:
                    if len(allnucrecs):
                        fnafil.write("\n".join(allnucrecs) + "\n")
                    if len(allnucrecs):
                        faafil.write("\n".join(allaarecs) + "\n")
        return remorg, singhks[:maxgenes]
    else:
        log.error(
            "Failed: Only %s single copy genes found after removing orgs: %s\nKeepgenes found: %s"
            % (len(singhks), remorg, set(singhks) & set(keepgenes)))
        log.info("Try different set or allow more deletions")
        return False, False