Exemple #1
0
    def runPlacedHMM(self, hmmfile, proteinfaa, bedfile):
        # run hmmer and strip down

        # define output files
        hmmDir = os.path.join(self.cfg["outdir"], "workfiles", "hmmer",
                              "estimations")
        file.isdir(hmmDir)
        hmmOut = os.path.join(hmmDir, "placement.tsv")
        hmmOus = os.path.join(hmmDir, "placement.out")
        hitOut = os.path.join(hmmDir, "hits.tsv")

        h = hmmer(
            "hmmsearch",
            proteinfaa,
            hmmOut,
            self.cfg["debug"],
            touch=self.cfg["touch"],
        )
        if h.doIneedTorun(
                self.cfg["force"]) or self.cfg["fplace"] or file.isnewer(
                    hmmfile, hmmOut):
            logging.info("Running hmmer for chosen locations")
            h.run(
                hmmOus,
                hmmfiles=hmmfile,
                modus=self.cfg["dbinfo"]["modus"],
                evalue=self.cfg["evalue"],
                cores=self.cfg["ncores"],
                training=self.cfg["training"],
            )
            # clean hmmer outpout
            logging.info("Processing Hmmer results")
            hitOut = h.clean(hmmOut, bedfile, hitOut, self.cfg["mindist"])
        return hitOut
Exemple #2
0
    def gmes(self, fasta):
        """
        predict proteins using gmes
        """
        logging.debug("Starting gmes function")

        gmesDir = os.path.join(self.cfg["outdir"], "workfiles", "gmes")
        file.isdir(gmesDir)
        gmesOut = os.path.join(gmesDir, "prot_seq.faa")
        gtffile = os.path.join(gmesDir, "genemark.gtf")
        inputfasta = os.path.abspath(os.path.join(gmesDir, "input.fna"))

        # GeneMark-ES
        g = gmes("runGMES", fasta, [gtffile, gmesOut], touch=self.cfg["touch"])
        logging.debug("Defined gmes run")
        if g.doIneedTorun(self.cfg["force"]):
            # rename fasta entries, so we dont have white spaces in them
            # can be turned of via cleanfasta in config file
            if not self.cfg["touch"]:
                g.input = base.clearFastaNames(fasta, inputfasta)
            else:
                g.input = inputfasta

            logging.info("Running GeneMark-ES")
            g.run(cores=self.cfg["ncores"])
        else:
            logging.debug("I do not need to run gmes, output exists:")
            logging.debug(gtffile)

        # always check if gtffile exists, if not Genemark-ES failed and
        # we can stop here
        if not file.exists(gtffile):
            # log and document failing
            # then stop pipeline
            logging.error("GeneMark-ES failed on this bin")
            self.write_outfile()
            exit(1)
        elif self.cfg["clean"]:
            # clean temp dirs
            _tmpdirs = ["data", "run", "info", "output/data", "output/gmhmm"]
            tempdirs = [os.path.join(gmesDir, x) for x in _tmpdirs]
            g.cleanup(tempdirs)

        # make a bed file from GTF
        bedf = os.path.join(gmesDir, "proteins.bed")
        if self.cfg["force"] or file.isnewer(gtffile,
                                             bedf) and not self.cfg["touch"]:
            logging.info("Extracting protein locations")
            bedf = base.gmesBED(gtffile, bedf)

        # touch files expected for next step
        if self.cfg["touch"]:
            g.touch([bedf, gmesOut])
        self._clean_fasta = inputfasta
        return (gmesOut, bedf)
Exemple #3
0
    def concatHMM(self):
        # create a dir for this
        hmmdir = os.path.join(self.cfg["outdir"], "workfiles", "hmmer", "estimations")
        file.isdir(hmmdir)
        hmmconcat = os.path.join(hmmdir, "all.hmm")

        if self.cfg["touch"]:
            file.touch(hmmconcat)
            return hmmconcat

        profiles = set()
        for p in self.placements[self.cfg["placementMethod"]]:
            localpath = os.path.join(self.cfg["db"], "sets", "{}.set".format(p["node"]))
            with open(localpath) as f:
                for line in f:
                    profiles.add(line.strip())
        # make profiles to sorted list
        profiles = list(profiles)
        profiles.sort()
        # create all paths for all hmms
        hmmerpaths = [os.path.join(self.cfg["db"], "hmms", "panther", "{}.hmm".format(profile)) for profile in profiles]
        # sort and check if we already have the hmm for this
        canuseprev = False
        profilehash = hashlib.sha256("_".join(profiles).encode()).hexdigest()
        hashpath = os.path.join(hmmdir, "all.hash")
        if file.exists(hashpath):
            with open(hashpath) as f:
                for line in f:
                    prevhash = line.strip()
                    break
            canuseprev = prevhash == profilehash

        if canuseprev:
            # we can use the existing file, so no need to continue
            log("Using pressed hmms from last run")
            return hmmconcat

        # concatenate
        if len(profiles) == 0:
            logging.error("We have no profiles to evaluate")
            exit(1)

        log("{} hmm profiles need to be used for estimations".format(len(profiles)))
        log("Concatenating hmms, this might take a while (IO limited)")
        hmmconcat = base.concatenate(hmmconcat, hmmerpaths)
        # press
        log("Pressing hmms")
        hp = hmmpress("hmmpress", hmmconcat, None, touch=self.cfg["touch"])
        hp.run()

        # save profile hash
        with open(hashpath, "w") as f:
            f.write(f"{profilehash}")

        return hmmconcat
Exemple #4
0
    def __init__(self, program, inf, outf, debug=False, touch=False):
        # check software is in path:
        if run.which(program) is None:
            print("{} is not installed".format(program))
        self.debug = debug
        self.touchonly = touch
        self.program = program
        self.input = inf
        # in case multiple output fiules are defined
        # we set the first one as output but use all for testing
        # is a rule has to be run
        if isinstance(outf, list):
            self.output = outf[0]
            self.output_test = outf
        else:
            self.output = outf
            self.output_test = [outf]

        if outf is not None:
            # create output dir
            file.isdir(os.path.dirname(self.output))
Exemple #5
0
 def checkIO(self, fastapath, outdir):
     # create outdir if not exists
     file.isdir(self.cfg["outdir"])
     # check if input and output can be accessed
     logging.debug("Warning: IO check not yet implemented")
     return False
Exemple #6
0
    def place(self, fasta, bedfile):
        """
        main function to place a bin in the tree.
        will subsequently run hmmer
        """
        # test if we can open the input files first
        if not base.exists(fasta):
            logging.error("Could not open fasta file")
            self.write_outfile()
            exit(1)
        if not base.exists(bedfile):
            logging.error("Could not open bed file")
            self.write_outfile()
            exit(1)

        # define output files
        hmmDir = os.path.join(self.cfg["outdir"], "workfiles", "hmmer")
        file.isdir(hmmDir)
        hmmOut = os.path.join(hmmDir, "placement.tsv")
        hmmOus = os.path.join(hmmDir, "placement.out")
        hitOut = os.path.join(hmmDir, "hits.tsv")

        # run hmmer if forced or input newer than output
        h = hmmer("hmmsearch", fasta, hmmOut, touch=self.cfg["touch"])
        if h.doIneedTorun(self.cfg["force"]) or self.cfg["fplace"]:
            logging.info("Searching for proteins to place in the tree")
            h.run(
                hmmOus,
                hmmfiles=self.config.placementHMMs,
                modus=self.cfg["dbinfo"]["modus"],
                evalue=self.cfg["evalue"],
                cores=self.cfg["ncores"],
            )
            # clean hmmer outpout
            logging.info("Processing Hmmer results")
            hitOut = h.clean(hmmOut, bedfile, hitOut, self.cfg["mindist"])
            self.updateStep("findprots", "looked for proteins")

        # pplacer paths
        placerDir = os.path.join(self.cfg["outdir"], "workfiles", "pplacer")
        placerDirTmp = os.path.join(placerDir, "tmp")
        pplaceAlinment = os.path.join(placerDir, "horizontalAlignment.fasta")
        pplaceOut = os.path.join(placerDir, "placement.jplace")
        pplaceLog = os.path.join(placerDir, "placement.log")
        pplaceOutReduced = os.path.join(placerDir, "placementReduced.jplace")
        file.isdir(placerDirTmp)

        # pplacer
        logging.debug("Preparing pplacer")
        pp = pplacer("pplacer", fasta, pplaceOut, touch=self.cfg["touch"])
        if pp.doIneedTorun(self.cfg["force"]) or self.cfg["fplace"]:
            logging.debug("Preparing alignments")
            pp.prepareAlignment(
                pplaceAlinment,
                hitOut,
                os.path.join(self.cfg["db"], "profile.list"),
                fasta,
                self.config,
                self.cfg,
                placerDirTmp,
            )
            if pp.lenscmgs == 0 and not self.cfg["touch"]:
                logging.error("Could not find any marker genes")
                self.write_outfile()
                exit(1)
            else:
                logging.info("Placing proteins in tree")
                self.updateStep("pplacer", "starting")
                pplacer_success = pp.run(
                    os.path.join(self.cfg["db"], "refpkg", "concat.refpkg"),
                    logfile=pplaceLog,
                    cores=self.cfg["ncorespplacer"],
                )
                if pplacer_success is False:
                    logging.warning("Pplacer could not finish. Exiting now")
                    self.write_outfile()
                    exit(1)

        # reduce placements to the placements with at least posterior of p
        logging.debug("Reducing placements")
        if not self.cfg["touch"]:
            pplaceOutReduced = pp.reduceJplace(
                pplaceOut, pplaceOutReduced,
                self.cfg["minPlacementLikelyhood"])
        else:
            pp.touch([pplaceOutReduced])
        logging.debug("Reducing placements done")
        # run TOG to get a tree
        togTree = os.path.join(placerDir, "placement.tree")
        tg = tog("guppy", pplaceOutReduced, togTree, touch=self.cfg["touch"])
        if tg.doIneedTorun(self.cfg["force"]):
            logging.debug("Fetching tree")
            r = tg.run()
            if r is False:
                logging.debug("No placement found")
                self.write_outfile()

        logging.debug("Getting best placements")
        # save path to togtree for plotting later
        self.cfg["togtreepath"] = togTree
        self.cfg["togjson"] = pplaceOutReduced
        # now we can place the bin using the tree
        if not self.cfg["touch"]:
            t = treelineage.treeHandler(togTree, annotate=False)
            t2 = treelineage.treeHandler(self.config.tree, annotate=False)
            sets = self.getSets()
            # get HCA and LCA placements
            self.placements = {}
            for method in ["LCA", "HPA"]:
                self.placements[method] = t.getPlacement(
                    method,
                    sets,
                    t2,
                    self.cfg["nPlacements"],
                    self.cfg["minSupport"],
                    maximum=self.cfg["nEvals"],
                    debug=self.cfg["debug"],
                )
        else:
            self.placements = {"LCA": "touch", "HCA": "touch"}

        logging.info("MAG succesfully placed in tree")
Exemple #7
0
                    default=False,
                    help='silcence most output')
parser.add_argument('--debug',
                    '-d',
                    action='store_true',
                    default=False,
                    help='debug and thus ignore safety')
args = parser.parse_args()

###############################################
# starting the analysis
log("Running eukcc for {} bin{}".format(len(args.fasta),
                                        "s" if len(args.fasta) > 1 else ""))

# create output if not exists
if not file.isdir(args.outdir):
    exit()

# check if a protein fasta was passed (implied )
if args.bed is not None:
    # set no glob
    args.noglob = True
    args.isprotein = True
else:
    args.isprotein = False

# check if we can expand glob:
if len(args.fasta) == 1 and not args.noglob:
    log("Expanding paths using glob", not args.quiet)
    args.fasta = glob.glob(args.fasta[0])
Exemple #8
0
def main():
    # set arguments
    # arguments are passed to classes
    parser = configargparse.ArgumentParser(
        description="Evaluate completeness and contamination of a MAG.")
    parser.add_argument("fasta",
                        type=str,
                        help="Run script on this bin (fasta file)")
    parser.add_argument("--db",
                        type=str,
                        required=True,
                        help="Path to EukCC DB")
    parser.add_argument(
        "--outdir",
        "-o",
        type=str,
        default="./",
        help=
        "Location for the output. Names will be prefixed using the bin filenames",
    )
    parser.add_argument(
        "--config",
        "-c",
        type=str,
        required=False,
        is_config_file=True,
        help="Config file to define parameters, YAML",
    )
    parser.add_argument(
        "--ncores",
        "-n",
        metavar="int",
        type=int,
        default=1,
        help="set number of cores for GeneMark-ES, pplacer and Hmmer",
    )
    parser.add_argument(
        "--ncorespplacer",
        metavar="int",
        type=int,
        default=0,
        help="Pplacer requires a lot of memory. If you want \
                              you can set less cores for pplacer,\
                              which improves memory consumption significantly",
    )
    parser.add_argument(
        "--hmm",
        dest="hmm",
        type=str,
        default=None,
        help="run hmmer on all these HMMs instead",
    )
    parser.add_argument(
        "--training",
        dest="training",
        action="store_true",
        default=False,
        help=
        "Run EukCC in training mode (needed to create a new release of the DB)",
    )
    parser.add_argument("--proteins",
                        default=False,
                        action="store_true",
                        dest="proteins",
                        help="Input fasta is proteins")
    parser.add_argument(
        "--bed",
        "-b",
        metavar="file.bed",
        type=str,
        default=None,
        help=
        "You can pass a bedfile of the protein location to omit fragmented proteins being detected twice",
    )
    parser.add_argument(
        "--force",
        "-f",
        dest="force",
        action="store_true",
        default=False,
        help="Force rerun of computation even if \
                                              output is newer than input. Don't resume previous run.",
    )
    parser.add_argument(
        "--keeptemp",
        dest="clean",
        action="store_false",
        default=True,
        help=
        "Keep all temporary files, by default EukCC will remove some temp files",
    )
    parser.add_argument(
        "--fplace",
        "-p",
        dest="fplace",
        action="store_true",
        default=False,
        help="Force rerun of placement and subsequent steps",
    )
    parser.add_argument(
        "--noglob",
        "-g",
        dest="noglob",
        action="store_true",
        default=False,
        help="Do not expand paths using glob",
    )
    parser.add_argument(
        "--quiet",
        "-q",
        dest="quiet",
        action="store_true",
        default=False,
        help="Silcence most output",
    )
    parser.add_argument(
        "--debug",
        "-d",
        action="store_true",
        default=False,
        help="Debug and thus ignore safety",
    )
    parser.add_argument(
        "--HPA",
        default=False,
        action="store_true",
        help="Set placement method to HPA",
    )
    parser.add_argument(
        "--nPlacements",
        type=int,
        default=2,
        metavar="n",
        help="Set number of proteins to support location \
                                in tree (default: 2)",
    )
    parser.add_argument(
        "--minGenomes",
        type=int,
        default=3,
        metavar="n",
        help="Minimal number of genomes to support a set (default: 3)",
    )
    parser.add_argument(
        "--fullineage",
        default=False,
        action="store_true",
        help="Output full lineage for MAGs",
    )
    parser.add_argument(
        "--minPlacementLikelyhood",
        default=0.4,
        type=float,
        metavar="float",
        help="minimal pplacer likelyhood (default: 0.4)",
    )
    parser.add_argument(
        "--mindist",
        type=int,
        default=2000,
        metavar="n",
        help="Distance to collapse hits (default: 2000)",
    )
    parser.add_argument(
        "--touch",
        default=False,
        action="store_true",
        help="Do not run, but touch all output files",
    )
    parser.add_argument(
        "--gmes",
        default=False,
        action="store_true",
        help="only run GeneMark-ES",
    )
    parser.add_argument(
        "--pygmes",
        default=False,
        action="store_true",
        help=
        "Use pygmes, will improve eukccs capability of running on highly fragmented bins but will take longer",
    )
    parser.add_argument("--diamond",
                        default=None,
                        type=str,
                        help="required to use pygmes option")
    parser.add_argument("--plot",
                        default=False,
                        action="store_true",
                        help="produce plots")
    parser.add_argument("-v",
                        "--version",
                        action="version",
                        version=f"EukCC version {version.__version__}")
    options = parser.parse_args()

    # define logging
    logLevel = logging.INFO
    if options.quiet:
        logLevel = logging.WARNING
    elif options.debug:
        logLevel = logging.DEBUG
    logging.basicConfig(
        format="%(asctime)s %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S: ",
        level=logLevel,
    )

    # for pygmes we need a diamond DB
    if options.pygmes and options.diamond is None:
        logging.error(
            "For pygmes you need to provide a diamond database with taxonomic information"
        )
        exit(1)

    logging.debug("Launching EukCC in debug mode")
    logging.info("Starting EukCC")

    # Now we start the run with EukCC
    # All magic numbers should be defined in info.py if they are not
    # part of the configuration options
    m = workflow.eukcc(options)

    # skip gene predition if this is already protein sequences
    if options.bed is None and options.proteins is False and options.pygmes is False:
        # run gmes
        proteinfaa, bedfile = m.gmes(options.fasta)
    elif options.bed is None and options.proteins is False and options.pygmes is True:
        proteinfaa, bedfile = m.pygmes(options.fasta, options.diamond)
    else:
        proteinfaa = options.fasta
        if options.bed is None:
            # create bed file
            bedpath = os.path.join(options.outdir, "workfiles",
                                   "proteins_tmp.bed")
            file.isdir(os.path.join(options.outdir, "workfiles"))

            bedfile = faabed(proteinfaa, bedpath)
        else:
            bedfile = options.bed

    # terminate if only gmes step was to be run
    if m.cfg["gmes"]:
        logging.info("Finished running GeneMark-ES")
        logging.info("Terminating as requested")
        exit(0)

    # run hmm file if we are asked to
    # this is needed during for training
    if m.cfg["training"] or m.cfg["hmm"]:
        logging.info("Running on custom hmm for training mode")
        m.runPlacedHMM(m.cfg["hmm"], proteinfaa, bedfile)
        logging.info("Stopping now as we are only doing training")
        exit(0)

    # place using pplacer and hmmer
    m.place(proteinfaa, bedfile)

    # concat hmms for hmmer
    hmmfile = m.concatHMM()
    # run Hmmer for sets of placement
    hits = m.runPlacedHMM(hmmfile, proteinfaa, bedfile)
    # infer lineage
    _ = m.inferLineage(m.placements[m.cfg["placementMethod"]])

    # estimate completeness and contamiantion
    outputfile = os.path.join(m.cfg["outdir"], "eukcc.tsv")
    m.estimate(hits, outputfile, m.placements[m.cfg["placementMethod"]])

    if m.cfg["plot"]:
        _ = m.plot()