Beispiel #1
0
def eukcc_folder(args):
    state = eukcc_state(workdir=os.path.join(args.out, "refine_workdir"),
                        options=vars(args))
    file.isdir(state["workdir"])

    if state["db"] is None:
        if os.environ.get("EUKCC2_DB") is not None:
            state["db"] = os.environ.get("EUKCC2_DB")
            logging.debug(
                "Defined db via env variable EUKCC2_DB as '{}'".format(
                    state["db"]))
        else:
            logging.error(
                "No database was provided via --db or EUKCC2_DB env variable")
            exit(202)

    logging.info("EukCC version {}".format(version.__version__))

    # discover all bins
    logging.debug("Loading all bins")
    state["input_bins"] = glob.glob(
        os.path.join(state["binfolder"], "*{}".format(state["suffix"])))
    logging.info("Found {} bins".format(len(state["input_bins"])))
    if len(state["input_bins"]) == 0:
        logging.error("Stopping as no bins in folder")
        exit(222)
    refine(state)
Beispiel #2
0
    def __init__(
        self, program, workdir, infiles, outfiles, cores=1, touch=False, **kwargs
    ):
        # check software is in path:
        if which(program) is None:
            raise EnvironmentError("Could not find executable: {}".format(program))
        self.success = None

        if not isinstance(infiles, list):
            infiles = [infiles]
        if not isinstance(outfiles, list):
            outfiles = [outfiles]

        # create workdir
        file.isdir(workdir)

        # check for all infiles
        for infile in infiles:
            if infile is None or (
                not os.path.exists(infile)
                and not os.path.exists("{}.h3f".format(infile))
            ):
                raise OSError("File not found: {}".format(infile))
        # make sure all infiles are abspath
        infiles = [os.path.abspath(x) for x in infiles]

        # check if running is required
        if self.run_needed(workdir, infiles, outfiles):
            logging.debug("Running {}".format(program))
            self.run(program, workdir, infiles, outfiles, cores, kwargs)
Beispiel #3
0
    def align_marker_genes(self, fasta, markers, workdir, output):
        """
        Align marker genes identified using hmmsearch to be passed on the EPA

        :params fasta: path to a protein fasta file
        :params markers: dictionary of marker genes obtained by read_hmmer
        :params workdir: Folder to write tmp files to
        :params output: Path to the final expected output file
        """
        wd = workdir
        file.isdir(wd)
        md = defaultdict(list)
        for row in markers:
            md[row["query"]].append(row["target"])
        # prots file
        prots = {}

        all_profiles = set(
            self.state["dbinfo"]["files"]["hmm_placement_place"].keys())
        for profile, ids in md.items():
            if profile not in self.state["dbinfo"]["files"][
                    "hmm_placement_place"].keys():
                logging.error(
                    "Profile {} misses the hmm and alignment files".format(
                        profile))
                exit(200)
            hmmfile = self.state["dbinfo"]["files"]["hmm_placement_place"][
                profile]["hmm"]
            alnfile = self.state["dbinfo"]["files"]["hmm_placement_place"][
                profile]["aln"]
            prot_file = os.path.join(wd, "{}.faa".format(profile))
            prot_file = Fasta.reduce_fasta(fasta, prot_file, ids)

            aligned = os.path.join(wd, "{}.fasta".format(profile))
            hmmalign(
                "hmmalign",
                wd,
                [alnfile, hmmfile, prot_file],
                [aligned],
            )
            # check that alignment has also non gap sites, else epa-ng will fail
            if check_alignment(aligned, prot_file):
                prots[profile] = aligned
                all_profiles.remove(profile)
            else:
                logging.debug("Skipped alignment, as it is only gaps")

        if len(prots.keys()) == 0:
            logging.debug("No alignments could be done")
            return None

        for profile in all_profiles:
            prots[profile] = self.state["dbinfo"]["files"][
                "hmm_placement_place"][profile]["aln"]

        logging.debug("Concatenating alignments")
        return horizontal_concatenate(output, [v for k, v in prots.items()],
                                      list(prots.keys()))
Beispiel #4
0
    def __init__(self, state):
        # state is kept internally
        self.state = state
        self.state.checkpoint("launch")
        for key in ["db", "fasta", "out"]:
            if key in state.keys():
                self.state[key] = make_absolute(state[key])

        logging.debug("Using database located at: {}".format(self.state["db"]))
        if self.state["workdir"] is None or self.state["workdir"] == "":
            self.state["workdir"] = os.path.join(state["out"], "workdir")
        file.isdir(self.state["workdir"])

        # check if the database is accessible
        self.state["loaded_dbs"] = {}
        self.state["dbinfo"] = self.load_db(state["db"], "base")
        if state["clade"] != "base":
            self.state["dbinfo"] = self.load_db(state["db"], state["clade"])
        logging.debug("Using database '{}' with PANTHER version {}".format(
            self.state["dbinfo"]["name"], state["dbinfo"]["panther_version"]))

        # validate AA type
        self.state["seqtype"] = state["seqtype"].upper()
        if self.state["seqtype"] not in ["AA", "DNA"]:
            raise ValueError("seqtype should be either DNA or AA")

        if Fasta.validate_fasta(self.state["fasta"]) is False:
            raise ValueError("The provided fasta is corrupt or maybe gzipped?")

        if self.state["set_number_species"] < 3:
            logging.warning(
                "You included less than 3 species in the set creation, this might lead to unstable results"
            )
        if self.state["marker_prevalence"] < 95:
            logging.warning(
                "You selected a marker prevalence below 95%, this can lead to wrong results"
            )

        if self.state["set_size"] > state["max_set_size"]:
            self.state["max_set_size"] = state["set_minimal_size"]
            logging.warning(
                "Your minimal set size is bigger than the maximial set size. Thus we increased the maximal set size to fit the minimal set size. Both are now {}"
                .format(self.state["max_set_size"]))

        logging.debug(
            "Using {threads} threads".format(threads=self.state["threads"]))

        if self.state["rerun"]:
            logging.debug("deleting workdir to rerun all of the analysis")
            self.remove_workdir(self.state["workdir"])

        # check dependencies
        self.check_dependencies()
Beispiel #5
0
def merge_bins(parent, children, workdir):
    """
    Function to merge a sinlge parent with n children of class bin
    Returns an object of eukcc_state
    """
    # create an all_bins container
    all_bins = [parent]
    for c in children:
        all_bins.append(c)

    # define a name for this merged bin, this is used in
    # file paths and later for the output
    names = [x.name for x in all_bins]
    name = "merged" + "_".join(names)
    wd = os.path.join(workdir, name)

    # We use the childerns faa file to search for missing parent markers
    faa_path = os.path.abspath(os.path.join(wd, "{}.faa".format(name)))
    if not os.path.exists(faa_path):
        faas = [x.state["faa"] for x in children]
        file.isdir(wd)
        merge_fasta(faas, faa_path, seperator=None)

    # initial a new eukcc state usign the parent as a starting point
    state = eukcc_state(workdir, parent.state)
    state["faa"] = faa_path
    state["name"] = name
    state["workdir"] = os.path.abspath(wd)
    # Caching is done for relaunchs
    svp = os.path.join(state["workdir"], "save.json.gz")
    if os.path.exists(svp):
        state.load_state(svp)
        return state
    else:
        # Initialize a eukcc class
        E = eukcc(state)
        # hmmersearc is done using parents pressed hmms
        E.state["scmg_data"] = E.hmmsearch_scmg(
            E.state["workdir"],
            E.state["faa"],
            E.state["marker_set"]["profiles"],
            use_hmm=parent.state["estimate_hmm_path"],
        )
        # merge parent results with kids
        E.state["scmg_data"].extend(parent.state["scmg_data"])
        E.compute_quality(E.state["scmg_data"],
                          E.state["marker_set"]["profiles"])
        # saving this computation for a quick reload
        E.state.save_state(svp)
        return E.state
Beispiel #6
0
def split_contig_faa(path, workdir, delim="_binsep_"):
    faa_dir = os.path.join(workdir, "faa")
    if os.path.exists(faa_dir):
        logging.info(
            "Faa folder exists, remove if you want to rerun this analsys, will reuse it for now"
        )
        return [
            os.path.abspath(os.path.join(faa_dir, x))
            for x in os.listdir(faa_dir)
        ]
    elif file.isdir(faa_dir):
        fls = {}
        for seq in Fasta(path):
            b = seq.name.split(delim, 1)[0]
            contig = seq.name.split(delim, 1)[1]
            if b not in fls.keys():
                fls[b] = open(os.path.join(faa_dir, b), "w")
            fls[b].write(">{name}\n{seq}\n".format(name=contig, seq=seq.seq))
        # close all files
        for key, fl in fls.items():
            fl.close()
        return [
            os.path.abspath(os.path.join(faa_dir, x))
            for x in os.listdir(faa_dir)
        ]
    else:
        logging.warning("Could not create fasta split dir: {}".format(faa_dir))
Beispiel #7
0
def refine(state):
    """
    main refinement pipeline
    It takes a state argument and does the rest. It is basically a per bin EukCC run
    with added mergin features
    """
    state["contigs"] = os.path.join(state["workdir"], "contigs.fasta")
    if not os.path.exists(state["contigs"]):
        logging.debug("Merging bins into temp contig file")
        state["contigs"] = merge_fasta(state["input_bins"],
                                       state["contigs"],
                                       seperator="_binsplit_")
    else:
        logging.debug(
            "Using existing merged contigs, delete output folder if thats not what you want."
        )

    # initialize output tables
    result_table = os.path.join(state["out"], "eukcc.csv")
    write_table(None, result_table, header=True)

    merged_table = os.path.join(state["out"], "merged_bins.csv")
    note_merges(None, None, merged_table, header=True)

    # predict proteins using metaeuk
    state["fasta"] = state["contigs"]
    logging.debug("Initialize EukCC")
    E = eukcc(state)
    E.predict_protein()
    # split proteins into Bin files
    # this also gets rid of bins with zero proteins
    state["faas"] = split_contig_faa(state["faa"],
                                     state["workdir"],
                                     delim="_binsplit_")

    if state["links"] is not None:
        link_table = read_link_table(state["links"])

    # for each bin create a EukCC run and try to place and estimate its completeness
    bins = []
    for i, path in enumerate(state["faas"]):
        logging.debug("Running EukCC on: {} ({}/{})".format(
            os.path.basename(path), i, len(state["faas"])))
        wd = os.path.join(state["workdir"], "refine",
                          "bin_{}".format(os.path.basename(path)))
        bins.append(bin(state, wd, path, protein=True))

    smallbins = []
    for i, b in enumerate(bins):
        if b.state["quality"] is None:
            smallbins.append(i)
            continue
        elif b.state["quality"]["completeness"] < 50:
            smallbins.append(i)
            write_table(b.state, result_table)
            continue
        else:
            write_table(b.state, result_table)

    # get all combinations of small bins
    # s_cmb = n_combi(smallbins, state["n_combine"])
    # logging.info("Created {} possible combinations of small bins".format(len(s_cmb)))

    n_large_bins = len(bins) - len(smallbins)
    logging.info("Found {} large bins to merge with".format(n_large_bins))

    refined = []

    def get_name(i):
        return bins[i].name.split("_", 1)[1]

    def combine_bins(idx, children, smallbins, stop=0):
        all_comb = []

        possible_kids = []
        if state["links"] is not None:
            possible_kids = []
            connected_kids = connected_bins(get_name(idx), link_table,
                                            state["min_links"])
            for k in children:
                connected_kids = connected_kids.union(
                    connected_bins(get_name(k), link_table,
                                   state["min_links"]))
            # convert names back to indicies
            for i, b in enumerate(bins):
                if get_name(i) in connected_kids:
                    possible_kids.append(i)
        else:
            possible_kids = smallbins

        possible_kids = set(possible_kids) - set(children)

        # add a single kid and evaluate again
        if stop < 1:
            s = list(set(children) | set([idx]))
            s.sort()
            return [tuple(s)]
        else:
            for kid in possible_kids:
                k = children.copy()
                k.append(kid)
                all_comb.extend(combine_bins(idx, k, smallbins, stop=stop - 1))
            return all_comb

    # mergers = defaultdict(lambda: {'parent': None, "children": [], "gain": []})
    parent_bins = [i for i, b in enumerate(bins) if i not in smallbins]
    for parent_bin in parent_bins:
        parent_name = bins[parent_bin].name.split("_", 1)[1]

        # we find all combinations that are valid to merge with
        combies = []
        for i in range(state["n_combine"]):
            ks = combine_bins(parent_bin, [], smallbins, stop=i)
            combies.extend(ks)
        combies = list(set(combies))
        # sort combinations by length
        combies = sorted(combies, key=len, reverse=False)

        logging.info("For bin {} we found {} merging combinations".format(
            parent_name, len(combies)))
        for kid_ids in combies:
            # turn kid indexec into sorted list, so they are reproducible turn kid indexec into sorted list, so they are reproducible
            kid_ids = list(kid_ids)
            kid_ids.sort()
            kid_ids = [i for i in kid_ids if i != parent_bin]
            children = [bins[i] for i in kid_ids]
            if len(children) == 0:
                continue
            logging.debug(
                "Testing bin combination for bin {}".format(parent_name))
            logging.debug("Adding in bins {}".format(",".join(
                [get_name(i) for i in kid_ids])))
            # make all merges and see if they are great or not
            merged = merge_bins(bins[parent_bin], children,
                                os.path.join(state["workdir"], "mergers"))

            # identify parent bin, if [A, B, C] then parent can be [A,B] or [A,C] or if none of them exists
            # it should be [A]
            compare_state = bins[parent_bin].state
            if len(children) > 1:
                for c in combinations(kid_ids, len(kid_ids) - 1):
                    for potential_parent in refined:
                        if set(c) == set(potential_parent["children_idx"]):
                            compare_state = potential_parent
                            break

            gain_cp = round(
                merged["quality"]["completeness"] -
                compare_state["quality"]["completeness"],
                2,
            )
            gain_ct = round(
                merged["quality"]["contamination"] -
                compare_state["quality"]["contamination"],
                2,
            )
            if gain_cp >= state["improve_percent"] and gain_cp > (
                    state["improve_ratio"] * gain_ct):
                logging.warning(
                    "Successfull merge: +{}% Compl. +{}% Cont.".format(
                        gain_cp, gain_ct))
                merged["children_idx"] = kid_ids
                merged["parent_idx"] = parent_bin
                refined.append(merged)

    # logging.info("Iterated all possible combinations")
    # refined = remove_double_kids(refined, bins)

    # Here we just have to export the bins at this point. Thats easy and quick.
    # For each new bin we rerun eukcc with a enw metaeuk run, so we get a final verdict
    ref_dir = os.path.join(state["out"], "merged_bins")
    if os.path.exists(ref_dir):
        logging.warning(
            "Folder merged_bins alerady exist. Will not overwrite. Please remove the folder."
        )
        merged_fnas = [os.path.join(ref_dir, x) for x in os.listdir(ref_dir)]
    else:
        file.isdir(ref_dir)
        merged_fnas = []
        for merged_i, merged in enumerate(refined):
            new_name = "{}{}{}".format(state["prefix"], merged_i,
                                       state["suffix"])
            merged_fna = os.path.join(ref_dir, new_name)
            # combined index list
            idxs = [merged["parent_idx"]]
            for i in merged["children_idx"]:
                idxs.append(i)
            names = [os.path.basename(bins[i].state["faa"]) for i in idxs]
            names = [n.split("_", 1)[1] for n in names]
            # write that to file
            note_merges(new_name, names, merged_table)

            fnas = [os.path.join(state["binfolder"], name) for name in names]
            merged_fna = merge_fasta(fnas, merged_fna, seperator=None)
            merged_fnas.append(merged_fna)
            logging.info("Created combined fasta {}".format(merged_fna))

    evaluate_multiples(merged_fnas, result_table, state)
    logging.info("Created {} merged bins".format(len(refined)))
Beispiel #8
0
    def run_EPA(self):
        wd = os.path.join(self.state["workdir"], "epa-ng", self.state["clade"])
        file.isdir(wd)
        aligned_file = os.path.join(wd, "alignment.fasta")
        if file.isnewer(self.state["faa"], aligned_file):
            align_wd = os.path.join(wd, "align")
            aligned_file = self.align_marker_genes(self.state["faa"],
                                                   self.state["marker_genes"],
                                                   align_wd, aligned_file)
            if aligned_file is None:
                logging.debug("No marker genes aligned")
                return None
            # once the horizontal alignment is created we can delete
            # all small alignments
            file.remove(align_wd)

        # then split into query
        RS = epa_split(
            "epa-ng",
            wd,
            [
                self.state["dbinfo"]["files"]["backbone_alignment"],
                aligned_file
            ],
            ["query.fasta", "reference.fasta"],
        )
        # Return False if placement failed
        if RS.success is False:
            return None

        # then epa place them
        R = epa_ng(
            "epa-ng",
            wd,
            [
                os.path.join(wd, "reference.fasta"),
                self.state["dbinfo"]["files"]["backbone_tree"],
                os.path.join(wd, "query.fasta"),
            ],
            ["epa_result.jplace"],
            model=self.state["dbinfo"]["iqtree_model"],
            cores=self.state["threads_epa"],
        )
        # Return False if placement failed
        if R.success is False:
            return None
        # load the results
        try:
            with open(os.path.join(wd, "epa_result.jplace"), "r") as info_file:
                data = info_file.read()
                info = json.loads(data)
        except json.decoder.JSONDecodeError:
            jfile = os.path.join(wd, "epa_result.jplace")
            logging.error(
                "Malformed epa-ng result file. Did you ungracefully terminate? Remove the file {} and try again"
                .format(jfile))
            exit(203)
        info["tog"] = self.guppy_tree(self.state["workdir"],
                                      os.path.join(wd, "epa_result.jplace"))
        logging.debug("Placed {} markers in the reference tree".format(
            len(info["placements"])))
        return info
Beispiel #9
0
    def hmmsearch_scmg(self,
                       workdir,
                       fasta_faa,
                       profiles,
                       keep_hmm=False,
                       use_hmm=None,
                       cut_ga=True):
        """
        Fetch and compress and search markers specified by profiles list.
        Will call hmmfetch, hmmpress and hmmsearch internally

        :param workdir: folder to create tmp files under
        :param fasta_faa: predicted proteome
        :param profiles: list or set of profiles, must be in reference database
        """

        # creating a hash from the profiles
        # so set adjustments will lead to new results
        profiles = list(profiles)
        profiles.sort()
        m = hashlib.sha256()
        # concatenate all parameters to a single hash
        hash_str = "-".join(profiles)
        hash_str = hash_str + str(cut_ga)

        m.update(hash_str.encode())
        key = m.hexdigest()[0:12]

        logging.debug("Searching for selected markers")
        wd = os.path.join(workdir, "hmm", "singlecopy", self.state["clade"])
        file.isdir(wd)
        outfile = "found_markers_{}.tbl".format(key)
        cutoff_file = "found_markers_{}_GA.csv".format(key)

        # only run if outfile is older than fasta_faa
        if file.isnewer(fasta_faa, os.path.join(wd, outfile)) or file.isnewer(
                fasta_faa, os.path.join(wd, cutoff_file)):
            if use_hmm is None:
                # fetch and press hmms from database
                hmmfile = self.hmmfetch(wd, profiles)
                logging.debug("Pressing hmms")
                hmmpress("hmmpress", wd, hmmfile, "selected_hmms.hmm.h3m")
            else:
                hmmfile = use_hmm

            logging.info("Searching fasta for selected markers")
            hmmsearch(
                "hmmsearch",
                workdir=wd,
                infiles=[hmmfile, fasta_faa],
                outfiles=[outfile],
                cores=self.state["threads"],
                cut_ga=cut_ga,
            )

            # read GA values from fetched hmm file, needed for some stats in the end
            cutoff_file = hmmer_cutoffs(hmmfile, os.path.join(wd, cutoff_file))
            if keep_hmm:
                self.state["estimate_hmm_path"] = hmmfile
            else:
                # we can remove hmms and pressed hmms
                file.delete_but(
                    wd,
                    keep=[
                        os.path.basename(x)
                        for x in glob(os.path.join(wd, "found_markers_*"))
                    ],
                )

        return read_hmmer(os.path.join(wd, outfile),
                          os.path.join(wd, cutoff_file))
Beispiel #10
0
def main():
    # set arguments
    # arguments are passed to classes
    parser = argparse.ArgumentParser(
        description="Evaluate completeness and contamination of a MAG."
    )
    parser.add_argument(
        "genomes", type=str, help="Find marker for these genomes", nargs="+"
    )
    parser.add_argument(
        "--out",
        "-o",
        type=str,
        required=False,
        help="Path to output folder (Default: .)",
        default=".",
    )
    parser.add_argument("--db", type=str, default=None, help="Path to EukCC DB")
    parser.add_argument(
        "--threads", type=int, help="Number of threads to use (Default: 1)", default=1
    )
    parser.add_argument(
        "--tree",
        type=int,
        help="Number of profiles to use at target for tree profiles (default: 30)",
        default=30,
    )
    parser.add_argument(
        "--clade",
        default="base",
        type=str,
        help="Define clade as base, fungi, protozoa or plants (Defaut: base)",
    )
    parser.add_argument(
        "--quiet",
        "-q",
        dest="quiet",
        action="store_true",
        default=False,
        help="Silcence most output",
    )
    parser.add_argument(
        "--debug",
        "-d",
        action="store_true",
        default=False,
        help="Debug and thus ignore safety",
    )
    parser.add_argument(
        "-v",
        "--version",
        action="version",
        version="EukCC version {}".format(version.__version__),
    )
    args = parser.parse_args()
    state = eukcc_state(
        workdir=os.path.join(args.out, "refine_workdir"), options=vars(args)
    )
    file.isdir(state["workdir"])

    # define logging
    logLevel = logging.INFO
    if state["quiet"]:
        logLevel = logging.WARNING
    elif state["debug"]:
        logLevel = logging.DEBUG
    logging.basicConfig(
        format="%(asctime)s %(message)s",
        datefmt="%d-%m-%Y %H:%M:%S: ",
        level=logLevel,
    )
    # if db is not set, we check for env variable
    if state["db"] is None:
        if os.environ.get("EUKCC2_DB") is not None:
            state["db"] = os.environ.get("EUKCC2_DB")
            logging.debug(
                "Defined db via env variable EUKCC2_DB as '{}'".format(state["db"])
            )
        else:
            logging.error("No database was provided via --db or EUKCC2_DB env variable")
            exit(202)

    logging.info("EukCC version {}".format(version.__version__))

    logging.info(
        "Looking for shared markers across {} genomes".format(len(state["genomes"]))
    )
    n_per_worker = 4  # using more threads for hmmer makes no sense, so we parallize accroos genomes
    if state["threads"] > (2 * n_per_worker):
        # multithreading pool
        n_processes = math.floor(state["threads"] / n_per_worker)
        logging.info(
            "Launching {} threads with {} threads each".format(
                n_processes, n_per_worker
            )
        )
        pool = Pool(processes=n_processes)
        # change threads not
        opt = {k: v for k, v in state.opt.items()}
        opt["threads"] = n_per_worker
        search_genome_p = partial(search_genome, state=opt)
        data = pool.map(search_genome_p, state["genomes"])
        pool.close()
        pool.join()
    else:
        data = []
        for genome in state["genomes"]:
            data.append(search_genome(genome, state))

    tree_profiles = define_tree_set(data, n_target=args.tree)
    result = find_intersection(data, missing=3)
    outfile = os.path.join(state["out"], "profiles.txt")
    with open(outfile, "w") as fout:
        for key, profiles in result.items():
            for profile in profiles:
                fout.write("{}\t{}\n".format(key, profile))
        for profile in tree_profiles:
            fout.write("{}\t{}\n".format("tree", profile))
    logging.info("wrote profiles to {}".format(outfile))
Beispiel #11
0
 def test_file_isdir(self):
     self.assertTrue(file.isdir(TESTDATA_dir))