Exemple #1
0
def test_qiime_db(tmp_path):
    meta = qf.metadata(db)
    assert "uuid" in meta
    assert meta["type"] == "MetabolicModels[JSON]"
    manifest = qf.load_qiime_model_db(db, str(tmp_path))
    assert manifest.shape[0] == 3
    assert all(path.exists(f) for f in manifest.file)
Exemple #2
0
def check_db_medium(model_db, medium, threads=1):
    """Complete a growth medium for all models in a database.

    Arguments
    ---------
    model_db : str
        A pre-built model database. If ending in `.qza` must be a Qiime 2
        artifact of type `MetabolicModels[JSON]`. Can also be a folder,
        zip (must end in `.zip`) file or None if the taxonomy contains a
        column `file`.
    medium : pd.DataFrame
        A growth medium. Must have columns "reaction" and "flux" denoting
        exchange reactions and their respective maximum flux. Can not be sample
        specific.
    threads : int >=1
        The number of parallel workers to use when building models. As a
        rule of thumb you will need around 1GB of RAM for each thread.

    Returns
    -------
    pd.DataFrame
        Returns an annotated manifest file with a column `can_grow` that tells you
        whether the model can grow on the (fixed) medium, and a column `growth_rate`
        that gives the growth rate.
    """
    medium = process_medium(medium, ["dummy"])
    medium.index = medium.global_id
    compressed = model_db.endswith(".qza") or model_db.endswith(".zip")
    if compressed:
        tdir = TemporaryDirectory(prefix="micom_")
    if model_db.endswith(".qza"):
        manifest = load_qiime_model_db(model_db, tdir.name)
    elif model_db.endswith(".zip"):
        manifest = load_zip_model_db(model_db, tdir.name)
    else:
        manifest = load_manifest(model_db)
    rank = manifest["summary_rank"][0]
    logger.info("Checking %d %s-level models on a medium with %d components." %
                (manifest.shape[0], rank, len(medium)))

    args = [(f, medium.flux) for f in manifest.file]
    results = workflow(_grow, args, threads)
    manifest["growth_rate"] = results
    manifest["can_grow"] = manifest.growth_rate.notna() & (manifest.growth_rate
                                                           > 1e-6)

    if compressed:
        tdir.cleanup()

    return manifest
Exemple #3
0
def db_annotations(
    model_db,
    threads=1,
):
    """Get metabolite annotations from a model DB.

    Arguments
    ---------
    model_db : str
        A pre-built model database. If ending in `.qza` must be a Qiime 2
        artifact of type `MetabolicModels[JSON]`. Can also be a folder,
        zip (must end in `.zip`) file or None if the taxonomy contains a
        column `file`.
    threads : int >=1
        The number of parallel workers to use when building models. As a
        rule of thumb you will need around 1GB of RAM for each thread.

    Returns
    -------
    pd.DataFrame
        Annotations for all exchanged metabolites.
    """
    compressed = model_db.endswith(".qza") or model_db.endswith(".zip")
    if compressed:
        tdir = TemporaryDirectory(prefix="micom_")
    if model_db.endswith(".qza"):
        manifest = load_qiime_model_db(model_db, tdir.name)
    elif model_db.endswith(".zip"):
        manifest = load_zip_model_db(model_db, tdir.name)
    else:
        manifest = load_manifest(model_db)
    rank = manifest["summary_rank"][0]
    logger.info("Getting annotations from %d %s-level models ." %
                (manifest.shape[0], rank))

    args = manifest.file.tolist()
    results = workflow(_annotate, args, threads)
    anns = pd.concat(results).drop_duplicates()

    if compressed:
        tdir.cleanup()

    return anns
Exemple #4
0
    def __init__(
        self,
        taxonomy,
        model_db=None,
        id=None,
        name=None,
        rel_threshold=1e-6,
        solver=None,
        progress=True,
        max_exchange=100,
        mass=1,
    ):
        """Create a new community object.

        `micom` builds a community from a taxonomy which may simply be a list
        of model files in its simplest form. Usually, the taxonomy will contain
        additional information such as annotations for the individuals (for
        instance phylum, organims or species) and abundances.

        The recommended way to build a micom model is to supply a
        quantification of taxa (called "taxonomy" here) which specifies the
        taxonomic ranks for a taxon and its abundance, and a model database
        for a specific rank (for instance "genus"). MICOM will match the
        ranks from your taxonomy to the model database and assemble the
        community models from that. You will also get information about the
        construction process by calling `Community.build_metrics`.

        The most customizable way only takes a single table where summarization
        and matching to the reference database has already occured. In this
        case you will also provide paths to model files for each taxon. This is
        the "old" way but may still be applicable if you want to use a custom
        database or want full control of matching your data to reference
        models.

        Notes
        -----
        `micom` will automatically add exchange fluxes and and a community
        objective maximizing the overall growth rate of the community.

        Parameters
        ----------
        taxonomy : pandas.DataFrame
            The taxonomy used for building the model. Must have at least the
            column "id". If no model database is specified in the next argument
            it furthermore requires a column "file" which specifies a filepath
            for each model. Valid file extensions are ".pickle", ".xml",
            ".xml.gz" and ".json". If a model database is specified this must
            contain at least a column with the same name as the rank used in
            the model database. Thus, for a genus-level database you will need
            a column `genus`. Additional taxa ranks can also be specified and
            will be used to be more stringent in taxa matching.
            Finally, the taxonomy should contain a column `abundance`. It will
            be used to quantify each individual in the community. If absent,
            MICOM will assume all individuals are present in the same amount.
        model_db : str
            A pre-built model database. If ending in `.qza` must be a Qiime 2
            artifact of type `MetabolicModels[JSON]`. Can also be a folder,
            zip (must end in `.zip`) file or None if the taxonomy contains a
            column `file`.
        id : str, optional
            The ID for the community. Should only contain letters and numbers,
            otherwise it will be formatted as such.
        name : str, optional
            The name for the community.
        rel_threshold : float < 1, optional
            The relative abundance threshold that will be used. Describes the
            smallest relative amount of an individual that will be considered
            non-zero. All individuals with a smaller relative amount will be
            omitted.
        solver : str, optional
            Which solver to use. Will default to cplex if available which is
            better suited for large problems.
        progress : bool, optional
            Show a progress bar.
        max_exchange : positive float, optional
            During model constructions exchange reactions are duplicated into
            internal and external exchange reactions. This specifies the new
            import flux bound for the *internal* exchange reaction. Import
            rates for the exchanges between the medium and outside are still
            mantained.
        mass : positive float, optional
            The total mass of the community in gDW. Used to adjust import
            fluxes which are assumed to be given as mmol/gDW*h for the
            entire community. As a consequence all import fluxes will be
            divided by that number.

        Attributes
        ----------
        taxa : list
            A list of taxa IDs in the community.

        """
        super(Community, self).__init__(id, name)

        logger.info("building new micom model {}.".format(id))
        if not solver:
            solver = [
                s for s in ["cplex", "osqp", "gurobi", "glpk"]
                if s in cobra.util.solver.solvers
            ][0]
        logger.info("using the %s solver." % solver)
        if solver == "glpk":
            logger.warning(
                "No QP solver found, will use GLPK. A lot of functionality "
                "in MICOM will require a QP solver :/")
        self.solver.configuration.lp_method = "auto"
        self.solver.configuration.qp_method = "auto"
        self.solver.configuration.presolve = False
        self.solver = solver
        self._rtol = rel_threshold
        self._modification = None
        self.mass = mass
        self.__db_metrics = None
        adjust_solver_config(self.solver)
        taxonomy = taxonomy.copy()
        if "abundance" not in taxonomy.columns:
            taxonomy["abundance"] = 1
        taxonomy.abundance /= taxonomy.abundance.sum()
        logger.info("{} individuals with abundances below threshold".format(
            (taxonomy.abundance <= self._rtol).sum()))
        taxonomy = taxonomy[taxonomy.abundance > self._rtol]

        if not (isinstance(taxonomy, pd.DataFrame)
                and "id" in taxonomy.columns):
            raise ValueError("`taxonomy` must be a pandas DataFrame with at"
                             "least a column `id` :(")
        if model_db is None and "file" not in taxonomy.columns:
            raise ValueError(
                "If no model database is specified you need to pass "
                "file names for models in a `file` column as well.")
        compressed = False
        if model_db is not None:
            compressed = model_db.endswith(".qza") or model_db.endswith(".zip")
            if compressed:
                tdir = TemporaryDirectory(prefix="micom_")
            if "file" in taxonomy.columns:
                del taxonomy["file"]
            if model_db.endswith(".qza"):
                manifest = load_qiime_model_db(model_db, tdir.name)
            elif model_db.endswith(".zip"):
                manifest = load_zip_model_db(model_db, tdir.name)
            else:
                manifest = load_manifest(model_db)
            rank = manifest["summary_rank"][0]
            if rank not in taxonomy.columns:
                raise ValueError("Missing the column `%s` from the taxonomy." %
                                 rank)
            keep_cols = [
                r for r in _ranks[0:(_ranks.index(rank) + 1)]
                if r in taxonomy.columns and r in manifest.columns
            ]
            manifest = manifest[keep_cols + ["file"]]
            merged = pd.merge(taxonomy, manifest, on=keep_cols)

            self.__db_metrics = pd.Series({
                "found_taxa":
                merged.shape[0],
                "total_taxa":
                taxonomy.shape[0],
                "found_fraction":
                merged.shape[0] / taxonomy.shape[0],
                "found_abundance_fraction":
                merged.abundance.sum(),
            })
            logger.info("Matched %g%% of total abundance in model DB." %
                        (100.0 * self.__db_metrics[3]))
            if self.__db_metrics["found_abundance_fraction"] < 0.5:
                logger.warning(
                    "Less than 50%% of the abundance could be matched to the "
                    "model database. Model `%s` may not be representative "
                    "of the sample" % self.id)
            taxonomy = merged
            taxonomy["abundance"] /= taxonomy["abundance"].sum()

        if taxonomy.id.str.contains(r"[^A-Za-z0-9_]", regex=True).any():
            logger.warning("Taxa IDs contain prohibited characters and"
                           " will be reformatted.")
            taxonomy.id = taxonomy.id.replace(r"[^A-Za-z0-9_\s]+",
                                              "_",
                                              regex=True)

        self.__taxonomy = taxonomy
        self.__taxonomy.index = self.__taxonomy.id

        obj = Zero
        self.taxa = []
        index = self.__taxonomy.index
        index = tqdm(index, unit="models") if progress else index
        for idx in index:
            row = self.__taxonomy.loc[idx]
            if isinstance(row.file, list):
                if len(row.file) > 1:
                    model = join_models(row.file)
                    logger.info("joined {} models".format(len(row.file)))
                else:
                    model = load_model(row.file[0])
            else:
                model = load_model(row.file)
            suffix = "__" + idx.replace(" ", "_").strip()
            logger.info("converting IDs for {}".format(idx))
            external = cobra.medium.find_external_compartment(model)
            logger.info("Identified %s as the external compartment for %s. "
                        "If that is wrong you may be in trouble..." %
                        (external, idx))
            for r in model.reactions:
                r.global_id = clean_ids(r.id)
                r.id = r.global_id + suffix
                r.community_id = idx
                # avoids https://github.com/opencobra/cobrapy/issues/926
                r._compartments = None
                # SBO terms may not be maintained
                if "sbo" in r.annotation:
                    del r.annotation["sbo"]
            for m in model.metabolites:
                m.global_id = clean_ids(m.id)
                m.id = m.global_id + suffix
                m.compartment += suffix
                m.community_id = idx
            logger.info("adding reactions for {} to community".format(idx))
            self.add_reactions(model.reactions)
            o = self.solver.interface.Objective.clone(model.objective,
                                                      model=self.solver)
            obj += o.expression * row.abundance
            self.taxa.append(idx)
            taxa_obj = self.problem.Constraint(o.expression,
                                               name="objective_" + idx,
                                               lb=0.0)
            self.add_cons_vars([taxa_obj])
            self.__add_exchanges(
                model.reactions,
                row,
                external_compartment=external,
                internal_exchange=max_exchange,
            )
            self.solver.update()  # to avoid dangling refs due to lazy add

        if compressed:
            tdir.cleanup()
        com_obj = add_var_from_expression(self,
                                          "community_objective",
                                          obj,
                                          lb=0)
        self.objective = self.problem.Objective(com_obj, direction="max")
Exemple #5
0
def complete_db_medium(
    model_db,
    medium,
    growth=0.001,
    max_added_import=1,
    minimize_components=False,
    weights=None,
    threads=1,
):
    """Complete a growth medium for all models in a database.

    Arguments
    ---------
    model_db : str
        A pre-built model database. If ending in `.qza` must be a Qiime 2
        artifact of type `MetabolicModels[JSON]`. Can also be a folder,
        zip (must end in `.zip`) file or None if the taxonomy contains a
        column `file`.
    medium : pd.DataFrame
        A growth medium. Must have columns "reaction" and "flux" denoting
        exchange reactions and their respective maximum flux. Can not be sample
        specific.
    growth : positive float or pandas.Series
        The minimum growth rate the model has to achieve with the (fixed) medium. If
        a Series will have a minimum growth rate for each id/taxon in the model db.
    max_added_import : positive float
        Maximum import flux for each added additional import not included in the growth
        medium. If positive will expand the medium with additional imports in order to
        fulfill the growth objective.
    minimize_components : boolean
        Whether to minimize the number of components instead of the total
        import flux. Might be more intuitive if set to True but may also be
        slow to calculate.
    weights : str
        Will scale the fluxes by a weight factor. Can either be "mass" which will
        scale by molecular mass, a single element which will scale by
        the elemental content (for instance "C" to scale by carbon content).
        If None every metabolite will receive the same weight.
        Will be ignored if `minimize_components` is True.
    threads : int >=1
        The number of parallel workers to use when building models. As a
        rule of thumb you will need around 1GB of RAM for each thread.

    Returns
    -------
    tuple of (manifest, import fluxes)
        Returns an annotated manifest file with a column `can_grow` that tells you
        whether the model can grow on the (fixed) medium, and a column `added` that
        gives the number of added imports apart from the ones in the medium.
    """
    medium = process_medium(medium, ["dummy"])
    medium.index = medium.global_id
    compressed = model_db.endswith(".qza") or model_db.endswith(".zip")
    if compressed:
        tdir = TemporaryDirectory(prefix="micom_")
    if model_db.endswith(".qza"):
        manifest = load_qiime_model_db(model_db, tdir.name)
    elif model_db.endswith(".zip"):
        manifest = load_zip_model_db(model_db, tdir.name)
    else:
        manifest = load_manifest(model_db)
    rank = manifest["summary_rank"][0]
    logger.info("Checking %d %s-level models on a medium with %d components." %
                (manifest.shape[0], rank, len(medium)))
    if not isinstance(growth, pd.Series):
        growth = pd.Series(growth, index=manifest.id)

    manifest.index = manifest.id
    args = [(
        manifest.loc[i, "file"],
        medium.flux,
        growth[i],
        max_added_import,
        minimize_components,
        weights,
    ) for i in manifest.index]
    results = workflow(_try_complete, args, threads)
    manifest["can_grow"] = [r[0] for r in results]
    manifest["added"] = [r[1] for r in results]
    imports = pd.DataFrame.from_records([r[2] for r in results]).fillna(0.0)
    imports.index = manifest.id

    if compressed:
        tdir.cleanup()

    return (manifest, imports)