Search and Newick imports, that don't belong anywhere else. """ from typing import List, Optional, Set from intermake import pr from mhelper import Logger, bio_helper, isFilename import warnings import re from groot import constants from groot.constants import STAGES, EChanges from groot.application import app from groot.data import IHasFasta, global_view, Gene, Model from groot.utilities import cli_view_utils LOG = Logger("import") @app.command(folder=constants.F_IMPORT) def import_genes(file_name: str) -> EChanges: """ Imports a FASTA file into your model. If data already exists in the model, only sequence data matching sequences already in the model is loaded. :param file_name: File to import """ model = global_view.current_model() model.get_status(STAGES.SEQUENCES_2).assert_import() model.user_comments.append("IMPORT_FASTA \"{}\"".format(file_name))
from mgraph import MGraph, MSplit, exporting from mhelper import Logger from typing import Dict, Optional from groot import Component, constants from groot.application import app from groot.constants import STAGES, EChanges from groot.data import INode, Model, Split, global_view from groot.utilities import lego_graph __LOG_SPLITS = Logger("nrfg.splits", False) @app.command(folder=constants.F_CREATE) def create_splits(): """ Creates the candidate splits. NRFG Stage I. Collects the splits present in the component trees. :remarks: -------------------------------------------------------------------------------------------------------------- | Some of our graphs may have contradictory information. | | To resolve this we perform a consensus. | | We define all the graphs by their splits, then see whether the splits are supported by the majority. | | | | A couple of implementation notes: | | 1. I've not used the most efficient algorithm, however this is fast enough for the purpose and it is much | | easier to explain what we're doing. For a fast algorithm see Jansson 2013, which runs in O(nk) time. |
from intermake import pr from mgraph import analysing from mhelper import ComponentFinder, Logger, LogicError, string_helper from typing import List from groot import constants from groot.application import app from groot.constants import STAGES, EChanges from groot.data import INode, Pregraph, Subset, global_view from groot.utilities import lego_graph LOG = Logger("pregraphs", False) @app.command(folder=constants.F_CREATE) def create_pregraphs(): """ Creates the pregraphs. Requisites: `create_subsets` """ model = global_view.current_model() # Special case - if no subsets just stop now if model.get_status(STAGES.PREGRAPHS_13).is_complete and len( model.subsets) == 0: pr.printx("<verbose>No subsets - nothing to do.</verbose>") return model.get_status(STAGES.PREGRAPHS_13).assert_create()
from mhelper import Logger, LogicError, ansi_helper, string_helper from typing import Set from groot import constants from groot.application import app from groot.constants import STAGES, EChanges from groot.data import Split, global_view __LOG_EVIDENCE = Logger("nrfg.evidence", False) @app.command(folder=constants.F_CREATE) def create_consensus(cutoff: float = 0.5) -> EChanges: """ Filters the candidate splits. NRFG PHASE II. Collect consensus evidence. :remarks: ---------------------------------------------------------------------------------------------------- | The second stage of the consensus. | | We collect evidence from the graphs to support or reject our splits. | | Unlike a normal majority rule consensus, there's no guarantee that our splits are in the graphs, | | so, in addition to support/reject evidence, we have a third category, whereby the graph neither | | supports nor rejects a split. | ---------------------------------------------------------------------------------------------------- :param cutoff: Cutoff to be used in the consensus """
from mgraph import MGraph from mhelper import Logger, array_helper, string_helper from groot import constants from groot.application import app from groot.constants import STAGES, EChanges from groot.data import FusionGraph, Formation, global_view from groot.utilities import lego_graph __LOG = Logger("nrfg.sew", False) @app.command(folder=constants.F_CREATE) def create_fused(): """ Creates the NRFG (uncleaned). Sews the subgraphs back together at the fusion points. Requisites: `create_subgraphs` """ __LOG.pause("▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒ SEW ▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒") model = global_view.current_model() model.get_status(STAGES.FUSE_15).assert_create() # There is a special case where there is no fusions if len(model.fusions) == 0 and len(model.components) == 1: model.fusion_graph_unclean = FusionGraph( model.components[0].tree.copy(), False) return
from mgraph import MGraph, MNode, analysing from mhelper import Logger, LoopDetector, SwitchError from groot.constants import STAGES, EChanges from groot.data import EPosition, FusionGraph, Formation, Point, Gene, global_view from groot.utilities import lego_graph from groot import constants from groot.application import app LOG = Logger("clean", False) @app.command(folder=constants.F_CREATE) def create_cleaned(): """ Cleans the NRFG. Requisites: `create_fused` """ model = global_view.current_model() model.get_status(STAGES.CLEAN_16).assert_create() nrfg = model.fusion_graph_unclean.graph.copy() __remove_redundant_fusions(nrfg) __remove_redundant_clades(nrfg) __make_fusions_rootlets(nrfg) __make_outgroup_parents_roots(nrfg) model.fusion_graph_clean = FusionGraph(nrfg, True) return EChanges.MODEL_DATA
from intermake import pr from mgraph import MEdge, MGraph, MNode from mhelper import Logger, array_helper, string_helper import itertools from groot import constants from groot.constants import EChanges from groot.data.model_collections import FusionCollection from groot.application import app from groot.data import INode, Component, Fusion, Model, Point, Gene, global_view from groot.data.model_core import Formation from groot.utilities import lego_graph from groot.commands.workflow import s080_tree __LOG = Logger("fusion", False) __LOG_ISOLATION = Logger("isolation", False) @app.command(folder=constants.F_CREATE) def create_fusions() -> EChanges: """ Finds the fusion points in the model. i.e. Given the events (see `find_events`), find the exact points at which the fusion(s) occur. Requisites: `create_trees` """ model = global_view.current_model() model.get_status(constants.STAGES.FUSIONS_9).assert_create() r: List[Fusion] = []
from collections import defaultdict from typing import Dict, FrozenSet, List, Set, cast, Any from mhelper import Logger, string_helper from groot import constants from groot.application import app from groot.constants import STAGES, EChanges from groot.data import INode, Fusion, Point, Gene, Subset, global_view __LOG = Logger( "nrfg.find", False ) @app.command(folder = constants.F_DROP) def drop_subsets(): """ Removes data from the model. """ model = global_view.current_model() model.get_status( STAGES.SUBSETS_12 ).assert_drop() model.subsets = frozenset() return EChanges.COMP_DATA @app.command(folder = constants.F_CREATE) def create_subsets( no_super: bool = True ): """ Creates leaf subsets. Requisites: `create_consensus`
Components algorithms. The only one publicly exposed is `detect`, so start there. """ from typing import List, Optional from intermake import pr from mhelper import ComponentFinder, Logger, string_helper import warnings from groot.application import app from groot import constants from groot.constants import EChanges, STAGES from groot.data import Component, Edge, Gene, global_view LOG_MAJOR = Logger( "comp.major", False ) LOG_MAJOR_V = Logger( "comp.major.v", False ) LOG_GRAPH = Logger( "comp.graph", False ) @app.command( folder = constants.F_CREATE ) def create_major( tol: int = 0, debug: bool = False ) -> EChanges: """ Detects model components. First step of finding the components. We classify each component as a set of "major" genes. Components are defined as sets of genes that share a similarity path between them, where each edge between element 𝓧 and 𝓨 in that path: * Is sourced from no less than 𝓧's length, less the tolerance
from mgraph import MGraph, MNode from mhelper import Logger, SwitchError, string_helper, FunctionInspector from typing import Callable, Iterable, Sequence, Union from groot import constants from groot.application import app from groot.constants import STAGES, EChanges from groot.data import INamedGraph, Formation, Point, Pregraph, Subset, Subgraph, global_view, Gene from groot.utilities import AlgorithmCollection, lego_graph, external_runner LOG = Logger("possible_graphs", False) DAlgorithm = Callable[[Union[str, Subset]], Union[str, MGraph]] """ Task: A supertree consensus is required whereby the set of taxa on the inputs may not be the same. Input (ONE OF): str (default): newick trees LegoSubset: the gene subset in question Output (ONE OF): str: A newick tree MGraph: The tree Uses Pep-484 to indicate which input is required, otherwise the default will be assumed. """ supertree_algorithms = AlgorithmCollection(DAlgorithm, "Supertree")
BLAST is the default algorithm and this invocation can be found in the `groot_ex` project. """ from intermake import pr from typing import Callable, List, Optional from mhelper import EFileMode, isFilename, Logger import re from groot.commands.workflow.s020_sequences import _make_gene from groot.application import app from groot import Edge, constants from groot.constants import EXT_BLAST, STAGES, EChanges from groot.data import Model, Domain, global_view from groot.utilities import AlgorithmCollection, external_runner LOG = Logger("import/blast") DAlgorithm = Callable[[str], str] """ Task: A similarity of FASTA sequences. Input: str (default): FASTA sequences for two or more genes Output: str: A similarity matrix in BLAST format 6 TSV. """ similarity_algorithms = AlgorithmCollection(DAlgorithm, "Similarity")
from groot import supertree_algorithms, Subset, Gene from mgraph import importing, MGraph from mhelper import file_helper, Logger, LogicError, exception_helper from intermake import subprocess_helper __LOG_CREATE = Logger("supertree") @supertree_algorithms.register("clann") def supertree_clann(inputs: str) -> str: """ Uses CLANN to generate a supertree. :param inputs: Input trees in Newick format. :return: The consensus supertree in Newick format. """ file_helper.write_all_text("in_file.nwk", inputs) script = """ execute in_file.nwk; hs savetrees=out_file.nwk; quit """ subprocess_helper.run_subprocess(["clann"], stdin=script) result = file_helper.read_all_text("out_file.nwk") return result.split(";")[0]
The only one publicly exposed is `detect`, so start there. """ from collections import defaultdict from typing import Dict, Optional, Set, Tuple, List from intermake import pr from mhelper import Logger, array_helper, string_helper import warnings from groot import constants from groot.application import app from groot.constants import STAGES, EChanges from groot.data import Component, Edge, Model, Gene, Domain, global_view LOG_MINOR = Logger("comp.minor", False) @app.command(folder=constants.F_CREATE) def create_minor(tol: int) -> EChanges: """ Finds the subsequence components, here termed the "minor" elements. Clause 1: Subsequences belong to the component of the sequence in which they reside. Clause 2: When one sequence of a component possesses an edge to a sequence of another component (an "entry"). Subsequences of all sequences in that second component receive the first component, at the position of the entry. Requisites: `create_major`