Exemple #1
0
    def describe_filter(self):
        sel = self.filter
        r = []

        if sel.components:
            r.append("(ᴄᴏᴍᴩ, ꜰᴏʀ) ⊇ {{{0}}}".format(
                string_helper.format_array(sel.components)))

        if sel.genes:
            r.append("ᴀʟʟ ⊇ {{{0}}}".format(
                string_helper.format_array(sel.genes)))

        if sel.fusions:
            r.append("ᴀʟʟ ⊇ {{{0}}}".format(
                string_helper.format_array(sel.fusions)))

        if sel.splits:
            r.append("ꜱᴩʟɪᴛ ∈ {{{0}}}".format(
                string_helper.format_array(sel.splits)))

        if sel.text:
            r.append('"{0}" ∈ ꜱᴩʟɪᴛ'.format(sel.text))

        if not r:
            return "No filter"

        return " ∧ ".join(r)
Exemple #2
0
 def to_details(self):
     r = []
     r.append("MAJOR-SE: {}".format(
         string_helper.format_array(self.major_genes, sort=True)))
     r.append("MINOR-SE: {}".format(
         string_helper.format_array(self.minor_genes, sort=True)))
     r.append("MINOR-SS: {}".format(
         string_helper.format_array(self.minor_domains)))
     r.append("INCOMING: {}".format(
         string_helper.format_array(self.incoming_components(), sort=True)))
     r.append("OUTGOING: {}".format(
         string_helper.format_array(self.outgoing_components(), sort=True)))
     return "\n".join(r)
Exemple #3
0
def create_supertrees(algorithm: supertree_algorithms.Algorithm) -> None:
    """
    Creates the supertrees/subgraphs for the model.
    
    Requisites: `create_pregraphs`
    
    :param algorithm:   Algorithm to use, see `algorithm_help`.
    :return:            Nothing is returned, the state is saved into the model. 
    """

    # Check we're ready to go
    model = global_view.current_model()
    model.get_status(STAGES.SUPERTREES_14).assert_create()

    # Create the subgraphs
    subgraphs = []

    for subset in model.subsets:
        subgraph = __create_supertree(algorithm, subset)

        subgraphs.append((subset, subgraph))

    # Collect the sources and destinations
    destinations = set()
    sources = set()

    for subset, subgraph in subgraphs:
        sequences = lego_graph.get_sequence_data(subgraph)
        ffn = lego_graph.get_fusion_formation_nodes(subgraph)

        if not ffn:
            raise ValueError(
                "The subgraph («{}») of the subset «{}» («{}») doesn't appear to have any fusion point nodes. Refusing to continue because this means the subgraph's position in the NRFG is unavailable."
                .format(string_helper.format_array(subgraph.nodes), subset,
                        string_helper.format_array(subset.contents)))

        for node in ffn:  # type:MNode
            formation: Formation = node.data

            if any(x in sequences for x in formation.pertinent_inner):
                destinations.add(node.uid)
            else:
                sources.add(node.uid)

    model.subgraphs_destinations = tuple(destinations)
    model.subgraphs_sources = tuple(sources)
    model.subgraphs = tuple(
        Subgraph(subgraph, subset, repr(algorithm))
        for subset, subgraph in subgraphs)

    return EChanges.MODEL_DATA
Exemple #4
0
def __subset_to_possible_graphs(subset: Subset):
    """
    Converts a subset of genes into the possible graphs representing these genes (1 graph per component).
    
    :remarks:
    This isn't as simple as just pulling out the genes from the component trees, we need to address the following issues:
    
    1.  Our resulting graphs might contain distinct fusions points which may, in fact, be the same.
        This causes problems in the supertree stage.
        We address this issue by coalescing identical fusions (`LegoPoint`s into `LegoFormation`s).
    2.  The "intermediaries" (the clades which hold our gene subset together to form a coherent graph)
        should generally just be boring clades, but occasionally we'll pull a fusion node into them.
        This also causes problems at the supertree stage.
        We address this issue by swapping them out these out for clades.
    """
    graphs: List[Pregraph] = []

    LOG("{} :::: {}", subset, subset.contents)

    for component in subset.model.components:
        intermediaries = analysing.get_intermediaries(
            component.tree, lambda x: x.data in subset.contents)

        LOG("{} :::: {}", subset, component)
        graph = component.tree.copy(nodes=intermediaries)

        # Hold up!
        for node in graph:
            LOG("{} :::: {}", subset, node)

            if lego_graph.is_clade(node):
                continue

            if lego_graph.is_fusion_point(node):
                if node.data in subset.contents:
                    node.data = node.data.formation
                else:
                    # Substitute for clade
                    print("SUB {}".format(node))
                    node.data = None
                continue

            if lego_graph.is_sequence_node(node):
                if node.data in subset.contents:
                    continue

            raise ValueError(
                "Subset graph contains the node «{}», which does not appear in the actual subset «{}»."
                .format(node, subset))

        if sum(1 for _ in graph.nodes.roots) > 1:
            raise LogicError("Graph of subset has multiple roots: {}".format(
                string_helper.format_array(graph.nodes.roots)))

        if graph.nodes:
            graphs.append(Pregraph(graph, subset, component))

    subset.pregraphs = tuple(graphs)
Exemple #5
0
def print_subsets() -> EChanges:
    """
    Prints NRFG subsets.
    """
    model = global_view.current_model()
    
    for x in sorted( model.subsets, key = cast( Any, str ) ):
        assert isinstance( x, Subset )
        print( "{} - {} elements: {}".format( x, len( x ), string_helper.format_array( x.contents, sort = True, autorange = True ) ) )
    
    return EChanges.INFORMATION
Exemple #6
0
def __list_comp(comparison, name, html, calculated, original, ini: TIniData):
    html.append('<table border=1 style="border-collapse: collapse;">')
    html.append("<tr><td colspan=6><b>{} QUARTETS</b></td></tr>".format(name))
    ini_sect: TIniSection = {}
    ini[name.lower()] = ini_sect

    for quartet in comparison:
        calc = calculated[quartet.get_unsorted_key()]
        orig = original[quartet.get_unsorted_key()]
        html.append(
            "<tr><td>{}</td><td>{}</td><td>{}</td><td>{}</td><td>{}</td><td>{}</td></tr>"
            .format(*quartet.get_unsorted_key(), calc, orig))
        ini_sect["{}_calculated".format(
            string_helper.format_array(
                quartet.get_unsorted_key()))] = str(calc)
        ini_sect["{}_original".format(
            string_helper.format_array(
                quartet.get_unsorted_key()))] = str(orig)

    html.append("</table><br/>")
Exemple #7
0
def print_fusions() -> EChanges:
    """
    Prints model fusions.
    """
    results: List[str] = []

    model = global_view.current_model()

    for event in model.fusions:
        results.append("- name               {}".format(event))
        results.append("  components in      {}".format(event.components_in))
        results.append("  component out      {}".format(event.component_out))
        results.append("  index              {}".format(event.index))
        results.append("  points             {}".format(
            string_helper.format_array(event.points)))

        for point in event.points:
            results.append("     -   name               {}".format(point))
            results.append("         point_component    {}".format(
                point.point_component))
            results.append("         count              {}".format(
                point.count))
            results.append("         outer sequences    {}".format(
                string_helper.format_array(point.outer_sequences)))
            results.append("         pertinent inner    {}".format(
                string_helper.format_array(point.pertinent_inner)))
            results.append("         pertinent outer    {}".format(
                string_helper.format_array(point.pertinent_outer)))
            results.append("         sequences          {}".format(
                string_helper.format_array(point.genes)))
            results.append("")

        results.append("")

    print("\n".join(results))

    return EChanges.INFORMATION
Exemple #8
0
    def on_refresh_data(self):
        tvw = self.ui.LST_MAIN

        tvw.clear()
        model = self.get_model()
        accepted = 0
        rejected = 0

        if model.splits:
            for split in model.splits:
                if not self.check_filter(split):
                    rejected += 1
                    continue

                accepted += 1

                assert isinstance(split, Split)
                item = QTreeWidgetItem()

                col = qt.tree_helper.get_or_create_column(tvw, "Inside")
                txt = string_helper.format_array(split.split.inside)
                item.setText(col, txt)

                col = qt.tree_helper.get_or_create_column(tvw, "Outside")
                txt = string_helper.format_array(split.split.outside)
                item.setText(col, txt)

                col = qt.tree_helper.get_or_create_column(tvw, "Components")
                txt = string_helper.format_array(split.components)
                item.setText(col, txt)

                col = qt.tree_helper.get_or_create_column(tvw, "For")
                txt = string_helper.format_array(split.evidence_for)
                item.setText(col, txt)

                col = qt.tree_helper.get_or_create_column(tvw, "Against")
                txt = string_helper.format_array(split.evidence_against)
                item.setText(col, txt)

                col = qt.tree_helper.get_or_create_column(tvw, "Unused")
                txt = string_helper.format_array(split.evidence_unused)
                item.setText(col, txt)

                qt.tree_helper.set_data(item, split)

                tvw.addTopLevelItem(item)

        if rejected:
            self.ui.LBL_TITLE.setText(
                "{} splits, {} rejected due to filter".format(
                    accepted, rejected))
        else:
            self.ui.LBL_TITLE.setText("{} splits".format(accepted))

        self.ui.LBL_FILTER.setText(self.describe_filter())
Exemple #9
0
def create_consensus(cutoff: float = 0.5) -> EChanges:
    """
    Filters the candidate splits.
    
    NRFG PHASE II.
    
    Collect consensus evidence.
    
    :remarks:
    ----------------------------------------------------------------------------------------------------
    | The second stage of the consensus.                                                               |
    | We collect evidence from the graphs to support or reject our splits.                             |
    | Unlike a normal majority rule consensus, there's no guarantee that our splits are in the graphs, |
    | so, in addition to support/reject evidence, we have a third category, whereby the graph neither  |
    | supports nor rejects a split.                                                                    |
    ----------------------------------------------------------------------------------------------------
                                                                                                       
    :param cutoff:              Cutoff to be used in the consensus 
    """
    model = global_view.current_model()
    __LOG_EVIDENCE.pause("▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒ EVIDENCE ▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒")

    model.get_status(STAGES.CONSENSUS_11).assert_create()

    __LOG_EVIDENCE("BEGIN EVIDENCE ({} splits)".format(len(model.splits)))
    viable_splits: Set[Split] = set()

    for split in model.splits:
        assert isinstance(split, Split), split

        if split.split.is_empty:
            __LOG_EVIDENCE("SPLIT IS EMPTY: {}".format(split))
            continue

        evidence_for = set()
        evidence_against = set()
        evidence_unused = set()

        for component in model.components:
            component_splits = component.splits
            has_evidence = None

            for component_split in component_splits:
                evidence = split.is_evidenced_by(component_split)

                if evidence is True:
                    has_evidence = True
                    break
                elif evidence is False:
                    has_evidence = False

            if has_evidence is True:
                evidence_for.add(component)
            elif has_evidence is False:
                evidence_against.add(component)
            else:
                evidence_unused.add(component)

        if not evidence_for:
            raise LogicError(
                "There is no evidence for (F{} A{} U{}) this split «{}», but the split must have come from somewhere."
                .format(len(evidence_for), len(evidence_against),
                        len(evidence_unused), split))

        total_evidence: int = len(evidence_for) + len(evidence_against)
        frequency: float = len(evidence_for) / total_evidence
        accept: bool = frequency > cutoff
        split.evidence_for = frozenset(evidence_for)
        split.evidence_against = frozenset(evidence_against)
        split.evidence_unused = frozenset(evidence_unused)

        __LOG_EVIDENCE(
            "{} {} = {}% -- FOR: ({}) {}, AGAINST: ({}) {}, UNUSED: ({}) {}",
            "✔" if accept else "✘", ansi_helper.ljust(str(split), 80),
            int(frequency * 100), len(evidence_for),
            string_helper.format_array(evidence_for, sort=True),
            len(evidence_against),
            string_helper.format_array(evidence_against, sort=True),
            len(evidence_unused),
            string_helper.format_array(evidence_unused, sort=True))

        if accept:
            viable_splits.add(split)

    model.consensus = frozenset(viable_splits)

    return EChanges.MODEL_DATA
Exemple #10
0
def __format_elements(y):
    return string_helper.format_array(y, join=",", sort=True, autorange=True)
Exemple #11
0
 def get_details(self):
     return string_helper.format_array(self.contents)
Exemple #12
0
def compare_graphs(calc_graph_: INamedGraph,
                   orig_graph_: INamedGraph) -> Report:
    """
    Compares graphs using quartets.
    
    :param calc_graph_: The model graph. Data is `ILeaf` or `None`. 
    :param orig_graph_: The source graph. Data is `str`.
    :return:  A `Report` object with an `TIniData` as its `raw_data`. 
    """
    differences = []
    differences.append("<html><body>")
    differences.append(
        "<h1>Results for comparison of graphs {} and {}</h1>".format(
            calc_graph_, orig_graph_))

    calc_graph = calc_graph_.graph
    orig_graph = orig_graph_.graph
    ccs = analysing.find_connected_components(calc_graph)
    if len(ccs) != 1:
        raise ValueError(
            "The graph has more than 1 connected component ({}).".format(
                len(ccs)))

    calc_genes: Set[object] = set(
        x.data for x in analysing.realise_node_predicate_as_set(
            calc_graph, lego_graph.is_sequence_node))
    orig_genes: Set[object] = set(
        x.data for x in analysing.realise_node_predicate_as_set(
            orig_graph, lego_graph.is_sequence_node))

    if not calc_genes:
        raise ValueError("The calculated graph contains no genes.")

    if not orig_genes:
        raise ValueError("The original graph contains no genes.")

    if calc_genes != orig_genes:
        raise ValueError(
            "The calculated graph has a different gene set to the original. Missing: {}; additional: {}."
            .format(
                string_helper.format_array(
                    orig_genes - calc_genes,
                    sort=True,
                    format=lambda x: "{}:{}".format(type(x).__name__, x)),
                string_helper.format_array(
                    calc_genes - orig_genes,
                    sort=True,
                    format=lambda x: "{}:{}".format(type(x).__name__, x))))

    calc_quartets = __get_quartets_with_progress(calc_graph, "calculated")
    orig_quartets = __get_quartets_with_progress(orig_graph, "original")
    comparison: QuartetComparison = calc_quartets.compare(orig_quartets)

    html = []
    ini_data: TIniData = {}

    # QUARTETS
    html.append('<table border=1 style="border-collapse: collapse;">')
    html.append("<tr><td colspan=2><b>QUARTETS</b></td></tr>")
    ini_data["quartets"] = q = {}
    __add_row(html, q, "total_quartets", len(comparison))
    __add_row(
        html, q, "match_quartets",
        string_helper.percent(len(comparison.match), len(comparison.all)))
    __add_row(
        html, q, "mismatch_quartets",
        string_helper.percent(len(comparison.mismatch), len(comparison.all)))
    __add_row(
        html, q, "new_quartets",
        string_helper.percent(len(comparison.missing_in_left),
                              len(comparison.all)))
    __add_row(
        html, q, "missing_quartets",
        string_helper.percent(len(comparison.missing_in_right),
                              len(comparison.all)))

    # GENE COMBINATIONS
    __enumerate_2genes(calc_genes, comparison, html, 1, ini_data)
    __enumerate_2genes(calc_genes, comparison, html, 2, ini_data)
    __enumerate_2genes(calc_genes, comparison, html, 3, ini_data)

    c = calc_quartets.get_unsorted_lookup()
    o = orig_quartets.get_unsorted_lookup()
    __list_comp(comparison.match, "MATCHING", html, c, o, ini_data)
    __list_comp(comparison.mismatch, "MISMATCH", html, c, o, ini_data)
    if comparison.missing_in_left:
        __list_comp(comparison.missing_in_left, "MISSING IN LEFT", html, c, o,
                    ini_data)
    if comparison.missing_in_right:
        __list_comp(comparison.missing_in_right, "MISSING IN RIGHT", html, c,
                    o, ini_data)

    differences.append("</body></html>")

    report = Report("{} -vs- {}".format(orig_graph_, calc_graph_),
                    "\n".join(html))
    report.raw_data = ini_data
    return report
Exemple #13
0
def __enumerate_2genes(calc_seq: Set[object], comparison: QuartetComparison,
                       html: List[str], n: int, ini_data: TIniData) -> None:
    if array_helper.get_num_combinations(calc_seq, n) > 100:
        return

    html.append('<table border=1 style="border-collapse: collapse;">')
    html.append(
        "<tr><td colspan=5><b>BREAKDOWN FOR COMBINATIONS OF {}</b></td></tr>".
        format(n))
    html.append(
        "<tr><td>total</td><td>hit</td><td>miss</td><td>missing in left</td><td>missing in right</td></tr>"
    )
    ini_sect: TIniSection = {}
    ini_data["n_quartets_{}".format(n)] = ini_sect

    for comb in sorted(itertools.combinations(calc_seq, n),
                       key=cast(Callable, str)):  # type: Iterable[object]
        n_tot = []
        n_hit = []
        n_mis = []
        n_mil = []
        n_mir = []

        for quartet in comparison.all:
            assert isinstance(quartet, AbstractQuartet)

            if all(x in quartet.get_unsorted_key() for x in comb):
                n_tot.append(quartet)

                if quartet in comparison.match:
                    n_hit.append(quartet)
                elif quartet in comparison.mismatch:
                    n_mis.append(quartet)
                elif quartet in comparison.missing_in_left:
                    n_mil.append(quartet)
                elif quartet in comparison.missing_in_right:
                    n_mir.append(quartet)
                else:
                    raise SwitchError("quartet(in)", quartet)

        if not n_mis and not n_mil and not n_mir:
            continue

        html.append("<tr>")
        i = []

        # COMBINATION NAME
        name = string_helper.format_array(comb)
        html.append("<td>{}</td>".format(name))
        # HIT
        txt = string_helper.percent(len(n_hit), len(n_tot)) if n_hit else ""
        html.append("<td>{}</td>".format(txt))
        i.append(txt)
        # MISS
        txt = string_helper.percent(len(n_mis), len(n_tot)) if n_mis else ""
        html.append("<td>{}</td>".format(txt))
        i.append(txt)
        # MISSING IN LEFT
        txt = string_helper.percent(len(n_mil), len(n_tot)) if n_mil else ""
        html.append("<td>{}</td>".format(txt))
        i.append(txt)
        # MISSING IN RIGHT
        txt = string_helper.percent(len(n_mir), len(n_tot)) if n_mil else ""
        html.append("<td>{}</td>".format(txt))
        i.append(txt)

        html.append("</tr>")
        ini_sect[name] = "; ".join(str(x) for x in i)

        # Write out full quartets (if < 10)
        i = []

        if len(n_hit) < len(n_mis) < 10:
            for quartet in n_mis:
                html.append("<tr>")
                html.append("<td></td>")
                html.append("<td colspan=4>{}</td>".format(quartet))
                html.append("</tr>")
                i.append(quartet)

        ini_sect[name + "_list"] = "; ".join(str(x) for x in i)

    html.append("</table><br/>")
Exemple #14
0
def create_major( tol: int = 0, debug: bool = False ) -> EChanges:
    """
    Detects model components.
    
    First step of finding the components.
    
    We classify each component as a set of "major" genes.
    
    Components are defined as sets of genes that share a similarity path between them, where each edge between element 𝓧 and 𝓨 in that path:
        * Is sourced from no less than 𝓧's length, less the tolerance
        * Is targeted to no less than 𝓨's length, less the tolerance
        * The difference between 𝓧 and 𝓨's length is less than the tolerance
        
    We'll grab the minor domains that this component extends into in the next step.
    
    Requisites: Sequence similarity (BLAST data) must have been loaded 
    
    :param debug:       Assert the creation.
    :param tol:         Tolerance value
    :returns:           Nothing, the components are written to :ivar:`model.components`.
    """
    model = global_view.current_model()
    model.get_status( STAGES.MAJOR_4 ).assert_create()
    
    model.components.clear()
    
    # Find connected components
    components = ComponentFinder()
    
    # Basic assertions
    LOG_MAJOR( "There are {} sequences.", len( model.genes ) )
    missing_edges = []
    
    for sequence in model.genes:
        edges = model.edges.find_gene( sequence )
        
        if not edges:
            missing_edges.append( sequence )
    
    if missing_edges:
        raise ValueError( "Refusing to detect components because some sequences have no edges: «{}»".format( string_helper.format_array( missing_edges ) ) )
    
    # Iterate sequences
    for sequence_alpha in model.genes:
        assert isinstance( sequence_alpha, Gene )
        
        alpha_edges = model.edges.find_gene( sequence_alpha )
        any_accept = False
        
        LOG_MAJOR( "Sequence {} contains {} edges.", sequence_alpha, len( alpha_edges ) )
        
        for edge in alpha_edges:
            assert isinstance( edge, Edge )
            source_difference = abs( edge.left.length - edge.left.gene.length )
            destination_difference = abs( edge.right.length - edge.right.gene.length )
            total_difference = abs( edge.left.gene.length - edge.right.gene.length )
            
            LOG_MAJOR_V( "{}", edge )
            LOG_MAJOR_V( "-- Source difference ({})", source_difference )
            LOG_MAJOR_V( "-- Destination difference ({})", destination_difference )
            LOG_MAJOR_V( "-- Total difference ({})", total_difference )
            
            if source_difference > tol:
                LOG_MAJOR_V( "-- ==> REJECTED (SOURCE)" )
                continue
            elif destination_difference > tol:
                LOG_MAJOR_V( "-- ==> REJECTED (DEST)" )
                continue
            elif total_difference > tol:
                LOG_MAJOR_V( "-- ==> REJECTED (TOTAL)" )
                continue
            else:
                LOG_MAJOR_V( "-- ==> ACCEPTED" )
            
            if debug and edge.left.gene.accession[0] != edge.right.gene.accession[0]:
                raise ValueError( "Debug assertion failed. This edge not rejected: {}".format( edge ) )
            
            any_accept = True
            beta = edge.opposite( sequence_alpha ).gene
            LOG_MAJOR( "-- {:<40} LINKS {:<5} AND {:<5}", edge, sequence_alpha, beta )
            components.join( sequence_alpha, beta )
        
        if debug and not any_accept:
            raise ValueError( "Debug assertion failed. This sequence has no good edges: {}".format( sequence_alpha ) )
    
    # Create the components!
    sequences_in_components = set()
    
    for index, sequence_list in enumerate( components.tabulate() ):
        model.components.add( Component( model, index, sequence_list ) )
        LOG_MAJOR( "COMPONENT MAJOR: {}", sequence_list )
        sequences_in_components.update( sequence_list )
    
    # Create components for orphans
    for sequence in model.genes:
        if sequence not in sequences_in_components:
            LOG_MAJOR( "ORPHAN: {}", sequence )
            model.components.add( Component( model, len( model.components ), (sequence,) ) )
    
    # An assertion
    for component in model.components:
        assert isinstance( component, Component )
        if len( component.major_genes ) == 1:
            warnings.warn( "There are components with just one sequence in them. Maybe you meant to use a tolerance higher than {}?".format( tol ), UserWarning )
            break
    
    pr.printx( "<verbose>{} components detected.</verbose>".format( len( model.components ) ) )
    
    return EChanges.COMPONENTS
Exemple #15
0
def __create_supertree(algorithm: supertree_algorithms.Algorithm,
                       subset: Subset) -> MGraph:
    """
    Generates a supertree from a set of trees.
    
    :param algorithm:   Algorithm to use. See `algorithm_help`.
    :param subset:      Subset of genes from which we generate the consensus from 
    :return:            The consensus graph (this may be a reference to one of the input `graphs`)
    """
    # Get our algorithm
    ins = FunctionInspector(algorithm.function)

    # We allow two kinds of algorithm
    # - Python algorithms, which takes a `LegoSubset` instance
    # - External algorithms, which takes a newick-formatted string
    if ins.args[0].annotation == Subset:
        # Python algorithms get the subset instance
        input = subset
    else:
        # External algorithms get newick strings for each possible tree in the subset
        input_lines = __graphs_to_newick(subset.pregraphs)

        if __is_redundant(subset.pregraphs, input_lines):
            return subset.pregraphs[0].graph

        input = "\n".join(input_lines) + "\n"

    # Run the algorithm!
    output = external_runner.run_in_temporary(algorithm, input)

    # We allow two types of result
    # - `MGraph` objects
    # - `str` objects, which denote a newick-formatted string
    if isinstance(output, MGraph):
        result = output
    elif isinstance(output, str):
        # We don't reclade the newick, it's pointless at this stage and we remove redundancies during the NRFG_CLEAN stage anyway
        result = lego_graph.import_newick(output, subset.model, reclade=False)
    else:
        raise SwitchError("create_supertree::output", output, instance=True)

    # Assert the result
    # - All elements of the subset are in the supertree
    for element in subset.contents:
        if isinstance(element, Gene):
            if element in result.nodes.data:
                continue
        elif isinstance(element, Point):
            if element.formation in result.nodes.data:
                continue

        raise ValueError(
            _MSG1.format(
                element,
                string_helper.format_array(
                    result.nodes.data,
                    format=lambda x: "{}:{}".format(type(x).__name__, x),
                    sort=True),
                type(element).__name__))

    # - All (non-clade) elements of the supertree are in the subset
    for node in result.nodes:
        if lego_graph.is_clade(node):
            continue

        if lego_graph.is_formation(node):
            if any(x.formation is node.data for x in subset.contents
                   if isinstance(x, Point)):
                continue

        if lego_graph.is_sequence_node(node):
            if node.data in subset.contents:
                continue

        raise ValueError(
            _MSG2.format(
                node.data,
                string_helper.format_array(
                    subset.contents,
                    format=lambda x: "{}:{}".format(type(x).__name__, x),
                    sort=True),
                type(node.data).__name__))

    return result
Exemple #16
0
def print_minor(component: Optional[Component] = None,
                verbose: bool = False) -> EChanges:
    """
    Prints the edges between the component subsequences.
    
    Each line is of the form:
    
        `FROM <minor> TO <major> [ <start> : <end> ] <length>`
        
    Where:
    
        `minor`  is the source component
        `major`  is the destination component
        `start`  is the average of the start of the destination entry point
        `end`    is the average of the end of the destination entry point
        `length` is the average length of the sequences in the destination 

    :param component: Component to print.
                      If not specified prints a summary of all components.
    :param verbose:   Print all the things!
    """
    model = global_view.current_model()

    if not model.components:
        raise ValueError(
            "Cannot print components because components have not been calculated."
        )

    if verbose:
        rows = []

        rows.append(["component", "origins", "destinations"])

        for comp in model.components:
            assert isinstance(comp, Component)

            if component is not None and component is not comp:
                continue

            major_genes = string_helper.format_array(comp.major_genes,
                                                     join="\n")
            minor_domains = string_helper.format_array(comp.minor_domains,
                                                       join="\n")

            rows.append([comp, major_genes, minor_domains])

        with pr.pr_section("all components"):
            pr.pr_table(rows)

    if component:
        title = str(component)
    else:
        title = "all components"

    average_lengths = __get_average_component_lengths(model)

    rows = []
    rows.append([
        "source", "destination", "sequence", "seq-length", "start", "end",
        "edge-length"
    ])

    for comp in model.components:
        if component is not None and component is not comp:
            continue

        major_genes = list(comp.major_genes)

        for minor in model.components:
            if comp is minor:
                continue

            start = 0
            end = 0
            failed = False

            for sequence in major_genes:
                # subsequences that are in major sequence is a major sequence of major and are a minor subsequence of minor
                subsequences = [
                    x for x in minor.minor_domains if x.sequence is sequence
                ]

                if subsequences:
                    start += subsequences[0].start
                    end += subsequences[-1].end

                    if component is not None:
                        rows.append([
                            minor, comp, sequence.accession, sequence.length,
                            subsequences[0].start, subsequences[-1].end,
                            subsequences[-1].end - subsequences[0].start
                        ])
                else:
                    failed = True

            if failed:
                continue

            start /= len(major_genes)
            end /= len(major_genes)

            rows.append([
                minor, comp, "AVG*{}".format(len(major_genes)),
                round(average_lengths[comp]),
                round(start),
                round(end),
                round(end - start)
            ])

    with pr.pr_section(title):
        pr.pr_table(rows)
    return EChanges.INFORMATION