def describe_filter(self): sel = self.filter r = [] if sel.components: r.append("(ᴄᴏᴍᴩ, ꜰᴏʀ) ⊇ {{{0}}}".format( string_helper.format_array(sel.components))) if sel.genes: r.append("ᴀʟʟ ⊇ {{{0}}}".format( string_helper.format_array(sel.genes))) if sel.fusions: r.append("ᴀʟʟ ⊇ {{{0}}}".format( string_helper.format_array(sel.fusions))) if sel.splits: r.append("ꜱᴩʟɪᴛ ∈ {{{0}}}".format( string_helper.format_array(sel.splits))) if sel.text: r.append('"{0}" ∈ ꜱᴩʟɪᴛ'.format(sel.text)) if not r: return "No filter" return " ∧ ".join(r)
def to_details(self): r = [] r.append("MAJOR-SE: {}".format( string_helper.format_array(self.major_genes, sort=True))) r.append("MINOR-SE: {}".format( string_helper.format_array(self.minor_genes, sort=True))) r.append("MINOR-SS: {}".format( string_helper.format_array(self.minor_domains))) r.append("INCOMING: {}".format( string_helper.format_array(self.incoming_components(), sort=True))) r.append("OUTGOING: {}".format( string_helper.format_array(self.outgoing_components(), sort=True))) return "\n".join(r)
def create_supertrees(algorithm: supertree_algorithms.Algorithm) -> None: """ Creates the supertrees/subgraphs for the model. Requisites: `create_pregraphs` :param algorithm: Algorithm to use, see `algorithm_help`. :return: Nothing is returned, the state is saved into the model. """ # Check we're ready to go model = global_view.current_model() model.get_status(STAGES.SUPERTREES_14).assert_create() # Create the subgraphs subgraphs = [] for subset in model.subsets: subgraph = __create_supertree(algorithm, subset) subgraphs.append((subset, subgraph)) # Collect the sources and destinations destinations = set() sources = set() for subset, subgraph in subgraphs: sequences = lego_graph.get_sequence_data(subgraph) ffn = lego_graph.get_fusion_formation_nodes(subgraph) if not ffn: raise ValueError( "The subgraph («{}») of the subset «{}» («{}») doesn't appear to have any fusion point nodes. Refusing to continue because this means the subgraph's position in the NRFG is unavailable." .format(string_helper.format_array(subgraph.nodes), subset, string_helper.format_array(subset.contents))) for node in ffn: # type:MNode formation: Formation = node.data if any(x in sequences for x in formation.pertinent_inner): destinations.add(node.uid) else: sources.add(node.uid) model.subgraphs_destinations = tuple(destinations) model.subgraphs_sources = tuple(sources) model.subgraphs = tuple( Subgraph(subgraph, subset, repr(algorithm)) for subset, subgraph in subgraphs) return EChanges.MODEL_DATA
def __subset_to_possible_graphs(subset: Subset): """ Converts a subset of genes into the possible graphs representing these genes (1 graph per component). :remarks: This isn't as simple as just pulling out the genes from the component trees, we need to address the following issues: 1. Our resulting graphs might contain distinct fusions points which may, in fact, be the same. This causes problems in the supertree stage. We address this issue by coalescing identical fusions (`LegoPoint`s into `LegoFormation`s). 2. The "intermediaries" (the clades which hold our gene subset together to form a coherent graph) should generally just be boring clades, but occasionally we'll pull a fusion node into them. This also causes problems at the supertree stage. We address this issue by swapping them out these out for clades. """ graphs: List[Pregraph] = [] LOG("{} :::: {}", subset, subset.contents) for component in subset.model.components: intermediaries = analysing.get_intermediaries( component.tree, lambda x: x.data in subset.contents) LOG("{} :::: {}", subset, component) graph = component.tree.copy(nodes=intermediaries) # Hold up! for node in graph: LOG("{} :::: {}", subset, node) if lego_graph.is_clade(node): continue if lego_graph.is_fusion_point(node): if node.data in subset.contents: node.data = node.data.formation else: # Substitute for clade print("SUB {}".format(node)) node.data = None continue if lego_graph.is_sequence_node(node): if node.data in subset.contents: continue raise ValueError( "Subset graph contains the node «{}», which does not appear in the actual subset «{}»." .format(node, subset)) if sum(1 for _ in graph.nodes.roots) > 1: raise LogicError("Graph of subset has multiple roots: {}".format( string_helper.format_array(graph.nodes.roots))) if graph.nodes: graphs.append(Pregraph(graph, subset, component)) subset.pregraphs = tuple(graphs)
def print_subsets() -> EChanges: """ Prints NRFG subsets. """ model = global_view.current_model() for x in sorted( model.subsets, key = cast( Any, str ) ): assert isinstance( x, Subset ) print( "{} - {} elements: {}".format( x, len( x ), string_helper.format_array( x.contents, sort = True, autorange = True ) ) ) return EChanges.INFORMATION
def __list_comp(comparison, name, html, calculated, original, ini: TIniData): html.append('<table border=1 style="border-collapse: collapse;">') html.append("<tr><td colspan=6><b>{} QUARTETS</b></td></tr>".format(name)) ini_sect: TIniSection = {} ini[name.lower()] = ini_sect for quartet in comparison: calc = calculated[quartet.get_unsorted_key()] orig = original[quartet.get_unsorted_key()] html.append( "<tr><td>{}</td><td>{}</td><td>{}</td><td>{}</td><td>{}</td><td>{}</td></tr>" .format(*quartet.get_unsorted_key(), calc, orig)) ini_sect["{}_calculated".format( string_helper.format_array( quartet.get_unsorted_key()))] = str(calc) ini_sect["{}_original".format( string_helper.format_array( quartet.get_unsorted_key()))] = str(orig) html.append("</table><br/>")
def print_fusions() -> EChanges: """ Prints model fusions. """ results: List[str] = [] model = global_view.current_model() for event in model.fusions: results.append("- name {}".format(event)) results.append(" components in {}".format(event.components_in)) results.append(" component out {}".format(event.component_out)) results.append(" index {}".format(event.index)) results.append(" points {}".format( string_helper.format_array(event.points))) for point in event.points: results.append(" - name {}".format(point)) results.append(" point_component {}".format( point.point_component)) results.append(" count {}".format( point.count)) results.append(" outer sequences {}".format( string_helper.format_array(point.outer_sequences))) results.append(" pertinent inner {}".format( string_helper.format_array(point.pertinent_inner))) results.append(" pertinent outer {}".format( string_helper.format_array(point.pertinent_outer))) results.append(" sequences {}".format( string_helper.format_array(point.genes))) results.append("") results.append("") print("\n".join(results)) return EChanges.INFORMATION
def on_refresh_data(self): tvw = self.ui.LST_MAIN tvw.clear() model = self.get_model() accepted = 0 rejected = 0 if model.splits: for split in model.splits: if not self.check_filter(split): rejected += 1 continue accepted += 1 assert isinstance(split, Split) item = QTreeWidgetItem() col = qt.tree_helper.get_or_create_column(tvw, "Inside") txt = string_helper.format_array(split.split.inside) item.setText(col, txt) col = qt.tree_helper.get_or_create_column(tvw, "Outside") txt = string_helper.format_array(split.split.outside) item.setText(col, txt) col = qt.tree_helper.get_or_create_column(tvw, "Components") txt = string_helper.format_array(split.components) item.setText(col, txt) col = qt.tree_helper.get_or_create_column(tvw, "For") txt = string_helper.format_array(split.evidence_for) item.setText(col, txt) col = qt.tree_helper.get_or_create_column(tvw, "Against") txt = string_helper.format_array(split.evidence_against) item.setText(col, txt) col = qt.tree_helper.get_or_create_column(tvw, "Unused") txt = string_helper.format_array(split.evidence_unused) item.setText(col, txt) qt.tree_helper.set_data(item, split) tvw.addTopLevelItem(item) if rejected: self.ui.LBL_TITLE.setText( "{} splits, {} rejected due to filter".format( accepted, rejected)) else: self.ui.LBL_TITLE.setText("{} splits".format(accepted)) self.ui.LBL_FILTER.setText(self.describe_filter())
def create_consensus(cutoff: float = 0.5) -> EChanges: """ Filters the candidate splits. NRFG PHASE II. Collect consensus evidence. :remarks: ---------------------------------------------------------------------------------------------------- | The second stage of the consensus. | | We collect evidence from the graphs to support or reject our splits. | | Unlike a normal majority rule consensus, there's no guarantee that our splits are in the graphs, | | so, in addition to support/reject evidence, we have a third category, whereby the graph neither | | supports nor rejects a split. | ---------------------------------------------------------------------------------------------------- :param cutoff: Cutoff to be used in the consensus """ model = global_view.current_model() __LOG_EVIDENCE.pause("▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒ EVIDENCE ▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒") model.get_status(STAGES.CONSENSUS_11).assert_create() __LOG_EVIDENCE("BEGIN EVIDENCE ({} splits)".format(len(model.splits))) viable_splits: Set[Split] = set() for split in model.splits: assert isinstance(split, Split), split if split.split.is_empty: __LOG_EVIDENCE("SPLIT IS EMPTY: {}".format(split)) continue evidence_for = set() evidence_against = set() evidence_unused = set() for component in model.components: component_splits = component.splits has_evidence = None for component_split in component_splits: evidence = split.is_evidenced_by(component_split) if evidence is True: has_evidence = True break elif evidence is False: has_evidence = False if has_evidence is True: evidence_for.add(component) elif has_evidence is False: evidence_against.add(component) else: evidence_unused.add(component) if not evidence_for: raise LogicError( "There is no evidence for (F{} A{} U{}) this split «{}», but the split must have come from somewhere." .format(len(evidence_for), len(evidence_against), len(evidence_unused), split)) total_evidence: int = len(evidence_for) + len(evidence_against) frequency: float = len(evidence_for) / total_evidence accept: bool = frequency > cutoff split.evidence_for = frozenset(evidence_for) split.evidence_against = frozenset(evidence_against) split.evidence_unused = frozenset(evidence_unused) __LOG_EVIDENCE( "{} {} = {}% -- FOR: ({}) {}, AGAINST: ({}) {}, UNUSED: ({}) {}", "✔" if accept else "✘", ansi_helper.ljust(str(split), 80), int(frequency * 100), len(evidence_for), string_helper.format_array(evidence_for, sort=True), len(evidence_against), string_helper.format_array(evidence_against, sort=True), len(evidence_unused), string_helper.format_array(evidence_unused, sort=True)) if accept: viable_splits.add(split) model.consensus = frozenset(viable_splits) return EChanges.MODEL_DATA
def __format_elements(y): return string_helper.format_array(y, join=",", sort=True, autorange=True)
def get_details(self): return string_helper.format_array(self.contents)
def compare_graphs(calc_graph_: INamedGraph, orig_graph_: INamedGraph) -> Report: """ Compares graphs using quartets. :param calc_graph_: The model graph. Data is `ILeaf` or `None`. :param orig_graph_: The source graph. Data is `str`. :return: A `Report` object with an `TIniData` as its `raw_data`. """ differences = [] differences.append("<html><body>") differences.append( "<h1>Results for comparison of graphs {} and {}</h1>".format( calc_graph_, orig_graph_)) calc_graph = calc_graph_.graph orig_graph = orig_graph_.graph ccs = analysing.find_connected_components(calc_graph) if len(ccs) != 1: raise ValueError( "The graph has more than 1 connected component ({}).".format( len(ccs))) calc_genes: Set[object] = set( x.data for x in analysing.realise_node_predicate_as_set( calc_graph, lego_graph.is_sequence_node)) orig_genes: Set[object] = set( x.data for x in analysing.realise_node_predicate_as_set( orig_graph, lego_graph.is_sequence_node)) if not calc_genes: raise ValueError("The calculated graph contains no genes.") if not orig_genes: raise ValueError("The original graph contains no genes.") if calc_genes != orig_genes: raise ValueError( "The calculated graph has a different gene set to the original. Missing: {}; additional: {}." .format( string_helper.format_array( orig_genes - calc_genes, sort=True, format=lambda x: "{}:{}".format(type(x).__name__, x)), string_helper.format_array( calc_genes - orig_genes, sort=True, format=lambda x: "{}:{}".format(type(x).__name__, x)))) calc_quartets = __get_quartets_with_progress(calc_graph, "calculated") orig_quartets = __get_quartets_with_progress(orig_graph, "original") comparison: QuartetComparison = calc_quartets.compare(orig_quartets) html = [] ini_data: TIniData = {} # QUARTETS html.append('<table border=1 style="border-collapse: collapse;">') html.append("<tr><td colspan=2><b>QUARTETS</b></td></tr>") ini_data["quartets"] = q = {} __add_row(html, q, "total_quartets", len(comparison)) __add_row( html, q, "match_quartets", string_helper.percent(len(comparison.match), len(comparison.all))) __add_row( html, q, "mismatch_quartets", string_helper.percent(len(comparison.mismatch), len(comparison.all))) __add_row( html, q, "new_quartets", string_helper.percent(len(comparison.missing_in_left), len(comparison.all))) __add_row( html, q, "missing_quartets", string_helper.percent(len(comparison.missing_in_right), len(comparison.all))) # GENE COMBINATIONS __enumerate_2genes(calc_genes, comparison, html, 1, ini_data) __enumerate_2genes(calc_genes, comparison, html, 2, ini_data) __enumerate_2genes(calc_genes, comparison, html, 3, ini_data) c = calc_quartets.get_unsorted_lookup() o = orig_quartets.get_unsorted_lookup() __list_comp(comparison.match, "MATCHING", html, c, o, ini_data) __list_comp(comparison.mismatch, "MISMATCH", html, c, o, ini_data) if comparison.missing_in_left: __list_comp(comparison.missing_in_left, "MISSING IN LEFT", html, c, o, ini_data) if comparison.missing_in_right: __list_comp(comparison.missing_in_right, "MISSING IN RIGHT", html, c, o, ini_data) differences.append("</body></html>") report = Report("{} -vs- {}".format(orig_graph_, calc_graph_), "\n".join(html)) report.raw_data = ini_data return report
def __enumerate_2genes(calc_seq: Set[object], comparison: QuartetComparison, html: List[str], n: int, ini_data: TIniData) -> None: if array_helper.get_num_combinations(calc_seq, n) > 100: return html.append('<table border=1 style="border-collapse: collapse;">') html.append( "<tr><td colspan=5><b>BREAKDOWN FOR COMBINATIONS OF {}</b></td></tr>". format(n)) html.append( "<tr><td>total</td><td>hit</td><td>miss</td><td>missing in left</td><td>missing in right</td></tr>" ) ini_sect: TIniSection = {} ini_data["n_quartets_{}".format(n)] = ini_sect for comb in sorted(itertools.combinations(calc_seq, n), key=cast(Callable, str)): # type: Iterable[object] n_tot = [] n_hit = [] n_mis = [] n_mil = [] n_mir = [] for quartet in comparison.all: assert isinstance(quartet, AbstractQuartet) if all(x in quartet.get_unsorted_key() for x in comb): n_tot.append(quartet) if quartet in comparison.match: n_hit.append(quartet) elif quartet in comparison.mismatch: n_mis.append(quartet) elif quartet in comparison.missing_in_left: n_mil.append(quartet) elif quartet in comparison.missing_in_right: n_mir.append(quartet) else: raise SwitchError("quartet(in)", quartet) if not n_mis and not n_mil and not n_mir: continue html.append("<tr>") i = [] # COMBINATION NAME name = string_helper.format_array(comb) html.append("<td>{}</td>".format(name)) # HIT txt = string_helper.percent(len(n_hit), len(n_tot)) if n_hit else "" html.append("<td>{}</td>".format(txt)) i.append(txt) # MISS txt = string_helper.percent(len(n_mis), len(n_tot)) if n_mis else "" html.append("<td>{}</td>".format(txt)) i.append(txt) # MISSING IN LEFT txt = string_helper.percent(len(n_mil), len(n_tot)) if n_mil else "" html.append("<td>{}</td>".format(txt)) i.append(txt) # MISSING IN RIGHT txt = string_helper.percent(len(n_mir), len(n_tot)) if n_mil else "" html.append("<td>{}</td>".format(txt)) i.append(txt) html.append("</tr>") ini_sect[name] = "; ".join(str(x) for x in i) # Write out full quartets (if < 10) i = [] if len(n_hit) < len(n_mis) < 10: for quartet in n_mis: html.append("<tr>") html.append("<td></td>") html.append("<td colspan=4>{}</td>".format(quartet)) html.append("</tr>") i.append(quartet) ini_sect[name + "_list"] = "; ".join(str(x) for x in i) html.append("</table><br/>")
def create_major( tol: int = 0, debug: bool = False ) -> EChanges: """ Detects model components. First step of finding the components. We classify each component as a set of "major" genes. Components are defined as sets of genes that share a similarity path between them, where each edge between element 𝓧 and 𝓨 in that path: * Is sourced from no less than 𝓧's length, less the tolerance * Is targeted to no less than 𝓨's length, less the tolerance * The difference between 𝓧 and 𝓨's length is less than the tolerance We'll grab the minor domains that this component extends into in the next step. Requisites: Sequence similarity (BLAST data) must have been loaded :param debug: Assert the creation. :param tol: Tolerance value :returns: Nothing, the components are written to :ivar:`model.components`. """ model = global_view.current_model() model.get_status( STAGES.MAJOR_4 ).assert_create() model.components.clear() # Find connected components components = ComponentFinder() # Basic assertions LOG_MAJOR( "There are {} sequences.", len( model.genes ) ) missing_edges = [] for sequence in model.genes: edges = model.edges.find_gene( sequence ) if not edges: missing_edges.append( sequence ) if missing_edges: raise ValueError( "Refusing to detect components because some sequences have no edges: «{}»".format( string_helper.format_array( missing_edges ) ) ) # Iterate sequences for sequence_alpha in model.genes: assert isinstance( sequence_alpha, Gene ) alpha_edges = model.edges.find_gene( sequence_alpha ) any_accept = False LOG_MAJOR( "Sequence {} contains {} edges.", sequence_alpha, len( alpha_edges ) ) for edge in alpha_edges: assert isinstance( edge, Edge ) source_difference = abs( edge.left.length - edge.left.gene.length ) destination_difference = abs( edge.right.length - edge.right.gene.length ) total_difference = abs( edge.left.gene.length - edge.right.gene.length ) LOG_MAJOR_V( "{}", edge ) LOG_MAJOR_V( "-- Source difference ({})", source_difference ) LOG_MAJOR_V( "-- Destination difference ({})", destination_difference ) LOG_MAJOR_V( "-- Total difference ({})", total_difference ) if source_difference > tol: LOG_MAJOR_V( "-- ==> REJECTED (SOURCE)" ) continue elif destination_difference > tol: LOG_MAJOR_V( "-- ==> REJECTED (DEST)" ) continue elif total_difference > tol: LOG_MAJOR_V( "-- ==> REJECTED (TOTAL)" ) continue else: LOG_MAJOR_V( "-- ==> ACCEPTED" ) if debug and edge.left.gene.accession[0] != edge.right.gene.accession[0]: raise ValueError( "Debug assertion failed. This edge not rejected: {}".format( edge ) ) any_accept = True beta = edge.opposite( sequence_alpha ).gene LOG_MAJOR( "-- {:<40} LINKS {:<5} AND {:<5}", edge, sequence_alpha, beta ) components.join( sequence_alpha, beta ) if debug and not any_accept: raise ValueError( "Debug assertion failed. This sequence has no good edges: {}".format( sequence_alpha ) ) # Create the components! sequences_in_components = set() for index, sequence_list in enumerate( components.tabulate() ): model.components.add( Component( model, index, sequence_list ) ) LOG_MAJOR( "COMPONENT MAJOR: {}", sequence_list ) sequences_in_components.update( sequence_list ) # Create components for orphans for sequence in model.genes: if sequence not in sequences_in_components: LOG_MAJOR( "ORPHAN: {}", sequence ) model.components.add( Component( model, len( model.components ), (sequence,) ) ) # An assertion for component in model.components: assert isinstance( component, Component ) if len( component.major_genes ) == 1: warnings.warn( "There are components with just one sequence in them. Maybe you meant to use a tolerance higher than {}?".format( tol ), UserWarning ) break pr.printx( "<verbose>{} components detected.</verbose>".format( len( model.components ) ) ) return EChanges.COMPONENTS
def __create_supertree(algorithm: supertree_algorithms.Algorithm, subset: Subset) -> MGraph: """ Generates a supertree from a set of trees. :param algorithm: Algorithm to use. See `algorithm_help`. :param subset: Subset of genes from which we generate the consensus from :return: The consensus graph (this may be a reference to one of the input `graphs`) """ # Get our algorithm ins = FunctionInspector(algorithm.function) # We allow two kinds of algorithm # - Python algorithms, which takes a `LegoSubset` instance # - External algorithms, which takes a newick-formatted string if ins.args[0].annotation == Subset: # Python algorithms get the subset instance input = subset else: # External algorithms get newick strings for each possible tree in the subset input_lines = __graphs_to_newick(subset.pregraphs) if __is_redundant(subset.pregraphs, input_lines): return subset.pregraphs[0].graph input = "\n".join(input_lines) + "\n" # Run the algorithm! output = external_runner.run_in_temporary(algorithm, input) # We allow two types of result # - `MGraph` objects # - `str` objects, which denote a newick-formatted string if isinstance(output, MGraph): result = output elif isinstance(output, str): # We don't reclade the newick, it's pointless at this stage and we remove redundancies during the NRFG_CLEAN stage anyway result = lego_graph.import_newick(output, subset.model, reclade=False) else: raise SwitchError("create_supertree::output", output, instance=True) # Assert the result # - All elements of the subset are in the supertree for element in subset.contents: if isinstance(element, Gene): if element in result.nodes.data: continue elif isinstance(element, Point): if element.formation in result.nodes.data: continue raise ValueError( _MSG1.format( element, string_helper.format_array( result.nodes.data, format=lambda x: "{}:{}".format(type(x).__name__, x), sort=True), type(element).__name__)) # - All (non-clade) elements of the supertree are in the subset for node in result.nodes: if lego_graph.is_clade(node): continue if lego_graph.is_formation(node): if any(x.formation is node.data for x in subset.contents if isinstance(x, Point)): continue if lego_graph.is_sequence_node(node): if node.data in subset.contents: continue raise ValueError( _MSG2.format( node.data, string_helper.format_array( subset.contents, format=lambda x: "{}:{}".format(type(x).__name__, x), sort=True), type(node.data).__name__)) return result
def print_minor(component: Optional[Component] = None, verbose: bool = False) -> EChanges: """ Prints the edges between the component subsequences. Each line is of the form: `FROM <minor> TO <major> [ <start> : <end> ] <length>` Where: `minor` is the source component `major` is the destination component `start` is the average of the start of the destination entry point `end` is the average of the end of the destination entry point `length` is the average length of the sequences in the destination :param component: Component to print. If not specified prints a summary of all components. :param verbose: Print all the things! """ model = global_view.current_model() if not model.components: raise ValueError( "Cannot print components because components have not been calculated." ) if verbose: rows = [] rows.append(["component", "origins", "destinations"]) for comp in model.components: assert isinstance(comp, Component) if component is not None and component is not comp: continue major_genes = string_helper.format_array(comp.major_genes, join="\n") minor_domains = string_helper.format_array(comp.minor_domains, join="\n") rows.append([comp, major_genes, minor_domains]) with pr.pr_section("all components"): pr.pr_table(rows) if component: title = str(component) else: title = "all components" average_lengths = __get_average_component_lengths(model) rows = [] rows.append([ "source", "destination", "sequence", "seq-length", "start", "end", "edge-length" ]) for comp in model.components: if component is not None and component is not comp: continue major_genes = list(comp.major_genes) for minor in model.components: if comp is minor: continue start = 0 end = 0 failed = False for sequence in major_genes: # subsequences that are in major sequence is a major sequence of major and are a minor subsequence of minor subsequences = [ x for x in minor.minor_domains if x.sequence is sequence ] if subsequences: start += subsequences[0].start end += subsequences[-1].end if component is not None: rows.append([ minor, comp, sequence.accession, sequence.length, subsequences[0].start, subsequences[-1].end, subsequences[-1].end - subsequences[0].start ]) else: failed = True if failed: continue start /= len(major_genes) end /= len(major_genes) rows.append([ minor, comp, "AVG*{}".format(len(major_genes)), round(average_lengths[comp]), round(start), round(end), round(end - start) ]) with pr.pr_section(title): pr.pr_table(rows) return EChanges.INFORMATION