def _get_root_node(poagraph: graph.Poagraph, blosum_path: Path, output_dir: Path, p: parameters.P) -> tree.AffinityNode: """Creates root node of the Affinity Tree. The node has assigned a consensus path and all sequences present in poagraph.""" detailed_logger.info("Getting the root affinity node...") all_poagraph_sequences_ids = poagraph.get_sequences_ids() try: consensus_paths = poa.get_consensuses(poagraph, all_poagraph_sequences_ids, output_dir, "root", blosum_path, hbmin=parameters.Hbmin(0), specific_consensuses_id=[0]) except poa.NoConsensusError: raise AffinityTreeBuildException("Cannot find root consensus.") compatibilities = poagraph.get_compatibilities(all_poagraph_sequences_ids, consensus_paths[0].path, p=p) affinity_node = tree.AffinityNode(id_=tree.AffinityNodeID(0), sequences=[*poagraph.sequences.keys()], mincomp=_get_min_comp(all_poagraph_sequences_ids, compatibilities), compatibilities=compatibilities, consensus=consensus_paths[0].path) detailed_logger.info(f"New affinity node created: {str(affinity_node)}") return affinity_node
def _get_children_nodes_looping(node: tree.AffinityNode, poagraph: graph.Poagraph, output_dir: Path, blosum_path: Path, p: parameters.P, current_max_affinity_node_id: int) -> List[tree.AffinityNode]: """Generates children of given Affinity Tree node.""" children_nodes: List[tree.AffinityNode] = [] not_assigned_sequences_ids: List[msa.SequenceID] = node.sequences detailed_logger.info(f"""Getting children nodes for affinity node {node.id_}...""") affinity_node_id = 0 so_far_cutoffs: List[poagraph.Compatibility] = [] while not_assigned_sequences_ids: detailed_logger.info(f"### Getting child {len(so_far_cutoffs)}...") child_ready = False attempt = 0 current_candidates = not_assigned_sequences_ids while not child_ready: consensus_candidate = poa.get_consensuses(poagraph, current_candidates, output_dir, f"parent_{node.id_}_child_{len(so_far_cutoffs)}_attempt_{attempt}", blosum_path, parameters.Hbmin(0), specific_consensuses_id=[0])[0].path compatibilities_to_consensus_candidate = poagraph.get_compatibilities(sequences_ids=not_assigned_sequences_ids, consensus_path=consensus_candidate, p=p) compatibilities_to_consensus_candidate[msa.SequenceID("parent")] = node.mincomp qualified_sequences_ids_candidates, cutoff = _get_qualified_sequences_ids_and_cutoff( compatibilities_to_max_c=compatibilities_to_consensus_candidate, so_far_cutoffs=so_far_cutoffs, splitted_node_id=node.id_) if qualified_sequences_ids_candidates == current_candidates or attempt == 10: if attempt == 10: detailed_logger.info("Attempt treshold 10 exceeded!") affinity_node_id += 1 affinity_node = tree.AffinityNode( id_=tree.AffinityNodeID(current_max_affinity_node_id + affinity_node_id), parent=node.id_, sequences=qualified_sequences_ids_candidates, mincomp=_get_min_comp(node_sequences_ids=qualified_sequences_ids_candidates, comps_to_consensus=compatibilities_to_consensus_candidate), consensus=graph.SeqPath(consensus_candidate)) children_nodes.append(affinity_node) not_assigned_sequences_ids = list(set(not_assigned_sequences_ids) - set(qualified_sequences_ids_candidates)) child_ready = True so_far_cutoffs.append(affinity_node.mincomp) else: current_candidates = qualified_sequences_ids_candidates attempt += 1 detailed_logger.info("Children nodes generated.") return children_nodes
def get_ebola_consensus_tree(p: float, stop: float, output_dir_name: str) -> Tuple[Poagraph, AffinityTree]: current_path = Path(os.path.abspath(__file__)).resolve() output_dir_path = pathtools.get_child_dir(current_path.parent, output_dir_name) consensus_output_dir = pathtools.get_child_dir(output_dir_path, "consensus") multialignment_path = current_path.parent.joinpath("../data/Ebola/genome_whole/input/multialignment.maf") metadata_path = current_path.parent.joinpath("../data/Ebola/genome_whole/input/metadata.csv") blosum_path = current_path.parent.joinpath("../bin/blosum80.mat") fasta_provider = fp_ncbi.FromNCBI(use_cache=True) multialignment_content = pathtools.get_file_content_stringio(multialignment_path) multialignment = Maf(file_content=multialignment_content, filename=multialignment_path) metadata_content = pathtools.get_file_content_stringio(metadata_path) metadata = MetadataCSV(filecontent=metadata_content, filename=metadata_path) poagraph, dagmaf = Poagraph.build_from_dagmaf(multialignment, fasta_provider, metadata) blosum_content = pathtools.get_file_content_stringio(path=blosum_path) blosum = Blosum(blosum_content, blosum_path) return poagraph, atree_builders.get_affinity_tree(poagraph, blosum, consensus_output_dir, Stop(stop), P(p), False)
def get_ebola_affinity_tree( p: float, stop: float, output_dir_name: str) -> Tuple[Poagraph, AffinityTree]: current_path = Path(os.path.abspath(__file__)).resolve() output_dir_path = pathtools.get_child_dir(current_path.parent, output_dir_name) consensus_output_dir = pathtools.get_child_dir(output_dir_path, "consensus") multialignment_path = current_path.parent.joinpath( "../data/Ebola/multialignment.maf") metadata_path = current_path.parent.joinpath("../data/Ebola/metadata.csv") blosum_path = current_path.parent.joinpath("../bin/blosum80.mat") tp = TaskParameters(running_time="", multialignment_file_path=multialignment_path, multialignment_format="MAF", datatype="N", metadata_file_path=metadata_path, blosum_file_path=blosum_path, output_path=output_dir_path, output_po=False, output_fasta=False, output_with_nodes=False, verbose=False, raw_maf=False, fasta_provider='FromNCBI', cache=True, missing_base_symbol="", fasta_source_file=None, consensus_type="", hbmin=0.8, stop=stop, p=p) fasta_provider = fp_ncbi.FromNCBI(use_cache=True) multialignment_content = pathtools.get_file_content_stringio( multialignment_path) multialignment = Maf(file_content=multialignment_content, filename=multialignment_path) metadata_content = pathtools.get_file_content_stringio(metadata_path) metadata = MetadataCSV(filecontent=metadata_content, filename=metadata_path) poagraph, dagmaf = Poagraph.build_from_dagmaf(multialignment, fasta_provider, metadata) blosum_content = pathtools.get_file_content_stringio(path=blosum_path) blosum = Blosum(blosum_content, blosum_path) return poagraph, atree_builders.get_affinity_tree(poagraph, blosum, consensus_output_dir, Stop(stop), P(p), False)
def _convert_to_po_input_data(p: graph.Poagraph) -> \ Tuple[List[NodePO], List[SequencePO]]: po_nodes = [] po_sequences = [] sequences_weights = p.get_sequences_weights(p.get_sequences_ids()) for node in p.nodes: po_nodes.append( NodePO(base=node._base.value, aligned_to=node.aligned_to, in_nodes=set(), sequences_ids=[])) seq_int_id = -1 for seq_id, sequence in p.sequences.items(): nodes_count = sum([len(path) for path in sequence.paths]) if nodes_count == 0: continue seq_int_id += 1 po_sequences.append( SequencePO(name=str(seq_id), nodes_count=nodes_count, weight=sequences_weights[seq_id], consensus_id=-1, start_node_id=sequence.paths[0][0])) for path in sequence.paths: previous_node_id = None for node_id in path: po_nodes[node_id].sequences_ids.append(seq_int_id) if previous_node_id is not None: po_nodes[node_id].in_nodes.add(previous_node_id) previous_node_id = node_id for node in po_nodes: node.in_nodes = list(node.in_nodes) return po_nodes, po_sequences
def poagraph_to_fasta(poagraph: graph.Poagraph) -> str: """Converts poagraph to FASTA format. Args: poagraph: Poagraph to be converted. Returns: Fasta formatted string ready to be saved to file. """ fasta_lines = [] for seq_id, sequence in poagraph.sequences.items(): if poagraph.get_sequence_nodes_count(seq_id) == 0: continue sequence = "".join([ poagraph.nodes[node_id].get_base() for path in sequence.paths for node_id in path ]) fasta_lines.append(f">{seq_id}") fasta_lines.append(sequence) return "\n".join(fasta_lines)
def build_poa_affinity_tree(p: graph.Poagraph, blosum: Optional[parameters.Blosum], output_dir: Path, hbmin: parameters.Hbmin, verbose: bool) -> tree.AffinityTree: """Builds Affinity Tree coherent with poa software. This method builds a simple version of Affinity Tree as it uses a single call to poa software. Poa provides division of sequences in Poagraph into consistent groups with a consensus path assigned to each group. These groups are converted in this method to Affinity Tree nodes and connected with a dummy root node so the result is coherent with pangtree definition of Affinity Tree. Args: p: Poagraph containing sequences to be divided into groups (Affinity Tree nodes). optional blosum: BLOSUM matrix. If not provided, default Blosum80.mat is used. output_dir: Path to a directory that can be used by poa software. hbmin: Parameter required by poa software. The minimum value of sequence compatibility to generated consensus. verbose: Switch to control logging intensity. Raises: AffinityTreeGenerationException: if consensuses cannot be found. """ def _convert_consensus_paths_to_affinity_tree_nodes(): at_nodes = [] assigned_sequences = [] for c_id, c_info in consensus_paths.items(): assigned_sequences += c_info.assigned_sequences_ids all_seq = p.get_sequences_ids() compatibilities = p.get_compatibilities(all_seq, c_info.path) if len(c_info.assigned_sequences_ids): assigned_seq_comp = [c for seq_id, c in compatibilities.items() if seq_id in c_info.assigned_sequences_ids] mincomp = min(assigned_seq_comp) else: mincomp = 0 new_node = tree.AffinityNode(id_=tree.AffinityNodeID(c_id + 1), parent=tree.AffinityNodeID(0), sequences=c_info.assigned_sequences_ids, mincomp=mincomp, compatibilities=compatibilities, consensus=c_info.path, children=[]) at_nodes.append(new_node) node_for_unassigned_sequences = tree.AffinityNode(parent=tree.AffinityNodeID(0), sequences=[seq_id for seq_id in p.get_sequences_ids() if seq_id not in assigned_sequences], id_=tree.AffinityNodeID(len(at_nodes) + 1), mincomp=graph.Compatibility(0), children=[]) at_nodes.append(node_for_unassigned_sequences) return at_nodes global_logger.info("POA defined affinity tree generation started.") if blosum is None: blosum = get_default_blosum() _raise_error_if_invalid_poagraph(p) try: consensus_paths = poa.get_consensuses(p, p.get_sequences_ids(), output_dir, "poa_tree", blosum.filepath, hbmin) except poa.NoConsensusError: raise AffinityTreeBuildException("No consensus in the Affinity Tree.") consensus_nodes = _convert_consensus_paths_to_affinity_tree_nodes() root_node = tree.AffinityNode(id_=tree.AffinityNodeID(0), children=[c_node.id_ for c_node in consensus_nodes]) affinity_tree = tree.AffinityTree([root_node] + consensus_nodes) global_logger.info("POA defined affinity tree generation finished.") return affinity_tree
def build_affinity_tree(poagraph: graph.Poagraph, blosum: Optional[parameters.Blosum], output_dir: Path, stop: parameters.Stop, p: parameters.P, verbose: bool) -> tree.AffinityTree: """Builds Affinity Tree. Affinity Tree is defined in paper 'Getting insight into the pan-genome structure with Pangtree'. This method builds an Affinity Tree by iterative calls to poa software. Full algorithm and idea are described in the above-mentioned paper. Args: poagraph: Poagraph containing _sequences to be divided into groups (Affinity Tree nodes). optional blosum: BLOSUM matrix. If not provided, default Blosum80.mat is used. output_dir: Path to a directory that can be used by poa software. stop: Value of mincomp above which an affinity tree node is no more split. p: Value changing the linear meaning of compatibility when searching for cutoff. verbose: Switch to control logging intensity. Raises: AffinityTreeGenerationException: if consensuses cannot be found. Returns: Affinity Tree generated with Pangtree algorithm. """ global_logger.info("Affinity Tree generation started.") if blosum is None: blosum = get_default_blosum() if verbose: logprocess.add_file_handler_to_logger(output_dir, "tresholdsCSV", "tresholds.csv", "%(message)s", False) _raise_error_if_invalid_poagraph(poagraph) root_node = _get_root_node(poagraph, blosum.filepath, output_dir, p) affinity_tree = tree.AffinityTree([root_node]) nodes_to_process = deque([affinity_tree.get_node(tree.AffinityNodeID(0))]) while nodes_to_process: node = nodes_to_process.pop() children_nodes = _get_children_nodes_looping(node, poagraph, output_dir, blosum.filepath, p, affinity_tree.get_max_node_id()) if len(children_nodes) == 1: continue for child in children_nodes: all_sequences = [*poagraph.sequences.keys()] child.compatibilities = poagraph.get_compatibilities(sequences_ids=all_sequences, consensus_path=child.consensus, p=p) node.children.append(child.id_) affinity_tree.nodes.append(child) if not _node_is_ready(child, stop): nodes_to_process.append(child) global_logger.info("Affinity Tree generation finished.\n") return affinity_tree