Beispiel #1
0
def _get_root_node(poagraph: graph.Poagraph,
                   blosum_path: Path,
                   output_dir: Path,
                   p: parameters.P) -> tree.AffinityNode:
    """Creates root node of the Affinity Tree.

    The node has assigned a consensus path and
    all sequences present in poagraph."""

    detailed_logger.info("Getting the root affinity node...")
    all_poagraph_sequences_ids = poagraph.get_sequences_ids()
    try:
        consensus_paths = poa.get_consensuses(poagraph,
                                              all_poagraph_sequences_ids,
                                              output_dir,
                                              "root",
                                              blosum_path,
                                              hbmin=parameters.Hbmin(0),
                                              specific_consensuses_id=[0])
    except poa.NoConsensusError:
        raise AffinityTreeBuildException("Cannot find root consensus.")
    compatibilities = poagraph.get_compatibilities(all_poagraph_sequences_ids,
                                                   consensus_paths[0].path,
                                                   p=p)
    affinity_node = tree.AffinityNode(id_=tree.AffinityNodeID(0),
                                      sequences=[*poagraph.sequences.keys()],
                                      mincomp=_get_min_comp(all_poagraph_sequences_ids, compatibilities),
                                      compatibilities=compatibilities,
                                      consensus=consensus_paths[0].path)
    detailed_logger.info(f"New affinity node created: {str(affinity_node)}")
    return affinity_node
Beispiel #2
0
def _get_children_nodes_looping(node: tree.AffinityNode,
                                poagraph: graph.Poagraph,
                                output_dir: Path,
                                blosum_path: Path,
                                p: parameters.P,
                                current_max_affinity_node_id: int) -> List[tree.AffinityNode]:
    """Generates children of given Affinity Tree node."""

    children_nodes: List[tree.AffinityNode] = []
    not_assigned_sequences_ids: List[msa.SequenceID] = node.sequences
    detailed_logger.info(f"""Getting children nodes for
                             affinity node {node.id_}...""")

    affinity_node_id = 0
    so_far_cutoffs: List[poagraph.Compatibility] = []
    while not_assigned_sequences_ids:
        detailed_logger.info(f"### Getting child {len(so_far_cutoffs)}...")
        child_ready = False
        attempt = 0
        current_candidates = not_assigned_sequences_ids
        while not child_ready:
            consensus_candidate = poa.get_consensuses(poagraph,
                                                      current_candidates,
                                                      output_dir,
                                                      f"parent_{node.id_}_child_{len(so_far_cutoffs)}_attempt_{attempt}",
                                                      blosum_path,
                                                      parameters.Hbmin(0),
                                                      specific_consensuses_id=[0])[0].path
            compatibilities_to_consensus_candidate = poagraph.get_compatibilities(sequences_ids=not_assigned_sequences_ids,
                                                                                  consensus_path=consensus_candidate,
                                                                                  p=p)
            compatibilities_to_consensus_candidate[msa.SequenceID("parent")] = node.mincomp
            qualified_sequences_ids_candidates, cutoff = _get_qualified_sequences_ids_and_cutoff(
                compatibilities_to_max_c=compatibilities_to_consensus_candidate,
                so_far_cutoffs=so_far_cutoffs,
                splitted_node_id=node.id_)

            if qualified_sequences_ids_candidates == current_candidates or attempt == 10:
                if attempt == 10:
                    detailed_logger.info("Attempt treshold 10 exceeded!")
                affinity_node_id += 1

                affinity_node = tree.AffinityNode(
                    id_=tree.AffinityNodeID(current_max_affinity_node_id + affinity_node_id),
                    parent=node.id_,
                    sequences=qualified_sequences_ids_candidates,
                    mincomp=_get_min_comp(node_sequences_ids=qualified_sequences_ids_candidates,
                                          comps_to_consensus=compatibilities_to_consensus_candidate),
                    consensus=graph.SeqPath(consensus_candidate))
                children_nodes.append(affinity_node)
                not_assigned_sequences_ids = list(set(not_assigned_sequences_ids) - set(qualified_sequences_ids_candidates))
                child_ready = True
                so_far_cutoffs.append(affinity_node.mincomp)
            else:
                current_candidates = qualified_sequences_ids_candidates
                attempt += 1

    detailed_logger.info("Children nodes generated.")

    return children_nodes
def get_ebola_consensus_tree(p: float, stop: float, output_dir_name: str) -> Tuple[Poagraph, AffinityTree]:
    current_path = Path(os.path.abspath(__file__)).resolve()
    output_dir_path = pathtools.get_child_dir(current_path.parent, output_dir_name)
    consensus_output_dir = pathtools.get_child_dir(output_dir_path, "consensus")
    multialignment_path = current_path.parent.joinpath("../data/Ebola/genome_whole/input/multialignment.maf")
    metadata_path = current_path.parent.joinpath("../data/Ebola/genome_whole/input/metadata.csv")
    blosum_path = current_path.parent.joinpath("../bin/blosum80.mat")


    fasta_provider = fp_ncbi.FromNCBI(use_cache=True)

    multialignment_content = pathtools.get_file_content_stringio(multialignment_path)
    multialignment = Maf(file_content=multialignment_content, filename=multialignment_path)

    metadata_content = pathtools.get_file_content_stringio(metadata_path)
    metadata = MetadataCSV(filecontent=metadata_content, filename=metadata_path)

    poagraph, dagmaf = Poagraph.build_from_dagmaf(multialignment, fasta_provider, metadata)

    blosum_content = pathtools.get_file_content_stringio(path=blosum_path)
    blosum = Blosum(blosum_content, blosum_path)

    return poagraph, atree_builders.get_affinity_tree(poagraph,
                                                      blosum,
                                                      consensus_output_dir,
                                                      Stop(stop),
                                                      P(p),
                                                      False)
def get_ebola_affinity_tree(
        p: float, stop: float,
        output_dir_name: str) -> Tuple[Poagraph, AffinityTree]:
    current_path = Path(os.path.abspath(__file__)).resolve()
    output_dir_path = pathtools.get_child_dir(current_path.parent,
                                              output_dir_name)
    consensus_output_dir = pathtools.get_child_dir(output_dir_path,
                                                   "consensus")
    multialignment_path = current_path.parent.joinpath(
        "../data/Ebola/multialignment.maf")
    metadata_path = current_path.parent.joinpath("../data/Ebola/metadata.csv")
    blosum_path = current_path.parent.joinpath("../bin/blosum80.mat")

    tp = TaskParameters(running_time="",
                        multialignment_file_path=multialignment_path,
                        multialignment_format="MAF",
                        datatype="N",
                        metadata_file_path=metadata_path,
                        blosum_file_path=blosum_path,
                        output_path=output_dir_path,
                        output_po=False,
                        output_fasta=False,
                        output_with_nodes=False,
                        verbose=False,
                        raw_maf=False,
                        fasta_provider='FromNCBI',
                        cache=True,
                        missing_base_symbol="",
                        fasta_source_file=None,
                        consensus_type="",
                        hbmin=0.8,
                        stop=stop,
                        p=p)

    fasta_provider = fp_ncbi.FromNCBI(use_cache=True)

    multialignment_content = pathtools.get_file_content_stringio(
        multialignment_path)
    multialignment = Maf(file_content=multialignment_content,
                         filename=multialignment_path)

    metadata_content = pathtools.get_file_content_stringio(metadata_path)
    metadata = MetadataCSV(filecontent=metadata_content,
                           filename=metadata_path)

    poagraph, dagmaf = Poagraph.build_from_dagmaf(multialignment,
                                                  fasta_provider, metadata)

    blosum_content = pathtools.get_file_content_stringio(path=blosum_path)
    blosum = Blosum(blosum_content, blosum_path)

    return poagraph, atree_builders.get_affinity_tree(poagraph, blosum,
                                                      consensus_output_dir,
                                                      Stop(stop), P(p), False)
Beispiel #5
0
def _convert_to_po_input_data(p: graph.Poagraph) -> \
        Tuple[List[NodePO], List[SequencePO]]:
    po_nodes = []
    po_sequences = []
    sequences_weights = p.get_sequences_weights(p.get_sequences_ids())

    for node in p.nodes:
        po_nodes.append(
            NodePO(base=node._base.value,
                   aligned_to=node.aligned_to,
                   in_nodes=set(),
                   sequences_ids=[]))

    seq_int_id = -1
    for seq_id, sequence in p.sequences.items():
        nodes_count = sum([len(path) for path in sequence.paths])
        if nodes_count == 0:
            continue
        seq_int_id += 1
        po_sequences.append(
            SequencePO(name=str(seq_id),
                       nodes_count=nodes_count,
                       weight=sequences_weights[seq_id],
                       consensus_id=-1,
                       start_node_id=sequence.paths[0][0]))
        for path in sequence.paths:
            previous_node_id = None
            for node_id in path:
                po_nodes[node_id].sequences_ids.append(seq_int_id)
                if previous_node_id is not None:
                    po_nodes[node_id].in_nodes.add(previous_node_id)
                previous_node_id = node_id

    for node in po_nodes:
        node.in_nodes = list(node.in_nodes)

    return po_nodes, po_sequences
Beispiel #6
0
def poagraph_to_fasta(poagraph: graph.Poagraph) -> str:
    """Converts poagraph to FASTA format.

    Args:
        poagraph: Poagraph to be converted.

    Returns:
        Fasta formatted string ready to be saved to file.
    """

    fasta_lines = []
    for seq_id, sequence in poagraph.sequences.items():
        if poagraph.get_sequence_nodes_count(seq_id) == 0:
            continue
        sequence = "".join([
            poagraph.nodes[node_id].get_base() for path in sequence.paths
            for node_id in path
        ])
        fasta_lines.append(f">{seq_id}")
        fasta_lines.append(sequence)

    return "\n".join(fasta_lines)
Beispiel #7
0
def build_poa_affinity_tree(p: graph.Poagraph,
                            blosum: Optional[parameters.Blosum],
                            output_dir: Path,
                            hbmin: parameters.Hbmin,
                            verbose: bool) -> tree.AffinityTree:
    """Builds Affinity Tree coherent with poa software.

    This method builds a simple version of Affinity Tree
    as it uses a single call to poa software. Poa provides
    division of sequences in Poagraph into consistent groups
    with a consensus path assigned to each group. These groups
    are converted in this method to Affinity Tree nodes and
    connected with a dummy root node so the result is coherent
    with pangtree definition of Affinity Tree.

    Args:
        p: Poagraph containing sequences to be divided into
            groups (Affinity Tree nodes).
        optional blosum: BLOSUM matrix. If not provided, default Blosum80.mat is used.
        output_dir: Path to a directory that can be used by poa software.
        hbmin: Parameter required by poa software. The minimum value of
            sequence compatibility to generated consensus.
        verbose: Switch to control logging intensity.

    Raises:
        AffinityTreeGenerationException: if consensuses cannot be found.
    """
    def _convert_consensus_paths_to_affinity_tree_nodes():
        at_nodes = []
        assigned_sequences = []
        for c_id, c_info in consensus_paths.items():
            assigned_sequences += c_info.assigned_sequences_ids
            all_seq = p.get_sequences_ids()
            compatibilities = p.get_compatibilities(all_seq, c_info.path)
            if len(c_info.assigned_sequences_ids):
                assigned_seq_comp = [c
                                     for seq_id, c in compatibilities.items()
                                     if seq_id in c_info.assigned_sequences_ids]
                mincomp = min(assigned_seq_comp)
            else:
                mincomp = 0
            new_node = tree.AffinityNode(id_=tree.AffinityNodeID(c_id + 1),
                                         parent=tree.AffinityNodeID(0),
                                         sequences=c_info.assigned_sequences_ids,
                                         mincomp=mincomp,
                                         compatibilities=compatibilities,
                                         consensus=c_info.path,
                                         children=[])
            at_nodes.append(new_node)

        node_for_unassigned_sequences = tree.AffinityNode(parent=tree.AffinityNodeID(0),
                                                          sequences=[seq_id
                                                                     for seq_id in p.get_sequences_ids()
                                                                     if seq_id not in assigned_sequences],
                                                          id_=tree.AffinityNodeID(len(at_nodes) + 1),
                                                          mincomp=graph.Compatibility(0),
                                                          children=[])
        at_nodes.append(node_for_unassigned_sequences)
        return at_nodes

    global_logger.info("POA defined affinity tree generation started.")
    if blosum is None:
        blosum = get_default_blosum()
    _raise_error_if_invalid_poagraph(p)
    try:
        consensus_paths = poa.get_consensuses(p,
                                              p.get_sequences_ids(),
                                              output_dir,
                                              "poa_tree",
                                              blosum.filepath,
                                              hbmin)
    except poa.NoConsensusError:
        raise AffinityTreeBuildException("No consensus in the Affinity Tree.")

    consensus_nodes = _convert_consensus_paths_to_affinity_tree_nodes()
    root_node = tree.AffinityNode(id_=tree.AffinityNodeID(0),
                                  children=[c_node.id_
                                            for c_node in consensus_nodes])
    affinity_tree = tree.AffinityTree([root_node] + consensus_nodes)
    global_logger.info("POA defined affinity tree generation finished.")
    return affinity_tree
Beispiel #8
0
def build_affinity_tree(poagraph: graph.Poagraph,
                        blosum: Optional[parameters.Blosum],
                        output_dir: Path,
                        stop: parameters.Stop,
                        p: parameters.P,
                        verbose: bool) -> tree.AffinityTree:
    """Builds Affinity Tree.

    Affinity Tree is defined in paper 'Getting insight into the
    pan-genome structure with Pangtree'. This method builds
    an Affinity Tree by iterative calls to poa software.
    Full algorithm and idea are described in the above-mentioned paper.

    Args:
        poagraph: Poagraph containing _sequences to be divided into groups
            (Affinity Tree nodes).
        optional blosum: BLOSUM matrix. If not provided, default Blosum80.mat is used.
        output_dir: Path to a directory that can be used by poa software.
        stop: Value of mincomp above which an affinity tree node is no more
            split.
        p: Value changing the linear meaning of compatibility when searching
            for cutoff.
        verbose: Switch to control logging intensity.

    Raises:
        AffinityTreeGenerationException: if consensuses cannot be found.

    Returns:
        Affinity Tree generated with Pangtree algorithm.
    """

    global_logger.info("Affinity Tree generation started.")
    if blosum is None:
        blosum = get_default_blosum()
    if verbose:
        logprocess.add_file_handler_to_logger(output_dir,
                                              "tresholdsCSV",
                                              "tresholds.csv",
                                              "%(message)s", False)
    _raise_error_if_invalid_poagraph(poagraph)

    root_node = _get_root_node(poagraph, blosum.filepath, output_dir, p)
    affinity_tree = tree.AffinityTree([root_node])

    nodes_to_process = deque([affinity_tree.get_node(tree.AffinityNodeID(0))])
    while nodes_to_process:
        node = nodes_to_process.pop()

        children_nodes = _get_children_nodes_looping(node,
                                                     poagraph,
                                                     output_dir,
                                                     blosum.filepath,
                                                     p,
                                                     affinity_tree.get_max_node_id())
        if len(children_nodes) == 1:
            continue

        for child in children_nodes:
            all_sequences = [*poagraph.sequences.keys()]
            child.compatibilities = poagraph.get_compatibilities(sequences_ids=all_sequences,
                                                                 consensus_path=child.consensus,
                                                                 p=p)
            node.children.append(child.id_)
            affinity_tree.nodes.append(child)
            if not _node_is_ready(child, stop):
                nodes_to_process.append(child)
    global_logger.info("Affinity Tree generation finished.\n")
    return affinity_tree