Ejemplo n.º 1
0
    def test_1_download_sequence_and_save_to_cache(self):
        cache_dir_path = pathtools.get_child_path(Path.cwd(), ".fastacache")
        if cache_dir_path.exists():
            shutil.rmtree(cache_dir_path)

        ncbi_fasta_provider = missings.FromNCBI(use_cache=True)
        sequence_id = msa.SequenceID("AB050936v1")

        _ = ncbi_fasta_provider.get_base(sequence_id, 0)

        # cache directory creation
        cache_directory_created = cache_dir_path.exists()
        self.assertTrue(cache_directory_created)

        # file creation
        files_in_cache_dircetory = [*cache_dir_path.glob("*")]
        expected_filepath = pathtools.get_child_path(cache_dir_path,
                                                     f"{sequence_id}.fasta")
        file_created_in_cache = expected_filepath in files_in_cache_dircetory
        self.assertTrue(file_created_in_cache)

        # file content
        control_fasta_path = Path(__file__).parent.joinpath(
            'fasta_ncbi/AB050936.1.fasta').resolve()

        with open(control_fasta_path) as fasta_file_hanlder:
            expected_content = fasta_file_hanlder.read()
        with open(expected_filepath) as fasta_file_handler:
            actual_content = fasta_file_handler.read()
        self.assertEqual(expected_content, actual_content)
Ejemplo n.º 2
0
def get_consensuses(
    poagraph: Poagraph,
    sequences_ids: List[SequenceID],
    output_dir: Path,
    job_name: str,
    blosum_path: Path,
    hbmin: Hbmin,
    specific_consensuses_id: Optional[List[int]] = None
) -> Dict[int, ConsInfo]:
    poa_input_path = pathtools.get_child_path(output_dir,
                                              f"{job_name}_in_pangenome.po")
    poa_output_path = pathtools.get_child_path(output_dir,
                                               f"{job_name}_out_pangenome.po")

    s = PoagraphPOTranslator(poagraph, sequences_ids)
    poa_input_content = s.get_input_po_content()
    with open(poa_input_path, 'w') as poa_input:
        poa_input.write(poa_input_content)
    b_resolved = blosum_path.resolve()
    call(po_file_path=poa_input_path,
         hb_file_path=poa_output_path,
         blosum_path=blosum_path.resolve(),
         hbmin=hbmin.value)
    with open(poa_output_path) as poa_output:
        poa_output_lines = poa_output.readlines()
    os.remove(poa_input_path)
    os.remove(poa_output_path)
    consensus_paths = s.read_consensus_paths(poa_output_lines,
                                             specific_consensuses_id)
    return consensus_paths
Ejemplo n.º 3
0
def get_default_output_dir():
    """Creates timestamped child dir under current working directory."""

    current_dir = pathtools.get_cwd()
    output_dir = pathtools.get_child_path(current_dir, "output")
    pathtools.create_dir(output_dir)
    current_time = pathtools.get_current_time()
    output_dir_name = "_".join(["output", current_time])
    output_dir_path = pathtools.get_child_path(output_dir, output_dir_name)
    pathtools.create_dir(output_dir_path)
    return output_dir_path
Ejemplo n.º 4
0
def run_pangtree(maf_path: Path, fasta_path: Path, output_dir: Path,
                 po_output: bool) -> None:
    output_dir = pathtools.get_child_dir(output_dir,
                                         pathtools.get_current_time())
    print(f"Runing pangtree for maf: {maf_path} and fasta: {fasta_path} "
          f"Output in: {output_dir}, include po file: {po_output}.")

    fasta_provider = missings.FromFile(fasta_path)
    maf = msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path)
    poagraph, dagmaf = builder.build_from_dagmaf(maf, fasta_provider)
    for p in p_values:
        current_output_dir = pathtools.get_child_dir(output_dir,
                                                     str(p).replace(".", "_"))
        stop = at_params.Stop(0.99)
        at = at_builders.build_affinity_tree(poagraph, None,
                                             current_output_dir, stop,
                                             at_params.P(p), True)

        at_newick = at.as_newick(None, separate_leaves=True)

        pathtools.save_to_file(
            at_newick,
            pathtools.get_child_path(current_output_dir,
                                     "affinity_tree.newick"))

        if po_output:
            pangenome_po = po.poagraph_to_PangenomePO(poagraph)
            pathtools.save_to_file(
                pangenome_po,
                pathtools.get_child_path(current_output_dir, "poagraph.po"))

        task_params = json.TaskParameters(
            multialignment_file_path=str(maf_path),
            multialignment_format="maf",
            datatype="nucleotides",
            blosum_file_path="",
            output_path=current_output_dir,
            fasta_provider=fasta_provider,
            fasta_source_file=fasta_path,
            consensus_type="tree",
            stop=str(stop),
            p=str(p),
            output_with_nodes=False)
        pangenomejson = json.to_PangenomeJSON(task_parameters=task_params,
                                              poagraph=poagraph,
                                              dagmaf=dagmaf,
                                              affinity_tree=at)

        pangenome_json_str = json.to_json(pangenomejson)
        pathtools.save_to_file(
            pangenome_json_str,
            pathtools.get_child_path(current_output_dir, "pangenome.json"))
Ejemplo n.º 5
0
def get_consensuses(poagraph: graph.Poagraph,
                    sequences_ids: List[msa.SequenceID],
                    output_dir: Path,
                    job_name: str,
                    blosum_path: Path,
                    hbmin: parameters.Hbmin,
                    specific_consensuses_id: Optional[List[int]] = None) -> \
                        Dict[int, ConsInfo]:
    """Calls poa software on given Poagraph to get consensus paths.

    Args:
        poagraph: Poagraph used as input to poa software. It may be cropped by
            using sequences_ids argument.
        sequences_ids: IDs of the _sequences that should be kept in poagraph
            being input to poa.
        output_dir: Full path to the directory used by poa software as
            temporary storage place.
        job_name: Name of the task used to label produced file names.
        blosum_path: Full path to the Blosum file used as poa's input.
        hbmin: Hbmin value used used as poa's input.
        specific_consensuses_id: Poa returns consensuses numbered by: 0, 1...
            By this argument it can be specified which should be returned.

    Returns:
        Dictionary of consensus numbers and corresponding information as
            ConsInfo object.

    Raises:
        NoConsensusError: If no consensus was found for given Poagraph and
            set of selected _sequences.
    """
    poa_input_path = pathtools.get_child_path(output_dir,
                                              f"{job_name}_in_pangenome.po")
    poa_output_path = pathtools.get_child_path(output_dir,
                                               f"{job_name}_out_pangenome.po")

    s = _PoagraphPOTranslator(poagraph, sequences_ids)
    poa_input_content = s.get_input_po_content()
    with open(poa_input_path, 'w') as poa_input:
        poa_input.write(poa_input_content)
    _call_poa(po_file_path=poa_input_path,
              hb_file_path=poa_output_path,
              blosum_path=blosum_path.resolve(),
              hbmin=hbmin.value)
    with open(poa_output_path) as poa_output:
        poa_output_lines = poa_output.readlines()
    os.remove(poa_input_path)
    os.remove(poa_output_path)
    consensus_paths = s.read_consensus_paths(poa_output_lines,
                                             specific_consensuses_id)
    return consensus_paths
Ejemplo n.º 6
0
def get_default_blosum():
    """Returns default blosum file: Blosum80.mat"""

    pangtreebuild_dir = Path(__file__).parent.parent
    default_blosum_path = pathtools.get_child_path(
        pangtreebuild_dir, "affinity_tree/bin/blosum80.mat")
    blosum_content = pathtools.get_file_content_stringio(default_blosum_path)
    return at_params.Blosum(blosum_content, default_blosum_path)
Ejemplo n.º 7
0
    def test_2_read_seqeunce_from_cache_instead_downloading(self):
        fasta_provider = missings.FromNCBI(use_cache=True)
        cache_dir_path = pathtools.get_child_path(Path.cwd(), ".fastacache")
        if cache_dir_path.exists():
            shutil.rmtree(cache_dir_path)

        cache_dir_path.mkdir()
        sequence_id = msa.SequenceID("seq1")
        fake_sequence = "foo"
        expected_base = graph.Base("o")
        fake_fasta_path = pathtools.get_child_path(cache_dir_path,
                                                   f"{sequence_id}.fasta")
        with open(fake_fasta_path, 'w') as fake_fasta_handler:
            fake_fasta_handler.write(f">{sequence_id} cached\n{fake_sequence}")

        actual_base = fasta_provider.get_base(sequence_id, 2)

        self.assertEqual(expected_base, actual_base)
Ejemplo n.º 8
0
def call(po_file_path: Path, hb_file_path: Path, blosum_path: Path,
         hbmin: float) -> None:
    poa_path = pathtools.get_child_path(Path(os.path.abspath(__file__)),
                                        '../../../bin/poa').resolve()
    detailed_logger.info(
        f"Run poa! Input: {po_file_path} Output: {hb_file_path}...")
    command = f"{poa_path} -read_msa {po_file_path} -hb -po {hb_file_path} {blosum_path} -hbmin {hbmin}"
    poa_result = subprocess.run(command, stderr=subprocess.PIPE, shell=True)
    poa_str_output = poa_result.stderr.decode("ASCII")
    detailed_logger.info(f"Poa output: {poa_str_output}")
Ejemplo n.º 9
0
def _call_poa(po_file_path: Path, hb_file_path: Path, blosum_path: Path,
              hbmin: float) -> None:
    """Calls poa software.

    Args:
        po_file_path: Path to the PO file containing Poagraph.
        hb_file_path: Path to the PO file where the poa result will be saved.
        blosum_path: Path to the Blosum file.
        hbmin: Hbmin value used as poa input.

    Returns:
        Nothing, as the result is stored in hb_file_path.
    """
    affinity_tree_dir = Path(__file__).parent
    poa_path = pathtools.get_child_path(affinity_tree_dir, "bin/poa")
    detailed_logger.info(
        f"Run poa! Input: {po_file_path} Output: {hb_file_path}...")
    command = f"{poa_path} -read_msa {po_file_path} -hb -po {hb_file_path} {blosum_path} -hbmin {hbmin}"
    poa_result = subprocess.run(command, stderr=subprocess.PIPE, shell=True)
    poa_str_output = poa_result.stderr.decode("ASCII")
    detailed_logger.info(f"Poa output: {poa_str_output}")
Ejemplo n.º 10
0
def add_file_handler_to_logger(
        outputdir: Path,
        logger_name: str,
        filename: str,
        handlerformat: str = "%(levelname)s - %(message)s",
        propagate: bool = True) -> None:
    """Adds new file handle to given logger.

    Args:
        outputdir: Directory where the file will be stored.
        logger_name: Logger to modify.
        filename: New logger name.
        handlerformat: Log message format.
        propagate: Whether logger should propagate.
    """

    logger = logging.getLogger(logger_name)
    fh = logging.FileHandler(pathtools.get_child_path(outputdir, filename))
    ft = logging.Formatter(handlerformat, datefmt="%x-%X")
    fh.setFormatter(ft)
    logger.propagate = propagate
    logger.addHandler(fh)
Ejemplo n.º 11
0
def get_default_blosum():
    """Returns default blosum file: Blosum80.mat"""
    parent_dir = Path(os.path.dirname(os.path.abspath(__file__)) + '/')
    default_blosum_path = pathtools.get_child_path(parent_dir, "../../bin/blosum80.mat")
    blosum_content = pathtools.get_file_content_stringio(default_blosum_path)
    return Blosum(blosum_content, default_blosum_path)
Ejemplo n.º 12
0
 def get_cached_filepath(self, seq_id: SequenceID) -> Path:
     return pathtools.get_child_path(self.cache_dir, f"{seq_id}.fasta")
Ejemplo n.º 13
0
 def __init__(self, parent_dir: Path):
     self.parent_dir = parent_dir
     self.cache_dir = pathtools.get_child_path(parent_dir, ".fastacache")
Ejemplo n.º 14
0
                                                      P(p),
                                                      False)


def add_leaves(consensus_tree:AffinityTree):
    new_nodes = []
    for node in consensus_tree.nodes:
        if len(node.children) == 0 and len(node.sequences) > 1:
            for seq_id in node.sequences:
                consensus_node_id = len(consensus_tree.nodes)+len(new_nodes)
                node.children.append(consensus_node_id)
                new_nodes.append(AffinityNode(id=consensus_node_id,
                                              parent=node.id,
                                              children=[],
                                              sequences=[seq_id],
                                              mincomp=Compatibility(1.0)
                                              ))

    consensus_tree.nodes.extend(new_nodes)
    return consensus_tree

ebola_poagraph, ebola_consensus_tree = get_ebola_poa_tree(hbmin=0.8, output_dir_name="output_ebola")

seq_id_to_metadata = {seq_id: seq.seqmetadata for seq_id, seq in ebola_poagraph.sequences.items()}
ebola_consensus_tree = add_leaves(ebola_consensus_tree)
newick_consensus_tree = ebola_consensus_tree.as_newick(seq_id_to_metadata)

pathtools.save_to_file(newick_consensus_tree, pathtools.get_child_path(Path("ebola_poa_tree_of_life"), "consensus_tree_poa.newick"))


Ejemplo n.º 15
0
def main():
    parser = cli.get_parser()
    args = parser.parse_args()
    start = datetime.datetime.now()
    if not args.quiet and args.verbose:
        logprocess.add_file_handler_to_logger(args.output_dir,
                                              "details",
                                              "details.log",
                                              propagate=False)
        logprocess.add_file_handler_to_logger(args.output_dir,
                                              "",
                                              "details.log",
                                              propagate=False)
    if args.quiet:
        logprocess.disable_all_loggers()

    poagraph, dagmaf, fasta_provider = None, None, None
    if isinstance(args.multialignment, Maf) and args.raw_maf:
        poagraph = Poagraph.build_from_maf(args.multialignment, args.metadata)
    elif isinstance(args.multialignment, Maf) and not args.raw_maf:
        fasta_provider = cli.resolve_fasta_provider(args)
        poagraph, dagmaf = Poagraph.build_from_dagmaf(args.multialignment,
                                                      fasta_provider,
                                                      args.metadata)
    elif isinstance(args.multialignment, Po):
        poagraph = Poagraph.build_from_po(args.multialignment, args.metadata)

    consensus_tree = None
    if args.consensus is not None:
        blosum = args.blosum if args.blosum else cli.get_default_blosum()
        if fasta_provider is not None and isinstance(fasta_provider,
                                                     ConstSymbolProvider):
            blosum.check_if_symbol_is_present(
                fasta_provider.missing_symbol.as_str())

        consensus_output_dir = pathtools.get_child_dir(args.output_dir,
                                                       "consensus")

        if args.consensus == 'poa':
            consensus_tree = simple_tree_generator.get_simple_consensus_tree(
                poagraph, blosum, consensus_output_dir, args.hbmin,
                args.verbose)
        elif args.consensus == 'tree':
            max_strategy = cli.resolve_max_strategy(args)
            node_strategy = cli.resolve_node_strategy(args)
            consensus_tree = tree_generator.get_consensus_tree(
                poagraph, blosum, consensus_output_dir, args.stop, args.p,
                max_strategy, node_strategy, args.verbose)
        try:
            seq_id_to_name = {
                seq_id: seq.seqmetadata["name"]
                for seq_id, seq in poagraph.sequences.items()
            }
        except:
            seq_id_to_name = None

        newick_consensus_tree = consensus_tree.as_newick(seq_id_to_name)

        pathtools.save_to_file(
            newick_consensus_tree,
            pathtools.get_child_path(args.output_dir, "consensus_tree.newick"))

    if args.output_po:
        pangenome_po = poagraph_to_PangenomePO(poagraph)
        pathtools.save_to_file(
            pangenome_po,
            pathtools.get_child_path(args.output_dir, "poagraph.po"))

    if args.output_fasta:
        sequences_fasta = poagraph_to_fasta(poagraph)
        pathtools.save_to_file(
            sequences_fasta,
            pathtools.get_child_path(args.output_dir, "sequences.fasta"))
        if consensus_tree:
            consensuses_fasta = consensuses_tree_to_fasta(
                poagraph, consensus_tree)
            pathtools.save_to_file(
                consensuses_fasta,
                pathtools.get_child_path(args.output_dir, "consensuses.fasta"))

    end = datetime.datetime.now()
    pangenomejson = to_PangenomeJSON(task_parameters=cli.get_task_parameters(
        args, running_time=f"{end-start}s"),
                                     poagraph=poagraph,
                                     dagmaf=dagmaf,
                                     consensuses_tree=consensus_tree)

    pangenome_json_str = to_json(pangenomejson)
    pathtools.save_to_file(
        pangenome_json_str,
        pathtools.get_child_path(args.output_dir, "pangenome.json"))
Ejemplo n.º 16
0
def main():
    parser = cli.get_parser()
    args = parser.parse_args()
    start = datetime.datetime.now()
    if not args.quiet and args.verbose:
        logprocess.add_file_handler_to_logger(args.output_dir,
                                              "details",
                                              "details.log",
                                              propagate=False)
        logprocess.add_file_handler_to_logger(args.output_dir,
                                              "",
                                              "details.log",
                                              propagate=False)
    if args.quiet:
        logprocess.disable_all_loggers()

    poagraph, dagmaf, fasta_provider = None, None, None
    if isinstance(args.multialignment, msa.Maf) and args.raw_maf:
        poagraph = builder.build_from_maf(args.multialignment, args.metadata)
    elif isinstance(args.multialignment, msa.Maf) and not args.raw_maf:
        fasta_provider = cli.resolve_fasta_provider(args)
        poagraph, dagmaf = builder.build_from_dagmaf(args.multialignment,
                                                     fasta_provider,
                                                     args.metadata)
    elif isinstance(args.multialignment, msa.Po):
        poagraph = builder.build_from_po(args.multialignment, args.metadata)

    affinity_tree = None
    if args.affinity is not None:
        blosum = args.blosum if args.blosum else cli.get_default_blosum()
        if fasta_provider is not None and isinstance(
                fasta_provider, missings.ConstBaseProvider):
            blosum.check_if_symbol_is_present(
                fasta_provider.missing_base.as_str())

        consensus_output_dir = pathtools.get_child_dir(args.output_dir,
                                                       "affinitytree")

        if args.affinity == 'poa':
            affinity_tree = at_builders.build_poa_affinity_tree(
                poagraph, blosum, consensus_output_dir, args.hbmin,
                args.verbose)
        elif args.affinity == 'tree':
            affinity_tree = at_builders.build_affinity_tree(
                poagraph, blosum, consensus_output_dir, args.stop, args.p,
                args.verbose)
        if args.metadata is not None:
            seq_id_to_metadata = {
                seq_id: seq.seqmetadata
                for seq_id, seq in poagraph.sequences.items()
            }
        else:
            seq_id_to_metadata = None

        affinity_tree_newick = affinity_tree.as_newick(seq_id_to_metadata,
                                                       separate_leaves=True)

        pathtools.save_to_file(
            affinity_tree_newick,
            pathtools.get_child_path(consensus_output_dir,
                                     "affinity_tree.newick"))

    if args.output_po:
        pangenome_po = po.poagraph_to_PangenomePO(poagraph)
        pathtools.save_to_file(
            pangenome_po,
            pathtools.get_child_path(args.output_dir, "poagraph.po"))

    if args.output_fasta:
        sequences_fasta = fasta.poagraph_to_fasta(poagraph)
        pathtools.save_to_file(
            sequences_fasta,
            pathtools.get_child_path(args.output_dir, "_sequences.fasta"))
        if affinity_tree:
            consensuses_fasta = fasta.affinity_tree_to_fasta(
                poagraph, affinity_tree)
            pathtools.save_to_file(
                consensuses_fasta,
                pathtools.get_child_path(args.output_dir,
                                         "affinitytree.fasta"))

    end = datetime.datetime.now()
    pangenomejson = json.to_PangenomeJSON(
        task_parameters=cli.get_task_parameters(args,
                                                running_time=f"{end-start}s"),
        poagraph=poagraph,
        dagmaf=dagmaf,
        affinity_tree=affinity_tree)

    pangenome_json_str = json.to_json(pangenomejson)
    pathtools.save_to_file(
        pangenome_json_str,
        pathtools.get_child_path(args.output_dir, "pangenome.json"))