def test_1_download_sequence_and_save_to_cache(self): cache_dir_path = pathtools.get_child_path(Path.cwd(), ".fastacache") if cache_dir_path.exists(): shutil.rmtree(cache_dir_path) ncbi_fasta_provider = missings.FromNCBI(use_cache=True) sequence_id = msa.SequenceID("AB050936v1") _ = ncbi_fasta_provider.get_base(sequence_id, 0) # cache directory creation cache_directory_created = cache_dir_path.exists() self.assertTrue(cache_directory_created) # file creation files_in_cache_dircetory = [*cache_dir_path.glob("*")] expected_filepath = pathtools.get_child_path(cache_dir_path, f"{sequence_id}.fasta") file_created_in_cache = expected_filepath in files_in_cache_dircetory self.assertTrue(file_created_in_cache) # file content control_fasta_path = Path(__file__).parent.joinpath( 'fasta_ncbi/AB050936.1.fasta').resolve() with open(control_fasta_path) as fasta_file_hanlder: expected_content = fasta_file_hanlder.read() with open(expected_filepath) as fasta_file_handler: actual_content = fasta_file_handler.read() self.assertEqual(expected_content, actual_content)
def get_consensuses( poagraph: Poagraph, sequences_ids: List[SequenceID], output_dir: Path, job_name: str, blosum_path: Path, hbmin: Hbmin, specific_consensuses_id: Optional[List[int]] = None ) -> Dict[int, ConsInfo]: poa_input_path = pathtools.get_child_path(output_dir, f"{job_name}_in_pangenome.po") poa_output_path = pathtools.get_child_path(output_dir, f"{job_name}_out_pangenome.po") s = PoagraphPOTranslator(poagraph, sequences_ids) poa_input_content = s.get_input_po_content() with open(poa_input_path, 'w') as poa_input: poa_input.write(poa_input_content) b_resolved = blosum_path.resolve() call(po_file_path=poa_input_path, hb_file_path=poa_output_path, blosum_path=blosum_path.resolve(), hbmin=hbmin.value) with open(poa_output_path) as poa_output: poa_output_lines = poa_output.readlines() os.remove(poa_input_path) os.remove(poa_output_path) consensus_paths = s.read_consensus_paths(poa_output_lines, specific_consensuses_id) return consensus_paths
def get_default_output_dir(): """Creates timestamped child dir under current working directory.""" current_dir = pathtools.get_cwd() output_dir = pathtools.get_child_path(current_dir, "output") pathtools.create_dir(output_dir) current_time = pathtools.get_current_time() output_dir_name = "_".join(["output", current_time]) output_dir_path = pathtools.get_child_path(output_dir, output_dir_name) pathtools.create_dir(output_dir_path) return output_dir_path
def run_pangtree(maf_path: Path, fasta_path: Path, output_dir: Path, po_output: bool) -> None: output_dir = pathtools.get_child_dir(output_dir, pathtools.get_current_time()) print(f"Runing pangtree for maf: {maf_path} and fasta: {fasta_path} " f"Output in: {output_dir}, include po file: {po_output}.") fasta_provider = missings.FromFile(fasta_path) maf = msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path) poagraph, dagmaf = builder.build_from_dagmaf(maf, fasta_provider) for p in p_values: current_output_dir = pathtools.get_child_dir(output_dir, str(p).replace(".", "_")) stop = at_params.Stop(0.99) at = at_builders.build_affinity_tree(poagraph, None, current_output_dir, stop, at_params.P(p), True) at_newick = at.as_newick(None, separate_leaves=True) pathtools.save_to_file( at_newick, pathtools.get_child_path(current_output_dir, "affinity_tree.newick")) if po_output: pangenome_po = po.poagraph_to_PangenomePO(poagraph) pathtools.save_to_file( pangenome_po, pathtools.get_child_path(current_output_dir, "poagraph.po")) task_params = json.TaskParameters( multialignment_file_path=str(maf_path), multialignment_format="maf", datatype="nucleotides", blosum_file_path="", output_path=current_output_dir, fasta_provider=fasta_provider, fasta_source_file=fasta_path, consensus_type="tree", stop=str(stop), p=str(p), output_with_nodes=False) pangenomejson = json.to_PangenomeJSON(task_parameters=task_params, poagraph=poagraph, dagmaf=dagmaf, affinity_tree=at) pangenome_json_str = json.to_json(pangenomejson) pathtools.save_to_file( pangenome_json_str, pathtools.get_child_path(current_output_dir, "pangenome.json"))
def get_consensuses(poagraph: graph.Poagraph, sequences_ids: List[msa.SequenceID], output_dir: Path, job_name: str, blosum_path: Path, hbmin: parameters.Hbmin, specific_consensuses_id: Optional[List[int]] = None) -> \ Dict[int, ConsInfo]: """Calls poa software on given Poagraph to get consensus paths. Args: poagraph: Poagraph used as input to poa software. It may be cropped by using sequences_ids argument. sequences_ids: IDs of the _sequences that should be kept in poagraph being input to poa. output_dir: Full path to the directory used by poa software as temporary storage place. job_name: Name of the task used to label produced file names. blosum_path: Full path to the Blosum file used as poa's input. hbmin: Hbmin value used used as poa's input. specific_consensuses_id: Poa returns consensuses numbered by: 0, 1... By this argument it can be specified which should be returned. Returns: Dictionary of consensus numbers and corresponding information as ConsInfo object. Raises: NoConsensusError: If no consensus was found for given Poagraph and set of selected _sequences. """ poa_input_path = pathtools.get_child_path(output_dir, f"{job_name}_in_pangenome.po") poa_output_path = pathtools.get_child_path(output_dir, f"{job_name}_out_pangenome.po") s = _PoagraphPOTranslator(poagraph, sequences_ids) poa_input_content = s.get_input_po_content() with open(poa_input_path, 'w') as poa_input: poa_input.write(poa_input_content) _call_poa(po_file_path=poa_input_path, hb_file_path=poa_output_path, blosum_path=blosum_path.resolve(), hbmin=hbmin.value) with open(poa_output_path) as poa_output: poa_output_lines = poa_output.readlines() os.remove(poa_input_path) os.remove(poa_output_path) consensus_paths = s.read_consensus_paths(poa_output_lines, specific_consensuses_id) return consensus_paths
def get_default_blosum(): """Returns default blosum file: Blosum80.mat""" pangtreebuild_dir = Path(__file__).parent.parent default_blosum_path = pathtools.get_child_path( pangtreebuild_dir, "affinity_tree/bin/blosum80.mat") blosum_content = pathtools.get_file_content_stringio(default_blosum_path) return at_params.Blosum(blosum_content, default_blosum_path)
def test_2_read_seqeunce_from_cache_instead_downloading(self): fasta_provider = missings.FromNCBI(use_cache=True) cache_dir_path = pathtools.get_child_path(Path.cwd(), ".fastacache") if cache_dir_path.exists(): shutil.rmtree(cache_dir_path) cache_dir_path.mkdir() sequence_id = msa.SequenceID("seq1") fake_sequence = "foo" expected_base = graph.Base("o") fake_fasta_path = pathtools.get_child_path(cache_dir_path, f"{sequence_id}.fasta") with open(fake_fasta_path, 'w') as fake_fasta_handler: fake_fasta_handler.write(f">{sequence_id} cached\n{fake_sequence}") actual_base = fasta_provider.get_base(sequence_id, 2) self.assertEqual(expected_base, actual_base)
def call(po_file_path: Path, hb_file_path: Path, blosum_path: Path, hbmin: float) -> None: poa_path = pathtools.get_child_path(Path(os.path.abspath(__file__)), '../../../bin/poa').resolve() detailed_logger.info( f"Run poa! Input: {po_file_path} Output: {hb_file_path}...") command = f"{poa_path} -read_msa {po_file_path} -hb -po {hb_file_path} {blosum_path} -hbmin {hbmin}" poa_result = subprocess.run(command, stderr=subprocess.PIPE, shell=True) poa_str_output = poa_result.stderr.decode("ASCII") detailed_logger.info(f"Poa output: {poa_str_output}")
def _call_poa(po_file_path: Path, hb_file_path: Path, blosum_path: Path, hbmin: float) -> None: """Calls poa software. Args: po_file_path: Path to the PO file containing Poagraph. hb_file_path: Path to the PO file where the poa result will be saved. blosum_path: Path to the Blosum file. hbmin: Hbmin value used as poa input. Returns: Nothing, as the result is stored in hb_file_path. """ affinity_tree_dir = Path(__file__).parent poa_path = pathtools.get_child_path(affinity_tree_dir, "bin/poa") detailed_logger.info( f"Run poa! Input: {po_file_path} Output: {hb_file_path}...") command = f"{poa_path} -read_msa {po_file_path} -hb -po {hb_file_path} {blosum_path} -hbmin {hbmin}" poa_result = subprocess.run(command, stderr=subprocess.PIPE, shell=True) poa_str_output = poa_result.stderr.decode("ASCII") detailed_logger.info(f"Poa output: {poa_str_output}")
def add_file_handler_to_logger( outputdir: Path, logger_name: str, filename: str, handlerformat: str = "%(levelname)s - %(message)s", propagate: bool = True) -> None: """Adds new file handle to given logger. Args: outputdir: Directory where the file will be stored. logger_name: Logger to modify. filename: New logger name. handlerformat: Log message format. propagate: Whether logger should propagate. """ logger = logging.getLogger(logger_name) fh = logging.FileHandler(pathtools.get_child_path(outputdir, filename)) ft = logging.Formatter(handlerformat, datefmt="%x-%X") fh.setFormatter(ft) logger.propagate = propagate logger.addHandler(fh)
def get_default_blosum(): """Returns default blosum file: Blosum80.mat""" parent_dir = Path(os.path.dirname(os.path.abspath(__file__)) + '/') default_blosum_path = pathtools.get_child_path(parent_dir, "../../bin/blosum80.mat") blosum_content = pathtools.get_file_content_stringio(default_blosum_path) return Blosum(blosum_content, default_blosum_path)
def get_cached_filepath(self, seq_id: SequenceID) -> Path: return pathtools.get_child_path(self.cache_dir, f"{seq_id}.fasta")
def __init__(self, parent_dir: Path): self.parent_dir = parent_dir self.cache_dir = pathtools.get_child_path(parent_dir, ".fastacache")
P(p), False) def add_leaves(consensus_tree:AffinityTree): new_nodes = [] for node in consensus_tree.nodes: if len(node.children) == 0 and len(node.sequences) > 1: for seq_id in node.sequences: consensus_node_id = len(consensus_tree.nodes)+len(new_nodes) node.children.append(consensus_node_id) new_nodes.append(AffinityNode(id=consensus_node_id, parent=node.id, children=[], sequences=[seq_id], mincomp=Compatibility(1.0) )) consensus_tree.nodes.extend(new_nodes) return consensus_tree ebola_poagraph, ebola_consensus_tree = get_ebola_poa_tree(hbmin=0.8, output_dir_name="output_ebola") seq_id_to_metadata = {seq_id: seq.seqmetadata for seq_id, seq in ebola_poagraph.sequences.items()} ebola_consensus_tree = add_leaves(ebola_consensus_tree) newick_consensus_tree = ebola_consensus_tree.as_newick(seq_id_to_metadata) pathtools.save_to_file(newick_consensus_tree, pathtools.get_child_path(Path("ebola_poa_tree_of_life"), "consensus_tree_poa.newick"))
def main(): parser = cli.get_parser() args = parser.parse_args() start = datetime.datetime.now() if not args.quiet and args.verbose: logprocess.add_file_handler_to_logger(args.output_dir, "details", "details.log", propagate=False) logprocess.add_file_handler_to_logger(args.output_dir, "", "details.log", propagate=False) if args.quiet: logprocess.disable_all_loggers() poagraph, dagmaf, fasta_provider = None, None, None if isinstance(args.multialignment, Maf) and args.raw_maf: poagraph = Poagraph.build_from_maf(args.multialignment, args.metadata) elif isinstance(args.multialignment, Maf) and not args.raw_maf: fasta_provider = cli.resolve_fasta_provider(args) poagraph, dagmaf = Poagraph.build_from_dagmaf(args.multialignment, fasta_provider, args.metadata) elif isinstance(args.multialignment, Po): poagraph = Poagraph.build_from_po(args.multialignment, args.metadata) consensus_tree = None if args.consensus is not None: blosum = args.blosum if args.blosum else cli.get_default_blosum() if fasta_provider is not None and isinstance(fasta_provider, ConstSymbolProvider): blosum.check_if_symbol_is_present( fasta_provider.missing_symbol.as_str()) consensus_output_dir = pathtools.get_child_dir(args.output_dir, "consensus") if args.consensus == 'poa': consensus_tree = simple_tree_generator.get_simple_consensus_tree( poagraph, blosum, consensus_output_dir, args.hbmin, args.verbose) elif args.consensus == 'tree': max_strategy = cli.resolve_max_strategy(args) node_strategy = cli.resolve_node_strategy(args) consensus_tree = tree_generator.get_consensus_tree( poagraph, blosum, consensus_output_dir, args.stop, args.p, max_strategy, node_strategy, args.verbose) try: seq_id_to_name = { seq_id: seq.seqmetadata["name"] for seq_id, seq in poagraph.sequences.items() } except: seq_id_to_name = None newick_consensus_tree = consensus_tree.as_newick(seq_id_to_name) pathtools.save_to_file( newick_consensus_tree, pathtools.get_child_path(args.output_dir, "consensus_tree.newick")) if args.output_po: pangenome_po = poagraph_to_PangenomePO(poagraph) pathtools.save_to_file( pangenome_po, pathtools.get_child_path(args.output_dir, "poagraph.po")) if args.output_fasta: sequences_fasta = poagraph_to_fasta(poagraph) pathtools.save_to_file( sequences_fasta, pathtools.get_child_path(args.output_dir, "sequences.fasta")) if consensus_tree: consensuses_fasta = consensuses_tree_to_fasta( poagraph, consensus_tree) pathtools.save_to_file( consensuses_fasta, pathtools.get_child_path(args.output_dir, "consensuses.fasta")) end = datetime.datetime.now() pangenomejson = to_PangenomeJSON(task_parameters=cli.get_task_parameters( args, running_time=f"{end-start}s"), poagraph=poagraph, dagmaf=dagmaf, consensuses_tree=consensus_tree) pangenome_json_str = to_json(pangenomejson) pathtools.save_to_file( pangenome_json_str, pathtools.get_child_path(args.output_dir, "pangenome.json"))
def main(): parser = cli.get_parser() args = parser.parse_args() start = datetime.datetime.now() if not args.quiet and args.verbose: logprocess.add_file_handler_to_logger(args.output_dir, "details", "details.log", propagate=False) logprocess.add_file_handler_to_logger(args.output_dir, "", "details.log", propagate=False) if args.quiet: logprocess.disable_all_loggers() poagraph, dagmaf, fasta_provider = None, None, None if isinstance(args.multialignment, msa.Maf) and args.raw_maf: poagraph = builder.build_from_maf(args.multialignment, args.metadata) elif isinstance(args.multialignment, msa.Maf) and not args.raw_maf: fasta_provider = cli.resolve_fasta_provider(args) poagraph, dagmaf = builder.build_from_dagmaf(args.multialignment, fasta_provider, args.metadata) elif isinstance(args.multialignment, msa.Po): poagraph = builder.build_from_po(args.multialignment, args.metadata) affinity_tree = None if args.affinity is not None: blosum = args.blosum if args.blosum else cli.get_default_blosum() if fasta_provider is not None and isinstance( fasta_provider, missings.ConstBaseProvider): blosum.check_if_symbol_is_present( fasta_provider.missing_base.as_str()) consensus_output_dir = pathtools.get_child_dir(args.output_dir, "affinitytree") if args.affinity == 'poa': affinity_tree = at_builders.build_poa_affinity_tree( poagraph, blosum, consensus_output_dir, args.hbmin, args.verbose) elif args.affinity == 'tree': affinity_tree = at_builders.build_affinity_tree( poagraph, blosum, consensus_output_dir, args.stop, args.p, args.verbose) if args.metadata is not None: seq_id_to_metadata = { seq_id: seq.seqmetadata for seq_id, seq in poagraph.sequences.items() } else: seq_id_to_metadata = None affinity_tree_newick = affinity_tree.as_newick(seq_id_to_metadata, separate_leaves=True) pathtools.save_to_file( affinity_tree_newick, pathtools.get_child_path(consensus_output_dir, "affinity_tree.newick")) if args.output_po: pangenome_po = po.poagraph_to_PangenomePO(poagraph) pathtools.save_to_file( pangenome_po, pathtools.get_child_path(args.output_dir, "poagraph.po")) if args.output_fasta: sequences_fasta = fasta.poagraph_to_fasta(poagraph) pathtools.save_to_file( sequences_fasta, pathtools.get_child_path(args.output_dir, "_sequences.fasta")) if affinity_tree: consensuses_fasta = fasta.affinity_tree_to_fasta( poagraph, affinity_tree) pathtools.save_to_file( consensuses_fasta, pathtools.get_child_path(args.output_dir, "affinitytree.fasta")) end = datetime.datetime.now() pangenomejson = json.to_PangenomeJSON( task_parameters=cli.get_task_parameters(args, running_time=f"{end-start}s"), poagraph=poagraph, dagmaf=dagmaf, affinity_tree=affinity_tree) pangenome_json_str = json.to_json(pangenomejson) pathtools.save_to_file( pangenome_json_str, pathtools.get_child_path(args.output_dir, "pangenome.json"))