def test_subpoagraph_should_omit_edges_2(self): nodes = [ graph.Node(node_id=nid(0), base=b('A'), aligned_to=None), graph.Node(node_id=nid(1), base=b('C'), aligned_to=None), graph.Node(node_id=nid(2), base=b('C'), aligned_to=None) ] sequences = { msa.SequenceID('seq1'): graph.Sequence(msa.SequenceID('seq1'), [graph.SeqPath([*map(nid, [0, 2])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence(msa.SequenceID('seq2'), [graph.SeqPath([*map(nid, [0, 1, 2])])], graph.SequenceMetadata({'group': '1'})) } poagraph = graph.Poagraph(nodes, sequences) translator = poa._PoagraphPOTranslator(poagraph, [msa.SequenceID('seq1')]) actual_po_content = translator.get_input_po_content() expected_po_content = "VERSION=pangenome\n" \ "NAME=pangenome\n" \ "TITLE=pangenome\n" \ "LENGTH=2\n" \ "SOURCECOUNT=1\n" \ "SOURCENAME=seq1\n" \ "SOURCEINFO=2 0 100 -1 seq1\n" \ "a:S0\n" \ "c:L0S0" self.assertEqual(expected_po_content, actual_po_content)
def test_subpoagraph_construction_full_graph(self): nodes = [ graph.Node(node_id=nid(0), base=b('A'), aligned_to=None), graph.Node(node_id=nid(1), base=b('A'), aligned_to=None), graph.Node(node_id=nid(2), base=b('C'), aligned_to=None), graph.Node(node_id=nid(3), base=b('A'), aligned_to=None), graph.Node(node_id=nid(4), base=b('T'), aligned_to=None) ] sequences = { msa.SequenceID('seq0'): graph.Sequence(msa.SequenceID('seq0'), [graph.SeqPath([*map(nid, [0, 1, 2, 3, 4])])], graph.SequenceMetadata({'group': '1'})) } poagraph = graph.Poagraph(nodes, sequences) translator = poa._PoagraphPOTranslator(poagraph, [msa.SequenceID('seq0')]) actual_po_content = translator.get_input_po_content() expected_po_content = "VERSION=pangenome\n" \ "NAME=pangenome\n" \ "TITLE=pangenome\n" \ "LENGTH=5\n" \ "SOURCECOUNT=1\n" \ "SOURCENAME=seq0\n" \ "SOURCEINFO=5 0 100 -1 seq0\n" \ "a:S0\n" \ "a:L0S0\n" \ "c:L1S0\n" \ "a:L2S0\n" \ "t:L3S0" self.assertEqual(expected_po_content, actual_po_content)
def __init__(self): self.sources = { msa.SequenceID("seq0"): "", msa.SequenceID("seq1"): "ACTAGGT", msa.SequenceID("seq2"): "GGTCAGT", msa.SequenceID("seq3"): "", msa.SequenceID("seq4"): "" }
def test_1_p_parameter_influence(self, p: at_params.P, expected_cutoff: graph.Compatibility): nodes = [ graph.Node(node_id=nid(0), base=b('T'), aligned_to=None), graph.Node(node_id=nid(1), base=b('A'), aligned_to=None), graph.Node(node_id=nid(2), base=b('G'), aligned_to=None), graph.Node(node_id=nid(3), base=b('A'), aligned_to=None), graph.Node(node_id=nid(4), base=b('C'), aligned_to=None), graph.Node(node_id=nid(5), base=b('A'), aligned_to=None), graph.Node(node_id=nid(6), base=b('C'), aligned_to=None), graph.Node(node_id=nid(7), base=b('G'), aligned_to=None), graph.Node(node_id=nid(8), base=b('T'), aligned_to=None), graph.Node(node_id=nid(9), base=b('A'), aligned_to=None) ] sequences = { msa.SequenceID('seq0'): graph.Sequence(msa.SequenceID('seq0'), [ graph.SeqPath( [*map(nid, [10, 11, 12, 13, 14, 15, 16, 17, 18, 9])]) ], graph.SequenceMetadata({})), msa.SequenceID('seq1'): graph.Sequence(msa.SequenceID('seq1'), [ graph.SeqPath( [*map(nid, [10, 11, 12, 13, 14, 15, 16, 17, 8, 9])]) ], graph.SequenceMetadata({})), msa.SequenceID('seq2'): graph.Sequence(msa.SequenceID('seq2'), [ graph.SeqPath( [*map(nid, [10, 11, 12, 13, 14, 15, 16, 7, 8, 9])]) ], graph.SequenceMetadata({})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [ graph.SeqPath([*map(nid, [10, 11, 12, 3, 4, 5, 6, 7, 8, 9])]) ], graph.SequenceMetadata({})), msa.SequenceID('seq4'): graph.Sequence( msa.SequenceID('seq3'), [graph.SeqPath([*map(nid, [10, 11, 2, 3, 4, 5, 6, 7, 8, 9])])], graph.SequenceMetadata({})) } poagraph = graph.Poagraph(nodes, sequences) consensus_path = graph.SeqPath( [*map(nid, [10, 11, 12, 13, 14, 15, 16, 17, 18, 19])]) compatibilities = poagraph.get_compatibilities( poagraph.get_sequences_ids(), consensus_path, p) actual_cutoff = at_builders._find_node_cutoff( [c for c in compatibilities.values()], []).cutoff self.assertAlmostEqual(expected_cutoff.value, actual_cutoff.value)
def test_04_single_block_no_nucleotides(self): maf_path = self.maf_files_dir.joinpath( "test_4_single_block_no_nucleotides.maf") expected_nodes = [] expected_sequences = { msa.SequenceID('seq0'): graph.Sequence(msa.SequenceID('seq0'), [], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence(msa.SequenceID('seq1'), [], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence(msa.SequenceID('seq2'), [], graph.SequenceMetadata({'group': '2'})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [], graph.SequenceMetadata({'group': '2'})) } expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences) actual_poagraph, _ = builder.build_from_dagmaf( msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path), self.fasta_provider, self.metadatacsv) self.assertEqual(expected_poagraph, actual_poagraph)
def test_7_missing_one_reverted_sequence_middle_minus1_minus1(self): maf_path = self.maf_files_dir.joinpath( "test_7_missing_one_reverted_sequence_middle_minus1_minus1.maf") expected_nodes = [ # block 0 graph.Node(node_id=nid(0), base=graph.Base('A'), aligned_to=None), graph.Node(node_id=nid(1), base=graph.Base('C'), aligned_to=None), graph.Node(node_id=nid(2), base=graph.Base('T'), aligned_to=None), graph.Node(node_id=nid(3), base=graph.Base('A'), aligned_to=None), # missing seq2 graph.Node(node_id=nid(4), base=graph.Base(self.missing_n.value), aligned_to=None), graph.Node(node_id=nid(5), base=graph.Base(self.missing_n.value), aligned_to=None), # block 1 graph.Node(node_id=nid(6), base=graph.Base('A'), aligned_to=nid(7)), graph.Node(node_id=nid(7), base=graph.Base('G'), aligned_to=nid(6)), graph.Node(node_id=nid(8), base=graph.Base('C'), aligned_to=nid(9)), graph.Node(node_id=nid(9), base=graph.Base('G'), aligned_to=nid(8)), graph.Node(node_id=nid(10), base=graph.Base('C'), aligned_to=nid(11)), graph.Node(node_id=nid(11), base=graph.Base('T'), aligned_to=nid(10)), ] expected_sequences = { msa.SequenceID('seq0'): graph.Sequence(msa.SequenceID('seq0'), [], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence( msa.SequenceID('seq1'), [graph.SeqPath([*map(nid, [0, 1, 2, 3, 7, 9, 11])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence( msa.SequenceID('seq2'), [graph.SeqPath([*map(nid, [0, 1, 4, 5, 6, 8, 10])])], graph.SequenceMetadata({'group': '2'})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [], graph.SequenceMetadata({'group': '2'})) } expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences) actual_poagraph, _ = builder.build_from_dagmaf( msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path), missings.ConstBaseProvider(self.missing_n), self.metadatacsv) self.assertEqual(expected_poagraph, actual_poagraph)
def test_9_get_seqids(self): metadata_path = self.csv_files_dir.joinpath("test_1_correct.csv") csv_content = pathtools.get_file_content_stringio(metadata_path) expected_seqids = [ msa.SequenceID('s1'), msa.SequenceID('s2'), msa.SequenceID('s3') ] m = msa.MetadataCSV(csv_content, metadata_path) actual_seqids = m.get_all_sequences_ids() self.assertEqual(expected_seqids, actual_seqids)
def test_05_single_block_single_nucletodide(self): maf_path = self.maf_files_dir.joinpath( "test_5_single_block_single_nucletodide.maf") expected_nodes = [ graph.Node(node_id=nid(0), base=graph.Base('A'), aligned_to=None, block_id=bid(0)) ] expected_sequences = { msa.SequenceID('seq0'): graph.Sequence(msa.SequenceID('seq0'), [graph.SeqPath([*map(nid, [0])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence(msa.SequenceID('seq1'), [graph.SeqPath([*map(nid, [0])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence(msa.SequenceID('seq2'), [graph.SeqPath([*map(nid, [0])])], graph.SequenceMetadata({'group': '2'})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [graph.SeqPath([*map(nid, [0])])], graph.SequenceMetadata({'group': '2'})) } expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences) actual_poagraph, _ = builder.build_from_dagmaf( msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path), self.fasta_provider, self.metadatacsv) self.assertEqual(expected_poagraph, actual_poagraph)
def test_2_three_sequences_in_two_files_in_zip(self): fasta_path = self.fasta_dir.joinpath( "test_2_three_sequences_in_two_files_in_zip.zip") fasta_provider = missings.FromFile(Path(fasta_path)) sequence_id_1 = msa.SequenceID("seq1") self.raise_error_if_unequal(sequence_id_1, "ACTGGGTGGGA", fasta_provider) sequence_id_2 = msa.SequenceID("seq2") self.raise_error_if_unequal(sequence_id_2, "AA", fasta_provider) sequence_id_3 = msa.SequenceID("seq3") self.raise_error_if_unequal(sequence_id_3, "GT", fasta_provider)
def _get_children_nodes_looping(node: tree.AffinityNode, poagraph: graph.Poagraph, output_dir: Path, blosum_path: Path, p: parameters.P, current_max_affinity_node_id: int) -> List[tree.AffinityNode]: """Generates children of given Affinity Tree node.""" children_nodes: List[tree.AffinityNode] = [] not_assigned_sequences_ids: List[msa.SequenceID] = node.sequences detailed_logger.info(f"""Getting children nodes for affinity node {node.id_}...""") affinity_node_id = 0 so_far_cutoffs: List[poagraph.Compatibility] = [] while not_assigned_sequences_ids: detailed_logger.info(f"### Getting child {len(so_far_cutoffs)}...") child_ready = False attempt = 0 current_candidates = not_assigned_sequences_ids while not child_ready: consensus_candidate = poa.get_consensuses(poagraph, current_candidates, output_dir, f"parent_{node.id_}_child_{len(so_far_cutoffs)}_attempt_{attempt}", blosum_path, parameters.Hbmin(0), specific_consensuses_id=[0])[0].path compatibilities_to_consensus_candidate = poagraph.get_compatibilities(sequences_ids=not_assigned_sequences_ids, consensus_path=consensus_candidate, p=p) compatibilities_to_consensus_candidate[msa.SequenceID("parent")] = node.mincomp qualified_sequences_ids_candidates, cutoff = _get_qualified_sequences_ids_and_cutoff( compatibilities_to_max_c=compatibilities_to_consensus_candidate, so_far_cutoffs=so_far_cutoffs, splitted_node_id=node.id_) if qualified_sequences_ids_candidates == current_candidates or attempt == 10: if attempt == 10: detailed_logger.info("Attempt treshold 10 exceeded!") affinity_node_id += 1 affinity_node = tree.AffinityNode( id_=tree.AffinityNodeID(current_max_affinity_node_id + affinity_node_id), parent=node.id_, sequences=qualified_sequences_ids_candidates, mincomp=_get_min_comp(node_sequences_ids=qualified_sequences_ids_candidates, comps_to_consensus=compatibilities_to_consensus_candidate), consensus=graph.SeqPath(consensus_candidate)) children_nodes.append(affinity_node) not_assigned_sequences_ids = list(set(not_assigned_sequences_ids) - set(qualified_sequences_ids_candidates)) child_ready = True so_far_cutoffs.append(affinity_node.mincomp) else: current_candidates = qualified_sequences_ids_candidates attempt += 1 detailed_logger.info("Children nodes generated.") return children_nodes
def test_1_download_sequence_and_save_to_cache(self): cache_dir_path = pathtools.get_child_path(Path.cwd(), ".fastacache") if cache_dir_path.exists(): shutil.rmtree(cache_dir_path) ncbi_fasta_provider = missings.FromNCBI(use_cache=True) sequence_id = msa.SequenceID("AB050936v1") _ = ncbi_fasta_provider.get_base(sequence_id, 0) # cache directory creation cache_directory_created = cache_dir_path.exists() self.assertTrue(cache_directory_created) # file creation files_in_cache_dircetory = [*cache_dir_path.glob("*")] expected_filepath = pathtools.get_child_path(cache_dir_path, f"{sequence_id}.fasta") file_created_in_cache = expected_filepath in files_in_cache_dircetory self.assertTrue(file_created_in_cache) # file content control_fasta_path = Path(__file__).parent.joinpath( 'fasta_ncbi/AB050936.1.fasta').resolve() with open(control_fasta_path) as fasta_file_hanlder: expected_content = fasta_file_hanlder.read() with open(expected_filepath) as fasta_file_handler: actual_content = fasta_file_handler.read() self.assertEqual(expected_content, actual_content)
def test_1_no_symbol_provided(self): missing_symbol = missings.MissingBase() const_symbol_provider = missings.ConstBaseProvider(missing_symbol) expected_symbol = graph.Base('?') actual_symbol = const_symbol_provider.get_base(msa.SequenceID('s'), 0) self.assertEqual(expected_symbol, actual_symbol)
def test_10_parallel_blocks_1st_and_2nd_merge_into_3rd(self): maf_path = self.maf_files_dir.joinpath("test_10_parallel_blocks_1st_and_2nd_merge_into_3rd.maf") expected_nodes = [ graph.Node(node_id=nid(0), base=graph.Base('G'), aligned_to=nid(1)), graph.Node(node_id=nid(1), base=graph.Base('T'), aligned_to=nid(0)), graph.Node(node_id=nid(2), base=graph.Base('T'), aligned_to=None), graph.Node(node_id=nid(3), base=graph.Base('A'), aligned_to=None), graph.Node(node_id=nid(4), base=graph.Base('C'), aligned_to=nid(5)), graph.Node(node_id=nid(5), base=graph.Base('G'), aligned_to=nid(4)), graph.Node(node_id=nid(6), base=graph.Base('C'), aligned_to=None), graph.Node(node_id=nid(7), base=graph.Base('A'), aligned_to=None), graph.Node(node_id=nid(8), base=graph.Base('C'), aligned_to=None), graph.Node(node_id=nid(9), base=graph.Base('T'), aligned_to=None), graph.Node(node_id=nid(10), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(11), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(12), base=graph.Base('C'), aligned_to=nid(13)), graph.Node(node_id=nid(13), base=graph.Base('G'), aligned_to=nid(12)), graph.Node(node_id=nid(14), base=graph.Base('C'), aligned_to=nid(15)), graph.Node(node_id=nid(15), base=graph.Base('G'), aligned_to=nid(16)), graph.Node(node_id=nid(16), base=graph.Base('T'), aligned_to=nid(14)), graph.Node(node_id=nid(17), base=graph.Base('A'), aligned_to=nid(18)), graph.Node(node_id=nid(18), base=graph.Base('T'), aligned_to=nid(17)), graph.Node(node_id=nid(19), base=graph.Base('A'), aligned_to=nid(20)), graph.Node(node_id=nid(20), base=graph.Base('C'), aligned_to=nid(19)), graph.Node(node_id=nid(21), base=graph.Base('C'), aligned_to=nid(22)), graph.Node(node_id=nid(22), base=graph.Base('G'), aligned_to=nid(21)), ] expected_sequences = { msa.SequenceID('seq0'): graph.Sequence(msa.SequenceID('seq0'), [graph.SeqPath([*map(nid, [7, 8, 9, 10, 11, 12, 15, 18, 19, 21])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence(msa.SequenceID('seq1'), [graph.SeqPath([*map(nid, [7, 8, 9, 10, 11, 12, 15, 18, 19, 21])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence(msa.SequenceID('seq2'), [graph.SeqPath([*map(nid, [0, 2, 3, 4, 6, 13, 16, 17, 20, 21])])], graph.SequenceMetadata({'group': '2'})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [graph.SeqPath([*map(nid, [1, 2, 3, 5, 6, 13, 14, 17, 20, 22])])], graph.SequenceMetadata({'group': '2'})), } expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences) actual_poagraph, _ = builder.build_from_dagmaf( msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path), self.fasta_provider, self.metadatacsv) self.assertEqual(expected_poagraph, actual_poagraph)
def test_09_inactive_edges_but_all_strands_plus(self): maf_path = self.maf_files_dir.joinpath("test_9_inactive_edges_but_all_strands_plus.maf") expected_nodes = [ graph.Node(node_id=nid(0), base=graph.Base('A'), aligned_to=None), graph.Node(node_id=nid(1), base=graph.Base('C'), aligned_to=None), graph.Node(node_id=nid(2), base=graph.Base('T'), aligned_to=None), graph.Node(node_id=nid(3), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(4), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(5), base=graph.Base('A'), aligned_to=None), graph.Node(node_id=nid(6), base=graph.Base('C'), aligned_to=None), graph.Node(node_id=nid(7), base=graph.Base('T'), aligned_to=None), graph.Node(node_id=nid(8), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(9), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(10), base=graph.Base('A'), aligned_to=None), graph.Node(node_id=nid(11), base=graph.Base('C'), aligned_to=None), graph.Node(node_id=nid(12), base=graph.Base('T'), aligned_to=None), graph.Node(node_id=nid(13), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(14), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(15), base=graph.Base('A'), aligned_to=None), graph.Node(node_id=nid(16), base=graph.Base('C'), aligned_to=None), graph.Node(node_id=nid(17), base=graph.Base('T'), aligned_to=None), graph.Node(node_id=nid(18), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(19), base=graph.Base('G'), aligned_to=None), ] expected_sequences = { msa.SequenceID('seq0'): graph.Sequence(msa.SequenceID('seq0'), [], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence(msa.SequenceID('seq1'), [graph.SeqPath([*map(nid, [0, 1, 2, 3, 4, 10, 11, 12, 13, 14])]), graph.SeqPath([*map(nid, [5, 6, 7, 8, 9, 15, 16, 17, 18, 19])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence(msa.SequenceID('seq2'), [graph.SeqPath([*map(nid, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19])])], graph.SequenceMetadata({'group': '2'})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [], graph.SequenceMetadata({'group': '2'})), } expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences) actual_poagraph, _ = builder.build_from_dagmaf( msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path), self.fasta_provider, self.metadatacsv) self.assertEqual(expected_poagraph, actual_poagraph)
def test_6_missing_one_reverted_sequence_middle_minus1_1(self): maf_path = self.maf_files_dir.joinpath( "test_6_missing_one_reverted_sequence_middle_minus1_1.maf") expected_nodes = [ # block 1 because it is first in DAG and reverted graph.Node(node_id=nid(0), base=graph.Base('A'), aligned_to=None), graph.Node(node_id=nid(1), base=graph.Base('C'), aligned_to=None), graph.Node(node_id=nid(2), base=graph.Base('C'), aligned_to=nid(3)), graph.Node(node_id=nid(3), base=graph.Base('T'), aligned_to=nid(2)), # missing seq2, on edge (-1,1) graph.Node(node_id=nid(4), base=graph.Base('T'), aligned_to=None), graph.Node(node_id=nid(5), base=graph.Base('C'), aligned_to=None), graph.Node(node_id=nid(6), base=graph.Base('A'), aligned_to=nid(7)), graph.Node(node_id=nid(7), base=graph.Base('C'), aligned_to=nid(6)), graph.Node(node_id=nid(8), base=graph.Base('C'), aligned_to=None), graph.Node(node_id=nid(9), base=graph.Base('T'), aligned_to=None), graph.Node(node_id=nid(10), base=graph.Base('A'), aligned_to=nid(11)), graph.Node(node_id=nid(11), base=graph.Base('C'), aligned_to=nid(10)), ] expected_sequences = { msa.SequenceID('seq0'): graph.Sequence(msa.SequenceID('seq0'), [], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence(msa.SequenceID('seq1'), [ graph.SeqPath([*map(nid, [0, 1, 2])]), graph.SeqPath([*map(nid, [6, 8, 9, 10])]) ], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence( msa.SequenceID('seq2'), [graph.SeqPath([*map(nid, [0, 1, 3, 4, 5, 7, 11])])], graph.SequenceMetadata({'group': '2'})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [], graph.SequenceMetadata({'group': '2'})) } expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences) actual_poagraph, _ = builder.build_from_dagmaf( msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path), self.fasta_provider, self.metadatacsv) self.assertEqual(expected_poagraph, actual_poagraph)
def _add_record_to_dict(self, record: SeqRecord, fastas_dict: Dict[msa.SequenceID, str]) -> None: """Adds record the sequence to dict if the sequence is not empty. Args: record: The sequence to add. Raises: FastaProviderExecption: If record contains empty sequence or its ID is already present in the dict. """ if len(record.seq) == 0: raise FastaProviderException( "Empty sequence in FASTA. Provide the sequence or remove its header." ) if msa.SequenceID(str(record.id)) in fastas_dict.keys(): raise FastaProviderException( "Incorrect fasta provided: sequences IDs are not unique.") fastas_dict[msa.SequenceID(str(record.id))] = record.seq
def test_1_one_sequence(self): fasta_path = self.fasta_dir.joinpath("test_1_one_sequence.fasta") fasta_provider = missings.FromFile(Path(fasta_path)) sequence_id = msa.SequenceID("seq1") expected_sequence = self.read_sequence(fasta_path) self.raise_error_if_unequal(sequence_id, expected_sequence, fasta_provider)
def get_poagraph(maf: msa.Maf, metadata: Optional[msa.MetadataCSV]) -> \ Tuple[List[graph.Node], Dict[msa.SequenceID, graph.Sequence]]: """Get poagraph elements from MAF. Args: maf: Multialignment file in MAF format. metadata: MetadataCSV. Returns: Tuple of poagraph elements. """ alignment = [*AlignIO.parse(maf.filecontent, "maf")] nodes, sequences = _init_poagraph(alignment, metadata) current_node_id = graph.NodeID(-1) column_id = graph.ColumnID(-1) for block_id, block in enumerate(alignment): global_logger.info(f"Processing block {block_id}...") block_width = len(block[0].seq) for col in range(block_width): column_id += 1 sequence_id_to_nucleotide = { msa.SequenceID(seq.id): seq[col] for seq in block } nodes_codes = sorted([ *(set([ nucleotide for nucleotide in sequence_id_to_nucleotide.values() ])).difference({'-'}) ]) column_nodes_ids = [ graph.NodeID(current_node_id + i + 1) for i, _ in enumerate(nodes_codes) ] for i, nucl in enumerate(nodes_codes): current_node_id += 1 nodes.append( graph.Node(node_id=current_node_id, base=graph.Base(nucl), aligned_to=_get_next_aligned_node_id( graph.NodeID(i), column_nodes_ids), column_id=graph.ColumnID(column_id), block_id=graph.BlockID(block_id))) for seq_id, nucleotide in sequence_id_to_nucleotide.items(): if nucleotide == nucl: sequences[seq_id] = _add_node_do_sequence( sequence=sequences[seq_id], node_id=current_node_id) return nodes, sequences
def _get_paths_join_info(block: Block, free_edges: Dict[msa.SequenceID, List[Edge]]) -> \ Dict[msa.SequenceID, Optional[graph.NodeID]]: paths_join_info: Dict[msa.SequenceID, Optional[graph.NodeID]] = dict() for seq in block.alignment: seq_id = msa.SequenceID(seq.id) paths_join_info[seq_id] = None for i, edge in enumerate(free_edges[seq_id]): if edge.to_block_id == block.id: paths_join_info[seq_id] = edge.last_node_id return paths_join_info
def test_1_one_sequence_one_file_in_zip(self): fasta_path = self.fasta_dir.joinpath( "test_1_one_sequence_one_file_in_zip.zip") fasta_provider = missings.FromFile(Path(fasta_path)) sequence_id = msa.SequenceID("seq1") expected_sequence = "ACTGGGTGGGA" self.raise_error_if_unequal(sequence_id, expected_sequence, fasta_provider)
def test_1_typical_poagraph(self): po_path = self.po_files_dir.joinpath("test_1.po") expected_nodes = [ graph.Node(node_id=nid(0), base=bid('A'), aligned_to=nid(1)), graph.Node(node_id=nid(1), base=bid('G'), aligned_to=nid(0)), graph.Node(node_id=nid(2), base=bid('C'), aligned_to=nid(3)), graph.Node(node_id=nid(3), base=bid('G'), aligned_to=nid(2)), graph.Node(node_id=nid(4), base=bid('A'), aligned_to=nid(5)), graph.Node(node_id=nid(5), base=bid('T'), aligned_to=nid(4)), graph.Node(node_id=nid(6), base=bid('G'), aligned_to=None), graph.Node(node_id=nid(7), base=bid('G'), aligned_to=None), graph.Node(node_id=nid(8), base=bid('A'), aligned_to=nid(9)), graph.Node(node_id=nid(9), base=bid('C'), aligned_to=nid(10)), graph.Node(node_id=nid(10), base=bid('G'), aligned_to=nid(11)), graph.Node(node_id=nid(11), base=bid('T'), aligned_to=nid(8)), graph.Node(node_id=nid(12), base=bid('A'), aligned_to=nid(13)), graph.Node(node_id=nid(13), base=bid('C'), aligned_to=nid(12)), graph.Node(node_id=nid(14), base=bid('T'), aligned_to=None), graph.Node(node_id=nid(15), base=bid('A'), aligned_to=nid(16)), graph.Node(node_id=nid(16), base=bid('C'), aligned_to=nid(17)), graph.Node(node_id=nid(17), base=bid('G'), aligned_to=nid(15)) ] expected_sequences = { msa.SequenceID('seq0'): graph.Sequence( msa.SequenceID('seq0'), [graph.SeqPath([*map(nid, [0, 2, 4, 6, 7, 8, 12, 14, 16])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence(msa.SequenceID('seq1'), [graph.SeqPath([*map(nid, [1, 2, 5, 6, 7, 9])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence( msa.SequenceID('seq2'), [graph.SeqPath([*map(nid, [3, 4, 6, 7, 10, 12, 14, 17])])], graph.SequenceMetadata({'group': '2'})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [graph.SeqPath([*map(nid, [11, 13, 14, 15])])], graph.SequenceMetadata({'group': '2'})) } expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences) nodes, sequences = po2poagraph.get_poagraph( msa.Po(pathtools.get_file_content_stringio(po_path), po_path), self.metadatacsv) actual_poagraph = graph.Poagraph(nodes, sequences) self.assertEqual(expected_poagraph, actual_poagraph)
def test_2_missing_sequence_end(self): maf_path = self.maf_files_dir.joinpath( "test_2_missing_sequence_end.maf") expected_nodes = [ graph.Node(node_id=nid(0), base=graph.Base('A'), aligned_to=nid(1)), graph.Node(node_id=nid(1), base=graph.Base('G'), aligned_to=nid(0)), graph.Node(node_id=nid(2), base=graph.Base('C'), aligned_to=nid(3)), graph.Node(node_id=nid(3), base=graph.Base('G'), aligned_to=nid(2)), graph.Node(node_id=nid(4), base=graph.Base('T'), aligned_to=None), graph.Node(node_id=nid(5), base=graph.Base('A'), aligned_to=nid(6)), graph.Node(node_id=nid(6), base=graph.Base('C'), aligned_to=nid(5)), graph.Node(node_id=nid(7), base=graph.Base('A'), aligned_to=None), graph.Node(node_id=nid(8), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(9), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(10), base=graph.Base('T'), aligned_to=None), graph.Node(node_id=nid(11), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(12), base=graph.Base('T'), aligned_to=None), ] expected_sequences = { msa.SequenceID('seq0'): graph.Sequence(msa.SequenceID('seq0'), [], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence( msa.SequenceID('seq1'), [graph.SeqPath([*map(nid, [0, 2, 4, 5, 8, 9, 10])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence( msa.SequenceID('seq2'), [graph.SeqPath([*map(nid, [1, 3, 4, 6, 7, 11, 12])])], graph.SequenceMetadata({'group': '2'})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [], graph.SequenceMetadata({'group': '2'})) } expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences) actual_poagraph, _ = builder.build_from_dagmaf( msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path), self.fasta_provider, self.metadatacsv) self.assertEqual(expected_poagraph, actual_poagraph)
def test_4_seqid_is_last(self): metadata_path = self.csv_files_dir.joinpath("test_4_seqid_is_last.csv") csv_content = pathtools.get_file_content_stringio(metadata_path) expected_metadata = { msa.SequenceID('s1'): { 'name': 'sequence1', 'group': 'A' }, msa.SequenceID('s2'): { 'name': 'sequence2', 'group': 'B' }, msa.SequenceID('s3'): { 'name': 'sequence3', 'group': 'B' } } m = msa.MetadataCSV(csv_content, metadata_path) actual_metadata = m.metadata self.assertEqual(expected_metadata, actual_metadata)
def test_1_typical_poagraph(self): expected_po_content_path = self.po_files_dir.joinpath("test_1.po") poagraph_nodes = [graph.Node(node_id=nid(0), base=bid('A'), aligned_to=nid(1)), graph.Node(node_id=nid(1), base=bid('G'), aligned_to=nid(0)), graph.Node(node_id=nid(2), base=bid('C'), aligned_to=nid(3)), graph.Node(node_id=nid(3), base=bid('G'), aligned_to=nid(2)), graph.Node(node_id=nid(4), base=bid('A'), aligned_to=nid(5)), graph.Node(node_id=nid(5), base=bid('T'), aligned_to=nid(4)), graph.Node(node_id=nid(6), base=bid('G'), aligned_to=None), graph.Node(node_id=nid(7), base=bid('G'), aligned_to=None), graph.Node(node_id=nid(8), base=bid('A'), aligned_to=nid(9)), graph.Node(node_id=nid(9), base=bid('C'), aligned_to=nid(10)), graph.Node(node_id=nid(10), base=bid('G'), aligned_to=nid(11)), graph.Node(node_id=nid(11), base=bid('T'), aligned_to=nid(8)), graph.Node(node_id=nid(12), base=bid('A'), aligned_to=nid(13)), graph.Node(node_id=nid(13), base=bid('C'), aligned_to=nid(12)), graph.Node(node_id=nid(14), base=bid('T'), aligned_to=None), graph.Node(node_id=nid(15), base=bid('A'), aligned_to=nid(16)), graph.Node(node_id=nid(16), base=bid('C'), aligned_to=nid(17)), graph.Node(node_id=nid(17), base=bid('G'), aligned_to=nid(15)) ] poagraph_sequences = { msa.SequenceID('seq0'): graph.Sequence(msa.SequenceID('seq0'), [graph.SeqPath([*map(nid, [0, 2, 4, 6, 7, 8, 12, 14, 16])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence(msa.SequenceID('seq1'), [graph.SeqPath([*map(nid, [1, 2, 5, 6, 7, 9])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence(msa.SequenceID('seq2'), [graph.SeqPath([*map(nid, [3, 4, 6, 7, 10, 12, 14, 17])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [graph.SeqPath([*map(nid, [11, 13, 14, 15])])], graph.SequenceMetadata({'group': '1'})), } poagraph = graph.Poagraph(poagraph_nodes, poagraph_sequences) actual_po_content = po.poagraph_to_PangenomePO(poagraph) expected_po_content = pathtools.get_file_content(expected_po_content_path) self.assertEqual(expected_po_content, actual_po_content)
def test_subpoagraph_construction_from_poagraph_keep_seq_0_1(self): translator = poa._PoagraphPOTranslator( self.poagraph, [msa.SequenceID('seq0'), msa.SequenceID('seq1')]) actual_po_content = translator.get_input_po_content() expected_po_content = "VERSION=pangenome\n"\ "NAME=pangenome\n"\ "TITLE=pangenome\n"\ "LENGTH=9\n"\ "SOURCECOUNT=2\n"\ "SOURCENAME=seq0\n"\ "SOURCEINFO=7 0 0 -1 seq0\n"\ "SOURCENAME=seq1\n"\ "SOURCEINFO=5 1 100 -1 seq1\n"\ "t:S0\n"\ "a:L0S0S1\n"\ "a:L1S0S1\n"\ "a:L2S0A4\n"\ "c:L2S1A3\n"\ "a:L3L4S0S1\n"\ "c:L5S0A7\n"\ "t:L5S1A6\n"\ "a:L6S0" self.assertEqual(expected_po_content, actual_po_content)
def _complement_sequence_middles_if_needed(build_state: _BuildState, block: Block, edge: Arc, seq, last_node_id: graph.NodeID): seq_id = msa.SequenceID(seq[0].seq_id) left_block_sinfo, right_block_sinfo = _get_edge_sinfos( seqs_info=build_state.seqs_info, from_block_id=block.id, edge=edge, seq_id=seq_id) if _complementation_not_needed(left_block_sinfo, right_block_sinfo): if edge.edge_type == (1, -1): return last_node_id else: return None else: current_node_id = _get_max_node_id(build_state.nodes) column_id = build_state.column_id if left_block_sinfo.start < right_block_sinfo.start: last_pos = left_block_sinfo.start + left_block_sinfo.size - 1 next_pos = right_block_sinfo.start else: last_pos = right_block_sinfo.start + right_block_sinfo.size - 1 next_pos = left_block_sinfo.start join_with = last_node_id if _should_join_with_last_node( edge.edge_type) else None for i in range(last_pos + 1, next_pos): column_id += 1 current_node_id += 1 missing_nucleotide = _get_missing_nucleotide( build_state.fasta_provider, seq_id, i) build_state.nodes += [ graph.Node(node_id=current_node_id, base=missing_nucleotide, aligned_to=None, column_id=column_id, block_id=None) ] _add_node_to_sequence(build_state, seq_id=seq_id, join_with=join_with, node_id=current_node_id) join_with = current_node_id if _should_join_with_next_node(edge.edge_type): return current_node_id else: return None
def setUp(self): nodes = [ graph.Node( node_id=nid(0), base=b('T'), aligned_to=None, ), graph.Node(node_id=nid(1), base=b('A'), aligned_to=nid(2)), graph.Node(node_id=nid(2), base=b('G'), aligned_to=nid(1)), graph.Node(node_id=nid(3), base=b('A'), aligned_to=nid(4)), graph.Node(node_id=nid(4), base=b('C'), aligned_to=nid(3)), graph.Node(node_id=nid(5), base=b('A'), aligned_to=nid(6)), graph.Node(node_id=nid(6), base=b('C'), aligned_to=nid(7)), graph.Node(node_id=nid(7), base=b('G'), aligned_to=nid(8)), graph.Node(node_id=nid(8), base=b('T'), aligned_to=nid(5)), graph.Node(node_id=nid(9), base=b('A'), aligned_to=None), graph.Node(node_id=nid(10), base=b('C'), aligned_to=nid(11)), graph.Node(node_id=nid(11), base=b('T'), aligned_to=nid(10)), graph.Node(node_id=nid(12), base=b('G'), aligned_to=None), graph.Node(node_id=nid(13), base=b('A'), aligned_to=nid(14)), graph.Node(node_id=nid(14), base=b('C'), aligned_to=nid(13)) ] sequences = { msa.SequenceID('seq0'): graph.Sequence( msa.SequenceID('seq0'), [graph.SeqPath([*map(nid, [0, 1, 3, 5, 9, 10, 13])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence(msa.SequenceID('seq1'), [graph.SeqPath([*map(nid, [1, 3, 6, 9, 11])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence(msa.SequenceID('seq2'), [graph.SeqPath([*map(nid, [2, 4, 7, 9, 11, 12])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq3'): graph.Sequence( msa.SequenceID('seq3'), [graph.SeqPath([*map(nid, [2, 4, 8, 9, 11, 12, 14])])], graph.SequenceMetadata({'group': '1'})), } self.poagraph = graph.Poagraph(nodes, sequences)
def test_read_consensus_path_seq1_only_in_input(self): translator = poa._PoagraphPOTranslator(self.poagraph, [msa.SequenceID('seq1')]) _ = translator.get_input_po_content() poa_lines = [ "VERSION=pangenome\n", "NAME=pangenome\n", "TITLE=pangenome\n", "LENGTH=5\n", "SOURCECOUNT=2\n", "SOURCENAME=seq1\n", "SOURCEINFO=5 0 100 0 seq1\n", "SOURCENAME=CONSENS0\n", "SOURCEINFO=5 0 100 0 CONSENS0\n", "a:S0S1\n", "a:L0S0S1\n", "c:L1S0S1\n", "a:L2S0S1\n", "t:L2S0S1" ] actual_consensus_path = translator.read_consensus_paths(poa_lines, [0]) expected_consensus_path = [1, 3, 6, 9, 11] self.assertEqual(expected_consensus_path, actual_consensus_path[0].path)
def test_06_1st_block_separates_into_2_branches_which_connect_in_3rd_block(self): maf_path = self.maf_files_dir.joinpath( "test_6_1st_block_separates_into_2_branches_which_connect_in_3rd_block.maf") expected_nodes = [ graph.Node(node_id=nid(0), base=graph.Base('A'), aligned_to=nid(1)), graph.Node(node_id=nid(1), base=graph.Base('C'), aligned_to=nid(2)), graph.Node(node_id=nid(2), base=graph.Base('G'), aligned_to=nid(0)), graph.Node(node_id=nid(3), base=graph.Base('C'), aligned_to=None), graph.Node(node_id=nid(4), base=graph.Base('A'), aligned_to=nid(5)), graph.Node(node_id=nid(5), base=graph.Base('T'), aligned_to=nid(4)), graph.Node(node_id=nid(6), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(7), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(8), base=graph.Base('C'), aligned_to=nid(9)), graph.Node(node_id=nid(9), base=graph.Base('G'), aligned_to=nid(10)), graph.Node(node_id=nid(10), base=graph.Base('T'), aligned_to=nid(8)), ] expected_sequences = { msa.SequenceID('seq0'): graph.Sequence(msa.SequenceID('seq0'), [graph.SeqPath([*map(nid, [0, 3, 4, 8])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence(msa.SequenceID('seq1'), [graph.SeqPath([*map(nid, [1, 3, 5, 6, 7, 9])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence(msa.SequenceID('seq2'), [graph.SeqPath([*map(nid, [2, 3, 5, 10])])], graph.SequenceMetadata({'group': '2'})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [], graph.SequenceMetadata({'group': '2'})) } expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences) actual_poagraph, _ = builder.build_from_dagmaf( msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path), self.fasta_provider, self.metadatacsv) self.assertEqual(expected_poagraph, actual_poagraph)
def test_02_seq_starts_in_second_block(self): maf_path = self.maf_files_dir.joinpath( "test_2_seq_starts_in_second_block.maf") expected_nodes = [ graph.Node(node_id=nid(0), base=graph.Base('C'), aligned_to=None, block_id=bid(0)), graph.Node(node_id=nid(1), base=graph.Base('T'), aligned_to=None, block_id=bid(0)), graph.Node(node_id=nid(2), base=graph.Base('G'), aligned_to=None, block_id=bid(0)), graph.Node(node_id=nid(3), base=graph.Base('T'), aligned_to=None, block_id=bid(1)), graph.Node(node_id=nid(4), base=graph.Base('G'), aligned_to=nid(5), block_id=bid(2)), graph.Node(node_id=nid(5), base=graph.Base('T'), aligned_to=nid(4), block_id=bid(2)), graph.Node(node_id=nid(6), base=graph.Base('A'), aligned_to=None, block_id=bid(2)), graph.Node(node_id=nid(7), base=graph.Base('A'), aligned_to=None, block_id=bid(2)), graph.Node(node_id=nid(8), base=graph.Base('C'), aligned_to=None, block_id=bid(2)), ] expected_sequences = { msa.SequenceID('seq0'): graph.Sequence(msa.SequenceID('seq0'), [graph.SeqPath([*map(nid, [1, 2, 3])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence(msa.SequenceID('seq1'), [graph.SeqPath([*map(nid, [0, 5, 7])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence(msa.SequenceID('seq2'), [graph.SeqPath([*map(nid, [3, 4, 6, 8])])], graph.SequenceMetadata({'group': '2'})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [], graph.SequenceMetadata({'group': '2'})) } expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences) actual_poagraph, _ = builder.build_from_dagmaf( msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path), self.fasta_provider, self.metadatacsv) self.assertEqual(expected_poagraph, actual_poagraph)