def test_subpoagraph_should_omit_edges_2(self):
        nodes = [
            graph.Node(node_id=nid(0), base=b('A'), aligned_to=None),
            graph.Node(node_id=nid(1), base=b('C'), aligned_to=None),
            graph.Node(node_id=nid(2), base=b('C'), aligned_to=None)
        ]

        sequences = {
            msa.SequenceID('seq1'):
            graph.Sequence(msa.SequenceID('seq1'),
                           [graph.SeqPath([*map(nid, [0, 2])])],
                           graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq2'):
            graph.Sequence(msa.SequenceID('seq2'),
                           [graph.SeqPath([*map(nid, [0, 1, 2])])],
                           graph.SequenceMetadata({'group': '1'}))
        }
        poagraph = graph.Poagraph(nodes, sequences)

        translator = poa._PoagraphPOTranslator(poagraph,
                                               [msa.SequenceID('seq1')])
        actual_po_content = translator.get_input_po_content()
        expected_po_content = "VERSION=pangenome\n" \
                              "NAME=pangenome\n" \
                              "TITLE=pangenome\n" \
                              "LENGTH=2\n" \
                              "SOURCECOUNT=1\n" \
                              "SOURCENAME=seq1\n" \
                              "SOURCEINFO=2 0 100 -1 seq1\n" \
                              "a:S0\n" \
                              "c:L0S0"

        self.assertEqual(expected_po_content, actual_po_content)
    def test_subpoagraph_construction_full_graph(self):
        nodes = [
            graph.Node(node_id=nid(0), base=b('A'), aligned_to=None),
            graph.Node(node_id=nid(1), base=b('A'), aligned_to=None),
            graph.Node(node_id=nid(2), base=b('C'), aligned_to=None),
            graph.Node(node_id=nid(3), base=b('A'), aligned_to=None),
            graph.Node(node_id=nid(4), base=b('T'), aligned_to=None)
        ]

        sequences = {
            msa.SequenceID('seq0'):
            graph.Sequence(msa.SequenceID('seq0'),
                           [graph.SeqPath([*map(nid, [0, 1, 2, 3, 4])])],
                           graph.SequenceMetadata({'group': '1'}))
        }
        poagraph = graph.Poagraph(nodes, sequences)
        translator = poa._PoagraphPOTranslator(poagraph,
                                               [msa.SequenceID('seq0')])
        actual_po_content = translator.get_input_po_content()
        expected_po_content = "VERSION=pangenome\n" \
                              "NAME=pangenome\n" \
                              "TITLE=pangenome\n" \
                              "LENGTH=5\n" \
                              "SOURCECOUNT=1\n" \
                              "SOURCENAME=seq0\n" \
                              "SOURCEINFO=5 0 100 -1 seq0\n" \
                              "a:S0\n" \
                              "a:L0S0\n" \
                              "c:L1S0\n" \
                              "a:L2S0\n" \
                              "t:L3S0"
        self.assertEqual(expected_po_content, actual_po_content)
Esempio n. 3
0
 def __init__(self):
     self.sources = {
         msa.SequenceID("seq0"): "",
         msa.SequenceID("seq1"): "ACTAGGT",
         msa.SequenceID("seq2"): "GGTCAGT",
         msa.SequenceID("seq3"): "",
         msa.SequenceID("seq4"): ""
     }
    def test_1_p_parameter_influence(self, p: at_params.P,
                                     expected_cutoff: graph.Compatibility):
        nodes = [
            graph.Node(node_id=nid(0), base=b('T'), aligned_to=None),
            graph.Node(node_id=nid(1), base=b('A'), aligned_to=None),
            graph.Node(node_id=nid(2), base=b('G'), aligned_to=None),
            graph.Node(node_id=nid(3), base=b('A'), aligned_to=None),
            graph.Node(node_id=nid(4), base=b('C'), aligned_to=None),
            graph.Node(node_id=nid(5), base=b('A'), aligned_to=None),
            graph.Node(node_id=nid(6), base=b('C'), aligned_to=None),
            graph.Node(node_id=nid(7), base=b('G'), aligned_to=None),
            graph.Node(node_id=nid(8), base=b('T'), aligned_to=None),
            graph.Node(node_id=nid(9), base=b('A'), aligned_to=None)
        ]

        sequences = {
            msa.SequenceID('seq0'):
            graph.Sequence(msa.SequenceID('seq0'), [
                graph.SeqPath(
                    [*map(nid, [10, 11, 12, 13, 14, 15, 16, 17, 18, 9])])
            ], graph.SequenceMetadata({})),
            msa.SequenceID('seq1'):
            graph.Sequence(msa.SequenceID('seq1'), [
                graph.SeqPath(
                    [*map(nid, [10, 11, 12, 13, 14, 15, 16, 17, 8, 9])])
            ], graph.SequenceMetadata({})),
            msa.SequenceID('seq2'):
            graph.Sequence(msa.SequenceID('seq2'), [
                graph.SeqPath(
                    [*map(nid, [10, 11, 12, 13, 14, 15, 16, 7, 8, 9])])
            ], graph.SequenceMetadata({})),
            msa.SequenceID('seq3'):
            graph.Sequence(msa.SequenceID('seq3'), [
                graph.SeqPath([*map(nid, [10, 11, 12, 3, 4, 5, 6, 7, 8, 9])])
            ], graph.SequenceMetadata({})),
            msa.SequenceID('seq4'):
            graph.Sequence(
                msa.SequenceID('seq3'),
                [graph.SeqPath([*map(nid, [10, 11, 2, 3, 4, 5, 6, 7, 8, 9])])],
                graph.SequenceMetadata({}))
        }

        poagraph = graph.Poagraph(nodes, sequences)

        consensus_path = graph.SeqPath(
            [*map(nid, [10, 11, 12, 13, 14, 15, 16, 17, 18, 19])])
        compatibilities = poagraph.get_compatibilities(
            poagraph.get_sequences_ids(), consensus_path, p)

        actual_cutoff = at_builders._find_node_cutoff(
            [c for c in compatibilities.values()], []).cutoff
        self.assertAlmostEqual(expected_cutoff.value, actual_cutoff.value)
    def test_04_single_block_no_nucleotides(self):
        maf_path = self.maf_files_dir.joinpath(
                        "test_4_single_block_no_nucleotides.maf")

        expected_nodes = []

        expected_sequences = {
            msa.SequenceID('seq0'):
                graph.Sequence(msa.SequenceID('seq0'),
                               [],
                               graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq1'):
                graph.Sequence(msa.SequenceID('seq1'),
                               [],
                               graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq2'):
                graph.Sequence(msa.SequenceID('seq2'),
                               [],
                               graph.SequenceMetadata({'group': '2'})),
            msa.SequenceID('seq3'):
                graph.Sequence(msa.SequenceID('seq3'),
                               [],
                               graph.SequenceMetadata({'group': '2'}))
        }
        expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences)
        actual_poagraph, _ = builder.build_from_dagmaf(
            msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path),
            self.fasta_provider,
            self.metadatacsv)

        self.assertEqual(expected_poagraph, actual_poagraph)
    def test_7_missing_one_reverted_sequence_middle_minus1_minus1(self):
        maf_path = self.maf_files_dir.joinpath(
            "test_7_missing_one_reverted_sequence_middle_minus1_minus1.maf")

        expected_nodes = [
            # block 0
            graph.Node(node_id=nid(0), base=graph.Base('A'), aligned_to=None),
            graph.Node(node_id=nid(1), base=graph.Base('C'), aligned_to=None),
            graph.Node(node_id=nid(2), base=graph.Base('T'), aligned_to=None),
            graph.Node(node_id=nid(3), base=graph.Base('A'), aligned_to=None),

            # missing seq2
            graph.Node(node_id=nid(4),
                       base=graph.Base(self.missing_n.value),
                       aligned_to=None),
            graph.Node(node_id=nid(5),
                       base=graph.Base(self.missing_n.value),
                       aligned_to=None),

            # block 1
            graph.Node(node_id=nid(6), base=graph.Base('A'),
                       aligned_to=nid(7)),
            graph.Node(node_id=nid(7), base=graph.Base('G'),
                       aligned_to=nid(6)),
            graph.Node(node_id=nid(8), base=graph.Base('C'),
                       aligned_to=nid(9)),
            graph.Node(node_id=nid(9), base=graph.Base('G'),
                       aligned_to=nid(8)),
            graph.Node(node_id=nid(10),
                       base=graph.Base('C'),
                       aligned_to=nid(11)),
            graph.Node(node_id=nid(11),
                       base=graph.Base('T'),
                       aligned_to=nid(10)),
        ]

        expected_sequences = {
            msa.SequenceID('seq0'):
            graph.Sequence(msa.SequenceID('seq0'), [],
                           graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq1'):
            graph.Sequence(
                msa.SequenceID('seq1'),
                [graph.SeqPath([*map(nid, [0, 1, 2, 3, 7, 9, 11])])],
                graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq2'):
            graph.Sequence(
                msa.SequenceID('seq2'),
                [graph.SeqPath([*map(nid, [0, 1, 4, 5, 6, 8, 10])])],
                graph.SequenceMetadata({'group': '2'})),
            msa.SequenceID('seq3'):
            graph.Sequence(msa.SequenceID('seq3'), [],
                           graph.SequenceMetadata({'group': '2'}))
        }
        expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences)
        actual_poagraph, _ = builder.build_from_dagmaf(
            msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path),
            missings.ConstBaseProvider(self.missing_n), self.metadatacsv)

        self.assertEqual(expected_poagraph, actual_poagraph)
    def test_9_get_seqids(self):
        metadata_path = self.csv_files_dir.joinpath("test_1_correct.csv")
        csv_content = pathtools.get_file_content_stringio(metadata_path)

        expected_seqids = [
            msa.SequenceID('s1'),
            msa.SequenceID('s2'),
            msa.SequenceID('s3')
        ]

        m = msa.MetadataCSV(csv_content, metadata_path)
        actual_seqids = m.get_all_sequences_ids()

        self.assertEqual(expected_seqids, actual_seqids)
    def test_05_single_block_single_nucletodide(self):
        maf_path = self.maf_files_dir.joinpath(
                        "test_5_single_block_single_nucletodide.maf")

        expected_nodes = [
            graph.Node(node_id=nid(0), base=graph.Base('A'), aligned_to=None, block_id=bid(0))
        ]

        expected_sequences = {
            msa.SequenceID('seq0'):
                graph.Sequence(msa.SequenceID('seq0'),
                               [graph.SeqPath([*map(nid, [0])])],
                               graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq1'):
                graph.Sequence(msa.SequenceID('seq1'),
                               [graph.SeqPath([*map(nid, [0])])],
                               graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq2'):
                graph.Sequence(msa.SequenceID('seq2'),
                               [graph.SeqPath([*map(nid, [0])])],
                               graph.SequenceMetadata({'group': '2'})),
            msa.SequenceID('seq3'):
                graph.Sequence(msa.SequenceID('seq3'),
                               [graph.SeqPath([*map(nid, [0])])],
                               graph.SequenceMetadata({'group': '2'}))
        }
        expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences)
        actual_poagraph, _ = builder.build_from_dagmaf(
            msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path),
            self.fasta_provider,
            self.metadatacsv)

        self.assertEqual(expected_poagraph, actual_poagraph)
    def test_2_three_sequences_in_two_files_in_zip(self):
        fasta_path = self.fasta_dir.joinpath(
            "test_2_three_sequences_in_two_files_in_zip.zip")

        fasta_provider = missings.FromFile(Path(fasta_path))

        sequence_id_1 = msa.SequenceID("seq1")
        self.raise_error_if_unequal(sequence_id_1, "ACTGGGTGGGA",
                                    fasta_provider)

        sequence_id_2 = msa.SequenceID("seq2")
        self.raise_error_if_unequal(sequence_id_2, "AA", fasta_provider)

        sequence_id_3 = msa.SequenceID("seq3")
        self.raise_error_if_unequal(sequence_id_3, "GT", fasta_provider)
Esempio n. 10
0
def _get_children_nodes_looping(node: tree.AffinityNode,
                                poagraph: graph.Poagraph,
                                output_dir: Path,
                                blosum_path: Path,
                                p: parameters.P,
                                current_max_affinity_node_id: int) -> List[tree.AffinityNode]:
    """Generates children of given Affinity Tree node."""

    children_nodes: List[tree.AffinityNode] = []
    not_assigned_sequences_ids: List[msa.SequenceID] = node.sequences
    detailed_logger.info(f"""Getting children nodes for
                             affinity node {node.id_}...""")

    affinity_node_id = 0
    so_far_cutoffs: List[poagraph.Compatibility] = []
    while not_assigned_sequences_ids:
        detailed_logger.info(f"### Getting child {len(so_far_cutoffs)}...")
        child_ready = False
        attempt = 0
        current_candidates = not_assigned_sequences_ids
        while not child_ready:
            consensus_candidate = poa.get_consensuses(poagraph,
                                                      current_candidates,
                                                      output_dir,
                                                      f"parent_{node.id_}_child_{len(so_far_cutoffs)}_attempt_{attempt}",
                                                      blosum_path,
                                                      parameters.Hbmin(0),
                                                      specific_consensuses_id=[0])[0].path
            compatibilities_to_consensus_candidate = poagraph.get_compatibilities(sequences_ids=not_assigned_sequences_ids,
                                                                                  consensus_path=consensus_candidate,
                                                                                  p=p)
            compatibilities_to_consensus_candidate[msa.SequenceID("parent")] = node.mincomp
            qualified_sequences_ids_candidates, cutoff = _get_qualified_sequences_ids_and_cutoff(
                compatibilities_to_max_c=compatibilities_to_consensus_candidate,
                so_far_cutoffs=so_far_cutoffs,
                splitted_node_id=node.id_)

            if qualified_sequences_ids_candidates == current_candidates or attempt == 10:
                if attempt == 10:
                    detailed_logger.info("Attempt treshold 10 exceeded!")
                affinity_node_id += 1

                affinity_node = tree.AffinityNode(
                    id_=tree.AffinityNodeID(current_max_affinity_node_id + affinity_node_id),
                    parent=node.id_,
                    sequences=qualified_sequences_ids_candidates,
                    mincomp=_get_min_comp(node_sequences_ids=qualified_sequences_ids_candidates,
                                          comps_to_consensus=compatibilities_to_consensus_candidate),
                    consensus=graph.SeqPath(consensus_candidate))
                children_nodes.append(affinity_node)
                not_assigned_sequences_ids = list(set(not_assigned_sequences_ids) - set(qualified_sequences_ids_candidates))
                child_ready = True
                so_far_cutoffs.append(affinity_node.mincomp)
            else:
                current_candidates = qualified_sequences_ids_candidates
                attempt += 1

    detailed_logger.info("Children nodes generated.")

    return children_nodes
Esempio n. 11
0
    def test_1_download_sequence_and_save_to_cache(self):
        cache_dir_path = pathtools.get_child_path(Path.cwd(), ".fastacache")
        if cache_dir_path.exists():
            shutil.rmtree(cache_dir_path)

        ncbi_fasta_provider = missings.FromNCBI(use_cache=True)
        sequence_id = msa.SequenceID("AB050936v1")

        _ = ncbi_fasta_provider.get_base(sequence_id, 0)

        # cache directory creation
        cache_directory_created = cache_dir_path.exists()
        self.assertTrue(cache_directory_created)

        # file creation
        files_in_cache_dircetory = [*cache_dir_path.glob("*")]
        expected_filepath = pathtools.get_child_path(cache_dir_path,
                                                     f"{sequence_id}.fasta")
        file_created_in_cache = expected_filepath in files_in_cache_dircetory
        self.assertTrue(file_created_in_cache)

        # file content
        control_fasta_path = Path(__file__).parent.joinpath(
            'fasta_ncbi/AB050936.1.fasta').resolve()

        with open(control_fasta_path) as fasta_file_hanlder:
            expected_content = fasta_file_hanlder.read()
        with open(expected_filepath) as fasta_file_handler:
            actual_content = fasta_file_handler.read()
        self.assertEqual(expected_content, actual_content)
    def test_1_no_symbol_provided(self):
        missing_symbol = missings.MissingBase()
        const_symbol_provider = missings.ConstBaseProvider(missing_symbol)

        expected_symbol = graph.Base('?')
        actual_symbol = const_symbol_provider.get_base(msa.SequenceID('s'), 0)
        self.assertEqual(expected_symbol, actual_symbol)
    def test_10_parallel_blocks_1st_and_2nd_merge_into_3rd(self):
        maf_path = self.maf_files_dir.joinpath("test_10_parallel_blocks_1st_and_2nd_merge_into_3rd.maf")

        expected_nodes = [
            graph.Node(node_id=nid(0), base=graph.Base('G'), aligned_to=nid(1)),
            graph.Node(node_id=nid(1), base=graph.Base('T'), aligned_to=nid(0)),
            graph.Node(node_id=nid(2), base=graph.Base('T'), aligned_to=None),
            graph.Node(node_id=nid(3), base=graph.Base('A'), aligned_to=None),
            graph.Node(node_id=nid(4), base=graph.Base('C'), aligned_to=nid(5)),
            graph.Node(node_id=nid(5), base=graph.Base('G'), aligned_to=nid(4)),
            graph.Node(node_id=nid(6), base=graph.Base('C'), aligned_to=None),

            graph.Node(node_id=nid(7), base=graph.Base('A'), aligned_to=None),
            graph.Node(node_id=nid(8), base=graph.Base('C'), aligned_to=None),
            graph.Node(node_id=nid(9), base=graph.Base('T'), aligned_to=None),
            graph.Node(node_id=nid(10), base=graph.Base('G'), aligned_to=None),
            graph.Node(node_id=nid(11), base=graph.Base('G'), aligned_to=None),

            graph.Node(node_id=nid(12), base=graph.Base('C'), aligned_to=nid(13)),
            graph.Node(node_id=nid(13), base=graph.Base('G'), aligned_to=nid(12)),
            graph.Node(node_id=nid(14), base=graph.Base('C'), aligned_to=nid(15)),
            graph.Node(node_id=nid(15), base=graph.Base('G'), aligned_to=nid(16)),
            graph.Node(node_id=nid(16), base=graph.Base('T'), aligned_to=nid(14)),
            graph.Node(node_id=nid(17), base=graph.Base('A'), aligned_to=nid(18)),
            graph.Node(node_id=nid(18), base=graph.Base('T'), aligned_to=nid(17)),
            graph.Node(node_id=nid(19), base=graph.Base('A'), aligned_to=nid(20)),
            graph.Node(node_id=nid(20), base=graph.Base('C'), aligned_to=nid(19)),
            graph.Node(node_id=nid(21), base=graph.Base('C'), aligned_to=nid(22)),
            graph.Node(node_id=nid(22), base=graph.Base('G'), aligned_to=nid(21)),
        ]

        expected_sequences = {
            msa.SequenceID('seq0'):
                graph.Sequence(msa.SequenceID('seq0'),
                               [graph.SeqPath([*map(nid, [7, 8, 9, 10, 11, 12, 15, 18, 19, 21])])],
                               graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq1'):
                graph.Sequence(msa.SequenceID('seq1'),
                               [graph.SeqPath([*map(nid, [7, 8, 9, 10, 11, 12, 15, 18, 19, 21])])],
                               graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq2'):
                graph.Sequence(msa.SequenceID('seq2'),
                               [graph.SeqPath([*map(nid, [0, 2, 3, 4, 6, 13, 16, 17, 20, 21])])],
                               graph.SequenceMetadata({'group': '2'})),
            msa.SequenceID('seq3'):
                graph.Sequence(msa.SequenceID('seq3'),
                               [graph.SeqPath([*map(nid, [1, 2, 3, 5, 6, 13, 14, 17, 20, 22])])],
                               graph.SequenceMetadata({'group': '2'})),
        }
        expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences)
        actual_poagraph, _ = builder.build_from_dagmaf(
            msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path),
            self.fasta_provider,
            self.metadatacsv)

        self.assertEqual(expected_poagraph, actual_poagraph)
    def test_09_inactive_edges_but_all_strands_plus(self):
        maf_path = self.maf_files_dir.joinpath("test_9_inactive_edges_but_all_strands_plus.maf")

        expected_nodes = [
            graph.Node(node_id=nid(0), base=graph.Base('A'), aligned_to=None),
            graph.Node(node_id=nid(1), base=graph.Base('C'), aligned_to=None),
            graph.Node(node_id=nid(2), base=graph.Base('T'), aligned_to=None),
            graph.Node(node_id=nid(3), base=graph.Base('G'), aligned_to=None),
            graph.Node(node_id=nid(4), base=graph.Base('G'), aligned_to=None),

            graph.Node(node_id=nid(5), base=graph.Base('A'), aligned_to=None),
            graph.Node(node_id=nid(6), base=graph.Base('C'), aligned_to=None),
            graph.Node(node_id=nid(7), base=graph.Base('T'), aligned_to=None),
            graph.Node(node_id=nid(8), base=graph.Base('G'), aligned_to=None),
            graph.Node(node_id=nid(9), base=graph.Base('G'), aligned_to=None),

            graph.Node(node_id=nid(10), base=graph.Base('A'), aligned_to=None),
            graph.Node(node_id=nid(11), base=graph.Base('C'), aligned_to=None),
            graph.Node(node_id=nid(12), base=graph.Base('T'), aligned_to=None),
            graph.Node(node_id=nid(13), base=graph.Base('G'), aligned_to=None),
            graph.Node(node_id=nid(14), base=graph.Base('G'), aligned_to=None),

            graph.Node(node_id=nid(15), base=graph.Base('A'), aligned_to=None),
            graph.Node(node_id=nid(16), base=graph.Base('C'), aligned_to=None),
            graph.Node(node_id=nid(17), base=graph.Base('T'), aligned_to=None),
            graph.Node(node_id=nid(18), base=graph.Base('G'), aligned_to=None),
            graph.Node(node_id=nid(19), base=graph.Base('G'), aligned_to=None),
        ]

        expected_sequences = {
            msa.SequenceID('seq0'):
                graph.Sequence(msa.SequenceID('seq0'),
                               [],
                               graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq1'):
                graph.Sequence(msa.SequenceID('seq1'),
                               [graph.SeqPath([*map(nid, [0, 1, 2, 3, 4, 10, 11, 12, 13, 14])]),
                                graph.SeqPath([*map(nid, [5, 6, 7, 8, 9, 15, 16, 17, 18, 19])])],
                               graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq2'):
                graph.Sequence(msa.SequenceID('seq2'),
                               [graph.SeqPath([*map(nid, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                                          13, 14, 15, 16, 17, 18, 19])])],
                               graph.SequenceMetadata({'group': '2'})),
            msa.SequenceID('seq3'):
                graph.Sequence(msa.SequenceID('seq3'),
                               [],
                               graph.SequenceMetadata({'group': '2'})),
        }
        expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences)
        actual_poagraph, _ = builder.build_from_dagmaf(
            msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path),
            self.fasta_provider,
            self.metadatacsv)

        self.assertEqual(expected_poagraph, actual_poagraph)
Esempio n. 15
0
    def test_6_missing_one_reverted_sequence_middle_minus1_1(self):
        maf_path = self.maf_files_dir.joinpath(
            "test_6_missing_one_reverted_sequence_middle_minus1_1.maf")

        expected_nodes = [
            # block 1 because it is first in DAG and reverted
            graph.Node(node_id=nid(0), base=graph.Base('A'), aligned_to=None),
            graph.Node(node_id=nid(1), base=graph.Base('C'), aligned_to=None),
            graph.Node(node_id=nid(2), base=graph.Base('C'),
                       aligned_to=nid(3)),
            graph.Node(node_id=nid(3), base=graph.Base('T'),
                       aligned_to=nid(2)),

            # missing seq2, on edge (-1,1)
            graph.Node(node_id=nid(4), base=graph.Base('T'), aligned_to=None),
            graph.Node(node_id=nid(5), base=graph.Base('C'), aligned_to=None),
            graph.Node(node_id=nid(6), base=graph.Base('A'),
                       aligned_to=nid(7)),
            graph.Node(node_id=nid(7), base=graph.Base('C'),
                       aligned_to=nid(6)),
            graph.Node(node_id=nid(8), base=graph.Base('C'), aligned_to=None),
            graph.Node(node_id=nid(9), base=graph.Base('T'), aligned_to=None),
            graph.Node(node_id=nid(10),
                       base=graph.Base('A'),
                       aligned_to=nid(11)),
            graph.Node(node_id=nid(11),
                       base=graph.Base('C'),
                       aligned_to=nid(10)),
        ]

        expected_sequences = {
            msa.SequenceID('seq0'):
            graph.Sequence(msa.SequenceID('seq0'), [],
                           graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq1'):
            graph.Sequence(msa.SequenceID('seq1'), [
                graph.SeqPath([*map(nid, [0, 1, 2])]),
                graph.SeqPath([*map(nid, [6, 8, 9, 10])])
            ], graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq2'):
            graph.Sequence(
                msa.SequenceID('seq2'),
                [graph.SeqPath([*map(nid, [0, 1, 3, 4, 5, 7, 11])])],
                graph.SequenceMetadata({'group': '2'})),
            msa.SequenceID('seq3'):
            graph.Sequence(msa.SequenceID('seq3'), [],
                           graph.SequenceMetadata({'group': '2'}))
        }
        expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences)
        actual_poagraph, _ = builder.build_from_dagmaf(
            msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path),
            self.fasta_provider, self.metadatacsv)

        self.assertEqual(expected_poagraph, actual_poagraph)
Esempio n. 16
0
    def _add_record_to_dict(self, record: SeqRecord,
                            fastas_dict: Dict[msa.SequenceID, str]) -> None:
        """Adds record the sequence to dict if the sequence is not empty.

        Args:
            record: The sequence to add.

        Raises:
            FastaProviderExecption: If record contains empty sequence or
                                    its ID is already present in the dict.
        """
        if len(record.seq) == 0:
            raise FastaProviderException(
                "Empty sequence in FASTA. Provide the sequence or remove its header."
            )
        if msa.SequenceID(str(record.id)) in fastas_dict.keys():
            raise FastaProviderException(
                "Incorrect fasta provided: sequences IDs are not unique.")
        fastas_dict[msa.SequenceID(str(record.id))] = record.seq
    def test_1_one_sequence(self):
        fasta_path = self.fasta_dir.joinpath("test_1_one_sequence.fasta")
        fasta_provider = missings.FromFile(Path(fasta_path))

        sequence_id = msa.SequenceID("seq1")
        expected_sequence = self.read_sequence(fasta_path)

        self.raise_error_if_unequal(sequence_id,
                                    expected_sequence,
                                    fasta_provider)
Esempio n. 18
0
def get_poagraph(maf: msa.Maf, metadata: Optional[msa.MetadataCSV]) -> \
        Tuple[List[graph.Node], Dict[msa.SequenceID, graph.Sequence]]:
    """Get poagraph elements from MAF.

    Args:
        maf: Multialignment file in MAF format.
        metadata: MetadataCSV.

    Returns:
        Tuple of poagraph elements.
    """

    alignment = [*AlignIO.parse(maf.filecontent, "maf")]
    nodes, sequences = _init_poagraph(alignment, metadata)

    current_node_id = graph.NodeID(-1)
    column_id = graph.ColumnID(-1)
    for block_id, block in enumerate(alignment):
        global_logger.info(f"Processing block {block_id}...")
        block_width = len(block[0].seq)

        for col in range(block_width):
            column_id += 1
            sequence_id_to_nucleotide = {
                msa.SequenceID(seq.id): seq[col]
                for seq in block
            }
            nodes_codes = sorted([
                *(set([
                    nucleotide
                    for nucleotide in sequence_id_to_nucleotide.values()
                ])).difference({'-'})
            ])
            column_nodes_ids = [
                graph.NodeID(current_node_id + i + 1)
                for i, _ in enumerate(nodes_codes)
            ]

            for i, nucl in enumerate(nodes_codes):
                current_node_id += 1
                nodes.append(
                    graph.Node(node_id=current_node_id,
                               base=graph.Base(nucl),
                               aligned_to=_get_next_aligned_node_id(
                                   graph.NodeID(i), column_nodes_ids),
                               column_id=graph.ColumnID(column_id),
                               block_id=graph.BlockID(block_id)))

                for seq_id, nucleotide in sequence_id_to_nucleotide.items():
                    if nucleotide == nucl:
                        sequences[seq_id] = _add_node_do_sequence(
                            sequence=sequences[seq_id],
                            node_id=current_node_id)

    return nodes, sequences
Esempio n. 19
0
def _get_paths_join_info(block: Block,
                         free_edges: Dict[msa.SequenceID, List[Edge]]) -> \
        Dict[msa.SequenceID, Optional[graph.NodeID]]:
    paths_join_info: Dict[msa.SequenceID, Optional[graph.NodeID]] = dict()
    for seq in block.alignment:
        seq_id = msa.SequenceID(seq.id)
        paths_join_info[seq_id] = None
        for i, edge in enumerate(free_edges[seq_id]):
            if edge.to_block_id == block.id:
                paths_join_info[seq_id] = edge.last_node_id
    return paths_join_info
    def test_1_one_sequence_one_file_in_zip(self):
        fasta_path = self.fasta_dir.joinpath(
            "test_1_one_sequence_one_file_in_zip.zip")

        fasta_provider = missings.FromFile(Path(fasta_path))

        sequence_id = msa.SequenceID("seq1")
        expected_sequence = "ACTGGGTGGGA"

        self.raise_error_if_unequal(sequence_id, expected_sequence,
                                    fasta_provider)
    def test_1_typical_poagraph(self):
        po_path = self.po_files_dir.joinpath("test_1.po")

        expected_nodes = [
            graph.Node(node_id=nid(0), base=bid('A'), aligned_to=nid(1)),
            graph.Node(node_id=nid(1), base=bid('G'), aligned_to=nid(0)),
            graph.Node(node_id=nid(2), base=bid('C'), aligned_to=nid(3)),
            graph.Node(node_id=nid(3), base=bid('G'), aligned_to=nid(2)),
            graph.Node(node_id=nid(4), base=bid('A'), aligned_to=nid(5)),
            graph.Node(node_id=nid(5), base=bid('T'), aligned_to=nid(4)),
            graph.Node(node_id=nid(6), base=bid('G'), aligned_to=None),
            graph.Node(node_id=nid(7), base=bid('G'), aligned_to=None),
            graph.Node(node_id=nid(8), base=bid('A'), aligned_to=nid(9)),
            graph.Node(node_id=nid(9), base=bid('C'), aligned_to=nid(10)),
            graph.Node(node_id=nid(10), base=bid('G'), aligned_to=nid(11)),
            graph.Node(node_id=nid(11), base=bid('T'), aligned_to=nid(8)),
            graph.Node(node_id=nid(12), base=bid('A'), aligned_to=nid(13)),
            graph.Node(node_id=nid(13), base=bid('C'), aligned_to=nid(12)),
            graph.Node(node_id=nid(14), base=bid('T'), aligned_to=None),
            graph.Node(node_id=nid(15), base=bid('A'), aligned_to=nid(16)),
            graph.Node(node_id=nid(16), base=bid('C'), aligned_to=nid(17)),
            graph.Node(node_id=nid(17), base=bid('G'), aligned_to=nid(15))
        ]

        expected_sequences = {
            msa.SequenceID('seq0'):
            graph.Sequence(
                msa.SequenceID('seq0'),
                [graph.SeqPath([*map(nid, [0, 2, 4, 6, 7, 8, 12, 14, 16])])],
                graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq1'):
            graph.Sequence(msa.SequenceID('seq1'),
                           [graph.SeqPath([*map(nid, [1, 2, 5, 6, 7, 9])])],
                           graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq2'):
            graph.Sequence(
                msa.SequenceID('seq2'),
                [graph.SeqPath([*map(nid, [3, 4, 6, 7, 10, 12, 14, 17])])],
                graph.SequenceMetadata({'group': '2'})),
            msa.SequenceID('seq3'):
            graph.Sequence(msa.SequenceID('seq3'),
                           [graph.SeqPath([*map(nid, [11, 13, 14, 15])])],
                           graph.SequenceMetadata({'group': '2'}))
        }

        expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences)
        nodes, sequences = po2poagraph.get_poagraph(
            msa.Po(pathtools.get_file_content_stringio(po_path), po_path),
            self.metadatacsv)
        actual_poagraph = graph.Poagraph(nodes, sequences)
        self.assertEqual(expected_poagraph, actual_poagraph)
Esempio n. 22
0
    def test_2_missing_sequence_end(self):
        maf_path = self.maf_files_dir.joinpath(
            "test_2_missing_sequence_end.maf")

        expected_nodes = [
            graph.Node(node_id=nid(0), base=graph.Base('A'),
                       aligned_to=nid(1)),
            graph.Node(node_id=nid(1), base=graph.Base('G'),
                       aligned_to=nid(0)),
            graph.Node(node_id=nid(2), base=graph.Base('C'),
                       aligned_to=nid(3)),
            graph.Node(node_id=nid(3), base=graph.Base('G'),
                       aligned_to=nid(2)),
            graph.Node(node_id=nid(4), base=graph.Base('T'), aligned_to=None),
            graph.Node(node_id=nid(5), base=graph.Base('A'),
                       aligned_to=nid(6)),
            graph.Node(node_id=nid(6), base=graph.Base('C'),
                       aligned_to=nid(5)),
            graph.Node(node_id=nid(7), base=graph.Base('A'), aligned_to=None),
            graph.Node(node_id=nid(8), base=graph.Base('G'), aligned_to=None),
            graph.Node(node_id=nid(9), base=graph.Base('G'), aligned_to=None),
            graph.Node(node_id=nid(10), base=graph.Base('T'), aligned_to=None),
            graph.Node(node_id=nid(11), base=graph.Base('G'), aligned_to=None),
            graph.Node(node_id=nid(12), base=graph.Base('T'), aligned_to=None),
        ]

        expected_sequences = {
            msa.SequenceID('seq0'):
            graph.Sequence(msa.SequenceID('seq0'), [],
                           graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq1'):
            graph.Sequence(
                msa.SequenceID('seq1'),
                [graph.SeqPath([*map(nid, [0, 2, 4, 5, 8, 9, 10])])],
                graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq2'):
            graph.Sequence(
                msa.SequenceID('seq2'),
                [graph.SeqPath([*map(nid, [1, 3, 4, 6, 7, 11, 12])])],
                graph.SequenceMetadata({'group': '2'})),
            msa.SequenceID('seq3'):
            graph.Sequence(msa.SequenceID('seq3'), [],
                           graph.SequenceMetadata({'group': '2'}))
        }
        expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences)
        actual_poagraph, _ = builder.build_from_dagmaf(
            msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path),
            self.fasta_provider, self.metadatacsv)

        self.assertEqual(expected_poagraph, actual_poagraph)
    def test_4_seqid_is_last(self):
        metadata_path = self.csv_files_dir.joinpath("test_4_seqid_is_last.csv")
        csv_content = pathtools.get_file_content_stringio(metadata_path)

        expected_metadata = {
            msa.SequenceID('s1'): {
                'name': 'sequence1',
                'group': 'A'
            },
            msa.SequenceID('s2'): {
                'name': 'sequence2',
                'group': 'B'
            },
            msa.SequenceID('s3'): {
                'name': 'sequence3',
                'group': 'B'
            }
        }
        m = msa.MetadataCSV(csv_content, metadata_path)
        actual_metadata = m.metadata

        self.assertEqual(expected_metadata, actual_metadata)
    def test_1_typical_poagraph(self):
        expected_po_content_path = self.po_files_dir.joinpath("test_1.po")

        poagraph_nodes = [graph.Node(node_id=nid(0), base=bid('A'), aligned_to=nid(1)),
                          graph.Node(node_id=nid(1), base=bid('G'), aligned_to=nid(0)),
                          graph.Node(node_id=nid(2), base=bid('C'), aligned_to=nid(3)),
                          graph.Node(node_id=nid(3), base=bid('G'), aligned_to=nid(2)),
                          graph.Node(node_id=nid(4), base=bid('A'), aligned_to=nid(5)),
                          graph.Node(node_id=nid(5), base=bid('T'), aligned_to=nid(4)),
                          graph.Node(node_id=nid(6), base=bid('G'), aligned_to=None),
                          graph.Node(node_id=nid(7), base=bid('G'), aligned_to=None),
                          graph.Node(node_id=nid(8), base=bid('A'), aligned_to=nid(9)),
                          graph.Node(node_id=nid(9), base=bid('C'), aligned_to=nid(10)),
                          graph.Node(node_id=nid(10), base=bid('G'), aligned_to=nid(11)),
                          graph.Node(node_id=nid(11), base=bid('T'), aligned_to=nid(8)),
                          graph.Node(node_id=nid(12), base=bid('A'), aligned_to=nid(13)),
                          graph.Node(node_id=nid(13), base=bid('C'), aligned_to=nid(12)),
                          graph.Node(node_id=nid(14), base=bid('T'), aligned_to=None),
                          graph.Node(node_id=nid(15), base=bid('A'), aligned_to=nid(16)),
                          graph.Node(node_id=nid(16), base=bid('C'), aligned_to=nid(17)),
                          graph.Node(node_id=nid(17), base=bid('G'), aligned_to=nid(15))
                          ]

        poagraph_sequences = {
            msa.SequenceID('seq0'):
                graph.Sequence(msa.SequenceID('seq0'),
                               [graph.SeqPath([*map(nid, [0, 2, 4, 6, 7, 8, 12, 14, 16])])],
                               graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq1'):
                graph.Sequence(msa.SequenceID('seq1'),
                               [graph.SeqPath([*map(nid, [1, 2, 5, 6, 7, 9])])],
                               graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq2'):
                graph.Sequence(msa.SequenceID('seq2'),
                               [graph.SeqPath([*map(nid, [3, 4, 6, 7, 10, 12, 14, 17])])],
                               graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq3'):
                graph.Sequence(msa.SequenceID('seq3'),
                               [graph.SeqPath([*map(nid, [11, 13, 14, 15])])],
                               graph.SequenceMetadata({'group': '1'})),
        }

        poagraph = graph.Poagraph(poagraph_nodes, poagraph_sequences)

        actual_po_content = po.poagraph_to_PangenomePO(poagraph)
        expected_po_content = pathtools.get_file_content(expected_po_content_path)
        self.assertEqual(expected_po_content, actual_po_content)
 def test_subpoagraph_construction_from_poagraph_keep_seq_0_1(self):
     translator = poa._PoagraphPOTranslator(
         self.poagraph, [msa.SequenceID('seq0'),
                         msa.SequenceID('seq1')])
     actual_po_content = translator.get_input_po_content()
     expected_po_content = "VERSION=pangenome\n"\
                           "NAME=pangenome\n"\
                           "TITLE=pangenome\n"\
                           "LENGTH=9\n"\
                           "SOURCECOUNT=2\n"\
                           "SOURCENAME=seq0\n"\
                           "SOURCEINFO=7 0 0 -1 seq0\n"\
                           "SOURCENAME=seq1\n"\
                           "SOURCEINFO=5 1 100 -1 seq1\n"\
                           "t:S0\n"\
                           "a:L0S0S1\n"\
                           "a:L1S0S1\n"\
                           "a:L2S0A4\n"\
                           "c:L2S1A3\n"\
                           "a:L3L4S0S1\n"\
                           "c:L5S0A7\n"\
                           "t:L5S1A6\n"\
                           "a:L6S0"
     self.assertEqual(expected_po_content, actual_po_content)
Esempio n. 26
0
def _complement_sequence_middles_if_needed(build_state: _BuildState,
                                           block: Block, edge: Arc, seq,
                                           last_node_id: graph.NodeID):
    seq_id = msa.SequenceID(seq[0].seq_id)
    left_block_sinfo, right_block_sinfo = _get_edge_sinfos(
        seqs_info=build_state.seqs_info,
        from_block_id=block.id,
        edge=edge,
        seq_id=seq_id)
    if _complementation_not_needed(left_block_sinfo, right_block_sinfo):
        if edge.edge_type == (1, -1):
            return last_node_id
        else:
            return None
    else:
        current_node_id = _get_max_node_id(build_state.nodes)
        column_id = build_state.column_id
        if left_block_sinfo.start < right_block_sinfo.start:
            last_pos = left_block_sinfo.start + left_block_sinfo.size - 1
            next_pos = right_block_sinfo.start
        else:
            last_pos = right_block_sinfo.start + right_block_sinfo.size - 1
            next_pos = left_block_sinfo.start

        join_with = last_node_id if _should_join_with_last_node(
            edge.edge_type) else None
        for i in range(last_pos + 1, next_pos):
            column_id += 1
            current_node_id += 1
            missing_nucleotide = _get_missing_nucleotide(
                build_state.fasta_provider, seq_id, i)
            build_state.nodes += [
                graph.Node(node_id=current_node_id,
                           base=missing_nucleotide,
                           aligned_to=None,
                           column_id=column_id,
                           block_id=None)
            ]
            _add_node_to_sequence(build_state,
                                  seq_id=seq_id,
                                  join_with=join_with,
                                  node_id=current_node_id)
            join_with = current_node_id

        if _should_join_with_next_node(edge.edge_type):
            return current_node_id
        else:
            return None
    def setUp(self):
        nodes = [
            graph.Node(
                node_id=nid(0),
                base=b('T'),
                aligned_to=None,
            ),
            graph.Node(node_id=nid(1), base=b('A'), aligned_to=nid(2)),
            graph.Node(node_id=nid(2), base=b('G'), aligned_to=nid(1)),
            graph.Node(node_id=nid(3), base=b('A'), aligned_to=nid(4)),
            graph.Node(node_id=nid(4), base=b('C'), aligned_to=nid(3)),
            graph.Node(node_id=nid(5), base=b('A'), aligned_to=nid(6)),
            graph.Node(node_id=nid(6), base=b('C'), aligned_to=nid(7)),
            graph.Node(node_id=nid(7), base=b('G'), aligned_to=nid(8)),
            graph.Node(node_id=nid(8), base=b('T'), aligned_to=nid(5)),
            graph.Node(node_id=nid(9), base=b('A'), aligned_to=None),
            graph.Node(node_id=nid(10), base=b('C'), aligned_to=nid(11)),
            graph.Node(node_id=nid(11), base=b('T'), aligned_to=nid(10)),
            graph.Node(node_id=nid(12), base=b('G'), aligned_to=None),
            graph.Node(node_id=nid(13), base=b('A'), aligned_to=nid(14)),
            graph.Node(node_id=nid(14), base=b('C'), aligned_to=nid(13))
        ]

        sequences = {
            msa.SequenceID('seq0'):
            graph.Sequence(
                msa.SequenceID('seq0'),
                [graph.SeqPath([*map(nid, [0, 1, 3, 5, 9, 10, 13])])],
                graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq1'):
            graph.Sequence(msa.SequenceID('seq1'),
                           [graph.SeqPath([*map(nid, [1, 3, 6, 9, 11])])],
                           graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq2'):
            graph.Sequence(msa.SequenceID('seq2'),
                           [graph.SeqPath([*map(nid, [2, 4, 7, 9, 11, 12])])],
                           graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq3'):
            graph.Sequence(
                msa.SequenceID('seq3'),
                [graph.SeqPath([*map(nid, [2, 4, 8, 9, 11, 12, 14])])],
                graph.SequenceMetadata({'group': '1'})),
        }

        self.poagraph = graph.Poagraph(nodes, sequences)
    def test_read_consensus_path_seq1_only_in_input(self):
        translator = poa._PoagraphPOTranslator(self.poagraph,
                                               [msa.SequenceID('seq1')])
        _ = translator.get_input_po_content()

        poa_lines = [
            "VERSION=pangenome\n", "NAME=pangenome\n", "TITLE=pangenome\n",
            "LENGTH=5\n", "SOURCECOUNT=2\n", "SOURCENAME=seq1\n",
            "SOURCEINFO=5 0 100 0 seq1\n", "SOURCENAME=CONSENS0\n",
            "SOURCEINFO=5 0 100 0 CONSENS0\n", "a:S0S1\n", "a:L0S0S1\n",
            "c:L1S0S1\n", "a:L2S0S1\n", "t:L2S0S1"
        ]
        actual_consensus_path = translator.read_consensus_paths(poa_lines, [0])
        expected_consensus_path = [1, 3, 6, 9, 11]
        self.assertEqual(expected_consensus_path,
                         actual_consensus_path[0].path)
    def test_06_1st_block_separates_into_2_branches_which_connect_in_3rd_block(self):
        maf_path = self.maf_files_dir.joinpath(
                        "test_6_1st_block_separates_into_2_branches_which_connect_in_3rd_block.maf")

        expected_nodes = [
            graph.Node(node_id=nid(0), base=graph.Base('A'), aligned_to=nid(1)),
            graph.Node(node_id=nid(1), base=graph.Base('C'), aligned_to=nid(2)),
            graph.Node(node_id=nid(2), base=graph.Base('G'), aligned_to=nid(0)),
            graph.Node(node_id=nid(3), base=graph.Base('C'), aligned_to=None),
            graph.Node(node_id=nid(4), base=graph.Base('A'), aligned_to=nid(5)),
            graph.Node(node_id=nid(5), base=graph.Base('T'), aligned_to=nid(4)),

            graph.Node(node_id=nid(6), base=graph.Base('G'), aligned_to=None),
            graph.Node(node_id=nid(7), base=graph.Base('G'), aligned_to=None),

            graph.Node(node_id=nid(8), base=graph.Base('C'), aligned_to=nid(9)),
            graph.Node(node_id=nid(9), base=graph.Base('G'), aligned_to=nid(10)),
            graph.Node(node_id=nid(10), base=graph.Base('T'), aligned_to=nid(8)),
        ]

        expected_sequences = {
            msa.SequenceID('seq0'):
                graph.Sequence(msa.SequenceID('seq0'),
                               [graph.SeqPath([*map(nid, [0, 3, 4, 8])])],
                               graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq1'):
                graph.Sequence(msa.SequenceID('seq1'),
                               [graph.SeqPath([*map(nid, [1, 3, 5, 6, 7, 9])])],
                               graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq2'):
                graph.Sequence(msa.SequenceID('seq2'),
                               [graph.SeqPath([*map(nid, [2, 3, 5, 10])])],
                               graph.SequenceMetadata({'group': '2'})),
            msa.SequenceID('seq3'):
                graph.Sequence(msa.SequenceID('seq3'),
                               [],
                               graph.SequenceMetadata({'group': '2'}))
        }
        expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences)
        actual_poagraph, _ = builder.build_from_dagmaf(
            msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path),
            self.fasta_provider,
            self.metadatacsv)

        self.assertEqual(expected_poagraph, actual_poagraph)
    def test_02_seq_starts_in_second_block(self):
        maf_path = self.maf_files_dir.joinpath(
                        "test_2_seq_starts_in_second_block.maf")

        expected_nodes = [
            graph.Node(node_id=nid(0), base=graph.Base('C'), aligned_to=None, block_id=bid(0)),
            graph.Node(node_id=nid(1), base=graph.Base('T'), aligned_to=None, block_id=bid(0)),
            graph.Node(node_id=nid(2), base=graph.Base('G'), aligned_to=None, block_id=bid(0)),

            graph.Node(node_id=nid(3), base=graph.Base('T'), aligned_to=None, block_id=bid(1)),

            graph.Node(node_id=nid(4), base=graph.Base('G'), aligned_to=nid(5), block_id=bid(2)),
            graph.Node(node_id=nid(5), base=graph.Base('T'), aligned_to=nid(4), block_id=bid(2)),
            graph.Node(node_id=nid(6), base=graph.Base('A'), aligned_to=None, block_id=bid(2)),
            graph.Node(node_id=nid(7), base=graph.Base('A'), aligned_to=None, block_id=bid(2)),
            graph.Node(node_id=nid(8), base=graph.Base('C'), aligned_to=None, block_id=bid(2)),

        ]

        expected_sequences = {
            msa.SequenceID('seq0'):
                graph.Sequence(msa.SequenceID('seq0'),
                               [graph.SeqPath([*map(nid, [1, 2, 3])])],
                               graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq1'):
                graph.Sequence(msa.SequenceID('seq1'),
                               [graph.SeqPath([*map(nid, [0, 5, 7])])],
                               graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq2'):
                graph.Sequence(msa.SequenceID('seq2'),
                               [graph.SeqPath([*map(nid, [3, 4, 6, 8])])],
                               graph.SequenceMetadata({'group': '2'})),
            msa.SequenceID('seq3'):
                graph.Sequence(msa.SequenceID('seq3'),
                               [],
                               graph.SequenceMetadata({'group': '2'}))
        }
        expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences)
        actual_poagraph, _ = builder.build_from_dagmaf(
            msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path),
            self.fasta_provider,
            self.metadatacsv)

        self.assertEqual(expected_poagraph, actual_poagraph)