def test_subpoagraph_construction_full_graph(self):
        nodes = [
            graph.Node(node_id=nid(0), base=b('A'), aligned_to=None),
            graph.Node(node_id=nid(1), base=b('A'), aligned_to=None),
            graph.Node(node_id=nid(2), base=b('C'), aligned_to=None),
            graph.Node(node_id=nid(3), base=b('A'), aligned_to=None),
            graph.Node(node_id=nid(4), base=b('T'), aligned_to=None)
        ]

        sequences = {
            msa.SequenceID('seq0'):
            graph.Sequence(msa.SequenceID('seq0'),
                           [graph.SeqPath([*map(nid, [0, 1, 2, 3, 4])])],
                           graph.SequenceMetadata({'group': '1'}))
        }
        poagraph = graph.Poagraph(nodes, sequences)
        translator = poa._PoagraphPOTranslator(poagraph,
                                               [msa.SequenceID('seq0')])
        actual_po_content = translator.get_input_po_content()
        expected_po_content = "VERSION=pangenome\n" \
                              "NAME=pangenome\n" \
                              "TITLE=pangenome\n" \
                              "LENGTH=5\n" \
                              "SOURCECOUNT=1\n" \
                              "SOURCENAME=seq0\n" \
                              "SOURCEINFO=5 0 100 -1 seq0\n" \
                              "a:S0\n" \
                              "a:L0S0\n" \
                              "c:L1S0\n" \
                              "a:L2S0\n" \
                              "t:L3S0"
        self.assertEqual(expected_po_content, actual_po_content)
    def test_subpoagraph_should_omit_edges_2(self):
        nodes = [
            graph.Node(node_id=nid(0), base=b('A'), aligned_to=None),
            graph.Node(node_id=nid(1), base=b('C'), aligned_to=None),
            graph.Node(node_id=nid(2), base=b('C'), aligned_to=None)
        ]

        sequences = {
            msa.SequenceID('seq1'):
            graph.Sequence(msa.SequenceID('seq1'),
                           [graph.SeqPath([*map(nid, [0, 2])])],
                           graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq2'):
            graph.Sequence(msa.SequenceID('seq2'),
                           [graph.SeqPath([*map(nid, [0, 1, 2])])],
                           graph.SequenceMetadata({'group': '1'}))
        }
        poagraph = graph.Poagraph(nodes, sequences)

        translator = poa._PoagraphPOTranslator(poagraph,
                                               [msa.SequenceID('seq1')])
        actual_po_content = translator.get_input_po_content()
        expected_po_content = "VERSION=pangenome\n" \
                              "NAME=pangenome\n" \
                              "TITLE=pangenome\n" \
                              "LENGTH=2\n" \
                              "SOURCECOUNT=1\n" \
                              "SOURCENAME=seq1\n" \
                              "SOURCEINFO=2 0 100 -1 seq1\n" \
                              "a:S0\n" \
                              "c:L0S0"

        self.assertEqual(expected_po_content, actual_po_content)
    def test_1_p_parameter_influence(self, p: at_params.P,
                                     expected_cutoff: graph.Compatibility):
        nodes = [
            graph.Node(node_id=nid(0), base=b('T'), aligned_to=None),
            graph.Node(node_id=nid(1), base=b('A'), aligned_to=None),
            graph.Node(node_id=nid(2), base=b('G'), aligned_to=None),
            graph.Node(node_id=nid(3), base=b('A'), aligned_to=None),
            graph.Node(node_id=nid(4), base=b('C'), aligned_to=None),
            graph.Node(node_id=nid(5), base=b('A'), aligned_to=None),
            graph.Node(node_id=nid(6), base=b('C'), aligned_to=None),
            graph.Node(node_id=nid(7), base=b('G'), aligned_to=None),
            graph.Node(node_id=nid(8), base=b('T'), aligned_to=None),
            graph.Node(node_id=nid(9), base=b('A'), aligned_to=None)
        ]

        sequences = {
            msa.SequenceID('seq0'):
            graph.Sequence(msa.SequenceID('seq0'), [
                graph.SeqPath(
                    [*map(nid, [10, 11, 12, 13, 14, 15, 16, 17, 18, 9])])
            ], graph.SequenceMetadata({})),
            msa.SequenceID('seq1'):
            graph.Sequence(msa.SequenceID('seq1'), [
                graph.SeqPath(
                    [*map(nid, [10, 11, 12, 13, 14, 15, 16, 17, 8, 9])])
            ], graph.SequenceMetadata({})),
            msa.SequenceID('seq2'):
            graph.Sequence(msa.SequenceID('seq2'), [
                graph.SeqPath(
                    [*map(nid, [10, 11, 12, 13, 14, 15, 16, 7, 8, 9])])
            ], graph.SequenceMetadata({})),
            msa.SequenceID('seq3'):
            graph.Sequence(msa.SequenceID('seq3'), [
                graph.SeqPath([*map(nid, [10, 11, 12, 3, 4, 5, 6, 7, 8, 9])])
            ], graph.SequenceMetadata({})),
            msa.SequenceID('seq4'):
            graph.Sequence(
                msa.SequenceID('seq3'),
                [graph.SeqPath([*map(nid, [10, 11, 2, 3, 4, 5, 6, 7, 8, 9])])],
                graph.SequenceMetadata({}))
        }

        poagraph = graph.Poagraph(nodes, sequences)

        consensus_path = graph.SeqPath(
            [*map(nid, [10, 11, 12, 13, 14, 15, 16, 17, 18, 19])])
        compatibilities = poagraph.get_compatibilities(
            poagraph.get_sequences_ids(), consensus_path, p)

        actual_cutoff = at_builders._find_node_cutoff(
            [c for c in compatibilities.values()], []).cutoff
        self.assertAlmostEqual(expected_cutoff.value, actual_cutoff.value)
    def test_subpoagraph_unfilled_nodes(self):
        symbol_for_uknown = '?'
        nodes = [
            graph.Node(node_id=nid(0), base=b('A'), aligned_to=nid(1)),
            graph.Node(node_id=nid(1), base=b('C'), aligned_to=nid(0)),
            graph.Node(node_id=nid(2), base=b('G'), aligned_to=None),
            graph.Node(node_id=nid(3),
                       base=b(symbol_for_uknown),
                       aligned_to=None),
            graph.Node(node_id=nid(4),
                       base=b(symbol_for_uknown),
                       aligned_to=None),
            graph.Node(node_id=nid(5), base=b('G'), aligned_to=None),
            graph.Node(node_id=nid(6), base=b('C'), aligned_to=None),
            graph.Node(node_id=nid(7), base=b('A'), aligned_to=None),
            graph.Node(node_id=nid(5), base=b('T'), aligned_to=None)
        ]

        sequences = {
            msa.SequenceID('seq1'):
            graph.Sequence(msa.SequenceID('seq1'),
                           [graph.SeqPath([*map(nid, [0, 2, 3, 4, 7, 8])])],
                           graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq2'):
            graph.Sequence(msa.SequenceID('seq2'),
                           [graph.SeqPath([*map(nid, [1, 2, 5, 6, 7, 8])])],
                           graph.SequenceMetadata({'group': '1'}))
        }
        poagraph = graph.Poagraph(nodes, sequences)

        translator = poa._PoagraphPOTranslator(
            poagraph, [msa.SequenceID('seq1'),
                       msa.SequenceID('seq2')])
        actual_po_content = translator.get_input_po_content()
        expected_po_content = "VERSION=pangenome\n" \
                              "NAME=pangenome\n" \
                              "TITLE=pangenome\n" \
                              "LENGTH=9\n" \
                              "SOURCECOUNT=2\n" \
                              "SOURCENAME=seq1\n" \
                              "SOURCEINFO=6 0 100 -1 seq1\n" \
                              "SOURCENAME=seq2\n" \
                              "SOURCEINFO=6 1 100 -1 seq2\n" \
                              "a:S0A1\n" \
                              "c:S1A0\n" \
                              "g:L0L1S0S1\n" \
                              f"{symbol_for_uknown}:L2S0\n" \
                              f"{symbol_for_uknown}:L3S0\n" \
                              "g:L2S1\n" \
                              "c:L5S1\n" \
                              "a:L4L6S0S1\n" \
                              "t:L7S0S1"
        self.assertEqual(expected_po_content, actual_po_content)
    def test_05_single_block_single_nucletodide(self):
        maf_path = self.maf_files_dir.joinpath(
                        "test_5_single_block_single_nucletodide.maf")

        expected_nodes = [
            graph.Node(node_id=nid(0), base=graph.Base('A'), aligned_to=None, block_id=bid(0))
        ]

        expected_sequences = {
            msa.SequenceID('seq0'):
                graph.Sequence(msa.SequenceID('seq0'),
                               [graph.SeqPath([*map(nid, [0])])],
                               graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq1'):
                graph.Sequence(msa.SequenceID('seq1'),
                               [graph.SeqPath([*map(nid, [0])])],
                               graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq2'):
                graph.Sequence(msa.SequenceID('seq2'),
                               [graph.SeqPath([*map(nid, [0])])],
                               graph.SequenceMetadata({'group': '2'})),
            msa.SequenceID('seq3'):
                graph.Sequence(msa.SequenceID('seq3'),
                               [graph.SeqPath([*map(nid, [0])])],
                               graph.SequenceMetadata({'group': '2'}))
        }
        expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences)
        actual_poagraph, _ = builder.build_from_dagmaf(
            msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path),
            self.fasta_provider,
            self.metadatacsv)

        self.assertEqual(expected_poagraph, actual_poagraph)
def _complement_sequence_starting_nodes(build_state: _BuildState,
                                        seq_id: msa.SequenceID,
                                        first_block_sinfo: SequenceInfo) -> \
        None:
    current_node_id: graph.NodeID = _get_max_node_id(build_state.nodes)
    column_id = -first_block_sinfo.start
    join_with = None
    for i in range(first_block_sinfo.start):
        current_node_id += 1
        missing_nucleotide = _get_missing_nucleotide(
            build_state.fasta_provider, seq_id, i)
        build_state.nodes += [
            graph.Node(node_id=current_node_id,
                       base=missing_nucleotide,
                       column_id=column_id)
        ]
        _add_node_to_sequence(build_state,
                              seq_id=seq_id,
                              join_with=join_with,
                              node_id=current_node_id)
        join_with = current_node_id
        column_id += 1
    build_state.free_edges[seq_id] += [
        Edge(seq_id=seq_id,
             from_block_id=None,
             to_block_id=first_block_sinfo.block_id,
             last_node_id=current_node_id)
    ]
def _complement_sequence_middle_nodes(build_state: _BuildState,
                                      seq_id: msa.SequenceID,
                                      last_pos,
                                      next_pos,
                                      last_node_id: graph.NodeID) -> \
        graph.NodeID:
    current_node_id = _get_max_node_id(build_state.nodes)
    column_id = build_state.column_id
    join_with = last_node_id
    for i in range(last_pos + 1, next_pos):
        column_id += 1
        current_node_id += 1
        missing_nucleotide = _get_missing_nucleotide(
            build_state.fasta_provider, seq_id, i)
        build_state.nodes += [
            graph.Node(node_id=current_node_id,
                       base=missing_nucleotide,
                       aligned_to=None,
                       column_id=column_id,
                       block_id=None)
        ]
        _add_node_to_sequence(build_state,
                              seq_id=seq_id,
                              join_with=join_with,
                              node_id=current_node_id)
        join_with = current_node_id
    return current_node_id
Esempio n. 8
0
def get_poagraph(maf: msa.Maf, metadata: Optional[msa.MetadataCSV]) -> \
        Tuple[List[graph.Node], Dict[msa.SequenceID, graph.Sequence]]:
    """Get poagraph elements from MAF.

    Args:
        maf: Multialignment file in MAF format.
        metadata: MetadataCSV.

    Returns:
        Tuple of poagraph elements.
    """

    alignment = [*AlignIO.parse(maf.filecontent, "maf")]
    nodes, sequences = _init_poagraph(alignment, metadata)

    current_node_id = graph.NodeID(-1)
    column_id = graph.ColumnID(-1)
    for block_id, block in enumerate(alignment):
        global_logger.info(f"Processing block {block_id}...")
        block_width = len(block[0].seq)

        for col in range(block_width):
            column_id += 1
            sequence_id_to_nucleotide = {
                msa.SequenceID(seq.id): seq[col]
                for seq in block
            }
            nodes_codes = sorted([
                *(set([
                    nucleotide
                    for nucleotide in sequence_id_to_nucleotide.values()
                ])).difference({'-'})
            ])
            column_nodes_ids = [
                graph.NodeID(current_node_id + i + 1)
                for i, _ in enumerate(nodes_codes)
            ]

            for i, nucl in enumerate(nodes_codes):
                current_node_id += 1
                nodes.append(
                    graph.Node(node_id=current_node_id,
                               base=graph.Base(nucl),
                               aligned_to=_get_next_aligned_node_id(
                                   graph.NodeID(i), column_nodes_ids),
                               column_id=graph.ColumnID(column_id),
                               block_id=graph.BlockID(block_id)))

                for seq_id, nucleotide in sequence_id_to_nucleotide.items():
                    if nucleotide == nucl:
                        sequences[seq_id] = _add_node_do_sequence(
                            sequence=sequences[seq_id],
                            node_id=current_node_id)

    return nodes, sequences
    def test_08_reversed_block(self):
        maf_path = self.maf_files_dir.joinpath("test_8_reversed_block.maf")

        expected_nodes = [
            graph.Node(node_id=nid(0), base=graph.Base('C'), aligned_to=None),
            graph.Node(node_id=nid(1), base=graph.Base('A'), aligned_to=None),
            graph.Node(node_id=nid(2), base=graph.Base('T'), aligned_to=None),
            # next block is reversed because it was converted to dag
            graph.Node(node_id=nid(3), base=graph.Base('G'), aligned_to=None),
            graph.Node(node_id=nid(4), base=graph.Base('G'), aligned_to=None),
            graph.Node(node_id=nid(5), base=graph.Base('A'), aligned_to=nid(6)),
            graph.Node(node_id=nid(6), base=graph.Base('G'), aligned_to=nid(5)),
            graph.Node(node_id=nid(7), base=graph.Base('A'), aligned_to=None),
            graph.Node(node_id=nid(8), base=graph.Base('G'), aligned_to=None),
            graph.Node(node_id=nid(9), base=graph.Base('T'), aligned_to=None),
        ]

        expected_sequences = {
            msa.SequenceID('seq0'):
                graph.Sequence(msa.SequenceID('seq0'),
                               [],
                               graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq1'):
                graph.Sequence(msa.SequenceID('seq1'),
                               [graph.SeqPath([*map(nid, [0, 1, 3, 4, 5, 7, 8, 9])])],
                               graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq2'):
                graph.Sequence(msa.SequenceID('seq2'),
                               [graph.SeqPath([*map(nid, [0, 1, 2, 3, 4, 6, 7, 8, 9])])],
                               graph.SequenceMetadata({'group': '2'})),
            msa.SequenceID('seq3'):
                graph.Sequence(msa.SequenceID('seq3'),
                               [graph.SeqPath([*map(nid, [0, 1, 2, 3, 4, 6, 7, 9])])],
                               graph.SequenceMetadata({'group': '2'})),
        }
        expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences)
        actual_poagraph, _ = builder.build_from_dagmaf(
            msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path),
            self.fasta_provider,
            self.metadatacsv)

        self.assertEqual(expected_poagraph, actual_poagraph)
    def test_2_consensuses_and_empty_sequences(self):
        expected_po_content_path = self.po_files_dir.joinpath("test_2.po")

        poagraph_nodes = [graph.Node(node_id=nid(0), base=bid('C'), aligned_to=nid(1)),
                          graph.Node(node_id=nid(1), base=bid('T'), aligned_to=nid(0)),
                          graph.Node(node_id=nid(2), base=bid('A'), aligned_to=nid(3)),
                          graph.Node(node_id=nid(3), base=bid('G'), aligned_to=nid(2)),
                          graph.Node(node_id=nid(4), base=bid('C'), aligned_to=None),
                          graph.Node(node_id=nid(5), base=bid('T'), aligned_to=None),
                          graph.Node(node_id=nid(6), base=bid('A'), aligned_to=nid(7)),
                          graph.Node(node_id=nid(7), base=bid('T'), aligned_to=nid(6)),
                          graph.Node(node_id=nid(8), base=bid('G'), aligned_to=None)
                          ]

        poagraph_sequences = {
            msa.SequenceID('seq0'):
                graph.Sequence(msa.SequenceID('seq0'),
                               [graph.SeqPath([*map(nid, [0, 3, 4, 5, 6, 8])])],
                               graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq1'):
                graph.Sequence(msa.SequenceID('seq1'),
                               [graph.SeqPath([*map(nid, [1, 2, 4, 5, 7, 8])])],
                               graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq2'):
                graph.Sequence(msa.SequenceID('seq2'),
                               [],
                               graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq3'):
                graph.Sequence(msa.SequenceID('seq3'),
                               [],
                               graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('CONSENS0'):
                graph.Sequence(msa.SequenceID('CONSENS0'),
                               [graph.SeqPath([*map(nid, [0, 3, 4, 5, 7, 8])])],
                               None),
            msa.SequenceID('CONSENS1'):
                graph.Sequence(msa.SequenceID('CONSENS1'),
                               [graph.SeqPath([*map(nid, [1, 2, 4, 5, 6, 8])])],
                               None),
        }

        poagraph = graph.Poagraph(poagraph_nodes, poagraph_sequences)

        actual_po_content = po.poagraph_to_PangenomePO(poagraph)
        expected_po_content = pathtools.get_file_content(expected_po_content_path)
        self.assertEqual(expected_po_content, actual_po_content)
    def test_2_consensuses_and_empty_sequences(self):
        po_path = self.po_files_dir.joinpath("test_2.po")

        expected_nodes = [
            graph.Node(node_id=nid(0), base=bid('C'), aligned_to=nid(1)),
            graph.Node(node_id=nid(1), base=bid('T'), aligned_to=nid(0)),
            graph.Node(node_id=nid(2), base=bid('A'), aligned_to=nid(3)),
            graph.Node(node_id=nid(3), base=bid('G'), aligned_to=nid(2)),
            graph.Node(node_id=nid(4), base=bid('C'), aligned_to=None),
            graph.Node(node_id=nid(5), base=bid('T'), aligned_to=None),
            graph.Node(node_id=nid(6), base=bid('A'), aligned_to=nid(7)),
            graph.Node(node_id=nid(7), base=bid('T'), aligned_to=nid(6)),
            graph.Node(node_id=nid(8), base=bid('G'), aligned_to=None)
        ]

        expected_sequences = {
            msa.SequenceID('seq0'):
            graph.Sequence(msa.SequenceID('seq0'),
                           [graph.SeqPath([*map(nid, [0, 3, 4, 5, 6, 8])])],
                           graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq1'):
            graph.Sequence(msa.SequenceID('seq1'),
                           [graph.SeqPath([*map(nid, [1, 2, 4, 5, 7, 8])])],
                           graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq2'):
            graph.Sequence(msa.SequenceID('seq2'), [],
                           graph.SequenceMetadata({'group': '2'})),
            msa.SequenceID('seq3'):
            graph.Sequence(msa.SequenceID('seq3'), [],
                           graph.SequenceMetadata({'group': '2'})),
            msa.SequenceID('CONSENS0'):
            graph.Sequence(msa.SequenceID('CONSENS0'),
                           [graph.SeqPath([*map(nid, [0, 3, 4, 5, 7, 8])])],
                           graph.SequenceMetadata({})),
            msa.SequenceID('CONSENS1'):
            graph.Sequence(msa.SequenceID('CONSENS1'),
                           [graph.SeqPath([*map(nid, [1, 2, 4, 5, 6, 8])])],
                           graph.SequenceMetadata({}))
        }

        expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences)
        nodes, sequences = po2poagraph.get_poagraph(
            msa.Po(pathtools.get_file_content_stringio(po_path), po_path),
            self.metadatacsv)
        actual_poagraph = graph.Poagraph(nodes, sequences)
        self.assertEqual(expected_poagraph, actual_poagraph)
    def test_02_seq_starts_in_second_block(self):
        maf_path = self.maf_files_dir.joinpath(
                        "test_2_seq_starts_in_second_block.maf")

        expected_nodes = [
            graph.Node(node_id=nid(0), base=graph.Base('C'), aligned_to=None, block_id=bid(0)),
            graph.Node(node_id=nid(1), base=graph.Base('T'), aligned_to=None, block_id=bid(0)),
            graph.Node(node_id=nid(2), base=graph.Base('G'), aligned_to=None, block_id=bid(0)),

            graph.Node(node_id=nid(3), base=graph.Base('T'), aligned_to=None, block_id=bid(1)),

            graph.Node(node_id=nid(4), base=graph.Base('G'), aligned_to=nid(5), block_id=bid(2)),
            graph.Node(node_id=nid(5), base=graph.Base('T'), aligned_to=nid(4), block_id=bid(2)),
            graph.Node(node_id=nid(6), base=graph.Base('A'), aligned_to=None, block_id=bid(2)),
            graph.Node(node_id=nid(7), base=graph.Base('A'), aligned_to=None, block_id=bid(2)),
            graph.Node(node_id=nid(8), base=graph.Base('C'), aligned_to=None, block_id=bid(2)),

        ]

        expected_sequences = {
            msa.SequenceID('seq0'):
                graph.Sequence(msa.SequenceID('seq0'),
                               [graph.SeqPath([*map(nid, [1, 2, 3])])],
                               graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq1'):
                graph.Sequence(msa.SequenceID('seq1'),
                               [graph.SeqPath([*map(nid, [0, 5, 7])])],
                               graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq2'):
                graph.Sequence(msa.SequenceID('seq2'),
                               [graph.SeqPath([*map(nid, [3, 4, 6, 8])])],
                               graph.SequenceMetadata({'group': '2'})),
            msa.SequenceID('seq3'):
                graph.Sequence(msa.SequenceID('seq3'),
                               [],
                               graph.SequenceMetadata({'group': '2'}))
        }
        expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences)
        actual_poagraph, _ = builder.build_from_dagmaf(
            msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path),
            self.fasta_provider,
            self.metadatacsv)

        self.assertEqual(expected_poagraph, actual_poagraph)
Esempio n. 13
0
def _complement_sequence_middles_if_needed(build_state: _BuildState,
                                           block: Block, edge: Arc, seq,
                                           last_node_id: graph.NodeID):
    seq_id = msa.SequenceID(seq[0].seq_id)
    left_block_sinfo, right_block_sinfo = _get_edge_sinfos(
        seqs_info=build_state.seqs_info,
        from_block_id=block.id,
        edge=edge,
        seq_id=seq_id)
    if _complementation_not_needed(left_block_sinfo, right_block_sinfo):
        if edge.edge_type == (1, -1):
            return last_node_id
        else:
            return None
    else:
        current_node_id = _get_max_node_id(build_state.nodes)
        column_id = build_state.column_id
        if left_block_sinfo.start < right_block_sinfo.start:
            last_pos = left_block_sinfo.start + left_block_sinfo.size - 1
            next_pos = right_block_sinfo.start
        else:
            last_pos = right_block_sinfo.start + right_block_sinfo.size - 1
            next_pos = left_block_sinfo.start

        join_with = last_node_id if _should_join_with_last_node(
            edge.edge_type) else None
        for i in range(last_pos + 1, next_pos):
            column_id += 1
            current_node_id += 1
            missing_nucleotide = _get_missing_nucleotide(
                build_state.fasta_provider, seq_id, i)
            build_state.nodes += [
                graph.Node(node_id=current_node_id,
                           base=missing_nucleotide,
                           aligned_to=None,
                           column_id=column_id,
                           block_id=None)
            ]
            _add_node_to_sequence(build_state,
                                  seq_id=seq_id,
                                  join_with=join_with,
                                  node_id=current_node_id)
            join_with = current_node_id

        if _should_join_with_next_node(edge.edge_type):
            return current_node_id
        else:
            return None
Esempio n. 14
0
def _process_block(build_state: _BuildState, block: DAGMaf.DAGMafNode):
    current_node_id = _get_max_node_id(build_state.nodes)
    block_width = len(block.alignment[0].seq)
    paths_join_info = _get_paths_join_info(block, build_state.free_edges)

    build_state.column_id = _get_max_column_id(build_state.nodes)
    for col in range(block_width):
        build_state.column_id += 1
        sequence_name_to_nucleotide = {
            MafSequenceID(seq.id): seq[col]
            for seq in block.alignment
        }
        nodes_codes = _get_column_nucleotides_sorted_codes(
            sequence_name_to_nucleotide)
        column_nodes_ids = [
            current_node_id + i + 1 for i, _ in enumerate(nodes_codes)
        ]
        for i, nucl in enumerate(nodes_codes):
            current_node_id += 1
            maf_seqs_id = [
                seq_id for seq_id, n in sequence_name_to_nucleotide.items()
                if n == nucl
            ]
            build_state.nodes += [
                graph.Node(node_id=current_node_id,
                           base=graph.Base(nucl),
                           aligned_to=_get_next_aligned_node_id(
                               i, column_nodes_ids),
                           column_id=build_state.column_id,
                           block_id=block.id)
            ]

            for maf_seq_id in maf_seqs_id:
                seq_id = msa.SequenceID(maf_seq_id)
                _add_node_to_sequence(build_state, seq_id,
                                      paths_join_info[seq_id], current_node_id)
                paths_join_info[seq_id] = current_node_id

    _add_block_out_edges_to_free_edges(build_state, block, paths_join_info)
    _manage_endings(build_state, block, paths_join_info)
Esempio n. 15
0
def _get_poagraph_paths_and_nodes(po_lines: List[str],
                                  sequences_info: Dict[int, POSequenceInfo],
                                  sequences: Dict[msa.SequenceID, graph.Sequence]) -> \
        Tuple[List[graph.Node], Dict[msa.SequenceID, graph.Sequence]]:
    nodes_count = int(_extract_line_value(po_lines[3]))
    paths_count = int(_extract_line_value(po_lines[4]))
    nodes: List[graph.Node] = [None] * nodes_count
    node_id = 0
    for i in range(5 + paths_count * 2, 5 + paths_count * 2 + nodes_count):
        node_line = po_lines[i]
        base = graph.Base(node_line[0].upper())
        in_nodes, po_sequences_ids, aligned_to = _extract_node_parameters(node_line)
        sequences_ids = [sequences_info[po_sequences_id].name
                         for po_sequences_id in po_sequences_ids]
        nodes[node_id] = graph.Node(graph.NodeID(node_id),
                                    base,
                                    graph.NodeID(aligned_to))
        for seq_id in sequences_ids:
            if len(sequences[seq_id].paths) == 1:
                sequences[seq_id].paths[0].append(graph.NodeID(node_id))
            else:
                sequences[seq_id].paths.append(graph.SeqPath([graph.NodeID(node_id)]))
        node_id += 1
    return nodes, sequences
    def test_1_typical_poagraph(self):
        po_path = self.po_files_dir.joinpath("test_1.po")

        expected_nodes = [
            graph.Node(node_id=nid(0), base=bid('A'), aligned_to=nid(1)),
            graph.Node(node_id=nid(1), base=bid('G'), aligned_to=nid(0)),
            graph.Node(node_id=nid(2), base=bid('C'), aligned_to=nid(3)),
            graph.Node(node_id=nid(3), base=bid('G'), aligned_to=nid(2)),
            graph.Node(node_id=nid(4), base=bid('A'), aligned_to=nid(5)),
            graph.Node(node_id=nid(5), base=bid('T'), aligned_to=nid(4)),
            graph.Node(node_id=nid(6), base=bid('G'), aligned_to=None),
            graph.Node(node_id=nid(7), base=bid('G'), aligned_to=None),
            graph.Node(node_id=nid(8), base=bid('A'), aligned_to=nid(9)),
            graph.Node(node_id=nid(9), base=bid('C'), aligned_to=nid(10)),
            graph.Node(node_id=nid(10), base=bid('G'), aligned_to=nid(11)),
            graph.Node(node_id=nid(11), base=bid('T'), aligned_to=nid(8)),
            graph.Node(node_id=nid(12), base=bid('A'), aligned_to=nid(13)),
            graph.Node(node_id=nid(13), base=bid('C'), aligned_to=nid(12)),
            graph.Node(node_id=nid(14), base=bid('T'), aligned_to=None),
            graph.Node(node_id=nid(15), base=bid('A'), aligned_to=nid(16)),
            graph.Node(node_id=nid(16), base=bid('C'), aligned_to=nid(17)),
            graph.Node(node_id=nid(17), base=bid('G'), aligned_to=nid(15))
        ]

        expected_sequences = {
            msa.SequenceID('seq0'):
            graph.Sequence(
                msa.SequenceID('seq0'),
                [graph.SeqPath([*map(nid, [0, 2, 4, 6, 7, 8, 12, 14, 16])])],
                graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq1'):
            graph.Sequence(msa.SequenceID('seq1'),
                           [graph.SeqPath([*map(nid, [1, 2, 5, 6, 7, 9])])],
                           graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq2'):
            graph.Sequence(
                msa.SequenceID('seq2'),
                [graph.SeqPath([*map(nid, [3, 4, 6, 7, 10, 12, 14, 17])])],
                graph.SequenceMetadata({'group': '2'})),
            msa.SequenceID('seq3'):
            graph.Sequence(msa.SequenceID('seq3'),
                           [graph.SeqPath([*map(nid, [11, 13, 14, 15])])],
                           graph.SequenceMetadata({'group': '2'}))
        }

        expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences)
        nodes, sequences = po2poagraph.get_poagraph(
            msa.Po(pathtools.get_file_content_stringio(po_path), po_path),
            self.metadatacsv)
        actual_poagraph = graph.Poagraph(nodes, sequences)
        self.assertEqual(expected_poagraph, actual_poagraph)
    def test_1_typical_poagraph(self):
        expected_po_content_path = self.po_files_dir.joinpath("test_1.po")

        poagraph_nodes = [graph.Node(node_id=nid(0), base=bid('A'), aligned_to=nid(1)),
                          graph.Node(node_id=nid(1), base=bid('G'), aligned_to=nid(0)),
                          graph.Node(node_id=nid(2), base=bid('C'), aligned_to=nid(3)),
                          graph.Node(node_id=nid(3), base=bid('G'), aligned_to=nid(2)),
                          graph.Node(node_id=nid(4), base=bid('A'), aligned_to=nid(5)),
                          graph.Node(node_id=nid(5), base=bid('T'), aligned_to=nid(4)),
                          graph.Node(node_id=nid(6), base=bid('G'), aligned_to=None),
                          graph.Node(node_id=nid(7), base=bid('G'), aligned_to=None),
                          graph.Node(node_id=nid(8), base=bid('A'), aligned_to=nid(9)),
                          graph.Node(node_id=nid(9), base=bid('C'), aligned_to=nid(10)),
                          graph.Node(node_id=nid(10), base=bid('G'), aligned_to=nid(11)),
                          graph.Node(node_id=nid(11), base=bid('T'), aligned_to=nid(8)),
                          graph.Node(node_id=nid(12), base=bid('A'), aligned_to=nid(13)),
                          graph.Node(node_id=nid(13), base=bid('C'), aligned_to=nid(12)),
                          graph.Node(node_id=nid(14), base=bid('T'), aligned_to=None),
                          graph.Node(node_id=nid(15), base=bid('A'), aligned_to=nid(16)),
                          graph.Node(node_id=nid(16), base=bid('C'), aligned_to=nid(17)),
                          graph.Node(node_id=nid(17), base=bid('G'), aligned_to=nid(15))
                          ]

        poagraph_sequences = {
            msa.SequenceID('seq0'):
                graph.Sequence(msa.SequenceID('seq0'),
                               [graph.SeqPath([*map(nid, [0, 2, 4, 6, 7, 8, 12, 14, 16])])],
                               graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq1'):
                graph.Sequence(msa.SequenceID('seq1'),
                               [graph.SeqPath([*map(nid, [1, 2, 5, 6, 7, 9])])],
                               graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq2'):
                graph.Sequence(msa.SequenceID('seq2'),
                               [graph.SeqPath([*map(nid, [3, 4, 6, 7, 10, 12, 14, 17])])],
                               graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq3'):
                graph.Sequence(msa.SequenceID('seq3'),
                               [graph.SeqPath([*map(nid, [11, 13, 14, 15])])],
                               graph.SequenceMetadata({'group': '1'})),
        }

        poagraph = graph.Poagraph(poagraph_nodes, poagraph_sequences)

        actual_po_content = po.poagraph_to_PangenomePO(poagraph)
        expected_po_content = pathtools.get_file_content(expected_po_content_path)
        self.assertEqual(expected_po_content, actual_po_content)
    def setUp(self):
        nodes = [
            graph.Node(
                node_id=nid(0),
                base=b('T'),
                aligned_to=None,
            ),
            graph.Node(node_id=nid(1), base=b('A'), aligned_to=nid(2)),
            graph.Node(node_id=nid(2), base=b('G'), aligned_to=nid(1)),
            graph.Node(node_id=nid(3), base=b('A'), aligned_to=nid(4)),
            graph.Node(node_id=nid(4), base=b('C'), aligned_to=nid(3)),
            graph.Node(node_id=nid(5), base=b('A'), aligned_to=nid(6)),
            graph.Node(node_id=nid(6), base=b('C'), aligned_to=nid(7)),
            graph.Node(node_id=nid(7), base=b('G'), aligned_to=nid(8)),
            graph.Node(node_id=nid(8), base=b('T'), aligned_to=nid(5)),
            graph.Node(node_id=nid(9), base=b('A'), aligned_to=None),
            graph.Node(node_id=nid(10), base=b('C'), aligned_to=nid(11)),
            graph.Node(node_id=nid(11), base=b('T'), aligned_to=nid(10)),
            graph.Node(node_id=nid(12), base=b('G'), aligned_to=None),
            graph.Node(node_id=nid(13), base=b('A'), aligned_to=nid(14)),
            graph.Node(node_id=nid(14), base=b('C'), aligned_to=nid(13))
        ]

        sequences = {
            msa.SequenceID('seq0'):
            graph.Sequence(
                msa.SequenceID('seq0'),
                [graph.SeqPath([*map(nid, [0, 1, 3, 5, 9, 10, 13])])],
                graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq1'):
            graph.Sequence(msa.SequenceID('seq1'),
                           [graph.SeqPath([*map(nid, [1, 3, 6, 9, 11])])],
                           graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq2'):
            graph.Sequence(msa.SequenceID('seq2'),
                           [graph.SeqPath([*map(nid, [2, 4, 7, 9, 11, 12])])],
                           graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq3'):
            graph.Sequence(
                msa.SequenceID('seq3'),
                [graph.SeqPath([*map(nid, [2, 4, 8, 9, 11, 12, 14])])],
                graph.SequenceMetadata({'group': '1'})),
        }

        self.poagraph = graph.Poagraph(nodes, sequences)
    def test_09_inactive_edges_but_all_strands_plus(self):
        maf_path = self.maf_files_dir.joinpath("test_9_inactive_edges_but_all_strands_plus.maf")

        expected_nodes = [
            graph.Node(node_id=nid(0), base=graph.Base('A'), aligned_to=None),
            graph.Node(node_id=nid(1), base=graph.Base('C'), aligned_to=None),
            graph.Node(node_id=nid(2), base=graph.Base('T'), aligned_to=None),
            graph.Node(node_id=nid(3), base=graph.Base('G'), aligned_to=None),
            graph.Node(node_id=nid(4), base=graph.Base('G'), aligned_to=None),

            graph.Node(node_id=nid(5), base=graph.Base('A'), aligned_to=None),
            graph.Node(node_id=nid(6), base=graph.Base('C'), aligned_to=None),
            graph.Node(node_id=nid(7), base=graph.Base('T'), aligned_to=None),
            graph.Node(node_id=nid(8), base=graph.Base('G'), aligned_to=None),
            graph.Node(node_id=nid(9), base=graph.Base('G'), aligned_to=None),

            graph.Node(node_id=nid(10), base=graph.Base('A'), aligned_to=None),
            graph.Node(node_id=nid(11), base=graph.Base('C'), aligned_to=None),
            graph.Node(node_id=nid(12), base=graph.Base('T'), aligned_to=None),
            graph.Node(node_id=nid(13), base=graph.Base('G'), aligned_to=None),
            graph.Node(node_id=nid(14), base=graph.Base('G'), aligned_to=None),

            graph.Node(node_id=nid(15), base=graph.Base('A'), aligned_to=None),
            graph.Node(node_id=nid(16), base=graph.Base('C'), aligned_to=None),
            graph.Node(node_id=nid(17), base=graph.Base('T'), aligned_to=None),
            graph.Node(node_id=nid(18), base=graph.Base('G'), aligned_to=None),
            graph.Node(node_id=nid(19), base=graph.Base('G'), aligned_to=None),
        ]

        expected_sequences = {
            msa.SequenceID('seq0'):
                graph.Sequence(msa.SequenceID('seq0'),
                               [],
                               graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq1'):
                graph.Sequence(msa.SequenceID('seq1'),
                               [graph.SeqPath([*map(nid, [0, 1, 2, 3, 4, 10, 11, 12, 13, 14])]),
                                graph.SeqPath([*map(nid, [5, 6, 7, 8, 9, 15, 16, 17, 18, 19])])],
                               graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq2'):
                graph.Sequence(msa.SequenceID('seq2'),
                               [graph.SeqPath([*map(nid, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                                          13, 14, 15, 16, 17, 18, 19])])],
                               graph.SequenceMetadata({'group': '2'})),
            msa.SequenceID('seq3'):
                graph.Sequence(msa.SequenceID('seq3'),
                               [],
                               graph.SequenceMetadata({'group': '2'})),
        }
        expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences)
        actual_poagraph, _ = builder.build_from_dagmaf(
            msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path),
            self.fasta_provider,
            self.metadatacsv)

        self.assertEqual(expected_poagraph, actual_poagraph)
    def test_10_parallel_blocks_1st_and_2nd_merge_into_3rd(self):
        maf_path = self.maf_files_dir.joinpath("test_10_parallel_blocks_1st_and_2nd_merge_into_3rd.maf")

        expected_nodes = [
            graph.Node(node_id=nid(0), base=graph.Base('G'), aligned_to=nid(1)),
            graph.Node(node_id=nid(1), base=graph.Base('T'), aligned_to=nid(0)),
            graph.Node(node_id=nid(2), base=graph.Base('T'), aligned_to=None),
            graph.Node(node_id=nid(3), base=graph.Base('A'), aligned_to=None),
            graph.Node(node_id=nid(4), base=graph.Base('C'), aligned_to=nid(5)),
            graph.Node(node_id=nid(5), base=graph.Base('G'), aligned_to=nid(4)),
            graph.Node(node_id=nid(6), base=graph.Base('C'), aligned_to=None),

            graph.Node(node_id=nid(7), base=graph.Base('A'), aligned_to=None),
            graph.Node(node_id=nid(8), base=graph.Base('C'), aligned_to=None),
            graph.Node(node_id=nid(9), base=graph.Base('T'), aligned_to=None),
            graph.Node(node_id=nid(10), base=graph.Base('G'), aligned_to=None),
            graph.Node(node_id=nid(11), base=graph.Base('G'), aligned_to=None),

            graph.Node(node_id=nid(12), base=graph.Base('C'), aligned_to=nid(13)),
            graph.Node(node_id=nid(13), base=graph.Base('G'), aligned_to=nid(12)),
            graph.Node(node_id=nid(14), base=graph.Base('C'), aligned_to=nid(15)),
            graph.Node(node_id=nid(15), base=graph.Base('G'), aligned_to=nid(16)),
            graph.Node(node_id=nid(16), base=graph.Base('T'), aligned_to=nid(14)),
            graph.Node(node_id=nid(17), base=graph.Base('A'), aligned_to=nid(18)),
            graph.Node(node_id=nid(18), base=graph.Base('T'), aligned_to=nid(17)),
            graph.Node(node_id=nid(19), base=graph.Base('A'), aligned_to=nid(20)),
            graph.Node(node_id=nid(20), base=graph.Base('C'), aligned_to=nid(19)),
            graph.Node(node_id=nid(21), base=graph.Base('C'), aligned_to=nid(22)),
            graph.Node(node_id=nid(22), base=graph.Base('G'), aligned_to=nid(21)),
        ]

        expected_sequences = {
            msa.SequenceID('seq0'):
                graph.Sequence(msa.SequenceID('seq0'),
                               [graph.SeqPath([*map(nid, [7, 8, 9, 10, 11, 12, 15, 18, 19, 21])])],
                               graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq1'):
                graph.Sequence(msa.SequenceID('seq1'),
                               [graph.SeqPath([*map(nid, [7, 8, 9, 10, 11, 12, 15, 18, 19, 21])])],
                               graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq2'):
                graph.Sequence(msa.SequenceID('seq2'),
                               [graph.SeqPath([*map(nid, [0, 2, 3, 4, 6, 13, 16, 17, 20, 21])])],
                               graph.SequenceMetadata({'group': '2'})),
            msa.SequenceID('seq3'):
                graph.Sequence(msa.SequenceID('seq3'),
                               [graph.SeqPath([*map(nid, [1, 2, 3, 5, 6, 13, 14, 17, 20, 22])])],
                               graph.SequenceMetadata({'group': '2'})),
        }
        expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences)
        actual_poagraph, _ = builder.build_from_dagmaf(
            msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path),
            self.fasta_provider,
            self.metadatacsv)

        self.assertEqual(expected_poagraph, actual_poagraph)
    def test_1_messy_sequences(self):
        maf_path = self.maf_files_dir.joinpath("test_1_messy_sequences.maf")
        expected_nodes = [
            graph.Node(node_id=nid(0),
                       base=graph.Base('A'),
                       aligned_to=None,
                       block_id=bid(0)),
            graph.Node(node_id=nid(1),
                       base=graph.Base('A'),
                       aligned_to=nid(2),
                       block_id=bid(0)),
            graph.Node(node_id=nid(2),
                       base=graph.Base('C'),
                       aligned_to=nid(1),
                       block_id=bid(0)),
            graph.Node(node_id=nid(3),
                       base=graph.Base('T'),
                       aligned_to=None,
                       block_id=bid(0)),
            graph.Node(node_id=nid(4),
                       base=graph.Base('C'),
                       aligned_to=nid(5),
                       block_id=bid(0)),
            graph.Node(node_id=nid(5),
                       base=graph.Base('G'),
                       aligned_to=nid(4),
                       block_id=bid(0)),
            graph.Node(node_id=nid(6),
                       base=graph.Base('A'),
                       aligned_to=None,
                       block_id=bid(1)),
            graph.Node(node_id=nid(7),
                       base=graph.Base('C'),
                       aligned_to=None,
                       block_id=bid(1)),
            graph.Node(node_id=nid(8),
                       base=graph.Base('G'),
                       aligned_to=None,
                       block_id=bid(1)),
            graph.Node(node_id=nid(9),
                       base=graph.Base('C'),
                       aligned_to=nid(10),
                       block_id=bid(2)),
            graph.Node(node_id=nid(10),
                       base=graph.Base('G'),
                       aligned_to=nid(9),
                       block_id=bid(2)),
            graph.Node(node_id=nid(11),
                       base=graph.Base('T'),
                       aligned_to=None,
                       block_id=bid(2)),
            graph.Node(node_id=nid(12),
                       base=graph.Base('C'),
                       aligned_to=None,
                       block_id=bid(2)),
            graph.Node(node_id=nid(13),
                       base=graph.Base('C'),
                       aligned_to=None,
                       block_id=bid(2)),
            graph.Node(node_id=nid(14),
                       base=graph.Base('A'),
                       aligned_to=None,
                       block_id=bid(2)),
        ]

        expected_sequences = {
            msa.SequenceID('seq0'):
            graph.Sequence(
                msa.SequenceID('seq0'),
                [graph.SeqPath([*map(nid, [1, 3, 4, 6, 8, 9, 11, 12])])],
                graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq1'):
            graph.Sequence(
                msa.SequenceID('seq1'),
                [graph.SeqPath([*map(nid, [2, 3, 4, 10, 11, 12, 13, 14])])],
                graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq2'):
            graph.Sequence(
                msa.SequenceID('seq2'),
                [graph.SeqPath([*map(nid, [0, 2, 5, 6, 7, 10, 11, 12, 14])])],
                graph.SequenceMetadata({'group': '2'})),
            msa.SequenceID('seq3'):
            graph.Sequence(msa.SequenceID('seq3'), [],
                           graph.SequenceMetadata({'group': '2'}))
        }
        actual_nodes, actual_sequences = maf2poagraph.get_poagraph(
            msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path),
            self.metadatacsv)

        self.assertEqual(expected_nodes, actual_nodes)
        self.assertEqual(expected_sequences, actual_sequences)
    def test_06_1st_block_separates_into_2_branches_which_connect_in_3rd_block(self):
        maf_path = self.maf_files_dir.joinpath(
                        "test_6_1st_block_separates_into_2_branches_which_connect_in_3rd_block.maf")

        expected_nodes = [
            graph.Node(node_id=nid(0), base=graph.Base('A'), aligned_to=nid(1)),
            graph.Node(node_id=nid(1), base=graph.Base('C'), aligned_to=nid(2)),
            graph.Node(node_id=nid(2), base=graph.Base('G'), aligned_to=nid(0)),
            graph.Node(node_id=nid(3), base=graph.Base('C'), aligned_to=None),
            graph.Node(node_id=nid(4), base=graph.Base('A'), aligned_to=nid(5)),
            graph.Node(node_id=nid(5), base=graph.Base('T'), aligned_to=nid(4)),

            graph.Node(node_id=nid(6), base=graph.Base('G'), aligned_to=None),
            graph.Node(node_id=nid(7), base=graph.Base('G'), aligned_to=None),

            graph.Node(node_id=nid(8), base=graph.Base('C'), aligned_to=nid(9)),
            graph.Node(node_id=nid(9), base=graph.Base('G'), aligned_to=nid(10)),
            graph.Node(node_id=nid(10), base=graph.Base('T'), aligned_to=nid(8)),
        ]

        expected_sequences = {
            msa.SequenceID('seq0'):
                graph.Sequence(msa.SequenceID('seq0'),
                               [graph.SeqPath([*map(nid, [0, 3, 4, 8])])],
                               graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq1'):
                graph.Sequence(msa.SequenceID('seq1'),
                               [graph.SeqPath([*map(nid, [1, 3, 5, 6, 7, 9])])],
                               graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq2'):
                graph.Sequence(msa.SequenceID('seq2'),
                               [graph.SeqPath([*map(nid, [2, 3, 5, 10])])],
                               graph.SequenceMetadata({'group': '2'})),
            msa.SequenceID('seq3'):
                graph.Sequence(msa.SequenceID('seq3'),
                               [],
                               graph.SequenceMetadata({'group': '2'}))
        }
        expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences)
        actual_poagraph, _ = builder.build_from_dagmaf(
            msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path),
            self.fasta_provider,
            self.metadatacsv)

        self.assertEqual(expected_poagraph, actual_poagraph)
Esempio n. 23
0
    def test_7_missing_one_reverted_sequence_middle_minus1_minus1(self):
        maf_path = self.maf_files_dir.joinpath(
            "test_7_missing_one_reverted_sequence_middle_minus1_minus1.maf")

        expected_nodes = [
            # block 0
            graph.Node(node_id=nid(0), base=graph.Base('A'), aligned_to=None),
            graph.Node(node_id=nid(1), base=graph.Base('C'), aligned_to=None),
            graph.Node(node_id=nid(2), base=graph.Base('T'), aligned_to=None),
            graph.Node(node_id=nid(3), base=graph.Base('A'), aligned_to=None),

            # missing seq2
            graph.Node(node_id=nid(4), base=graph.Base('C'), aligned_to=None),
            graph.Node(node_id=nid(5), base=graph.Base('A'), aligned_to=None),

            # block 1
            graph.Node(node_id=nid(6), base=graph.Base('A'),
                       aligned_to=nid(7)),
            graph.Node(node_id=nid(7), base=graph.Base('G'),
                       aligned_to=nid(6)),
            graph.Node(node_id=nid(8), base=graph.Base('C'),
                       aligned_to=nid(9)),
            graph.Node(node_id=nid(9), base=graph.Base('G'),
                       aligned_to=nid(8)),
            graph.Node(node_id=nid(10),
                       base=graph.Base('C'),
                       aligned_to=nid(11)),
            graph.Node(node_id=nid(11),
                       base=graph.Base('T'),
                       aligned_to=nid(10)),
        ]

        expected_sequences = {
            msa.SequenceID('seq0'):
            graph.Sequence(msa.SequenceID('seq0'), [],
                           graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq1'):
            graph.Sequence(
                msa.SequenceID('seq1'),
                [graph.SeqPath([*map(nid, [0, 1, 2, 3, 7, 9, 11])])],
                graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq2'):
            graph.Sequence(
                msa.SequenceID('seq2'),
                [graph.SeqPath([*map(nid, [0, 1, 4, 5, 6, 8, 10])])],
                graph.SequenceMetadata({'group': '2'})),
            msa.SequenceID('seq3'):
            graph.Sequence(msa.SequenceID('seq3'), [],
                           graph.SequenceMetadata({'group': '2'}))
        }
        expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences)
        actual_poagraph, _ = builder.build_from_dagmaf(
            msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path),
            self.fasta_provider, self.metadatacsv)

        self.assertEqual(expected_poagraph, actual_poagraph)
    def test_6_missing_one_reverted_sequence_middle_minus1_1(self):
        maf_path = self.maf_files_dir.joinpath(
            "test_6_missing_one_reverted_sequence_middle_minus1_1.maf")

        expected_nodes = [
            # block 1 because it is first in DAG and reverted
            graph.Node(node_id=nid(0), base=graph.Base('A'), aligned_to=None),
            graph.Node(node_id=nid(1), base=graph.Base('C'), aligned_to=None),
            graph.Node(node_id=nid(2), base=graph.Base('C'),
                       aligned_to=nid(3)),
            graph.Node(node_id=nid(3), base=graph.Base('T'),
                       aligned_to=nid(2)),

            # missing seq2, on edge (-1,1)
            graph.Node(node_id=nid(4),
                       base=graph.Base(self.missing_n.value),
                       aligned_to=None),
            graph.Node(node_id=nid(5),
                       base=graph.Base(self.missing_n.value),
                       aligned_to=None),
            graph.Node(node_id=nid(6), base=graph.Base('A'),
                       aligned_to=nid(7)),
            graph.Node(node_id=nid(7), base=graph.Base('C'),
                       aligned_to=nid(6)),
            graph.Node(node_id=nid(8), base=graph.Base('C'), aligned_to=None),
            graph.Node(node_id=nid(9), base=graph.Base('T'), aligned_to=None),
            graph.Node(node_id=nid(10),
                       base=graph.Base('A'),
                       aligned_to=nid(11)),
            graph.Node(node_id=nid(11),
                       base=graph.Base('C'),
                       aligned_to=nid(10)),
        ]

        expected_sequences = {
            msa.SequenceID('seq0'):
            graph.Sequence(msa.SequenceID('seq0'), [],
                           graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq1'):
            graph.Sequence(msa.SequenceID('seq1'), [
                graph.SeqPath([*map(nid, [0, 1, 2])]),
                graph.SeqPath([*map(nid, [6, 8, 9, 10])])
            ], graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq2'):
            graph.Sequence(
                msa.SequenceID('seq2'),
                [graph.SeqPath([*map(nid, [0, 1, 3, 4, 5, 7, 11])])],
                graph.SequenceMetadata({'group': '2'})),
            msa.SequenceID('seq3'):
            graph.Sequence(msa.SequenceID('seq3'), [],
                           graph.SequenceMetadata({'group': '2'}))
        }
        expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences)
        actual_poagraph, _ = builder.build_from_dagmaf(
            msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path),
            missings.ConstBaseProvider(self.missing_n), self.metadatacsv)

        self.assertEqual(expected_poagraph, actual_poagraph)
    def test_2_missing_sequence_end(self):
        maf_path = self.maf_files_dir.joinpath(
            "test_2_missing_sequence_end.maf")

        expected_nodes = [
            graph.Node(node_id=nid(0), base=graph.Base('A'),
                       aligned_to=nid(1)),
            graph.Node(node_id=nid(1), base=graph.Base('G'),
                       aligned_to=nid(0)),
            graph.Node(node_id=nid(2), base=graph.Base('C'),
                       aligned_to=nid(3)),
            graph.Node(node_id=nid(3), base=graph.Base('G'),
                       aligned_to=nid(2)),
            graph.Node(node_id=nid(4), base=graph.Base('T'), aligned_to=None),
            graph.Node(node_id=nid(5), base=graph.Base('A'),
                       aligned_to=nid(6)),
            graph.Node(node_id=nid(6), base=graph.Base('C'),
                       aligned_to=nid(5)),
            graph.Node(node_id=nid(7), base=graph.Base('A'), aligned_to=None),
            graph.Node(node_id=nid(8), base=graph.Base('G'), aligned_to=None),
            graph.Node(node_id=nid(9), base=graph.Base('G'), aligned_to=None),
            graph.Node(node_id=nid(10), base=graph.Base('T'), aligned_to=None),
            graph.Node(node_id=nid(11),
                       base=graph.Base(self.missing_n.value),
                       aligned_to=None),
            graph.Node(node_id=nid(12),
                       base=graph.Base(self.missing_n.value),
                       aligned_to=None),
        ]

        expected_sequences = {
            msa.SequenceID('seq0'):
            graph.Sequence(msa.SequenceID('seq0'), [],
                           graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq1'):
            graph.Sequence(
                msa.SequenceID('seq1'),
                [graph.SeqPath([*map(nid, [0, 2, 4, 5, 8, 9, 10])])],
                graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq2'):
            graph.Sequence(
                msa.SequenceID('seq2'),
                [graph.SeqPath([*map(nid, [1, 3, 4, 6, 7, 11, 12])])],
                graph.SequenceMetadata({'group': '2'})),
            msa.SequenceID('seq3'):
            graph.Sequence(msa.SequenceID('seq3'), [],
                           graph.SequenceMetadata({'group': '2'}))
        }
        expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences)
        actual_poagraph, _ = builder.build_from_dagmaf(
            msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path),
            missings.ConstBaseProvider(self.missing_n), self.metadatacsv)

        self.assertEqual(expected_poagraph, actual_poagraph)