def get_ebola_consensus_tree(p: float, stop: float, output_dir_name: str) -> Tuple[Poagraph, AffinityTree]:
    current_path = Path(os.path.abspath(__file__)).resolve()
    output_dir_path = pathtools.get_child_dir(current_path.parent, output_dir_name)
    consensus_output_dir = pathtools.get_child_dir(output_dir_path, "consensus")
    multialignment_path = current_path.parent.joinpath("../data/Ebola/genome_whole/input/multialignment.maf")
    metadata_path = current_path.parent.joinpath("../data/Ebola/genome_whole/input/metadata.csv")
    blosum_path = current_path.parent.joinpath("../bin/blosum80.mat")


    fasta_provider = fp_ncbi.FromNCBI(use_cache=True)

    multialignment_content = pathtools.get_file_content_stringio(multialignment_path)
    multialignment = Maf(file_content=multialignment_content, filename=multialignment_path)

    metadata_content = pathtools.get_file_content_stringio(metadata_path)
    metadata = MetadataCSV(filecontent=metadata_content, filename=metadata_path)

    poagraph, dagmaf = Poagraph.build_from_dagmaf(multialignment, fasta_provider, metadata)

    blosum_content = pathtools.get_file_content_stringio(path=blosum_path)
    blosum = Blosum(blosum_content, blosum_path)

    return poagraph, atree_builders.get_affinity_tree(poagraph,
                                                      blosum,
                                                      consensus_output_dir,
                                                      Stop(stop),
                                                      P(p),
                                                      False)
def get_ebola_affinity_tree(
        p: float, stop: float,
        output_dir_name: str) -> Tuple[Poagraph, AffinityTree]:
    current_path = Path(os.path.abspath(__file__)).resolve()
    output_dir_path = pathtools.get_child_dir(current_path.parent,
                                              output_dir_name)
    consensus_output_dir = pathtools.get_child_dir(output_dir_path,
                                                   "consensus")
    multialignment_path = current_path.parent.joinpath(
        "../data/Ebola/multialignment.maf")
    metadata_path = current_path.parent.joinpath("../data/Ebola/metadata.csv")
    blosum_path = current_path.parent.joinpath("../bin/blosum80.mat")

    tp = TaskParameters(running_time="",
                        multialignment_file_path=multialignment_path,
                        multialignment_format="MAF",
                        datatype="N",
                        metadata_file_path=metadata_path,
                        blosum_file_path=blosum_path,
                        output_path=output_dir_path,
                        output_po=False,
                        output_fasta=False,
                        output_with_nodes=False,
                        verbose=False,
                        raw_maf=False,
                        fasta_provider='FromNCBI',
                        cache=True,
                        missing_base_symbol="",
                        fasta_source_file=None,
                        consensus_type="",
                        hbmin=0.8,
                        stop=stop,
                        p=p)

    fasta_provider = fp_ncbi.FromNCBI(use_cache=True)

    multialignment_content = pathtools.get_file_content_stringio(
        multialignment_path)
    multialignment = Maf(file_content=multialignment_content,
                         filename=multialignment_path)

    metadata_content = pathtools.get_file_content_stringio(metadata_path)
    metadata = MetadataCSV(filecontent=metadata_content,
                           filename=metadata_path)

    poagraph, dagmaf = Poagraph.build_from_dagmaf(multialignment,
                                                  fasta_provider, metadata)

    blosum_content = pathtools.get_file_content_stringio(path=blosum_path)
    blosum = Blosum(blosum_content, blosum_path)

    return poagraph, atree_builders.get_affinity_tree(poagraph, blosum,
                                                      consensus_output_dir,
                                                      Stop(stop), P(p), False)
Example #3
0
def get_ebola_consensus_tree(p: float, stop: float, output_dir_name: str) -> Tuple[Poagraph, ConsensusTree]:
    current_path = Path(os.path.abspath(__file__)).resolve()
    output_dir_path = pathtools.get_child_dir(current_path.parent, output_dir_name)
    consensus_output_dir = pathtools.get_child_dir(output_dir_path, "consensus")
    multialignment_path = current_path.parent.joinpath("../data/Ebola/genome_whole/input/multialignment.maf")
    metadata_path = current_path.parent.joinpath("../data/Ebola/genome_whole/input/metadata.csv")
    blosum_path = current_path.parent.joinpath("../bin/blosum80.mat")

    tp = TaskParameters(running_time="",
                        multialignment_file_path=multialignment_path,
                        multialignment_format="MAF",
                        datatype="N",
                        metadata_file_path=metadata_path,
                        blosum_file_path=blosum_path,
                        output_path=output_dir_path,
                        output_po=False,
                        output_fasta=False,
                        output_with_nodes=False,
                        verbose=False,
                        raw_maf=False,
                        fasta_provider='FromNCBI',
                        cache=True,
                        missing_base_symbol="",
                        fasta_source_file=None,
                        consensus_type="",
                        hbmin=0.8,
                        max_cutoff_option="MAX2",
                        search_range=None,
                        node_cutoff_option="NODE3",
                        multiplier=None,
                        stop=stop,
                        p=p)

    fasta_provider = fp_ncbi.FromNCBI(use_cache=True)

    multialignment_content = pathtools.get_file_content_stringio(multialignment_path)
    multialignment = inp.Maf(file_content=multialignment_content, filename=multialignment_path)

    metadata_content = pathtools.get_file_content_stringio(metadata_path)
    metadata = inp.MetadataCSV(filecontent=metadata_content, filename=metadata_path)

    poagraph, dagmaf = Poagraph.build_from_dagmaf(multialignment, fasta_provider, metadata)

    blosum_content = pathtools.get_file_content_stringio(path=blosum_path)
    blosum = cinp.Blosum(blosum_content, blosum_path)

    return poagraph, tree_generator.get_consensus_tree(poagraph,
                                             blosum,
                                             consensus_output_dir,
                                             cinp.Stop(stop),
                                             cinp.P(p),
                                             MAX2(),
                                             NODE3(),
                                             False)
    def test_7_missing_one_reverted_sequence_middle_minus1_minus1(self):
        maf_path = self.maf_files_dir.joinpath(
            "test_7_missing_one_reverted_sequence_middle_minus1_minus1.maf")

        expected_nodes = [
            # block 0
            graph.Node(node_id=nid(0), base=graph.Base('A'), aligned_to=None),
            graph.Node(node_id=nid(1), base=graph.Base('C'), aligned_to=None),
            graph.Node(node_id=nid(2), base=graph.Base('T'), aligned_to=None),
            graph.Node(node_id=nid(3), base=graph.Base('A'), aligned_to=None),

            # missing seq2
            graph.Node(node_id=nid(4),
                       base=graph.Base(self.missing_n.value),
                       aligned_to=None),
            graph.Node(node_id=nid(5),
                       base=graph.Base(self.missing_n.value),
                       aligned_to=None),

            # block 1
            graph.Node(node_id=nid(6), base=graph.Base('A'),
                       aligned_to=nid(7)),
            graph.Node(node_id=nid(7), base=graph.Base('G'),
                       aligned_to=nid(6)),
            graph.Node(node_id=nid(8), base=graph.Base('C'),
                       aligned_to=nid(9)),
            graph.Node(node_id=nid(9), base=graph.Base('G'),
                       aligned_to=nid(8)),
            graph.Node(node_id=nid(10),
                       base=graph.Base('C'),
                       aligned_to=nid(11)),
            graph.Node(node_id=nid(11),
                       base=graph.Base('T'),
                       aligned_to=nid(10)),
        ]

        expected_sequences = {
            msa.SequenceID('seq0'):
            graph.Sequence(msa.SequenceID('seq0'), [],
                           graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq1'):
            graph.Sequence(
                msa.SequenceID('seq1'),
                [graph.SeqPath([*map(nid, [0, 1, 2, 3, 7, 9, 11])])],
                graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq2'):
            graph.Sequence(
                msa.SequenceID('seq2'),
                [graph.SeqPath([*map(nid, [0, 1, 4, 5, 6, 8, 10])])],
                graph.SequenceMetadata({'group': '2'})),
            msa.SequenceID('seq3'):
            graph.Sequence(msa.SequenceID('seq3'), [],
                           graph.SequenceMetadata({'group': '2'}))
        }
        expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences)
        actual_poagraph, _ = builder.build_from_dagmaf(
            msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path),
            missings.ConstBaseProvider(self.missing_n), self.metadatacsv)

        self.assertEqual(expected_poagraph, actual_poagraph)
    def test_05_single_block_single_nucletodide(self):
        maf_path = self.maf_files_dir.joinpath(
                        "test_5_single_block_single_nucletodide.maf")

        expected_nodes = [
            graph.Node(node_id=nid(0), base=graph.Base('A'), aligned_to=None, block_id=bid(0))
        ]

        expected_sequences = {
            msa.SequenceID('seq0'):
                graph.Sequence(msa.SequenceID('seq0'),
                               [graph.SeqPath([*map(nid, [0])])],
                               graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq1'):
                graph.Sequence(msa.SequenceID('seq1'),
                               [graph.SeqPath([*map(nid, [0])])],
                               graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq2'):
                graph.Sequence(msa.SequenceID('seq2'),
                               [graph.SeqPath([*map(nid, [0])])],
                               graph.SequenceMetadata({'group': '2'})),
            msa.SequenceID('seq3'):
                graph.Sequence(msa.SequenceID('seq3'),
                               [graph.SeqPath([*map(nid, [0])])],
                               graph.SequenceMetadata({'group': '2'}))
        }
        expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences)
        actual_poagraph, _ = builder.build_from_dagmaf(
            msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path),
            self.fasta_provider,
            self.metadatacsv)

        self.assertEqual(expected_poagraph, actual_poagraph)
 def test_2_no_seqid(self):
     csv_path = self.csv_files_dir.joinpath("test_2_no_seqid.csv")
     csv_content = pathtools.get_file_content_stringio(csv_path)
     with self.assertRaises(Exception) as err:
         _ = msa.MetadataCSV(csv_content, csv_path)
     self.assertEqual(f"No \'seqid\' column in metadata csv.",
                      str(err.exception))
    def test_3_empty_file(self):
        csv_path = self.csv_files_dir.joinpath("test_3_empty_file.csv")

        csv_content = pathtools.get_file_content_stringio(csv_path)
        with self.assertRaises(Exception) as err:
            _ = msa.MetadataCSV(csv_content, csv_path)
        self.assertEqual(f"Empty csv file.", str(err.exception))
 def setUp(self):
     metadata_path = Path(__file__).parent.joinpath(
         "../seq_metadata.csv").resolve()
     self.metadatacsv = msa.MetadataCSV(
         pathtools.get_file_content_stringio(metadata_path), metadata_path)
     self.po_files_dir = Path(__file__).parent.joinpath(
         "po_files").resolve()
    def test_04_single_block_no_nucleotides(self):
        maf_path = self.maf_files_dir.joinpath(
                        "test_4_single_block_no_nucleotides.maf")

        expected_nodes = []

        expected_sequences = {
            msa.SequenceID('seq0'):
                graph.Sequence(msa.SequenceID('seq0'),
                               [],
                               graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq1'):
                graph.Sequence(msa.SequenceID('seq1'),
                               [],
                               graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq2'):
                graph.Sequence(msa.SequenceID('seq2'),
                               [],
                               graph.SequenceMetadata({'group': '2'})),
            msa.SequenceID('seq3'):
                graph.Sequence(msa.SequenceID('seq3'),
                               [],
                               graph.SequenceMetadata({'group': '2'}))
        }
        expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences)
        actual_poagraph, _ = builder.build_from_dagmaf(
            msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path),
            self.fasta_provider,
            self.metadatacsv)

        self.assertEqual(expected_poagraph, actual_poagraph)
 def setUp(self):
     metadata_path = Path(__file__).parent.joinpath(
         "../seq_metadata.csv").resolve()
     self.metadatacsv = msa.MetadataCSV(
         pathtools.get_file_content_stringio(metadata_path), metadata_path)
     self.maf_files_dir = Path(__file__).parent.joinpath(
         "maf_files_with_gaps").resolve()
     self.missing_n = missings.MissingBase()
Example #11
0
def get_default_blosum():
    """Returns default blosum file: Blosum80.mat"""

    pangtreebuild_dir = Path(__file__).parent.parent
    default_blosum_path = pathtools.get_child_path(
        pangtreebuild_dir, "affinity_tree/bin/blosum80.mat")
    blosum_content = pathtools.get_file_content_stringio(default_blosum_path)
    return at_params.Blosum(blosum_content, default_blosum_path)
Example #12
0
 def setUp(self):
     metadata_path = Path(__file__).parent.joinpath(
         "../seq_metadata.csv").resolve()
     self.metadatacsv = msa.MetadataCSV(
         pathtools.get_file_content_stringio(metadata_path), metadata_path)
     self.maf_files_dir = Path(__file__).parent.joinpath(
         "maf_files_with_gaps").resolve()
     self.fasta_provider = DAGMaf2PoagraphFakeFastaProviderTests.FakeFastaProvider(
     )
    def test_7_not_unique_seqids(self):
        csv_path = self.csv_files_dir.joinpath("test_7_not_unique_seqids.csv")

        csv_content = pathtools.get_file_content_stringio(csv_path)
        with self.assertRaises(Exception) as err:
            _ = msa.MetadataCSV(csv_content, csv_path)
        self.assertEqual(
            "Repeated values in seqid column in metadata file. Make them unique.",
            str(err.exception))
    def test_10_parallel_blocks_1st_and_2nd_merge_into_3rd(self):
        maf_path = self.maf_files_dir.joinpath("test_10_parallel_blocks_1st_and_2nd_merge_into_3rd.maf")

        expected_nodes = [
            graph.Node(node_id=nid(0), base=graph.Base('G'), aligned_to=nid(1)),
            graph.Node(node_id=nid(1), base=graph.Base('T'), aligned_to=nid(0)),
            graph.Node(node_id=nid(2), base=graph.Base('T'), aligned_to=None),
            graph.Node(node_id=nid(3), base=graph.Base('A'), aligned_to=None),
            graph.Node(node_id=nid(4), base=graph.Base('C'), aligned_to=nid(5)),
            graph.Node(node_id=nid(5), base=graph.Base('G'), aligned_to=nid(4)),
            graph.Node(node_id=nid(6), base=graph.Base('C'), aligned_to=None),

            graph.Node(node_id=nid(7), base=graph.Base('A'), aligned_to=None),
            graph.Node(node_id=nid(8), base=graph.Base('C'), aligned_to=None),
            graph.Node(node_id=nid(9), base=graph.Base('T'), aligned_to=None),
            graph.Node(node_id=nid(10), base=graph.Base('G'), aligned_to=None),
            graph.Node(node_id=nid(11), base=graph.Base('G'), aligned_to=None),

            graph.Node(node_id=nid(12), base=graph.Base('C'), aligned_to=nid(13)),
            graph.Node(node_id=nid(13), base=graph.Base('G'), aligned_to=nid(12)),
            graph.Node(node_id=nid(14), base=graph.Base('C'), aligned_to=nid(15)),
            graph.Node(node_id=nid(15), base=graph.Base('G'), aligned_to=nid(16)),
            graph.Node(node_id=nid(16), base=graph.Base('T'), aligned_to=nid(14)),
            graph.Node(node_id=nid(17), base=graph.Base('A'), aligned_to=nid(18)),
            graph.Node(node_id=nid(18), base=graph.Base('T'), aligned_to=nid(17)),
            graph.Node(node_id=nid(19), base=graph.Base('A'), aligned_to=nid(20)),
            graph.Node(node_id=nid(20), base=graph.Base('C'), aligned_to=nid(19)),
            graph.Node(node_id=nid(21), base=graph.Base('C'), aligned_to=nid(22)),
            graph.Node(node_id=nid(22), base=graph.Base('G'), aligned_to=nid(21)),
        ]

        expected_sequences = {
            msa.SequenceID('seq0'):
                graph.Sequence(msa.SequenceID('seq0'),
                               [graph.SeqPath([*map(nid, [7, 8, 9, 10, 11, 12, 15, 18, 19, 21])])],
                               graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq1'):
                graph.Sequence(msa.SequenceID('seq1'),
                               [graph.SeqPath([*map(nid, [7, 8, 9, 10, 11, 12, 15, 18, 19, 21])])],
                               graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq2'):
                graph.Sequence(msa.SequenceID('seq2'),
                               [graph.SeqPath([*map(nid, [0, 2, 3, 4, 6, 13, 16, 17, 20, 21])])],
                               graph.SequenceMetadata({'group': '2'})),
            msa.SequenceID('seq3'):
                graph.Sequence(msa.SequenceID('seq3'),
                               [graph.SeqPath([*map(nid, [1, 2, 3, 5, 6, 13, 14, 17, 20, 22])])],
                               graph.SequenceMetadata({'group': '2'})),
        }
        expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences)
        actual_poagraph, _ = builder.build_from_dagmaf(
            msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path),
            self.fasta_provider,
            self.metadatacsv)

        self.assertEqual(expected_poagraph, actual_poagraph)
    def test_09_inactive_edges_but_all_strands_plus(self):
        maf_path = self.maf_files_dir.joinpath("test_9_inactive_edges_but_all_strands_plus.maf")

        expected_nodes = [
            graph.Node(node_id=nid(0), base=graph.Base('A'), aligned_to=None),
            graph.Node(node_id=nid(1), base=graph.Base('C'), aligned_to=None),
            graph.Node(node_id=nid(2), base=graph.Base('T'), aligned_to=None),
            graph.Node(node_id=nid(3), base=graph.Base('G'), aligned_to=None),
            graph.Node(node_id=nid(4), base=graph.Base('G'), aligned_to=None),

            graph.Node(node_id=nid(5), base=graph.Base('A'), aligned_to=None),
            graph.Node(node_id=nid(6), base=graph.Base('C'), aligned_to=None),
            graph.Node(node_id=nid(7), base=graph.Base('T'), aligned_to=None),
            graph.Node(node_id=nid(8), base=graph.Base('G'), aligned_to=None),
            graph.Node(node_id=nid(9), base=graph.Base('G'), aligned_to=None),

            graph.Node(node_id=nid(10), base=graph.Base('A'), aligned_to=None),
            graph.Node(node_id=nid(11), base=graph.Base('C'), aligned_to=None),
            graph.Node(node_id=nid(12), base=graph.Base('T'), aligned_to=None),
            graph.Node(node_id=nid(13), base=graph.Base('G'), aligned_to=None),
            graph.Node(node_id=nid(14), base=graph.Base('G'), aligned_to=None),

            graph.Node(node_id=nid(15), base=graph.Base('A'), aligned_to=None),
            graph.Node(node_id=nid(16), base=graph.Base('C'), aligned_to=None),
            graph.Node(node_id=nid(17), base=graph.Base('T'), aligned_to=None),
            graph.Node(node_id=nid(18), base=graph.Base('G'), aligned_to=None),
            graph.Node(node_id=nid(19), base=graph.Base('G'), aligned_to=None),
        ]

        expected_sequences = {
            msa.SequenceID('seq0'):
                graph.Sequence(msa.SequenceID('seq0'),
                               [],
                               graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq1'):
                graph.Sequence(msa.SequenceID('seq1'),
                               [graph.SeqPath([*map(nid, [0, 1, 2, 3, 4, 10, 11, 12, 13, 14])]),
                                graph.SeqPath([*map(nid, [5, 6, 7, 8, 9, 15, 16, 17, 18, 19])])],
                               graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq2'):
                graph.Sequence(msa.SequenceID('seq2'),
                               [graph.SeqPath([*map(nid, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                                          13, 14, 15, 16, 17, 18, 19])])],
                               graph.SequenceMetadata({'group': '2'})),
            msa.SequenceID('seq3'):
                graph.Sequence(msa.SequenceID('seq3'),
                               [],
                               graph.SequenceMetadata({'group': '2'})),
        }
        expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences)
        actual_poagraph, _ = builder.build_from_dagmaf(
            msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path),
            self.fasta_provider,
            self.metadatacsv)

        self.assertEqual(expected_poagraph, actual_poagraph)
    def test_6_incorrect_commas_number(self):
        csv_path = self.csv_files_dir.joinpath(
            "test_6_incorrect_commas_number.csv")

        csv_content = pathtools.get_file_content_stringio(csv_path)
        with self.assertRaises(Exception) as err:
            _ = msa.MetadataCSV(csv_content, csv_path)
        self.assertEqual(
            "CSV metadata error. Different number of columns in line 0 than in header line.",
            str(err.exception))
Example #17
0
    def test_6_missing_one_reverted_sequence_middle_minus1_1(self):
        maf_path = self.maf_files_dir.joinpath(
            "test_6_missing_one_reverted_sequence_middle_minus1_1.maf")

        expected_nodes = [
            # block 1 because it is first in DAG and reverted
            graph.Node(node_id=nid(0), base=graph.Base('A'), aligned_to=None),
            graph.Node(node_id=nid(1), base=graph.Base('C'), aligned_to=None),
            graph.Node(node_id=nid(2), base=graph.Base('C'),
                       aligned_to=nid(3)),
            graph.Node(node_id=nid(3), base=graph.Base('T'),
                       aligned_to=nid(2)),

            # missing seq2, on edge (-1,1)
            graph.Node(node_id=nid(4), base=graph.Base('T'), aligned_to=None),
            graph.Node(node_id=nid(5), base=graph.Base('C'), aligned_to=None),
            graph.Node(node_id=nid(6), base=graph.Base('A'),
                       aligned_to=nid(7)),
            graph.Node(node_id=nid(7), base=graph.Base('C'),
                       aligned_to=nid(6)),
            graph.Node(node_id=nid(8), base=graph.Base('C'), aligned_to=None),
            graph.Node(node_id=nid(9), base=graph.Base('T'), aligned_to=None),
            graph.Node(node_id=nid(10),
                       base=graph.Base('A'),
                       aligned_to=nid(11)),
            graph.Node(node_id=nid(11),
                       base=graph.Base('C'),
                       aligned_to=nid(10)),
        ]

        expected_sequences = {
            msa.SequenceID('seq0'):
            graph.Sequence(msa.SequenceID('seq0'), [],
                           graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq1'):
            graph.Sequence(msa.SequenceID('seq1'), [
                graph.SeqPath([*map(nid, [0, 1, 2])]),
                graph.SeqPath([*map(nid, [6, 8, 9, 10])])
            ], graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq2'):
            graph.Sequence(
                msa.SequenceID('seq2'),
                [graph.SeqPath([*map(nid, [0, 1, 3, 4, 5, 7, 11])])],
                graph.SequenceMetadata({'group': '2'})),
            msa.SequenceID('seq3'):
            graph.Sequence(msa.SequenceID('seq3'), [],
                           graph.SequenceMetadata({'group': '2'}))
        }
        expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences)
        actual_poagraph, _ = builder.build_from_dagmaf(
            msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path),
            self.fasta_provider, self.metadatacsv)

        self.assertEqual(expected_poagraph, actual_poagraph)
Example #18
0
def run_pangtree(maf_path: Path, fasta_path: Path, output_dir: Path,
                 po_output: bool) -> None:
    output_dir = pathtools.get_child_dir(output_dir,
                                         pathtools.get_current_time())
    print(f"Runing pangtree for maf: {maf_path} and fasta: {fasta_path} "
          f"Output in: {output_dir}, include po file: {po_output}.")

    fasta_provider = missings.FromFile(fasta_path)
    maf = msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path)
    poagraph, dagmaf = builder.build_from_dagmaf(maf, fasta_provider)
    for p in p_values:
        current_output_dir = pathtools.get_child_dir(output_dir,
                                                     str(p).replace(".", "_"))
        stop = at_params.Stop(0.99)
        at = at_builders.build_affinity_tree(poagraph, None,
                                             current_output_dir, stop,
                                             at_params.P(p), True)

        at_newick = at.as_newick(None, separate_leaves=True)

        pathtools.save_to_file(
            at_newick,
            pathtools.get_child_path(current_output_dir,
                                     "affinity_tree.newick"))

        if po_output:
            pangenome_po = po.poagraph_to_PangenomePO(poagraph)
            pathtools.save_to_file(
                pangenome_po,
                pathtools.get_child_path(current_output_dir, "poagraph.po"))

        task_params = json.TaskParameters(
            multialignment_file_path=str(maf_path),
            multialignment_format="maf",
            datatype="nucleotides",
            blosum_file_path="",
            output_path=current_output_dir,
            fasta_provider=fasta_provider,
            fasta_source_file=fasta_path,
            consensus_type="tree",
            stop=str(stop),
            p=str(p),
            output_with_nodes=False)
        pangenomejson = json.to_PangenomeJSON(task_parameters=task_params,
                                              poagraph=poagraph,
                                              dagmaf=dagmaf,
                                              affinity_tree=at)

        pangenome_json_str = json.to_json(pangenomejson)
        pathtools.save_to_file(
            pangenome_json_str,
            pathtools.get_child_path(current_output_dir, "pangenome.json"))
    def test_1_typical_poagraph(self):
        po_path = self.po_files_dir.joinpath("test_1.po")

        expected_nodes = [
            graph.Node(node_id=nid(0), base=bid('A'), aligned_to=nid(1)),
            graph.Node(node_id=nid(1), base=bid('G'), aligned_to=nid(0)),
            graph.Node(node_id=nid(2), base=bid('C'), aligned_to=nid(3)),
            graph.Node(node_id=nid(3), base=bid('G'), aligned_to=nid(2)),
            graph.Node(node_id=nid(4), base=bid('A'), aligned_to=nid(5)),
            graph.Node(node_id=nid(5), base=bid('T'), aligned_to=nid(4)),
            graph.Node(node_id=nid(6), base=bid('G'), aligned_to=None),
            graph.Node(node_id=nid(7), base=bid('G'), aligned_to=None),
            graph.Node(node_id=nid(8), base=bid('A'), aligned_to=nid(9)),
            graph.Node(node_id=nid(9), base=bid('C'), aligned_to=nid(10)),
            graph.Node(node_id=nid(10), base=bid('G'), aligned_to=nid(11)),
            graph.Node(node_id=nid(11), base=bid('T'), aligned_to=nid(8)),
            graph.Node(node_id=nid(12), base=bid('A'), aligned_to=nid(13)),
            graph.Node(node_id=nid(13), base=bid('C'), aligned_to=nid(12)),
            graph.Node(node_id=nid(14), base=bid('T'), aligned_to=None),
            graph.Node(node_id=nid(15), base=bid('A'), aligned_to=nid(16)),
            graph.Node(node_id=nid(16), base=bid('C'), aligned_to=nid(17)),
            graph.Node(node_id=nid(17), base=bid('G'), aligned_to=nid(15))
        ]

        expected_sequences = {
            msa.SequenceID('seq0'):
            graph.Sequence(
                msa.SequenceID('seq0'),
                [graph.SeqPath([*map(nid, [0, 2, 4, 6, 7, 8, 12, 14, 16])])],
                graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq1'):
            graph.Sequence(msa.SequenceID('seq1'),
                           [graph.SeqPath([*map(nid, [1, 2, 5, 6, 7, 9])])],
                           graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq2'):
            graph.Sequence(
                msa.SequenceID('seq2'),
                [graph.SeqPath([*map(nid, [3, 4, 6, 7, 10, 12, 14, 17])])],
                graph.SequenceMetadata({'group': '2'})),
            msa.SequenceID('seq3'):
            graph.Sequence(msa.SequenceID('seq3'),
                           [graph.SeqPath([*map(nid, [11, 13, 14, 15])])],
                           graph.SequenceMetadata({'group': '2'}))
        }

        expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences)
        nodes, sequences = po2poagraph.get_poagraph(
            msa.Po(pathtools.get_file_content_stringio(po_path), po_path),
            self.metadatacsv)
        actual_poagraph = graph.Poagraph(nodes, sequences)
        self.assertEqual(expected_poagraph, actual_poagraph)
Example #20
0
    def test_2_missing_sequence_end(self):
        maf_path = self.maf_files_dir.joinpath(
            "test_2_missing_sequence_end.maf")

        expected_nodes = [
            graph.Node(node_id=nid(0), base=graph.Base('A'),
                       aligned_to=nid(1)),
            graph.Node(node_id=nid(1), base=graph.Base('G'),
                       aligned_to=nid(0)),
            graph.Node(node_id=nid(2), base=graph.Base('C'),
                       aligned_to=nid(3)),
            graph.Node(node_id=nid(3), base=graph.Base('G'),
                       aligned_to=nid(2)),
            graph.Node(node_id=nid(4), base=graph.Base('T'), aligned_to=None),
            graph.Node(node_id=nid(5), base=graph.Base('A'),
                       aligned_to=nid(6)),
            graph.Node(node_id=nid(6), base=graph.Base('C'),
                       aligned_to=nid(5)),
            graph.Node(node_id=nid(7), base=graph.Base('A'), aligned_to=None),
            graph.Node(node_id=nid(8), base=graph.Base('G'), aligned_to=None),
            graph.Node(node_id=nid(9), base=graph.Base('G'), aligned_to=None),
            graph.Node(node_id=nid(10), base=graph.Base('T'), aligned_to=None),
            graph.Node(node_id=nid(11), base=graph.Base('G'), aligned_to=None),
            graph.Node(node_id=nid(12), base=graph.Base('T'), aligned_to=None),
        ]

        expected_sequences = {
            msa.SequenceID('seq0'):
            graph.Sequence(msa.SequenceID('seq0'), [],
                           graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq1'):
            graph.Sequence(
                msa.SequenceID('seq1'),
                [graph.SeqPath([*map(nid, [0, 2, 4, 5, 8, 9, 10])])],
                graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq2'):
            graph.Sequence(
                msa.SequenceID('seq2'),
                [graph.SeqPath([*map(nid, [1, 3, 4, 6, 7, 11, 12])])],
                graph.SequenceMetadata({'group': '2'})),
            msa.SequenceID('seq3'):
            graph.Sequence(msa.SequenceID('seq3'), [],
                           graph.SequenceMetadata({'group': '2'}))
        }
        expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences)
        actual_poagraph, _ = builder.build_from_dagmaf(
            msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path),
            self.fasta_provider, self.metadatacsv)

        self.assertEqual(expected_poagraph, actual_poagraph)
    def test_9_get_seqids(self):
        metadata_path = self.csv_files_dir.joinpath("test_1_correct.csv")
        csv_content = pathtools.get_file_content_stringio(metadata_path)

        expected_seqids = [
            msa.SequenceID('s1'),
            msa.SequenceID('s2'),
            msa.SequenceID('s3')
        ]

        m = msa.MetadataCSV(csv_content, metadata_path)
        actual_seqids = m.get_all_sequences_ids()

        self.assertEqual(expected_seqids, actual_seqids)
    def test_2_consensuses_and_empty_sequences(self):
        po_path = self.po_files_dir.joinpath("test_2.po")

        expected_nodes = [
            graph.Node(node_id=nid(0), base=bid('C'), aligned_to=nid(1)),
            graph.Node(node_id=nid(1), base=bid('T'), aligned_to=nid(0)),
            graph.Node(node_id=nid(2), base=bid('A'), aligned_to=nid(3)),
            graph.Node(node_id=nid(3), base=bid('G'), aligned_to=nid(2)),
            graph.Node(node_id=nid(4), base=bid('C'), aligned_to=None),
            graph.Node(node_id=nid(5), base=bid('T'), aligned_to=None),
            graph.Node(node_id=nid(6), base=bid('A'), aligned_to=nid(7)),
            graph.Node(node_id=nid(7), base=bid('T'), aligned_to=nid(6)),
            graph.Node(node_id=nid(8), base=bid('G'), aligned_to=None)
        ]

        expected_sequences = {
            msa.SequenceID('seq0'):
            graph.Sequence(msa.SequenceID('seq0'),
                           [graph.SeqPath([*map(nid, [0, 3, 4, 5, 6, 8])])],
                           graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq1'):
            graph.Sequence(msa.SequenceID('seq1'),
                           [graph.SeqPath([*map(nid, [1, 2, 4, 5, 7, 8])])],
                           graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq2'):
            graph.Sequence(msa.SequenceID('seq2'), [],
                           graph.SequenceMetadata({'group': '2'})),
            msa.SequenceID('seq3'):
            graph.Sequence(msa.SequenceID('seq3'), [],
                           graph.SequenceMetadata({'group': '2'})),
            msa.SequenceID('CONSENS0'):
            graph.Sequence(msa.SequenceID('CONSENS0'),
                           [graph.SeqPath([*map(nid, [0, 3, 4, 5, 7, 8])])],
                           graph.SequenceMetadata({})),
            msa.SequenceID('CONSENS1'):
            graph.Sequence(msa.SequenceID('CONSENS1'),
                           [graph.SeqPath([*map(nid, [1, 2, 4, 5, 6, 8])])],
                           graph.SequenceMetadata({}))
        }

        expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences)
        nodes, sequences = po2poagraph.get_poagraph(
            msa.Po(pathtools.get_file_content_stringio(po_path), po_path),
            self.metadatacsv)
        actual_poagraph = graph.Poagraph(nodes, sequences)
        self.assertEqual(expected_poagraph, actual_poagraph)
    def test_06_1st_block_separates_into_2_branches_which_connect_in_3rd_block(self):
        maf_path = self.maf_files_dir.joinpath(
                        "test_6_1st_block_separates_into_2_branches_which_connect_in_3rd_block.maf")

        expected_nodes = [
            graph.Node(node_id=nid(0), base=graph.Base('A'), aligned_to=nid(1)),
            graph.Node(node_id=nid(1), base=graph.Base('C'), aligned_to=nid(2)),
            graph.Node(node_id=nid(2), base=graph.Base('G'), aligned_to=nid(0)),
            graph.Node(node_id=nid(3), base=graph.Base('C'), aligned_to=None),
            graph.Node(node_id=nid(4), base=graph.Base('A'), aligned_to=nid(5)),
            graph.Node(node_id=nid(5), base=graph.Base('T'), aligned_to=nid(4)),

            graph.Node(node_id=nid(6), base=graph.Base('G'), aligned_to=None),
            graph.Node(node_id=nid(7), base=graph.Base('G'), aligned_to=None),

            graph.Node(node_id=nid(8), base=graph.Base('C'), aligned_to=nid(9)),
            graph.Node(node_id=nid(9), base=graph.Base('G'), aligned_to=nid(10)),
            graph.Node(node_id=nid(10), base=graph.Base('T'), aligned_to=nid(8)),
        ]

        expected_sequences = {
            msa.SequenceID('seq0'):
                graph.Sequence(msa.SequenceID('seq0'),
                               [graph.SeqPath([*map(nid, [0, 3, 4, 8])])],
                               graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq1'):
                graph.Sequence(msa.SequenceID('seq1'),
                               [graph.SeqPath([*map(nid, [1, 3, 5, 6, 7, 9])])],
                               graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq2'):
                graph.Sequence(msa.SequenceID('seq2'),
                               [graph.SeqPath([*map(nid, [2, 3, 5, 10])])],
                               graph.SequenceMetadata({'group': '2'})),
            msa.SequenceID('seq3'):
                graph.Sequence(msa.SequenceID('seq3'),
                               [],
                               graph.SequenceMetadata({'group': '2'}))
        }
        expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences)
        actual_poagraph, _ = builder.build_from_dagmaf(
            msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path),
            self.fasta_provider,
            self.metadatacsv)

        self.assertEqual(expected_poagraph, actual_poagraph)
    def test_02_seq_starts_in_second_block(self):
        maf_path = self.maf_files_dir.joinpath(
                        "test_2_seq_starts_in_second_block.maf")

        expected_nodes = [
            graph.Node(node_id=nid(0), base=graph.Base('C'), aligned_to=None, block_id=bid(0)),
            graph.Node(node_id=nid(1), base=graph.Base('T'), aligned_to=None, block_id=bid(0)),
            graph.Node(node_id=nid(2), base=graph.Base('G'), aligned_to=None, block_id=bid(0)),

            graph.Node(node_id=nid(3), base=graph.Base('T'), aligned_to=None, block_id=bid(1)),

            graph.Node(node_id=nid(4), base=graph.Base('G'), aligned_to=nid(5), block_id=bid(2)),
            graph.Node(node_id=nid(5), base=graph.Base('T'), aligned_to=nid(4), block_id=bid(2)),
            graph.Node(node_id=nid(6), base=graph.Base('A'), aligned_to=None, block_id=bid(2)),
            graph.Node(node_id=nid(7), base=graph.Base('A'), aligned_to=None, block_id=bid(2)),
            graph.Node(node_id=nid(8), base=graph.Base('C'), aligned_to=None, block_id=bid(2)),

        ]

        expected_sequences = {
            msa.SequenceID('seq0'):
                graph.Sequence(msa.SequenceID('seq0'),
                               [graph.SeqPath([*map(nid, [1, 2, 3])])],
                               graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq1'):
                graph.Sequence(msa.SequenceID('seq1'),
                               [graph.SeqPath([*map(nid, [0, 5, 7])])],
                               graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq2'):
                graph.Sequence(msa.SequenceID('seq2'),
                               [graph.SeqPath([*map(nid, [3, 4, 6, 8])])],
                               graph.SequenceMetadata({'group': '2'})),
            msa.SequenceID('seq3'):
                graph.Sequence(msa.SequenceID('seq3'),
                               [],
                               graph.SequenceMetadata({'group': '2'}))
        }
        expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences)
        actual_poagraph, _ = builder.build_from_dagmaf(
            msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path),
            self.fasta_provider,
            self.metadatacsv)

        self.assertEqual(expected_poagraph, actual_poagraph)
    def test_08_reversed_block(self):
        maf_path = self.maf_files_dir.joinpath("test_8_reversed_block.maf")

        expected_nodes = [
            graph.Node(node_id=nid(0), base=graph.Base('C'), aligned_to=None),
            graph.Node(node_id=nid(1), base=graph.Base('A'), aligned_to=None),
            graph.Node(node_id=nid(2), base=graph.Base('T'), aligned_to=None),
            # next block is reversed because it was converted to dag
            graph.Node(node_id=nid(3), base=graph.Base('G'), aligned_to=None),
            graph.Node(node_id=nid(4), base=graph.Base('G'), aligned_to=None),
            graph.Node(node_id=nid(5), base=graph.Base('A'), aligned_to=nid(6)),
            graph.Node(node_id=nid(6), base=graph.Base('G'), aligned_to=nid(5)),
            graph.Node(node_id=nid(7), base=graph.Base('A'), aligned_to=None),
            graph.Node(node_id=nid(8), base=graph.Base('G'), aligned_to=None),
            graph.Node(node_id=nid(9), base=graph.Base('T'), aligned_to=None),
        ]

        expected_sequences = {
            msa.SequenceID('seq0'):
                graph.Sequence(msa.SequenceID('seq0'),
                               [],
                               graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq1'):
                graph.Sequence(msa.SequenceID('seq1'),
                               [graph.SeqPath([*map(nid, [0, 1, 3, 4, 5, 7, 8, 9])])],
                               graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq2'):
                graph.Sequence(msa.SequenceID('seq2'),
                               [graph.SeqPath([*map(nid, [0, 1, 2, 3, 4, 6, 7, 8, 9])])],
                               graph.SequenceMetadata({'group': '2'})),
            msa.SequenceID('seq3'):
                graph.Sequence(msa.SequenceID('seq3'),
                               [graph.SeqPath([*map(nid, [0, 1, 2, 3, 4, 6, 7, 9])])],
                               graph.SequenceMetadata({'group': '2'})),
        }
        expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences)
        actual_poagraph, _ = builder.build_from_dagmaf(
            msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path),
            self.fasta_provider,
            self.metadatacsv)

        self.assertEqual(expected_poagraph, actual_poagraph)
    def test_4_seqid_is_last(self):
        metadata_path = self.csv_files_dir.joinpath("test_4_seqid_is_last.csv")
        csv_content = pathtools.get_file_content_stringio(metadata_path)

        expected_metadata = {
            msa.SequenceID('s1'): {
                'name': 'sequence1',
                'group': 'A'
            },
            msa.SequenceID('s2'): {
                'name': 'sequence2',
                'group': 'B'
            },
            msa.SequenceID('s3'): {
                'name': 'sequence3',
                'group': 'B'
            }
        }
        m = msa.MetadataCSV(csv_content, metadata_path)
        actual_metadata = m.metadata

        self.assertEqual(expected_metadata, actual_metadata)
    def test_10_metadata_feed_to_alignment_from_csv(self, test_name, maf_name,
                                                    csv_name, po_name,
                                                    expected_metadata):
        maf_path = self.alignment_files_dir.joinpath(maf_name)
        csv_path = self.csv_files_dir.joinpath(csv_name)
        po_path = self.alignment_files_dir.joinpath(po_name)

        poagraph, _ = builder.build_from_dagmaf(
            msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path),
            self.fasta_provider,
            msa.MetadataCSV(pathtools.get_file_content_stringio(csv_path),
                            csv_path))
        actual_metadata = {
            seq_id: seq.seqmetadata
            for seq_id, seq in poagraph.sequences.items()
        }
        self.assertEqual(expected_metadata, actual_metadata)

        poagraph = builder.build_from_maf(
            msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path),
            msa.MetadataCSV(pathtools.get_file_content_stringio(csv_path),
                            csv_path))
        actual_metadata = {
            seq_id: seq.seqmetadata
            for seq_id, seq in poagraph.sequences.items()
        }
        self.assertEqual(expected_metadata, actual_metadata)

        poagraph = builder.build_from_po(
            msa.Po(pathtools.get_file_content_stringio(po_path), maf_path),
            msa.MetadataCSV(pathtools.get_file_content_stringio(csv_path),
                            csv_path))
        actual_metadata = {
            seq_id: seq.seqmetadata
            for seq_id, seq in poagraph.sequences.items()
        }
        self.assertEqual(expected_metadata, actual_metadata)
 def setUp(self):
     metadata_path = Path(__file__).parent.joinpath("../seq_metadata.csv").resolve()
     self.metadatacsv = msa.MetadataCSV(pathtools.get_file_content_stringio(metadata_path), metadata_path)
     self.maf_files_dir = Path(__file__).parent.joinpath("maf_files_with_cycles_or_reversion").resolve()
     self.fasta_provider = missings.ConstBaseProvider(missings.MissingBase())
    def test_1_messy_sequences(self):
        maf_path = self.maf_files_dir.joinpath("test_1_messy_sequences.maf")
        expected_nodes = [
            graph.Node(node_id=nid(0),
                       base=graph.Base('A'),
                       aligned_to=None,
                       block_id=bid(0)),
            graph.Node(node_id=nid(1),
                       base=graph.Base('A'),
                       aligned_to=nid(2),
                       block_id=bid(0)),
            graph.Node(node_id=nid(2),
                       base=graph.Base('C'),
                       aligned_to=nid(1),
                       block_id=bid(0)),
            graph.Node(node_id=nid(3),
                       base=graph.Base('T'),
                       aligned_to=None,
                       block_id=bid(0)),
            graph.Node(node_id=nid(4),
                       base=graph.Base('C'),
                       aligned_to=nid(5),
                       block_id=bid(0)),
            graph.Node(node_id=nid(5),
                       base=graph.Base('G'),
                       aligned_to=nid(4),
                       block_id=bid(0)),
            graph.Node(node_id=nid(6),
                       base=graph.Base('A'),
                       aligned_to=None,
                       block_id=bid(1)),
            graph.Node(node_id=nid(7),
                       base=graph.Base('C'),
                       aligned_to=None,
                       block_id=bid(1)),
            graph.Node(node_id=nid(8),
                       base=graph.Base('G'),
                       aligned_to=None,
                       block_id=bid(1)),
            graph.Node(node_id=nid(9),
                       base=graph.Base('C'),
                       aligned_to=nid(10),
                       block_id=bid(2)),
            graph.Node(node_id=nid(10),
                       base=graph.Base('G'),
                       aligned_to=nid(9),
                       block_id=bid(2)),
            graph.Node(node_id=nid(11),
                       base=graph.Base('T'),
                       aligned_to=None,
                       block_id=bid(2)),
            graph.Node(node_id=nid(12),
                       base=graph.Base('C'),
                       aligned_to=None,
                       block_id=bid(2)),
            graph.Node(node_id=nid(13),
                       base=graph.Base('C'),
                       aligned_to=None,
                       block_id=bid(2)),
            graph.Node(node_id=nid(14),
                       base=graph.Base('A'),
                       aligned_to=None,
                       block_id=bid(2)),
        ]

        expected_sequences = {
            msa.SequenceID('seq0'):
            graph.Sequence(
                msa.SequenceID('seq0'),
                [graph.SeqPath([*map(nid, [1, 3, 4, 6, 8, 9, 11, 12])])],
                graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq1'):
            graph.Sequence(
                msa.SequenceID('seq1'),
                [graph.SeqPath([*map(nid, [2, 3, 4, 10, 11, 12, 13, 14])])],
                graph.SequenceMetadata({'group': '1'})),
            msa.SequenceID('seq2'):
            graph.Sequence(
                msa.SequenceID('seq2'),
                [graph.SeqPath([*map(nid, [0, 2, 5, 6, 7, 10, 11, 12, 14])])],
                graph.SequenceMetadata({'group': '2'})),
            msa.SequenceID('seq3'):
            graph.Sequence(msa.SequenceID('seq3'), [],
                           graph.SequenceMetadata({'group': '2'}))
        }
        actual_nodes, actual_sequences = maf2poagraph.get_poagraph(
            msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path),
            self.metadatacsv)

        self.assertEqual(expected_nodes, actual_nodes)
        self.assertEqual(expected_sequences, actual_sequences)
Example #30
0
def get_default_blosum():
    """Returns default blosum file: Blosum80.mat"""
    parent_dir = Path(os.path.dirname(os.path.abspath(__file__)) + '/')
    default_blosum_path = pathtools.get_child_path(parent_dir, "../../bin/blosum80.mat")
    blosum_content = pathtools.get_file_content_stringio(default_blosum_path)
    return Blosum(blosum_content, default_blosum_path)