Ejemplo n.º 1
0
def plot_tree(newick_in_fn, out_plot_fn, attribute_name):
    tree = pro.load_nhx_tree(newick_in_fn)

    ts = ete3.TreeStyle()
    ts.show_leaf_name = False

    def my_layout(node):
        name = getattr(node, attribute_name)

        try:
            kmer_full = locale.format("%d", int(node.kmers_full), grouping=True),
        except AttributeError:
            kmer_full = None

        try:
            kmer_reduced = locale.format("%d", int(node.kmers_reduced), grouping=True)
        except AttributeError:
            kmer_reduced = None

        if kmer_full is None:
            if kmer_reduced is None:
                t = name
            else:
                t = "{} [red. {}]".format(name, kmer_reduced)
        else:
            if kmer_reduced is None:
                t = "{} [full {}]".format(name, kmer_full)
            else:
                t = "{} [full {} & red. {}]".format(name, kmer_full, kmer_reduced)

        f = ete3.TextFace(t, tight_text=True)
        ete3.add_face_to_node(f, node, column=0, position="branch-right")

    ts.layout_fn = my_layout
    tree.render(out_plot_fn, tree_style=ts)
Ejemplo n.º 2
0
def main():
    parser = argparse.ArgumentParser(description='Verify a Newick/NHX tree')

    parser.add_argument(
        'tree',
        metavar='<tree.nw>',
        type=str,
        nargs='+',
        help='phylogenetic tree (in Newick/NHX)',
    )

    args = parser.parse_args()
    tree_fns = args.tree

    ok = True

    for tree_fn in tree_fns:
        print("Validating '{}'".format(tree_fn))
        tree = pro.load_nhx_tree(tree_fn, validate=False)
        r = pro.validate_prophyle_nhx_tree(tree,
                                           verbose=True,
                                           throw_exceptions=False,
                                           output_fo=sys.stdout)
        if r:
            print("   ...OK")
        else:
            ok = False
        print()

    sys.exit(0 if ok else 1)
Ejemplo n.º 3
0
def _test_tree(fn):
    """Test if given tree is valid for ProPhyle.

    Args:
        fn (str): Newick/NHX tree.
    """
    tree = pro.load_nhx_tree(fn, validate=False)
    if not pro.validate_prophyle_nhx_tree(
            tree, verbose=True, throw_exceptions=False, output_fo=sys.stderr):
        error("The tree '{}' could not be properly parsed.".format(fn))
Ejemplo n.º 4
0
def _test_tree(fn):
    """Test if given tree is valid for ProPhyle.

    Args:
        fn (str): Newick/NHX tree.

    Raises:
        AssertionError: The tree is not valid.
    """
    tree = pro.load_nhx_tree(fn, validate=False)
    assert pro.validate_prophyle_nhx_tree(tree, verbose=True, throw_exceptions=False, output_fo=sys.stderr)
Ejemplo n.º 5
0
    def __init__(self, tree_newick_fn, index_dir, library_dir, makefile_fn):
        """Init the class.

        Args:
            tree_newick_fn (str): Tree file name.
            index_dir (str): Directory of the index.
            library_dir (str): Directory with FASTA files.
            makefile_fn (str): Output Makefile.
        """

        self.tree_newick_fn = tree_newick_fn
        tree = pro.load_nhx_tree(tree_newick_fn)
        self.tree = pro.minimal_subtree(tree)
        self.newick_dir = os.path.dirname(tree_newick_fn)
        self.index_dir = index_dir
        self.library_dir = library_dir
        self.makefile_fn = makefile_fn
        pro.makedirs(self.index_dir)
Ejemplo n.º 6
0
    def __init__(self, tree_newick_fn, k):
        tree = pro.load_nhx_tree(tree_newick_fn)
        self.tree = pro.minimal_subtree(tree)

        self.k = k

        self.nodename_to_node = {}
        self.nodename_to_kmercount = {}

        self.nodename_to_samannot = {}

        self.nodename_to_upnodenames = collections.defaultdict(lambda: set())

        for node in self.tree.traverse("postorder"):
            nodename = node.name
            self.nodename_to_node[nodename] = node
            self.nodename_to_kmercount[nodename] = int(node.kmers_full)

            # annotations
            tags_parts = []
            try:
                tags_parts.append("gi:Z:{}".format(node.gi))
            except AttributeError:
                pass

            try:
                tags_parts.append("sn:Z:{}".format(node.sci_name))
            except AttributeError:
                pass

            try:
                tags_parts.append("ra:Z:{}".format(node.rank))
            except AttributeError:
                pass

            self.nodename_to_samannot[nodename] = "\t".join(tags_parts)

            # set of upper nodes
            while node.up:
                node = node.up
                self.nodename_to_upnodenames[nodename].add(node.name)
 def __init__(self, tree_newick_fn):
     self.tree_newick_fn = tree_newick_fn
     self.tree = pro.load_nhx_tree(tree_newick_fn)
Ejemplo n.º 8
0
def prophyle_index(
    index_dir, threads, k, trees_fn, library_dir, construct_klcp, force, no_prefixes, mask_repeats, keep_tmp_files,
    sampling_rate, autocomplete
):
    """Build a ProPhyle index.

    Args:
        index_dir (str): Index directory.
        threads (int): Number of threads in k-mer propagation.
        k (int): K-mer size.
        trees_fn (list of str): Newick/NHX tree, possibly with a root spec (@root).
        library_dir (str): Library directory.
        klcp (bool): Generate klcp.
        force (bool): Rewrite files if they already exist.
        no_prefixes (bool): Don't prepend prefixes to node names during tree merging.
        mask_repeats (bool): Mask repeats using DustMasker.
        keep_tmp_files (bool): Keep temporary files from k-mer propagation.
        sampling rate (float): Sampling rate for subsampling the tree or None for no subsampling.
        autocomplete (bool): Autocomplete names of internal nodes and fasta paths.
    """

    assert isinstance(k, int)
    assert isinstance(threads, int)
    assert k > 1
    assert threads > 0
    assert sampling_rate is None or 0.0 <= float(sampling_rate) <= 1.0

    _compile_prophyle_bin(parallel=True)

    index_fa = os.path.join(index_dir, 'index.fa')
    index_tree_1 = os.path.join(index_dir, 'tree.preliminary.nw')
    index_tree_2 = os.path.join(index_dir, 'tree.nw')

    # recompute = recompute everything from now on
    # force==True => start to recompute everything from beginning
    recompute = force

    # make index dir
    pro.makedirs(index_dir)

    #
    # 1) Newick
    #

    #if not _is_complete(index_dir, 1) or not pro.existing_and_newer_list(trees_fn, index_tree_1):
    if not _is_complete(index_dir, 1):
        recompute = True

    if recompute:
        pro.message('[1/6] Copying/merging trees', upper=True)
        for tree_fn in trees_fn:
            tree_fn, _, root = tree_fn.partition("@")
            tree = pro.load_nhx_tree(tree_fn, validate=False)
            # postpone for autocomplete
            if not autocomplete:
                pro.validate_prophyle_nhx_tree(tree)
            if root != "":
                assert len(tree.search_nodes(name=root)) != 0, "Node '{}' does not exist in '{}'.".format(root, tree_fn)
        if len(trees_fn) != 1:
            pro.message('Merging {} trees'.format(len(trees_fn)))
        _propagation_preprocessing(
            trees_fn, index_tree_1, no_prefixes=no_prefixes, sampling_rate=sampling_rate, autocomplete=autocomplete
        )
        _test_tree(index_tree_1)
        _mark_complete(index_dir, 1)
    else:
        pro.message('[1/6] Tree already exists, skipping its creation', upper=True)

    #
    # 2) Create and run Makefile for propagation, and merge FASTA files
    #

    if not _is_complete(index_dir, 2):
        recompute = True

    if recompute:
        pro.message('[2/6] Running k-mer propagation', upper=True)
        _create_makefile(index_dir, k, library_dir, mask_repeats=mask_repeats)
        _propagate(index_dir, threads=threads)
        _propagation_postprocessing(index_dir, index_tree_1, index_tree_2)
        _test_tree(index_tree_2)
        _kmer_stats(index_dir)
        if not keep_tmp_files:
            _remove_tmp_propagation_files(index_dir)
        else:
            pro.message('Keeping temporary files')
        _mark_complete(index_dir, 2)
    else:
        pro.message('[2/6] K-mers have already been propagated, skipping propagation', upper=True)

    #
    # 3) BWT
    #

    if not _is_complete(index_dir, 3) and not _is_complete(index_dir, 4, dont_check_previous=True):
        recompute = True

    if recompute:
        pro.message('[3/6] Constructing BWT', upper=True)
        pro.rm(index_fa + '.bwt', index_fa + '.bwt.complete')
        _fa2pac(index_fa)
        _pac2bwt(index_fa)
        _mark_complete(index_dir, 3)
    else:
        pro.message('[3/6] BWT already exists, skipping its construction', upper=True)

    #
    # 3) OCC
    #

    if not _is_complete(index_dir, 4):
        recompute = True

    if recompute:
        pro.message('[4/6] Constructing OCC', upper=True)
        _bwt2bwtocc(index_fa)
        _mark_complete(index_dir, 4)
    else:
        pro.message('[4/6] OCC already exists, skipping their construction', upper=True)

    #
    # 4) SA + 5) KLCP (compute SA + KLCP in parallel)
    #

    klcp_fn = "{}.{}.klcp".format(index_fa, k)

    if construct_klcp:

        if not _is_complete(index_dir, 5):
            # SA not computed yet => compute it in parallel with KLCP
            recompute = True

        if recompute:
            pro.message('[5/6],[6/6] Constructing SA + KLCP in parallel ', upper=True)
            _bwtocc2sa_klcp(index_fa, k)
            _mark_complete(index_dir, 5)
            _mark_complete(index_dir, 6)
            return

    #
    # 5) SA (compute only SA)
    #

    if not _is_complete(index_dir, 5):
        recompute = True

    if recompute:
        pro.message('[5/6] Constructing SA', upper=True)
        _bwtocc2sa(index_fa)
    else:
        pro.message('[5/6] SA already exists, skipping its construction', upper=True)

    #
    # 6) KLCP (compute only KLCP)
    #

    if construct_klcp:
        if not _is_complete(index_dir, 6):
            recompute = True

        if recompute:
            pro.message('[6/6] Constructing k-LCP', upper=True)
            _bwtocc2klcp(index_fa, k)
            _mark_complete(index_dir, 6)
        else:
            pro.message('[6/6] k-LCP already exists, skipping its construction', upper=True)
Ejemplo n.º 9
0
def main():

    parser = argparse.ArgumentParser(
        description=
        'K-mer propagation postprocessing: merging FASTA files and k-mer annotation.'
    )

    parser.add_argument('dir',
                        metavar='<propagation.dir>',
                        type=str,
                        help='directory with FASTA files')

    parser.add_argument(
        'index_fasta_fn',
        type=str,
        metavar='<index.fa>',
        help='output fast file',
    )

    parser.add_argument(
        'in_tree_fn',
        type=str,
        metavar='<in.tree.nw>',
        help='input phylogenetic tree',
    )

    parser.add_argument(
        'counts_fn',
        type=str,
        metavar='<counts.tsv>',
        help='input phylogenetic tree',
    )

    parser.add_argument(
        'out_tree_fn',
        type=str,
        metavar='<out.tree.nw>',
        help='output phylogenetic tree',
    )

    #parser.add_argument (
    #   '-D',
    #   dest='nondel',
    #   action='store_true',
    #   help='Non-deleting propagation',
    #)

    args = parser.parse_args()

    dir_fn = args.dir
    index_fasta_fn = args.index_fasta_fn
    in_tree_fn = args.in_tree_fn
    out_tree_fn = args.out_tree_fn
    tsv_fn = args.counts_fn

    suffix = "reduced.fa"

    #if args.nondel:
    #   suffix = "full.fa"
    #else:
    #   suffix = "reduced.fa"

    create_fasta(dir_fn, index_fasta_fn, suffix)

    tree = pro.load_nhx_tree(in_tree_fn)
    stats = load_kmer_stats(tsv_fn)
    enrich_tree(tree, stats)
    pro.save_nhx_tree(tree, out_tree_fn)
def merge_trees(input_trees_fn, output_tree_fn, verbose, add_prefixes,
                sampling_rate, autocomplete):
    assert sampling_rate is None or 0.0 <= float(sampling_rate) <= 1.0

    t = ete3.Tree(name="merge_root", )

    if len(input_trees_fn) == 1:
        if verbose:
            print("Only one tree, don't add any prefix", file=sys.stderr)
        add_prefixes = False

    for i, x in enumerate(input_trees_fn, 1):
        if verbose:
            print("Loading '{}'".format(x), file=sys.stderr)

        tree_fn, _, root_name = x.partition("@")
        tree_to_add = pro.load_nhx_tree(tree_fn, validate=False)

        # subtree extraction required
        if root_name != '':
            tree_to_add = tree_to_add & root_name

        # prepend prefixes to node names
        if add_prefixes:
            tree_to_add = add_prefix(tree_to_add, i)

        t.add_child(tree_to_add)

    if autocomplete:
        if not pro.has_attribute(t, "fastapath"):
            t = autocomplete_fastapath(t)
        t = autocomplete_internal_node_names(t)

    if sampling_rate is not None:
        sampling_rate = float(sampling_rate)

        leaves_1 = []

        for node in t.traverse("postorder"):
            if len(node.children) == 0:
                leaves_1.append(node)

        leaves_1.sort(key=lambda x: x.name)

        leaves_2 = random.sample(leaves_1,
                                 max(round(sampling_rate * len(leaves_1)), 1))
        leaves_2.sort(key=lambda x: x.name)

        leaves_to_remove = list(set(leaves_1) - set(leaves_2))
        leaves_to_remove.sort(key=lambda x: x.name)

        if verbose:
            print("Removing the following leaves: {}".format(", ".join(
                map(apply(lambda x: x.name, leaves_to_remove)))),
                  file=sys.stderr)

        for node in leaves_to_remove:
            while len(node.up.children) == 1:
                node = node.up
            node.detach()

        print(
            "Subsampling the tree with rate {:.4f}, {} leaves were kept (out of {})"
            .format(sampling_rate, len(leaves_2), len(leaves_1)),
            file=sys.stderr)

    if verbose:
        print("Writing to '{}'".format(output_tree_fn), file=sys.stderr)

    pro.save_nhx_tree(t, output_tree_fn)