Beispiel #1
0
def treeshrink(tree_file, output_dir, output_ext, quantiles):
    """Remove long branches from a tree."""
    subdir = util.file_name(tree_file)

    cmd = ' '.join([
        'run_treeshrink.py',
        '--tree {}'.format(tree_file),
        '--centroid',
        '--mode per-gene',
        '--quantiles {}'.format(quantiles),
        '--outdir {}'.format(subdir),
        '--tempdir {}'.format(subdir)])

    with util.cd(output_dir):
        subprocess.check_call(cmd, shell=True)

        mask = util.file_name(subdir + '_*', ext=EXT_IN, dir_=subdir)
        tree_src = glob(mask)[0]
        tree_dst = util.file_name(tree_file, output_ext + EXT_OUT)

        with open(tree_src) as in_file, open(tree_dst, 'w') as out_file:
            content = in_file.read()
            out_file.write(content.replace("'", ''))

        rmtree(subdir)

    return tree_dst
Beispiel #2
0
def prune_mo(tree_file, output_dir, min_taxa, out_groups):
    output_files = []

    # read in the tree and check number of taxa
    with open(tree_file) as infile:
        intree = newick3.parse(infile.readline())
    curroot = intree
    names = get_front_names(curroot)
    num_tips, num_taxa = len(names), len(set(names))
    if num_taxa < min_taxa:
        return output_files  # not enough taxa

    # If the homolog has no taxon duplication, no cutting is needed
    if num_tips == num_taxa:
        if OUTPUT_1TO1_ORTHOLOGS:
            output_file = util.file_name(tree_file, '_1to1ortho.tre',
                                         output_dir)
            copyfile(tree_file, output_file)
            output_files.append(output_file)
    else:
        # now need to deal with taxon duplications
        # check to make sure that the ingroup and outgroup names were
        # set correctly
        outgroup_names = get_front_outgroup_names(curroot, out_groups)

        # if no out-group at all, do not resolve gene duplication
        if len(outgroup_names) == 0:
            print("duplicated taxa in unrooted tree")

        # skip the homolog if there are duplicated out-group taxa
        elif len(outgroup_names) > len(set(outgroup_names)):
            print("outgroup contains taxon repeats")

        else:  # at least one out-group present and there's no out-group
            # duplication
            if curroot.nchildren == 2:  # need to reroot
                _, curroot = remove_kink(curroot, curroot)
            curroot = reroot_with_monophyletic_outgroups(curroot, out_groups)
            # only return one tree after pruning
            if curroot is not None:
                output_file = util.file_name(tree_file, '.reroot')
                output_files.append(output_file)
                with open(output_file, "w") as outfile:
                    outfile.write(newick3.tostring(curroot) + ";\n")
                ortho = prune_paralogs_from_rerooted_homotree(
                    curroot, out_groups)
                if len(set(get_front_names(curroot))) >= min_taxa:
                    output_file = util.file_name(tree_file, '.ortho.tre',
                                                 output_dir)
                    output_file += '.ortho.tre'
                    output_files.append(output_file)
                    with open(output_file, "w") as outfile:
                        outfile.write(newick3.tostring(ortho) + ";\n")
                else:
                    print("not enough taxa after pruning")
            else:
                print("out-group non-monophyletic")

    return output_files
def prune_mi(tree_file, output_dir, min_taxa, relative_tip_cutoff,
             absolute_tip_cutoff):
    output_files = []

    with open(tree_file) as infile:  # only 1 tree in each file
        intree = newick3.parse(infile.readline())
    curroot = intree

    if get_front_score(curroot) >= min_taxa:  # No need to prune
        print("No pruning needed")
        if OUTPUT_1to1_ORTHOLOGS:
            output_file = util.file_name(tree_file, '_1to1ortho.tre',
                                         output_dir)
            copyfile(tree_file, output_file)
            output_files.append(output_file)
    else:  # scoring the tree
        pp_trees = []

        while True:  # python version of do..while loop
            highest = 0
            highest_node = None
            score_hashes = {}  # key: node, value: (front_score,back_score)
            for node in curroot.iternodes():
                front_score = get_front_score(node)
                back_score = get_back_score(node, curroot)
                score_hashes[node] = (front_score, back_score)
                if front_score > highest or back_score > highest:
                    highest_node = node
                    highest = max(front_score, back_score)
            if highest >= min_taxa:  # prune
                curroot, done = prune(score_hashes[highest_node], highest_node,
                                      curroot, pp_trees)
                if done or len(curroot.leaves()) < min_taxa:
                    break
            else:
                break

        if len(pp_trees) > 0:
            count = 1
            for tree in pp_trees:
                if tree.nchildren == 2:
                    node, tree = tree_utils.remove_kink(tree, tree)
                tree = trim_tips.trim(tree, relative_tip_cutoff,
                                      absolute_tip_cutoff)
                if tree is not None and len(tree.leaves()) >= min_taxa:
                    output_file = util.file_name(
                        tree_file, '_MIortho{}.tre'.format(count), output_dir)
                    output_files.append(output_file)
                    with open(output_file, "w") as outfile:
                        outfile.write(newick3.tostring(tree) + ";\n")
                    count += 1

    return output_files
def mafft(fasta_file, output_dir, output_ext, seq_type, cpus, anysymbol):
    """Align sequences."""
    in_path = fasta_file
    if seq_type == 'aa':
        in_path = bio.adjust_aa_seqs(fasta_file, output_dir)

    cmd = [
        'mafft', '--amino' if seq_type == 'aa' else '--nuc',
        '--thread {}'.format(cpus), '--anysymbol' if anysymbol else ''
    ]

    if (bio.fasta_record_count(in_path) >= bio.SEQ_COUNT_CUTOFF
            or bio.longest_fasta_seq(in_path) >= bio.SEQ_LEN_CUTOFF):
        cmd.append('--auto')
    else:
        cmd += [
            '--genafpair', '--maxiterate {}'.format(MAX_ITERATE),
            '--anysymbol' if anysymbol else ''
        ]

    cmd.append(in_path)
    cmd = ' '.join(cmd)

    aligned = util.file_name(fasta_file, output_ext)

    with util.cd(output_dir):
        result = subprocess.check_output(cmd, shell=True)
        with open(aligned, 'wb') as out_file:
            out_file.write(result)

    return aligned
Beispiel #5
0
def raxml_ng_bs(fasta_file,
                output_dir,
                temp_dir,
                seq_type,
                cpus,
                seed,
                output_ext,
                replicates=100):
    """Build a bootstrapped tree with raxml."""
    model = "Blosum62" if seq_type == "aa" else "GTR"
    tree = util.file_name(fasta_file, output_ext, output_dir)
    cmd = ' '.join([
        'raxml-ng', '-T {}'.format(cpus), '-f a', '-x {}'.format(seed),
        '-p {}'.format(seed), '-m {}'.format(model),
        '-# {}'.format(replicates), '-s {}'.format(fasta_file),
        '-n {}'.format(tree)
    ])

    with util.cd(temp_dir):
        subprocess.check_call(cmd, shell=True)

        tree_src = join('RAxML_bipartitions.' + tree)
        tree_dst = join(output_dir, tree)
        move(tree_src, tree_dst)

    return tree_dst
Beispiel #6
0
def mask_tips(tree_file, output_dir, output_ext):
    """Wrap tree tip removal."""
    tree = Phylo.read(tree_file, 'newick')

    mask_monophyletic_tips(tree)

    output = util.file_name(tree_file, output_ext)
    with util.cd(output_dir):
        Phylo.write(tree, output, 'newick')

    return output
Beispiel #7
0
def tree_to_fasta(old_fasta, tree_file, output_dir, output_ext):
    """Convert a Newick tree to a fasta file."""
    tree = Phylo.read(tree_file, 'newick')
    fasta = bio.read_fasta(old_fasta)

    fasta_path = util.file_name(tree_file, output_ext)

    with open(fasta_path, 'w') as out_file:
        for node in tree.get_terminals():
            bio.write_fasta_record(out_file, node.name, fasta[node.name])

    return fasta_path
Beispiel #8
0
def cut_branches(tree_file, output_dir, output_ext, branch_cutoff, min_taxa):
    """Cut long internal branches."""
    tree = Phylo.read(tree_file, 'newick')

    subtrees = cut_deep(tree, branch_cutoff, min_taxa)

    with util.cd(output_dir):
        for i, subtree in enumerate(subtrees):
            output = '{}_{}'.format(tree_file, i)
            output = util.file_name(output, output_ext)
            Phylo.write(subtree, output, 'newick')

    return output
Beispiel #9
0
def pxrr(tree_file, output_dir):
    """Unroot the tree returned by treeshrink."""
    unrooted = util.file_name(tree_file)
    cmd = ' '.join([
        'pxrr', '--unroot', '--treef {}'.format(tree_file),
        '--outf {}'.format(unrooted)
    ])

    with util.cd(output_dir):
        subprocess.check_call(cmd, shell=True)
        util.remove_files('phyx.logfile')

    return unrooted
Beispiel #10
0
def prune_1to1(tree_file, output_dir, min_taxa, min_bootstrap=0.0):
    output_files = []
    with open(tree_file) as infile:
        intree = newick3.parse(infile.readline())
    names = get_front_names(intree)
    num_tips, num_taxa = len(names), len(set(names))
    print("number of tips:", num_tips, "number of taxa:", num_taxa)
    if num_tips == num_taxa and num_taxa >= min_taxa:
        if min_bootstrap > 0.0 and not pass_boot_filter(intree, min_bootstrap):
            return output_files
        output_file = util.file_name(tree_file, '_1to1ortho.tre')
        copyfile(tree_file, output_file)
        output_files.append(output_file)
    return output_files
Beispiel #11
0
def fasttree(fasta_file, output_dir, output_ext, seq_type):
    """Build a tree with fasttree."""
    cmd = ['fasttree', '-quiet']
    cmd += ['-wag'] if seq_type == 'aa' else ['-nt', '-gtr']
    cmd.append(fasta_file)
    cmd = ' '.join(cmd)

    tree_file = util.file_name(fasta_file, output_ext)

    with util.cd(output_dir):
        result = subprocess.check_output(cmd, shell=True)
        with open(tree_file, 'wb') as out_file:
            out_file.write(result)

    return tree_file
Beispiel #12
0
def pxclsq(fasta_file, output_dir, output_ext, seq_type, min_occupancy,
           min_len):
    """Filter aligned sequences for occupancy and length."""
    ext = output_ext + EXT_PXCLSQ
    temp_cleaned = util.file_name(fasta_file, ext)

    cmd = ' '.join([
        'pxclsq', '--aminoacid' if seq_type == 'aa' else '',
        '--prop {}'.format(min_occupancy), '--seqf {}'.format(fasta_file),
        '--outf {}'.format(basename(temp_cleaned))
    ])

    cleaned = util.file_name(fasta_file, output_ext)

    with util.cd(output_dir):
        subprocess.check_call(cmd, shell=True)
        with open(temp_cleaned) as in_file, open(cleaned, 'w') as out_file:
            for header, seq in SimpleFastaParser(in_file):
                if len(seq.replace('-', '')) >= min_len:
                    bio.write_fasta_record(out_file, header, seq)

        util.remove_files('phyx.logfile')

    return cleaned
def raxml(fasta_file, output_dir, output_ext, seq_type, cpus, seed):
    """Build a tree with raxml."""
    model = "PROTCATWAG" if seq_type == "aa" else "GTRCAT"
    tree = util.file_name(fasta_file, output_ext)
    cmd = ' '.join([
        'raxml', '-T {}'.format(cpus), '-p {}'.format(seed),
        '-m {}'.format(model), '-s {}'.format(fasta_file), '-n {}'.format(tree)
    ])

    with util.cd(output_dir):
        subprocess.check_call(cmd, shell=True)
        tree_src = 'RAxML_bestTree.' + tree
        move(tree_src, tree)
        util.remove_files('RAxML_*')

    return tree
Beispiel #14
0
def ortholog_to_fasta(old_fasta, tree_file, output_dir, min_taxa, output_ext):
    """Convert a Newick tree to a fasta file using extra checks."""
    tree = Phylo.read(tree_file, 'newick')
    fasta = bio.read_fasta(old_fasta)

    fasta_path = util.file_name(tree_file, output_ext)

    taxa = set(n.name.split('@')[0]
               for n in tree.get_terminals() if '@' in n.name)
    if len(taxa) < min_taxa:
        return None

    with open(fasta_path, 'w') as out_file:
        for node in tree.get_terminals():
            bio.write_fasta_record(out_file, node.name, fasta[node.name])

    return fasta_path
Beispiel #15
0
def raxml_ng(fasta_file, output_dir, temp_dir, seq_type, cpus, seed,
             output_ext):
    """Build a tree with raxml."""
    model = "Blosum62" if seq_type == "aa" else "GTR"
    tree = util.file_name(fasta_file, output_ext)
    cmd = ' '.join([
        'raxml-ng', '-T {}'.format(cpus), '-p {}'.format(seed),
        '-m {}'.format(model), '-s {}'.format(fasta_file), '-n {}'.format(tree)
    ])

    with util.cd(temp_dir):
        subprocess.check_call(cmd, shell=True)

        tree_src = join('RAxML_bestTree.' + tree)
        tree_dst = join(output_dir, tree)
        move(tree_src, tree_dst)

    return tree_dst
Beispiel #16
0
def prank(fasta_file, output_dir, temp_dir, seq_type):
    """Align sequences."""
    in_path = fasta_file
    if seq_type == 'aa':
        in_path = bio.adjust_aa_seqs(fasta_file, temp_dir)

    aligned = util.file_name(fasta_file, 'ortho.aln')

    cmd = [
        'prank',
        '-d {}'.format(in_path),
        '-o {}'.format(aligned),
        '-protein' if seq_type == 'aa' else '-DNA',
    ]

    cmd = ' '.join(cmd)

    with util.cd(temp_dir):
        result = subprocess.check_output(cmd)
        with open(aligned, 'wb') as out_file:
            out_file.write(result)

    return aligned
def raxml_bs(fasta_file,
             output_dir,
             output_ext,
             seq_type,
             cpus,
             seed,
             replicates=100):
    """Build a bootstrapped tree with raxml."""
    model = "PROTCATWAG" if seq_type == "aa" else "GTRCAT"
    tree = util.file_name(fasta_file, output_ext)
    cmd = ' '.join([
        'raxml', '-T {}'.format(cpus), '-f a', '-x {}'.format(seed),
        '-p {}'.format(seed), '-m {}'.format(model),
        '-# {}'.format(replicates), '-s {}'.format(fasta_file),
        '-n {}'.format(tree)
    ])

    with util.cd(output_dir):
        subprocess.check_call(cmd, shell=True)
        tree_src = 'RAxML_bipartitions.' + tree
        move(tree_src, tree)
        util.remove_files('RAxML_*')

    return tree
def prune_rt(tree_file, output_dir, min_taxa, taxon_code_file):
    output_files = []
    in_groups = []
    out_groups = []
    with open(taxon_code_file, "r") as infile:
        for line in infile:
            if len(line) < 3:
                continue
            spls = line.strip().split("\t")
            if spls[0] == "IN":
                in_groups.append(spls[1])
            elif spls[0] == "OUT":
                out_groups.append(spls[1])
            else:
                print("Check taxon_code_file file format")
                sys.exit()
    if len(set(in_groups) & set(out_groups)) > 0:
        print("Taxon ID",
              set(in_groups) & set(out_groups),
              "in both ingroups and outgroups")
        sys.exit(0)
    print(len(in_groups), "ingroup taxa and", len(out_groups),
          "outgroup taxa read")
    print("Ingroups:", in_groups)
    print("Outgroups:", out_groups)

    with open(tree_file) as infile:
        intree = newick3.parse(infile.readline())
    curroot = intree
    all_names = tree_utils.get_front_names(curroot)
    num_taxa = len(set(all_names))

    # check taxonIDs
    ingroup_names = []
    outgroup_names = []
    for name in all_names:
        if name in in_groups:
            ingroup_names.append(name)
        elif name in out_groups:
            outgroup_names.append(name)
        else:
            print(name, "not in ingroups or outgroups")
            sys.exit()
    if len(set(ingroup_names)) < min_taxa:
        print("not enough ingroup taxa in tree")
        return output_files

    if len(outgroup_names) > 0:  # >= one outgroup, root & cut inclades
        inclades = tree_utils.extract_rooted_ingroup_clades(
            curroot, in_groups, out_groups, min_taxa)
        inclade_count = 0
        for inclade in inclades:
            inclade_count += 1
            output_file = util.file_name(tree_file,
                                         '.inclade{}'.format(inclade_count),
                                         output_dir)
            output_files.append(output_file)
            with open(output_file, "w") as outfile:
                outfile.write(newick3.tostring(inclade) + ";\n")
            orthologs = tree_utils.get_ortho_from_rooted_inclade(inclade)
            ortho_count = 0
            for ortho in orthologs:
                if len(tree_utils.get_front_labels(ortho)) >= min_taxa:
                    ortho_count += 1
                    output_file = util.file_name(
                        tree_file, '.ortho{}.tre'.format(ortho_count))
                    output_files.append(output_file)
                    with open(output_file, "w") as outfile:
                        outfile.write(newick3.tostring(ortho) + ";\n")

    elif len(all_names) == num_taxa:
        # only output ortho tree when there is no taxon repeats
        output_file = util.file_name(tree_file, '.unrooted-ortho.tre',
                                     output_dir)
        output_files.append(output_file)
        with open(output_file, "w") as outfile:
            outfile.write(newick3.tostring(curroot) + ";\n")

    else:  # do not attempt to infer direction of gene duplication
        # without out-group info
        print("duplicated taxa in unrooted tree")

    return output_files