def treeshrink(tree_file, output_dir, output_ext, quantiles): """Remove long branches from a tree.""" subdir = util.file_name(tree_file) cmd = ' '.join([ 'run_treeshrink.py', '--tree {}'.format(tree_file), '--centroid', '--mode per-gene', '--quantiles {}'.format(quantiles), '--outdir {}'.format(subdir), '--tempdir {}'.format(subdir)]) with util.cd(output_dir): subprocess.check_call(cmd, shell=True) mask = util.file_name(subdir + '_*', ext=EXT_IN, dir_=subdir) tree_src = glob(mask)[0] tree_dst = util.file_name(tree_file, output_ext + EXT_OUT) with open(tree_src) as in_file, open(tree_dst, 'w') as out_file: content = in_file.read() out_file.write(content.replace("'", '')) rmtree(subdir) return tree_dst
def prune_mo(tree_file, output_dir, min_taxa, out_groups): output_files = [] # read in the tree and check number of taxa with open(tree_file) as infile: intree = newick3.parse(infile.readline()) curroot = intree names = get_front_names(curroot) num_tips, num_taxa = len(names), len(set(names)) if num_taxa < min_taxa: return output_files # not enough taxa # If the homolog has no taxon duplication, no cutting is needed if num_tips == num_taxa: if OUTPUT_1TO1_ORTHOLOGS: output_file = util.file_name(tree_file, '_1to1ortho.tre', output_dir) copyfile(tree_file, output_file) output_files.append(output_file) else: # now need to deal with taxon duplications # check to make sure that the ingroup and outgroup names were # set correctly outgroup_names = get_front_outgroup_names(curroot, out_groups) # if no out-group at all, do not resolve gene duplication if len(outgroup_names) == 0: print("duplicated taxa in unrooted tree") # skip the homolog if there are duplicated out-group taxa elif len(outgroup_names) > len(set(outgroup_names)): print("outgroup contains taxon repeats") else: # at least one out-group present and there's no out-group # duplication if curroot.nchildren == 2: # need to reroot _, curroot = remove_kink(curroot, curroot) curroot = reroot_with_monophyletic_outgroups(curroot, out_groups) # only return one tree after pruning if curroot is not None: output_file = util.file_name(tree_file, '.reroot') output_files.append(output_file) with open(output_file, "w") as outfile: outfile.write(newick3.tostring(curroot) + ";\n") ortho = prune_paralogs_from_rerooted_homotree( curroot, out_groups) if len(set(get_front_names(curroot))) >= min_taxa: output_file = util.file_name(tree_file, '.ortho.tre', output_dir) output_file += '.ortho.tre' output_files.append(output_file) with open(output_file, "w") as outfile: outfile.write(newick3.tostring(ortho) + ";\n") else: print("not enough taxa after pruning") else: print("out-group non-monophyletic") return output_files
def prune_mi(tree_file, output_dir, min_taxa, relative_tip_cutoff, absolute_tip_cutoff): output_files = [] with open(tree_file) as infile: # only 1 tree in each file intree = newick3.parse(infile.readline()) curroot = intree if get_front_score(curroot) >= min_taxa: # No need to prune print("No pruning needed") if OUTPUT_1to1_ORTHOLOGS: output_file = util.file_name(tree_file, '_1to1ortho.tre', output_dir) copyfile(tree_file, output_file) output_files.append(output_file) else: # scoring the tree pp_trees = [] while True: # python version of do..while loop highest = 0 highest_node = None score_hashes = {} # key: node, value: (front_score,back_score) for node in curroot.iternodes(): front_score = get_front_score(node) back_score = get_back_score(node, curroot) score_hashes[node] = (front_score, back_score) if front_score > highest or back_score > highest: highest_node = node highest = max(front_score, back_score) if highest >= min_taxa: # prune curroot, done = prune(score_hashes[highest_node], highest_node, curroot, pp_trees) if done or len(curroot.leaves()) < min_taxa: break else: break if len(pp_trees) > 0: count = 1 for tree in pp_trees: if tree.nchildren == 2: node, tree = tree_utils.remove_kink(tree, tree) tree = trim_tips.trim(tree, relative_tip_cutoff, absolute_tip_cutoff) if tree is not None and len(tree.leaves()) >= min_taxa: output_file = util.file_name( tree_file, '_MIortho{}.tre'.format(count), output_dir) output_files.append(output_file) with open(output_file, "w") as outfile: outfile.write(newick3.tostring(tree) + ";\n") count += 1 return output_files
def mafft(fasta_file, output_dir, output_ext, seq_type, cpus, anysymbol): """Align sequences.""" in_path = fasta_file if seq_type == 'aa': in_path = bio.adjust_aa_seqs(fasta_file, output_dir) cmd = [ 'mafft', '--amino' if seq_type == 'aa' else '--nuc', '--thread {}'.format(cpus), '--anysymbol' if anysymbol else '' ] if (bio.fasta_record_count(in_path) >= bio.SEQ_COUNT_CUTOFF or bio.longest_fasta_seq(in_path) >= bio.SEQ_LEN_CUTOFF): cmd.append('--auto') else: cmd += [ '--genafpair', '--maxiterate {}'.format(MAX_ITERATE), '--anysymbol' if anysymbol else '' ] cmd.append(in_path) cmd = ' '.join(cmd) aligned = util.file_name(fasta_file, output_ext) with util.cd(output_dir): result = subprocess.check_output(cmd, shell=True) with open(aligned, 'wb') as out_file: out_file.write(result) return aligned
def raxml_ng_bs(fasta_file, output_dir, temp_dir, seq_type, cpus, seed, output_ext, replicates=100): """Build a bootstrapped tree with raxml.""" model = "Blosum62" if seq_type == "aa" else "GTR" tree = util.file_name(fasta_file, output_ext, output_dir) cmd = ' '.join([ 'raxml-ng', '-T {}'.format(cpus), '-f a', '-x {}'.format(seed), '-p {}'.format(seed), '-m {}'.format(model), '-# {}'.format(replicates), '-s {}'.format(fasta_file), '-n {}'.format(tree) ]) with util.cd(temp_dir): subprocess.check_call(cmd, shell=True) tree_src = join('RAxML_bipartitions.' + tree) tree_dst = join(output_dir, tree) move(tree_src, tree_dst) return tree_dst
def mask_tips(tree_file, output_dir, output_ext): """Wrap tree tip removal.""" tree = Phylo.read(tree_file, 'newick') mask_monophyletic_tips(tree) output = util.file_name(tree_file, output_ext) with util.cd(output_dir): Phylo.write(tree, output, 'newick') return output
def tree_to_fasta(old_fasta, tree_file, output_dir, output_ext): """Convert a Newick tree to a fasta file.""" tree = Phylo.read(tree_file, 'newick') fasta = bio.read_fasta(old_fasta) fasta_path = util.file_name(tree_file, output_ext) with open(fasta_path, 'w') as out_file: for node in tree.get_terminals(): bio.write_fasta_record(out_file, node.name, fasta[node.name]) return fasta_path
def cut_branches(tree_file, output_dir, output_ext, branch_cutoff, min_taxa): """Cut long internal branches.""" tree = Phylo.read(tree_file, 'newick') subtrees = cut_deep(tree, branch_cutoff, min_taxa) with util.cd(output_dir): for i, subtree in enumerate(subtrees): output = '{}_{}'.format(tree_file, i) output = util.file_name(output, output_ext) Phylo.write(subtree, output, 'newick') return output
def pxrr(tree_file, output_dir): """Unroot the tree returned by treeshrink.""" unrooted = util.file_name(tree_file) cmd = ' '.join([ 'pxrr', '--unroot', '--treef {}'.format(tree_file), '--outf {}'.format(unrooted) ]) with util.cd(output_dir): subprocess.check_call(cmd, shell=True) util.remove_files('phyx.logfile') return unrooted
def prune_1to1(tree_file, output_dir, min_taxa, min_bootstrap=0.0): output_files = [] with open(tree_file) as infile: intree = newick3.parse(infile.readline()) names = get_front_names(intree) num_tips, num_taxa = len(names), len(set(names)) print("number of tips:", num_tips, "number of taxa:", num_taxa) if num_tips == num_taxa and num_taxa >= min_taxa: if min_bootstrap > 0.0 and not pass_boot_filter(intree, min_bootstrap): return output_files output_file = util.file_name(tree_file, '_1to1ortho.tre') copyfile(tree_file, output_file) output_files.append(output_file) return output_files
def fasttree(fasta_file, output_dir, output_ext, seq_type): """Build a tree with fasttree.""" cmd = ['fasttree', '-quiet'] cmd += ['-wag'] if seq_type == 'aa' else ['-nt', '-gtr'] cmd.append(fasta_file) cmd = ' '.join(cmd) tree_file = util.file_name(fasta_file, output_ext) with util.cd(output_dir): result = subprocess.check_output(cmd, shell=True) with open(tree_file, 'wb') as out_file: out_file.write(result) return tree_file
def pxclsq(fasta_file, output_dir, output_ext, seq_type, min_occupancy, min_len): """Filter aligned sequences for occupancy and length.""" ext = output_ext + EXT_PXCLSQ temp_cleaned = util.file_name(fasta_file, ext) cmd = ' '.join([ 'pxclsq', '--aminoacid' if seq_type == 'aa' else '', '--prop {}'.format(min_occupancy), '--seqf {}'.format(fasta_file), '--outf {}'.format(basename(temp_cleaned)) ]) cleaned = util.file_name(fasta_file, output_ext) with util.cd(output_dir): subprocess.check_call(cmd, shell=True) with open(temp_cleaned) as in_file, open(cleaned, 'w') as out_file: for header, seq in SimpleFastaParser(in_file): if len(seq.replace('-', '')) >= min_len: bio.write_fasta_record(out_file, header, seq) util.remove_files('phyx.logfile') return cleaned
def raxml(fasta_file, output_dir, output_ext, seq_type, cpus, seed): """Build a tree with raxml.""" model = "PROTCATWAG" if seq_type == "aa" else "GTRCAT" tree = util.file_name(fasta_file, output_ext) cmd = ' '.join([ 'raxml', '-T {}'.format(cpus), '-p {}'.format(seed), '-m {}'.format(model), '-s {}'.format(fasta_file), '-n {}'.format(tree) ]) with util.cd(output_dir): subprocess.check_call(cmd, shell=True) tree_src = 'RAxML_bestTree.' + tree move(tree_src, tree) util.remove_files('RAxML_*') return tree
def ortholog_to_fasta(old_fasta, tree_file, output_dir, min_taxa, output_ext): """Convert a Newick tree to a fasta file using extra checks.""" tree = Phylo.read(tree_file, 'newick') fasta = bio.read_fasta(old_fasta) fasta_path = util.file_name(tree_file, output_ext) taxa = set(n.name.split('@')[0] for n in tree.get_terminals() if '@' in n.name) if len(taxa) < min_taxa: return None with open(fasta_path, 'w') as out_file: for node in tree.get_terminals(): bio.write_fasta_record(out_file, node.name, fasta[node.name]) return fasta_path
def raxml_ng(fasta_file, output_dir, temp_dir, seq_type, cpus, seed, output_ext): """Build a tree with raxml.""" model = "Blosum62" if seq_type == "aa" else "GTR" tree = util.file_name(fasta_file, output_ext) cmd = ' '.join([ 'raxml-ng', '-T {}'.format(cpus), '-p {}'.format(seed), '-m {}'.format(model), '-s {}'.format(fasta_file), '-n {}'.format(tree) ]) with util.cd(temp_dir): subprocess.check_call(cmd, shell=True) tree_src = join('RAxML_bestTree.' + tree) tree_dst = join(output_dir, tree) move(tree_src, tree_dst) return tree_dst
def prank(fasta_file, output_dir, temp_dir, seq_type): """Align sequences.""" in_path = fasta_file if seq_type == 'aa': in_path = bio.adjust_aa_seqs(fasta_file, temp_dir) aligned = util.file_name(fasta_file, 'ortho.aln') cmd = [ 'prank', '-d {}'.format(in_path), '-o {}'.format(aligned), '-protein' if seq_type == 'aa' else '-DNA', ] cmd = ' '.join(cmd) with util.cd(temp_dir): result = subprocess.check_output(cmd) with open(aligned, 'wb') as out_file: out_file.write(result) return aligned
def raxml_bs(fasta_file, output_dir, output_ext, seq_type, cpus, seed, replicates=100): """Build a bootstrapped tree with raxml.""" model = "PROTCATWAG" if seq_type == "aa" else "GTRCAT" tree = util.file_name(fasta_file, output_ext) cmd = ' '.join([ 'raxml', '-T {}'.format(cpus), '-f a', '-x {}'.format(seed), '-p {}'.format(seed), '-m {}'.format(model), '-# {}'.format(replicates), '-s {}'.format(fasta_file), '-n {}'.format(tree) ]) with util.cd(output_dir): subprocess.check_call(cmd, shell=True) tree_src = 'RAxML_bipartitions.' + tree move(tree_src, tree) util.remove_files('RAxML_*') return tree
def prune_rt(tree_file, output_dir, min_taxa, taxon_code_file): output_files = [] in_groups = [] out_groups = [] with open(taxon_code_file, "r") as infile: for line in infile: if len(line) < 3: continue spls = line.strip().split("\t") if spls[0] == "IN": in_groups.append(spls[1]) elif spls[0] == "OUT": out_groups.append(spls[1]) else: print("Check taxon_code_file file format") sys.exit() if len(set(in_groups) & set(out_groups)) > 0: print("Taxon ID", set(in_groups) & set(out_groups), "in both ingroups and outgroups") sys.exit(0) print(len(in_groups), "ingroup taxa and", len(out_groups), "outgroup taxa read") print("Ingroups:", in_groups) print("Outgroups:", out_groups) with open(tree_file) as infile: intree = newick3.parse(infile.readline()) curroot = intree all_names = tree_utils.get_front_names(curroot) num_taxa = len(set(all_names)) # check taxonIDs ingroup_names = [] outgroup_names = [] for name in all_names: if name in in_groups: ingroup_names.append(name) elif name in out_groups: outgroup_names.append(name) else: print(name, "not in ingroups or outgroups") sys.exit() if len(set(ingroup_names)) < min_taxa: print("not enough ingroup taxa in tree") return output_files if len(outgroup_names) > 0: # >= one outgroup, root & cut inclades inclades = tree_utils.extract_rooted_ingroup_clades( curroot, in_groups, out_groups, min_taxa) inclade_count = 0 for inclade in inclades: inclade_count += 1 output_file = util.file_name(tree_file, '.inclade{}'.format(inclade_count), output_dir) output_files.append(output_file) with open(output_file, "w") as outfile: outfile.write(newick3.tostring(inclade) + ";\n") orthologs = tree_utils.get_ortho_from_rooted_inclade(inclade) ortho_count = 0 for ortho in orthologs: if len(tree_utils.get_front_labels(ortho)) >= min_taxa: ortho_count += 1 output_file = util.file_name( tree_file, '.ortho{}.tre'.format(ortho_count)) output_files.append(output_file) with open(output_file, "w") as outfile: outfile.write(newick3.tostring(ortho) + ";\n") elif len(all_names) == num_taxa: # only output ortho tree when there is no taxon repeats output_file = util.file_name(tree_file, '.unrooted-ortho.tre', output_dir) output_files.append(output_file) with open(output_file, "w") as outfile: outfile.write(newick3.tostring(curroot) + ";\n") else: # do not attempt to infer direction of gene duplication # without out-group info print("duplicated taxa in unrooted tree") return output_files