def evaluate_directory(data_dir, eval_limit=50000, lim=5, limited_expand=False): ''' Processes a directory of FASTA files, generating and evaluating trees using branch and bound with early stopping and limited expansion (optional) Args: data_dir: str, the path to the data directory which stores fasta files eval_limit: int, max number of trees to evaluate for each file. Good setting depends on seq length and time you're willing to wait. lim: int, number of files in the data directory to process limited_expand: bool, whether or not to use limited expansion Returns: List[Tree], a list of BioPython Phylo trees with tied best scores ''' scorer = TreeConstruction.ParsimonyScorer() all_best = [] files = os.listdir(data_dir) for i, file in enumerate(files[:lim]): # Load and sort file print(f"Processing {file} ({i+1}/{len(files[:lim])})") aln = AlignIO.read(open(data_dir + os.path.sep + file), 'fasta') aln.sort(key=lambda a: a.id) result_trees = get_best_trees(aln, scorer, eval_limit, limited_expand) print(f"Found {len(result_trees)} trees.") all_best.extend(result_trees) return [tr[0] for tr in all_best]
def find_good_tree(trees, data_dir, lim=25): ''' Evaluates a list of trees against multiple alignments, returning the best scoring tree across all alignments. Requires tree terminal names and alignment names in data_dir are the same. Args: trees: List[Tree], a list of BioPython Phylo trees to evaluate data_dir: str, the path to the data directory lim: int, number of alignments to evaluate against (more is slower) ''' best_trees = [] scorer = TreeConstruction.ParsimonyScorer() files = os.listdir(data_dir) alns = [] for i, file in enumerate(files[:lim]): aln = AlignIO.read(open(data_dir + os.path.sep + file), 'fasta') for rec in aln: rec.name = rec.id = rec.name[:5] alns.append(aln) scores = {} max_aln = {} # Score trees and track highest score for each alignment to normalize later print(f"Processing {len(trees)} trees...") for i, tree in enumerate(trees): print(f"\t{i+1}/{len(trees)}") for j, aln in enumerate(alns[:lim]): try: sco = scorer.get_score(copy.deepcopy(tree), aln) except Exception as e: print(e) print( "Scoring failed. Did you ensure that terminal names and alignment names match?" ) return None scores[(i, j)] = sco if sco > max_aln.get(j, 0): max_aln[j] = sco # Computes normalized scores for each tree fin_scores = {} for i in range(len(trees)): for j in range(len(alns[:lim])): m_aln = max_aln.get(j, 0) if m_aln <= 0: continue normalized_score = scores.get((i, j), 0) / m_aln if not normalized_score: print(f"Error for tree {i} and alignment {j}.") fin_scores[i] = fin_scores.get(i, 0) + normalized_score # Finds final best tree best_tree = -1 for key in fin_scores.keys(): if best_tree < 0 or fin_scores[key] < fin_scores[best_tree]: best_tree = key if best_tree < 0: print("No best tree found.") return None return trees[best_tree]