Example #1
0
def evaluate_directory(data_dir,
                       eval_limit=50000,
                       lim=5,
                       limited_expand=False):
    '''
    Processes a directory of FASTA files, generating and evaluating trees
    using branch and bound with early stopping and limited expansion (optional)
    Args:
        data_dir: str, the path to the data directory which stores fasta files
        eval_limit: int, max number of trees to evaluate for each file. Good
                    setting depends on seq length and time you're willing to wait.
        lim: int, number of files in the data directory to process
        limited_expand: bool, whether or not to use limited expansion
    Returns:
        List[Tree], a list of BioPython Phylo trees with tied best scores
    '''
    scorer = TreeConstruction.ParsimonyScorer()
    all_best = []

    files = os.listdir(data_dir)

    for i, file in enumerate(files[:lim]):
        # Load and sort file
        print(f"Processing {file} ({i+1}/{len(files[:lim])})")
        aln = AlignIO.read(open(data_dir + os.path.sep + file), 'fasta')
        aln.sort(key=lambda a: a.id)

        result_trees = get_best_trees(aln, scorer, eval_limit, limited_expand)
        print(f"Found {len(result_trees)} trees.")
        all_best.extend(result_trees)

    return [tr[0] for tr in all_best]
Example #2
0
def find_good_tree(trees, data_dir, lim=25):
    '''
    Evaluates a list of trees against multiple alignments, returning the best
    scoring tree across all alignments. Requires tree terminal names and
    alignment names in data_dir are the same.
    Args:
        trees: List[Tree], a list of BioPython Phylo trees to evaluate
        data_dir: str, the path to the data directory
        lim: int, number of alignments to evaluate against (more is slower)
    '''
    best_trees = []
    scorer = TreeConstruction.ParsimonyScorer()

    files = os.listdir(data_dir)
    alns = []
    for i, file in enumerate(files[:lim]):
        aln = AlignIO.read(open(data_dir + os.path.sep + file), 'fasta')
        for rec in aln:
            rec.name = rec.id = rec.name[:5]
        alns.append(aln)
    scores = {}
    max_aln = {}

    # Score trees and track highest score for each alignment to normalize later
    print(f"Processing {len(trees)} trees...")
    for i, tree in enumerate(trees):
        print(f"\t{i+1}/{len(trees)}")
        for j, aln in enumerate(alns[:lim]):
            try:
                sco = scorer.get_score(copy.deepcopy(tree), aln)
            except Exception as e:
                print(e)
                print(
                    "Scoring failed. Did you ensure that terminal names and alignment names match?"
                )
                return None

            scores[(i, j)] = sco
            if sco > max_aln.get(j, 0): max_aln[j] = sco

    # Computes normalized scores for each tree
    fin_scores = {}
    for i in range(len(trees)):
        for j in range(len(alns[:lim])):
            m_aln = max_aln.get(j, 0)
            if m_aln <= 0: continue
            normalized_score = scores.get((i, j), 0) / m_aln
            if not normalized_score:
                print(f"Error for tree {i} and alignment {j}.")
            fin_scores[i] = fin_scores.get(i, 0) + normalized_score

    # Finds final best tree
    best_tree = -1
    for key in fin_scores.keys():
        if best_tree < 0 or fin_scores[key] < fin_scores[best_tree]:
            best_tree = key
    if best_tree < 0:
        print("No best tree found.")
        return None

    return trees[best_tree]