Exemple #1
0
    def _score(self, alignment):
        if self.use_seq_weights:
            seq_weights = alignment.get_seq_weights()
        else:
            seq_weights = [1.0] * len(alignment.msa)

        if self.bg_distribution is None:
            # Estimate bg distribution from this alignment
            q = weighted_freq_count_pseudocount((aa for seq in alignment.msa for aa in seq), seq_weights, PSEUDOCOUNT)
        else:
            q = self.bg_distribution

        scores = []
        for i in xrange(len(alignment.msa[0])):
            col = get_column(i, alignment.msa)
            n_gaps = col.count("-")
            assert n_gaps < len(col)
            if self.gap_cutoff != 1 and n_gaps / len(col) > self.gap_cutoff:
                score = self.SCORE_OVER_GAP_CUTOFF
            else:
                score = self._score_col(col, seq_weights, q)
                if self.use_gap_penalty:
                    # vn_entropy has this commented out for some reason
                    score *= weighted_gap_penalty(col, seq_weights)
            scores.append(score)
        return scores
Exemple #2
0
    def _score(self, alignment):
        if self.use_seq_weights:
            seq_weights = alignment.get_seq_weights()
        else:
            seq_weights = [1.] * len(alignment.msa)

        # Estimate bg distribution from this alignment
        if hasattr(self, 'bg_distribution'):
            if not self.bg_distribution:
                q = dict((aa, 0) for aa in amino_acids)
                for seq in self.msa:
                    for aa in seq:
                        q[aa] += 1
                self.bg_distribution = q

        scores = []
        for i in xrange(len(alignment.msa[0])):
            col = get_column(i, alignment.msa)
            n_gaps = col.count('-')
            assert n_gaps < len(col)
            if self.gap_cutoff != 1 and n_gaps/len(col) > self.gap_cutoff:
                score = self.SCORE_OVER_GAP_CUTOFF
            else:
                score = self._score_col(col, seq_weights)
                if self.use_gap_penalty:
                    # vn_entropy has this commented out for some reason
                    score *= weighted_gap_penalty(col, seq_weights)
            scores.append(score)
        return scores
Exemple #3
0
def write_scores(alignment, scores_cols, scorer_names, f=sys.stdout, header=None):
    """
    Write scores to `f` in human-readable format.  Columns written are:
        (position index, aa's in column, score by scorer 1, score by scorer 2, ...)
    Note that score.py has been changed to only support scoring using 1 scorer, so only
    3 columns are printed.
    """
    # sanity check
    if not len(scores_cols) == len(scorer_names):
        raise ValueError("Mismatch between inputs 'scores_cols' and 'scorer_names'")

    f.write("# Alignment: %s\n" % alignment.align_file)
    f.write("# Num sites: %d\n" % len(alignment.msa[0]))
    f.write("# Num sequences: %d\n" % len(alignment.names))
    if alignment.filtered:
        f.write("# Num sequences before filtering: %d\n" % alignment.orig_num_sequences)
    f.write("\n")

    if header:
        # Check if header lines start with '#'?
        f.write(header)
        f.write("\n")

    # print scores
    f.write("# i\tcolumn\t%s\n" % "\t".join(scorer_names))
    score_tups = zip(*scores_cols)
    for i, score_tup in enumerate(score_tups):
        site = "".join(get_column(i, alignment.msa))
        f.write("%d\t%s\t%s\n" % (i+1, site, "\t".join(map(write_score_helper, score_tup))))
Exemple #4
0
    def _estimate_r(self, alignment, names_map, prior_distr):
        tree = alignment.get_phylotree()

        # Pre-compute the probabilities for every branch and rate.
        # This can be done because the discrete gamma distribution tells us
        # which rates P(rt) will be computed for when scoring columns.
        P_cached = precompute_tree_probs(tree, prior_distr.get_rates(), self.sub_model)

        # E-step
        rates = []
        rates_for_est = []
        log_marginal = 0
        for i in xrange(len(alignment.msa[0])):
            col = get_column(i, alignment.msa)
            n_gaps = col.count('-')
            assert n_gaps < len(col)
            if n_gaps == len(col) - 1:
                # Return mean rate.
                rate = 1
            else:
                rate, marginal = self._estimate_r_col(col, names_map, prior_distr, P_cached, tree)
                rates_for_est.append(rate)
                log_marginal += np.log(marginal)
            rates.append(rate)

        return rates, rates_for_est, log_marginal
Exemple #5
0
def get_batchscores(dataset_name, batchscore_ids=[], align_files_only=False):
    """
    Useful for evaluators.
    Get an iterator on (alignment, scores_col) where scores_col consists
    of lists of scores for each id in `batchscore_ids`.  This iterator is over
    all alignments in `dataset_name`.
    """
    # Sanity check.
    ds_dir = get_batchscore_dir(dataset_name)
    if not os.path.exists(ds_dir):
        raise IOError("%s for dataset %r does not exist"
                % (ds_dir, dataset_name))
    for batchscore_id in batchscore_ids:
        sc_dir = os.path.join(ds_dir, batchscore_id)
        if not os.path.exists(sc_dir):
            raise IOError("%s for dataset %r, scorer %r does not exist"
                    % (sc_dir, dataset_name, batchscore_id))

    dataset_config = DATASET_CONFIGS[dataset_name]
    align_files = dataset_config.get_align_files()

    # Be particular about which alignments we can evaluate.
    afs = []
    for align_file in align_files:
        alignment = Alignment(align_file)
        n_gapped_cols = 0
        for i in xrange(len(alignment.msa[0])):
            col = get_column(i, alignment.msa)
            if col.count('-') > len(col) / 2:
                n_gapped_cols += 1
        if n_gapped_cols > len(alignment.msa[0]) / 2:
            continue
        include = True
        for batchscore_id in batchscore_ids:
            sc_dir = os.path.join(ds_dir, batchscore_id)
            out_file = dataset_config.get_out_file(align_file, sc_dir)
            if not os.path.exists(out_file):
                include = False
                break
        if include:
            afs.append(align_file)

    print "Evaluating dataset %r: %d/%d scored alignments after minor filtering" \
            % (dataset_name, len(afs), len(align_files))
    if align_files_only:
        for af in afs:
            yield af
        return

    # Iterate through score files in dataset, per alignment.
    for align_file in afs:
        scores_cols = []
        for batchscore_id in batchscore_ids:
            sc_dir = os.path.join(ds_dir, batchscore_id)
            out_file = dataset_config.get_out_file(align_file, sc_dir)
            scores = read_batchscores(out_file)
            scores_cols.append(scores)
        alignment = Alignment(align_file,
                test_file=dataset_config.get_test_file(align_file),
                parse_testset_fn=dataset_config.parse_testset_fn)
        yield alignment, scores_cols