def _score(self, alignment): if self.use_seq_weights: seq_weights = alignment.get_seq_weights() else: seq_weights = [1.0] * len(alignment.msa) if self.bg_distribution is None: # Estimate bg distribution from this alignment q = weighted_freq_count_pseudocount((aa for seq in alignment.msa for aa in seq), seq_weights, PSEUDOCOUNT) else: q = self.bg_distribution scores = [] for i in xrange(len(alignment.msa[0])): col = get_column(i, alignment.msa) n_gaps = col.count("-") assert n_gaps < len(col) if self.gap_cutoff != 1 and n_gaps / len(col) > self.gap_cutoff: score = self.SCORE_OVER_GAP_CUTOFF else: score = self._score_col(col, seq_weights, q) if self.use_gap_penalty: # vn_entropy has this commented out for some reason score *= weighted_gap_penalty(col, seq_weights) scores.append(score) return scores
def _score(self, alignment): if self.use_seq_weights: seq_weights = alignment.get_seq_weights() else: seq_weights = [1.] * len(alignment.msa) # Estimate bg distribution from this alignment if hasattr(self, 'bg_distribution'): if not self.bg_distribution: q = dict((aa, 0) for aa in amino_acids) for seq in self.msa: for aa in seq: q[aa] += 1 self.bg_distribution = q scores = [] for i in xrange(len(alignment.msa[0])): col = get_column(i, alignment.msa) n_gaps = col.count('-') assert n_gaps < len(col) if self.gap_cutoff != 1 and n_gaps/len(col) > self.gap_cutoff: score = self.SCORE_OVER_GAP_CUTOFF else: score = self._score_col(col, seq_weights) if self.use_gap_penalty: # vn_entropy has this commented out for some reason score *= weighted_gap_penalty(col, seq_weights) scores.append(score) return scores
def write_scores(alignment, scores_cols, scorer_names, f=sys.stdout, header=None): """ Write scores to `f` in human-readable format. Columns written are: (position index, aa's in column, score by scorer 1, score by scorer 2, ...) Note that score.py has been changed to only support scoring using 1 scorer, so only 3 columns are printed. """ # sanity check if not len(scores_cols) == len(scorer_names): raise ValueError("Mismatch between inputs 'scores_cols' and 'scorer_names'") f.write("# Alignment: %s\n" % alignment.align_file) f.write("# Num sites: %d\n" % len(alignment.msa[0])) f.write("# Num sequences: %d\n" % len(alignment.names)) if alignment.filtered: f.write("# Num sequences before filtering: %d\n" % alignment.orig_num_sequences) f.write("\n") if header: # Check if header lines start with '#'? f.write(header) f.write("\n") # print scores f.write("# i\tcolumn\t%s\n" % "\t".join(scorer_names)) score_tups = zip(*scores_cols) for i, score_tup in enumerate(score_tups): site = "".join(get_column(i, alignment.msa)) f.write("%d\t%s\t%s\n" % (i+1, site, "\t".join(map(write_score_helper, score_tup))))
def _estimate_r(self, alignment, names_map, prior_distr): tree = alignment.get_phylotree() # Pre-compute the probabilities for every branch and rate. # This can be done because the discrete gamma distribution tells us # which rates P(rt) will be computed for when scoring columns. P_cached = precompute_tree_probs(tree, prior_distr.get_rates(), self.sub_model) # E-step rates = [] rates_for_est = [] log_marginal = 0 for i in xrange(len(alignment.msa[0])): col = get_column(i, alignment.msa) n_gaps = col.count('-') assert n_gaps < len(col) if n_gaps == len(col) - 1: # Return mean rate. rate = 1 else: rate, marginal = self._estimate_r_col(col, names_map, prior_distr, P_cached, tree) rates_for_est.append(rate) log_marginal += np.log(marginal) rates.append(rate) return rates, rates_for_est, log_marginal
def get_batchscores(dataset_name, batchscore_ids=[], align_files_only=False): """ Useful for evaluators. Get an iterator on (alignment, scores_col) where scores_col consists of lists of scores for each id in `batchscore_ids`. This iterator is over all alignments in `dataset_name`. """ # Sanity check. ds_dir = get_batchscore_dir(dataset_name) if not os.path.exists(ds_dir): raise IOError("%s for dataset %r does not exist" % (ds_dir, dataset_name)) for batchscore_id in batchscore_ids: sc_dir = os.path.join(ds_dir, batchscore_id) if not os.path.exists(sc_dir): raise IOError("%s for dataset %r, scorer %r does not exist" % (sc_dir, dataset_name, batchscore_id)) dataset_config = DATASET_CONFIGS[dataset_name] align_files = dataset_config.get_align_files() # Be particular about which alignments we can evaluate. afs = [] for align_file in align_files: alignment = Alignment(align_file) n_gapped_cols = 0 for i in xrange(len(alignment.msa[0])): col = get_column(i, alignment.msa) if col.count('-') > len(col) / 2: n_gapped_cols += 1 if n_gapped_cols > len(alignment.msa[0]) / 2: continue include = True for batchscore_id in batchscore_ids: sc_dir = os.path.join(ds_dir, batchscore_id) out_file = dataset_config.get_out_file(align_file, sc_dir) if not os.path.exists(out_file): include = False break if include: afs.append(align_file) print "Evaluating dataset %r: %d/%d scored alignments after minor filtering" \ % (dataset_name, len(afs), len(align_files)) if align_files_only: for af in afs: yield af return # Iterate through score files in dataset, per alignment. for align_file in afs: scores_cols = [] for batchscore_id in batchscore_ids: sc_dir = os.path.join(ds_dir, batchscore_id) out_file = dataset_config.get_out_file(align_file, sc_dir) scores = read_batchscores(out_file) scores_cols.append(scores) alignment = Alignment(align_file, test_file=dataset_config.get_test_file(align_file), parse_testset_fn=dataset_config.parse_testset_fn) yield alignment, scores_cols