def adjust_scores(dataset_name, *batchscore_ids, **kwargs): """ Create normalized versions of batchscore runs in `batchscore_ids`. Note that zscores with an absolute of higher than 5 are chopped off (set to 5). """ if 'overwrite' in kwargs: overwrite = kwargs['overwrite'] else: overwrite = False # Determine adjustment type adj_type = None for k,v in kwargs.iteritems(): if k in ['norm', 'window', 'neg']: if adj_type: raise Exception("Cannot adjust in multiple ways: %s, %s" % (adj_type, k)) if k == 'window': window_size = int(v) adj_type = '%s_%d' % (k, window_size) else: adj_type = k # Perform adjustments on all scores.. for batchscore_id in batchscore_ids: dc = DATASET_CONFIGS[dataset_name] ds_dir = get_batchscore_dir(dataset_name) sc_dir = os.path.join(ds_dir, batchscore_id) sc_dir_adj = "%s-%s" % (sc_dir, adj_type) if not overwrite: resp = raw_input("Creating %s'd version of\n %s\nat\n %s\nContinue? y/[n]: " % (adj_type, sc_dir, sc_dir_adj)) if resp != 'y': continue if not os.path.exists(sc_dir_adj): os.mkdir(sc_dir_adj) else: for fname in os.listdir(sc_dir_adj): os.remove(os.path.join(sc_dir_adj, fname)) align_files = dc.get_align_files() for align_file in align_files: out_file = dc.get_out_file(align_file, sc_dir) if not os.path.exists(out_file): continue scores = read_batchscores(out_file) if adj_type is 'norm': scores_adj = norm_scores(scores, filter=5) elif adj_type is 'neg': scores_adj = [-s for s in scores] elif adj_type.startswith('window'): scores_adj = window_scores(scores, window_size) out_file_adj = os.path.join(sc_dir_adj, os.path.split(out_file)[-1]) write_batchscores(out_file_adj, scores_adj) print "Created %s" % sc_dir_adj
def get_batchscores(dataset_name, batchscore_ids=[], align_files_only=False): """ Useful for evaluators. Get an iterator on (alignment, scores_col) where scores_col consists of lists of scores for each id in `batchscore_ids`. This iterator is over all alignments in `dataset_name`. """ # Sanity check. ds_dir = get_batchscore_dir(dataset_name) if not os.path.exists(ds_dir): raise IOError("%s for dataset %r does not exist" % (ds_dir, dataset_name)) for batchscore_id in batchscore_ids: sc_dir = os.path.join(ds_dir, batchscore_id) if not os.path.exists(sc_dir): raise IOError("%s for dataset %r, scorer %r does not exist" % (sc_dir, dataset_name, batchscore_id)) dataset_config = DATASET_CONFIGS[dataset_name] align_files = dataset_config.get_align_files() # Be particular about which alignments we can evaluate. afs = [] for align_file in align_files: alignment = Alignment(align_file) n_gapped_cols = 0 for i in xrange(len(alignment.msa[0])): col = get_column(i, alignment.msa) if col.count('-') > len(col) / 2: n_gapped_cols += 1 if n_gapped_cols > len(alignment.msa[0]) / 2: continue include = True for batchscore_id in batchscore_ids: sc_dir = os.path.join(ds_dir, batchscore_id) out_file = dataset_config.get_out_file(align_file, sc_dir) if not os.path.exists(out_file): include = False break if include: afs.append(align_file) print "Evaluating dataset %r: %d/%d scored alignments after minor filtering" \ % (dataset_name, len(afs), len(align_files)) if align_files_only: for af in afs: yield af return # Iterate through score files in dataset, per alignment. for align_file in afs: scores_cols = [] for batchscore_id in batchscore_ids: sc_dir = os.path.join(ds_dir, batchscore_id) out_file = dataset_config.get_out_file(align_file, sc_dir) scores = read_batchscores(out_file) scores_cols.append(scores) alignment = Alignment(align_file, test_file=dataset_config.get_test_file(align_file), parse_testset_fn=dataset_config.parse_testset_fn) yield alignment, scores_cols
def r4s_ppc(dataset_name, **jsd_params): afs = list(get_batchscores(dataset_name, align_files_only=True)) dc = DATASET_CONFIGS[dataset_name] r4s_name = 'R4S_EB-vanilla' r4s_dir = os.path.join(get_batchscore_dir(dataset_name), r4s_name) r4s = Rate4siteEb() jsd = JsDivergence(**jsd_params) # Choose random alignment/scores pair align_file = random.choice(afs) test_file = dc.get_test_file(align_file) r4s_file = dc.get_out_file(align_file, r4s_dir) alignment = Alignment(align_file, test_file=test_file, parse_testset_fn=dc.parse_testset_fn) n_seqs = len(alignment.msa) n_sites = len(alignment.msa[0]) fig = plt.figure() ax = plt.gca() inds = range(n_sites) rates = read_batchscores(r4s_file) tree = alignment.get_phylotree() root = tree.root names_map = dict((name,i) for i,name in enumerate(alignment.names)) # Pre-compute probabilities for branch, for every site (i.e., rate) P_cached = precompute_tree_probs(tree, rates, r4s.sub_model) for r in P_cached: for node in P_cached[r]: # We treat root separately if node != root: cum_probs = np.cumsum(P_cached[r][node],axis=1) if not np.all(np.abs(cum_probs[:,-1]-1) < 1e-4): raise ValueError("Bad probability matrix") cum_probs[:,-1] = 1 P_cached[r][node] = np.array(cum_probs) root_freqs = np.cumsum(r4s.sub_model.freqs) if not abs(root_freqs[-1]-1) < 1e-4: raise ValueError("Bad probability matrix") root_freqs[-1] = 1 # Repeat replication for N_RUNS jsd_rep_scores_all = [] for n in xrange(N_RUNS): # For each site, generate amino acids for each sequence using that site's rate # DFS through tree, setting amino acids at each node msa = [[] for i in xrange(n_seqs)] for i in xrange(n_sites): r = rates[i] aa_ind = weighted_choice(root_freqs) bfs = [(aa_ind,node) for node in root.clades] for aa_ind, node in bfs: aa_ind = weighted_choice(P_cached[r][node][aa_ind]) if node.is_terminal(): msa[names_map[node.name]].append(amino_acids[aa_ind]) else: bfs += ((aa_ind,child) for child in node.clades) aln_rep = MockAlignment(alignment.names, msa, tree, alignment.get_seq_weights) jsd_rep_scores = jsd.score(aln_rep) jsd_rep_scores_all.append(jsd_rep_scores) ax.scatter(inds, jsd_rep_scores, color='k', alpha=0.2) jsd_orig_scores = jsd.score(alignment) ts = alignment.testset pos_inds = [i for i in inds if ts[i]] pos_orig_scores = [jsd_orig_scores[i] for i in pos_inds] neg_inds = [i for i in inds if not ts[i]] neg_orig_scores = [jsd_orig_scores[i] for i in neg_inds] ax.scatter(pos_inds, pos_orig_scores, color='g') ax.scatter(neg_inds, neg_orig_scores, color='r') ax.set_ylabel('JSD score') rep_scores_per_col = np.array(jsd_rep_scores_all).T means = np.mean(rep_scores_per_col, axis=1) stds = np.std(rep_scores_per_col, axis=1) zscores = (np.array(jsd_orig_scores) - means) / stds ax2 = ax.twinx() ax2.plot(inds, zscores, '--') ax2.set_ylabel('Deviations') plt.xlim(inds[0], inds[-1]) plt.xlabel('Sites') plt.figure() plt.scatter(rates, zscores) plt.xlabel('Rates') plt.xlabel('Deviations') plt.show(block=False) import ipdb ipdb.set_trace()