def _convert_clustering_to_assignment(clusters): mapping = { vid: cidx for cidx, cluster in enumerate(clusters) for vid in cluster } vids = common.sort_vids(mapping.keys()) assign = np.array([mapping[vid] for vid in vids], dtype=np.int32) return (vids, assign)
def sort_mutphi(mphi): sorted_vids = common.sort_vids(mphi.vids) mapping = [mphi.vids.index(V) for V in sorted_vids] assert sorted_vids == [mphi.vids[idx] for idx in mapping] sorted_logprobs = np.array([mphi.logprobs[idx] for idx in mapping]) return mutphi.Mutphi( vids = sorted_vids, assays = mphi.assays, logprobs = sorted_logprobs, )
def make_membership_mat(clusters): vids = common.sort_vids([vid for C in clusters for vid in C]) vidmap = {vid: vidx for vidx, vid in enumerate(vids)} N = len(vids) K = len(clusters) # membership[i,j] = 1 iff mutation `i` is in cluster `j` membership = np.zeros((N, K)) for cidx, C in enumerate(clusters): members = [vidmap[vid] for vid in C] membership[members,cidx] = 1 return (vids, membership)
def sort_mutrel_by_vids(mrel): sorted_vids = common.sort_vids(mrel.vids) if mrel.vids == sorted_vids: return mrel sort_map = {vid: vidx for vidx, vid in enumerate(mrel.vids)} order = [sort_map[vid] for vid in sorted_vids] assert sorted_vids == [mrel.vids[idx] for idx in order] return Mutrel( vids=sorted_vids, rels=reorder_array(mrel.rels, order), )
def main(): parser = argparse.ArgumentParser( description='LOL HI THERE', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--phi-hat-threshold', type=float, default=1 - 1e-2, help='Blah') parser.add_argument('--quantile', type=float, default=0.5, help='Blah') parser.add_argument('--print-bad-data', action='store_true') parser.add_argument('in_ssm_fn') parser.add_argument('in_params_fn') parser.add_argument('out_params_fn') args = parser.parse_args() np.set_printoptions(linewidth=400, precision=3, threshold=sys.maxsize, suppress=True) np.seterr(divide='raise', invalid='raise', over='raise') ssms = inputparser.load_ssms(args.in_ssm_fn) params = inputparser.load_params(args.in_params_fn) ssms = inputparser.remove_garbage(ssms, params['garbage']) bad_vids, bad_samp_prop = _remove_bad(ssms, args.phi_hat_threshold, args.quantile, args.print_bad_data) bad_ssm_prop = len(bad_vids) / len(ssms) if len(bad_vids) > 0: params['garbage'] = common.sort_vids(params['garbage'] + bad_vids) with open(args.out_params_fn, 'w') as F: json.dump(params, F) stats = { 'bad_ssms': common.sort_vids(bad_vids), 'bad_samp_prop': '%.3f' % bad_samp_prop, 'bad_ssm_prop': '%.3f' % bad_ssm_prop, } for K, V in stats.items(): print('%s=%s' % (K, V))
def convert(params_fn, calder_mats_fn, calder_trees_fn, neutree_fn): params = inputparser.load_params(params_fn) mats, row_labels, col_labels = _load_mats(calder_mats_fn) assert row_labels['Fhat'][0] == 'samples' svids = row_labels['Fhat'][1:] assert svids == common.sort_vids(svids) struct = _load_struct(svids, calder_trees_fn) ntree = neutree.Neutree( structs=[struct], phis=[mats['Fhat']], counts=np.array([1]), logscores=np.array([0.]), clusterings=[params['clusters']], garbage=params['garbage'], ) neutree.save(ntree, neutree_fn)
def make_varids_contiguous(variants, garbage, clusters): mapping = {} for new_idx, old_varid in enumerate(common.extract_vids(variants)): mapping[old_varid] = 's%s' % new_varidx new_variants = {mapping[V]: variants[V] for V in variants.keys()} for V in new_variants.keys(): new_variants[V]['id'] = V new_clusters = [ common.sort_vids([mapping[V] for V in C]) for C in clusters ] assert set(new_variants.keys()) == \ set([V for C in new_clusters for V in C]) == \ set([V['id'] for V in new_variants.values()]) assert not np.any(np.array([len(C) for C in new_clusters]) == 0) return (new_variants, new_clusters)
def write_snvs(variants, sampnames, garbage, snv_fn, normal_vaf=0.0): sampnames = ['Normal'] + sampnames snv_indices = {} with open(snv_fn, 'w') as F: print('#chr', 'position', 'description', *sampnames, sep='\t', file=F) vids = common.sort_vids(variants.keys()) idx = 1 for vid in vids: if vid in garbage: continue vaf = (variants[vid]['var_reads'] / variants[vid]['total_reads']).tolist() vaf = [normal_vaf] + vaf print('1', idx, vid, *vaf, sep='\t', file=F) snv_indices[vid] = idx idx += 1 return snv_indices
def write_clusters(variants, clusters, snv_indices, cluster_fn, normal_vaf=0.0): rows = [] for cluster in clusters: cvars = [variants[V] for V in cluster] var_reads = np.sum(extract_mat(cvars, 'var_reads'), axis=0) total_reads = np.sum(extract_mat(cvars, 'total_reads'), axis=0) cvaf = (var_reads / total_reads).tolist() cvaf = [normal_vaf] + cvaf sampmask = '0' + (len(cvaf) - 1) * '1' snv_idxs = [str(snv_indices[V]) for V in common.sort_vids(cluster)] rows.append([sampmask] + cvaf + [','.join(snv_idxs)]) with open(cluster_fn, 'w') as F: for row in rows: print(*row, sep='\t', file=F)
def impute_garbage(mstats, garbage, _impute): if len(garbage) == 0: return mstats garbage = set(garbage) old_vids = set(mstats.vids) assert len(old_vids & garbage) == 0 new_vids = common.sort_vids(old_vids | garbage) M_old, S = mstats.stats.shape M_new = len(new_vids) new_stats = np.full((M_new, S), np.nan) new_assays = mstats.assays idxmap = {vid: idx for idx, vid in enumerate(mstats.vids)} for idx, vid in enumerate(new_vids): if vid in old_vids: new_stats[idx] = mstats.stats[idxmap[vid]] else: new_stats[idx] = _impute(vid) assert not np.any(np.isnan(new_stats)) return Mutstat(vids=new_vids, stats=new_stats, assays=new_assays)
def load_nongarb_vids(variants, garbage): vids = set(variants.keys()) nongarb_vids = common.sort_vids(vids - set(garbage)) return nongarb_vids
def main(): parser = argparse.ArgumentParser( description= 'Find variants with likely incorrect var_read_prob by comparing model with provided var_read_prob to haploid (LOH) model using Bayes factors', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( '--logbf-threshold', type=float, default=10., help= 'Logarithm of Bayes factor threshold at which the haploid model is accepted as more likely model than the model using the provided var_read_prob' ) parser.add_argument('--verbose', action='store_true', help='Print debugging messages') parser.add_argument( '--ignore-existing-garbage', action='store_true', help= 'Ignore any existing garbage variants listed in in_params_fn and test all variants. If not specified, any existing garbage variants will be kept as garbage and not tested again.' ) parser.add_argument('--action', choices=('add_to_garbage', 'modify_var_read_prob'), default='add_to_garbage') parser.add_argument('--var-read-prob-alt', type=float, default=1.) parser.add_argument('in_ssm_fn', help='Input SSM file with mutations') parser.add_argument( 'in_params_fn', help= 'Input params file listing sample names and any existing garbage mutations' ) parser.add_argument( 'out_ssm_fn', help='Output SSM file with modified list of garbage mutations') parser.add_argument( 'out_params_fn', help='Output params file with modified list of garbage mutations') args = parser.parse_args() np.set_printoptions(linewidth=400, precision=3, threshold=sys.maxsize, suppress=True) np.seterr(divide='raise', invalid='raise', over='raise') if args.ignore_existing_garbage: variants, params = inputparser.load_ssms_and_params(args.in_ssm_fn, args.in_params_fn, remove_garb=False) params['garbage'] = [] else: variants, params = inputparser.load_ssms_and_params( args.in_ssm_fn, args.in_params_fn) bad_vids, bad_samp_prop = _remove_bad(variants, args.logbf_threshold, args.var_read_prob_alt, args.verbose) bad_ssm_prop = len(bad_vids) / len(variants) if args.action == 'add_to_garbage': params['garbage'] = common.sort_vids( set(bad_vids) | set(params['garbage'])) elif args.action == 'modify_var_read_prob': for vid in bad_vids: variants[vid]['omega_v'][:] = args.var_read_prob_alt else: raise Exception('Unknown action: %s' % args.action) inputparser.write_ssms(variants, args.out_ssm_fn) with open(args.out_params_fn, 'w') as F: json.dump(params, F) stats = { 'num_bad_ssms': len(bad_vids), 'bad_ssms': common.sort_vids(bad_vids), 'bad_samp_prop': '%.3f' % bad_samp_prop, 'bad_ssm_prop': '%.3f' % bad_ssm_prop, } for K, V in stats.items(): print('%s=%s' % (K, V))
def main(): all_plot_choices = set(( 'tree', 'pairwise_separate', 'pairwise_mle', 'vaf_matrix', 'phi', 'phi_hat', 'phi_interleaved', 'cluster_stats', 'eta', 'diversity_indices', )) parser = argparse.ArgumentParser( description='LOL HI THERE', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--seed', type=int) parser.add_argument('--tree-index', type=int, default=0) parser.add_argument('--plot', dest='plot_choices', type=lambda s: set(s.split(',')), help='Things to plot; by default, plot everything') parser.add_argument('--omit-plots', dest='omit_plots', type=lambda s: set(s.split(',')), help='Things to omit from plotting; overrides --plot') parser.add_argument('--runid') parser.add_argument( '--reorder-subclones', action='store_true', help= 'Reorder subclones according to depth-first search through tree structure' ) parser.add_argument( '--tree-json', dest='tree_json_fn', help= 'Additional external file in which to store JSON, which is already stored statically in the HTML file' ) parser.add_argument('--phi-orientation', choices=('samples_as_rows', 'populations_as_rows'), default='populations_as_rows') parser.add_argument( '--remove-normal', action='store_true', help= 'Remove normal (non-cancerous) population 0 from tree, phi, and eta plots.' ) parser.add_argument('ssm_fn') parser.add_argument('params_fn') parser.add_argument('results_fn') parser.add_argument('discord_fn') parser.add_argument('html_out_fn') args = parser.parse_args() np.seterr(divide='raise', invalid='raise', over='raise') if args.seed is not None: random.seed(args.seed) np.random.seed(args.seed) plot_choices = _choose_plots(args.plot_choices, args.omit_plots, all_plot_choices) results = resultserializer.Results(args.results_fn) variants = inputparser.load_ssms(args.ssm_fn) params = inputparser.load_params(args.params_fn) discord = _parse_discord(args.discord_fn) data = { K: results.get(K)[args.tree_index] for K in ( 'struct', 'count', 'llh', 'prob', 'phi', ) } data['garbage'] = results.get('garbage') data['clusters'] = results.get('clusters') data['samples'] = params['samples'] data['clustrel_posterior'] = results.get_mutrel('clustrel_posterior') if args.reorder_subclones: data, params = _reorder_subclones(data, params) if 'hidden_samples' in params: hidden = set(params['hidden_samples']) assert hidden.issubset(set( data['samples'])) and len(hidden) < len(data['samples']) visible_sampidxs = [ idx for idx, samp in enumerate(data['samples']) if samp not in hidden ] else: visible_sampidxs = None samp_colours = params.get('samp_colours', None) pop_colours = params.get('pop_colours', None) if samp_colours is not None: assert set([S[0] for S in samp_colours]).issubset(data['samples']) if pop_colours is not None: assert len(pop_colours) == len(data['struct']) + 1 supervars = clustermaker.make_cluster_supervars(data['clusters'], variants) supervars = [supervars[vid] for vid in common.sort_vids(supervars.keys())] with open(args.html_out_fn, 'w') as outf: write_header(args.runid, args.tree_index, outf) if 'tree' in plot_choices: tree_struct = util.make_tree_struct( data['struct'], data['count'], data['llh'], data['prob'], data['phi'], supervars, data['clusters'], data['samples'], ) tree_struct['discord'] = discord _write_tree_html( tree_struct, args.tree_index, visible_sampidxs, samp_colours, pop_colours, 'eta' in plot_choices, 'diversity_indices' in plot_choices, 'phi' in plot_choices, 'phi_hat' in plot_choices, 'phi_interleaved' in plot_choices, args.phi_orientation, args.remove_normal, outf, ) if args.tree_json_fn is not None: _write_tree_json(tree_struct, args.tree_json_fn) if 'vaf_matrix' in plot_choices: vaf_plotter.plot_vaf_matrix( data['clusters'], variants, supervars, data['garbage'], data['phi'], data['samples'], should_correct_vaf=True, outf=outf, ) if 'pairwise_mle' in plot_choices: relation_plotter.plot_ml_relations(data['clustrel_posterior'], outf) if 'pairwise_separate' in plot_choices: relation_plotter.plot_separate_relations( data['clustrel_posterior'], outf) if 'cluster_stats' in plot_choices: write_cluster_stats(data['clusters'], data['garbage'], supervars, variants, outf) write_footer(outf)
def write(mstat, mutstatfn): # calc_mutstat should have created `mutstat` with sorted vids, but double-check # this is true. assert list(mstat.vids) == common.sort_vids(mstat.vids) np.savez_compressed(mutstatfn, stats=mstat.stats, vids=mstat.vids, assays=mstat.assays)