Beispiel #1
0
def _convert_clustering_to_assignment(clusters):
    mapping = {
        vid: cidx
        for cidx, cluster in enumerate(clusters) for vid in cluster
    }
    vids = common.sort_vids(mapping.keys())
    assign = np.array([mapping[vid] for vid in vids], dtype=np.int32)
    return (vids, assign)
def sort_mutphi(mphi):
  sorted_vids = common.sort_vids(mphi.vids)
  mapping = [mphi.vids.index(V) for V in sorted_vids]
  assert sorted_vids == [mphi.vids[idx] for idx in mapping]
  sorted_logprobs = np.array([mphi.logprobs[idx] for idx in mapping])
  return mutphi.Mutphi(
    vids = sorted_vids,
    assays = mphi.assays,
    logprobs = sorted_logprobs,
  )
Beispiel #3
0
def make_membership_mat(clusters):
  vids = common.sort_vids([vid for C in clusters for vid in C])
  vidmap = {vid: vidx for vidx, vid in enumerate(vids)}
  N = len(vids)
  K = len(clusters)

  # membership[i,j] = 1 iff mutation `i` is in cluster `j`
  membership = np.zeros((N, K))
  for cidx, C in enumerate(clusters):
    members = [vidmap[vid] for vid in C]
    membership[members,cidx] = 1
  return (vids, membership)
Beispiel #4
0
def sort_mutrel_by_vids(mrel):
    sorted_vids = common.sort_vids(mrel.vids)
    if mrel.vids == sorted_vids:
        return mrel

    sort_map = {vid: vidx for vidx, vid in enumerate(mrel.vids)}
    order = [sort_map[vid] for vid in sorted_vids]
    assert sorted_vids == [mrel.vids[idx] for idx in order]
    return Mutrel(
        vids=sorted_vids,
        rels=reorder_array(mrel.rels, order),
    )
Beispiel #5
0
def main():
    parser = argparse.ArgumentParser(
        description='LOL HI THERE',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--phi-hat-threshold',
                        type=float,
                        default=1 - 1e-2,
                        help='Blah')
    parser.add_argument('--quantile', type=float, default=0.5, help='Blah')
    parser.add_argument('--print-bad-data', action='store_true')
    parser.add_argument('in_ssm_fn')
    parser.add_argument('in_params_fn')
    parser.add_argument('out_params_fn')
    args = parser.parse_args()

    np.set_printoptions(linewidth=400,
                        precision=3,
                        threshold=sys.maxsize,
                        suppress=True)
    np.seterr(divide='raise', invalid='raise', over='raise')

    ssms = inputparser.load_ssms(args.in_ssm_fn)
    params = inputparser.load_params(args.in_params_fn)
    ssms = inputparser.remove_garbage(ssms, params['garbage'])

    bad_vids, bad_samp_prop = _remove_bad(ssms, args.phi_hat_threshold,
                                          args.quantile, args.print_bad_data)
    bad_ssm_prop = len(bad_vids) / len(ssms)
    if len(bad_vids) > 0:
        params['garbage'] = common.sort_vids(params['garbage'] + bad_vids)
        with open(args.out_params_fn, 'w') as F:
            json.dump(params, F)

    stats = {
        'bad_ssms': common.sort_vids(bad_vids),
        'bad_samp_prop': '%.3f' % bad_samp_prop,
        'bad_ssm_prop': '%.3f' % bad_ssm_prop,
    }
    for K, V in stats.items():
        print('%s=%s' % (K, V))
Beispiel #6
0
def convert(params_fn, calder_mats_fn, calder_trees_fn, neutree_fn):
    params = inputparser.load_params(params_fn)

    mats, row_labels, col_labels = _load_mats(calder_mats_fn)
    assert row_labels['Fhat'][0] == 'samples'
    svids = row_labels['Fhat'][1:]
    assert svids == common.sort_vids(svids)

    struct = _load_struct(svids, calder_trees_fn)
    ntree = neutree.Neutree(
        structs=[struct],
        phis=[mats['Fhat']],
        counts=np.array([1]),
        logscores=np.array([0.]),
        clusterings=[params['clusters']],
        garbage=params['garbage'],
    )
    neutree.save(ntree, neutree_fn)
Beispiel #7
0
def make_varids_contiguous(variants, garbage, clusters):
    mapping = {}
    for new_idx, old_varid in enumerate(common.extract_vids(variants)):
        mapping[old_varid] = 's%s' % new_varidx

    new_variants = {mapping[V]: variants[V] for V in variants.keys()}
    for V in new_variants.keys():
        new_variants[V]['id'] = V

    new_clusters = [
        common.sort_vids([mapping[V] for V in C]) for C in clusters
    ]

    assert set(new_variants.keys()) == \
      set([V for C in new_clusters for V in C]) == \
      set([V['id'] for V in new_variants.values()])
    assert not np.any(np.array([len(C) for C in new_clusters]) == 0)

    return (new_variants, new_clusters)
Beispiel #8
0
def write_snvs(variants, sampnames, garbage, snv_fn, normal_vaf=0.0):
    sampnames = ['Normal'] + sampnames
    snv_indices = {}

    with open(snv_fn, 'w') as F:
        print('#chr', 'position', 'description', *sampnames, sep='\t', file=F)
        vids = common.sort_vids(variants.keys())

        idx = 1
        for vid in vids:
            if vid in garbage:
                continue
            vaf = (variants[vid]['var_reads'] /
                   variants[vid]['total_reads']).tolist()
            vaf = [normal_vaf] + vaf
            print('1', idx, vid, *vaf, sep='\t', file=F)
            snv_indices[vid] = idx
            idx += 1

    return snv_indices
Beispiel #9
0
def write_clusters(variants,
                   clusters,
                   snv_indices,
                   cluster_fn,
                   normal_vaf=0.0):
    rows = []
    for cluster in clusters:
        cvars = [variants[V] for V in cluster]
        var_reads = np.sum(extract_mat(cvars, 'var_reads'), axis=0)
        total_reads = np.sum(extract_mat(cvars, 'total_reads'), axis=0)
        cvaf = (var_reads / total_reads).tolist()
        cvaf = [normal_vaf] + cvaf

        sampmask = '0' + (len(cvaf) - 1) * '1'
        snv_idxs = [str(snv_indices[V]) for V in common.sort_vids(cluster)]

        rows.append([sampmask] + cvaf + [','.join(snv_idxs)])

    with open(cluster_fn, 'w') as F:
        for row in rows:
            print(*row, sep='\t', file=F)
Beispiel #10
0
def impute_garbage(mstats, garbage, _impute):
  if len(garbage) == 0:
    return mstats
  garbage = set(garbage)
  old_vids = set(mstats.vids)
  assert len(old_vids & garbage) == 0

  new_vids = common.sort_vids(old_vids | garbage)
  M_old, S = mstats.stats.shape
  M_new = len(new_vids)
  new_stats = np.full((M_new, S), np.nan)
  new_assays = mstats.assays
  idxmap = {vid: idx for idx, vid in enumerate(mstats.vids)}

  for idx, vid in enumerate(new_vids):
    if vid in old_vids:
      new_stats[idx] = mstats.stats[idxmap[vid]]
    else:
      new_stats[idx] = _impute(vid)

  assert not np.any(np.isnan(new_stats))
  return Mutstat(vids=new_vids, stats=new_stats, assays=new_assays)
Beispiel #11
0
def load_nongarb_vids(variants, garbage):
  vids = set(variants.keys())
  nongarb_vids = common.sort_vids(vids - set(garbage))
  return nongarb_vids
Beispiel #12
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'Find variants with likely incorrect var_read_prob by comparing model with provided var_read_prob to haploid (LOH) model using Bayes factors',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        '--logbf-threshold',
        type=float,
        default=10.,
        help=
        'Logarithm of Bayes factor threshold at which the haploid model is accepted as more likely model than the model using the provided var_read_prob'
    )
    parser.add_argument('--verbose',
                        action='store_true',
                        help='Print debugging messages')
    parser.add_argument(
        '--ignore-existing-garbage',
        action='store_true',
        help=
        'Ignore any existing garbage variants listed in in_params_fn and test all variants. If not specified, any existing garbage variants will be kept as garbage and not tested again.'
    )
    parser.add_argument('--action',
                        choices=('add_to_garbage', 'modify_var_read_prob'),
                        default='add_to_garbage')
    parser.add_argument('--var-read-prob-alt', type=float, default=1.)
    parser.add_argument('in_ssm_fn', help='Input SSM file with mutations')
    parser.add_argument(
        'in_params_fn',
        help=
        'Input params file listing sample names and any existing garbage mutations'
    )
    parser.add_argument(
        'out_ssm_fn',
        help='Output SSM file with modified list of garbage mutations')
    parser.add_argument(
        'out_params_fn',
        help='Output params file with modified list of garbage mutations')
    args = parser.parse_args()

    np.set_printoptions(linewidth=400,
                        precision=3,
                        threshold=sys.maxsize,
                        suppress=True)
    np.seterr(divide='raise', invalid='raise', over='raise')

    if args.ignore_existing_garbage:
        variants, params = inputparser.load_ssms_and_params(args.in_ssm_fn,
                                                            args.in_params_fn,
                                                            remove_garb=False)
        params['garbage'] = []
    else:
        variants, params = inputparser.load_ssms_and_params(
            args.in_ssm_fn, args.in_params_fn)

    bad_vids, bad_samp_prop = _remove_bad(variants, args.logbf_threshold,
                                          args.var_read_prob_alt, args.verbose)
    bad_ssm_prop = len(bad_vids) / len(variants)

    if args.action == 'add_to_garbage':
        params['garbage'] = common.sort_vids(
            set(bad_vids) | set(params['garbage']))
    elif args.action == 'modify_var_read_prob':
        for vid in bad_vids:
            variants[vid]['omega_v'][:] = args.var_read_prob_alt
    else:
        raise Exception('Unknown action: %s' % args.action)

    inputparser.write_ssms(variants, args.out_ssm_fn)
    with open(args.out_params_fn, 'w') as F:
        json.dump(params, F)

    stats = {
        'num_bad_ssms': len(bad_vids),
        'bad_ssms': common.sort_vids(bad_vids),
        'bad_samp_prop': '%.3f' % bad_samp_prop,
        'bad_ssm_prop': '%.3f' % bad_ssm_prop,
    }
    for K, V in stats.items():
        print('%s=%s' % (K, V))
def main():
    all_plot_choices = set((
        'tree',
        'pairwise_separate',
        'pairwise_mle',
        'vaf_matrix',
        'phi',
        'phi_hat',
        'phi_interleaved',
        'cluster_stats',
        'eta',
        'diversity_indices',
    ))
    parser = argparse.ArgumentParser(
        description='LOL HI THERE',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--seed', type=int)
    parser.add_argument('--tree-index', type=int, default=0)
    parser.add_argument('--plot',
                        dest='plot_choices',
                        type=lambda s: set(s.split(',')),
                        help='Things to plot; by default, plot everything')
    parser.add_argument('--omit-plots',
                        dest='omit_plots',
                        type=lambda s: set(s.split(',')),
                        help='Things to omit from plotting; overrides --plot')
    parser.add_argument('--runid')
    parser.add_argument(
        '--reorder-subclones',
        action='store_true',
        help=
        'Reorder subclones according to depth-first search through tree structure'
    )
    parser.add_argument(
        '--tree-json',
        dest='tree_json_fn',
        help=
        'Additional external file in which to store JSON, which is already stored statically in the HTML file'
    )
    parser.add_argument('--phi-orientation',
                        choices=('samples_as_rows', 'populations_as_rows'),
                        default='populations_as_rows')
    parser.add_argument(
        '--remove-normal',
        action='store_true',
        help=
        'Remove normal (non-cancerous) population 0 from tree, phi, and eta plots.'
    )
    parser.add_argument('ssm_fn')
    parser.add_argument('params_fn')
    parser.add_argument('results_fn')
    parser.add_argument('discord_fn')
    parser.add_argument('html_out_fn')
    args = parser.parse_args()

    np.seterr(divide='raise', invalid='raise', over='raise')

    if args.seed is not None:
        random.seed(args.seed)
        np.random.seed(args.seed)

    plot_choices = _choose_plots(args.plot_choices, args.omit_plots,
                                 all_plot_choices)

    results = resultserializer.Results(args.results_fn)
    variants = inputparser.load_ssms(args.ssm_fn)
    params = inputparser.load_params(args.params_fn)
    discord = _parse_discord(args.discord_fn)

    data = {
        K: results.get(K)[args.tree_index]
        for K in (
            'struct',
            'count',
            'llh',
            'prob',
            'phi',
        )
    }
    data['garbage'] = results.get('garbage')
    data['clusters'] = results.get('clusters')
    data['samples'] = params['samples']
    data['clustrel_posterior'] = results.get_mutrel('clustrel_posterior')
    if args.reorder_subclones:
        data, params = _reorder_subclones(data, params)

    if 'hidden_samples' in params:
        hidden = set(params['hidden_samples'])
        assert hidden.issubset(set(
            data['samples'])) and len(hidden) < len(data['samples'])
        visible_sampidxs = [
            idx for idx, samp in enumerate(data['samples'])
            if samp not in hidden
        ]
    else:
        visible_sampidxs = None

    samp_colours = params.get('samp_colours', None)
    pop_colours = params.get('pop_colours', None)
    if samp_colours is not None:
        assert set([S[0] for S in samp_colours]).issubset(data['samples'])
    if pop_colours is not None:
        assert len(pop_colours) == len(data['struct']) + 1

    supervars = clustermaker.make_cluster_supervars(data['clusters'], variants)
    supervars = [supervars[vid] for vid in common.sort_vids(supervars.keys())]

    with open(args.html_out_fn, 'w') as outf:
        write_header(args.runid, args.tree_index, outf)

        if 'tree' in plot_choices:
            tree_struct = util.make_tree_struct(
                data['struct'],
                data['count'],
                data['llh'],
                data['prob'],
                data['phi'],
                supervars,
                data['clusters'],
                data['samples'],
            )
            tree_struct['discord'] = discord

            _write_tree_html(
                tree_struct,
                args.tree_index,
                visible_sampidxs,
                samp_colours,
                pop_colours,
                'eta' in plot_choices,
                'diversity_indices' in plot_choices,
                'phi' in plot_choices,
                'phi_hat' in plot_choices,
                'phi_interleaved' in plot_choices,
                args.phi_orientation,
                args.remove_normal,
                outf,
            )
            if args.tree_json_fn is not None:
                _write_tree_json(tree_struct, args.tree_json_fn)

        if 'vaf_matrix' in plot_choices:
            vaf_plotter.plot_vaf_matrix(
                data['clusters'],
                variants,
                supervars,
                data['garbage'],
                data['phi'],
                data['samples'],
                should_correct_vaf=True,
                outf=outf,
            )

        if 'pairwise_mle' in plot_choices:
            relation_plotter.plot_ml_relations(data['clustrel_posterior'],
                                               outf)
        if 'pairwise_separate' in plot_choices:
            relation_plotter.plot_separate_relations(
                data['clustrel_posterior'], outf)
        if 'cluster_stats' in plot_choices:
            write_cluster_stats(data['clusters'], data['garbage'], supervars,
                                variants, outf)

        write_footer(outf)
Beispiel #14
0
def write(mstat, mutstatfn):
  # calc_mutstat should have created `mutstat` with sorted vids, but double-check
  # this is true.
  assert list(mstat.vids) == common.sort_vids(mstat.vids)
  np.savez_compressed(mutstatfn, stats=mstat.stats, vids=mstat.vids, assays=mstat.assays)