Ejemplo n.º 1
0
    def kmeans_cluster_v_seqs(self, qr_seqs, swfo, plotdir=None, debug=False):
        if plotdir is not None:
            utils.prep_dir(plotdir, wildlings=['*.svg'], subdirs=[d for d in os.listdir(plotdir) if os.path.isdir(plotdir + '/' + d)], rm_subdirs=True)

        clusterfos = []
        if debug:
            print 'kmeans clustering'
            print '  seqs    family'
        for family, seqfos in self.get_family_groups(qr_seqs, swfo).items():
            if debug:
                print '  %5d     %s' % (len(seqfos), family)
            partition = mds.run_bios2mds(self.n_mds_components, self.XXX_n_kmeans_clusters, seqfos, self.args.workdir + '/mds', self.args.seed, reco_info=self.reco_info, region=self.region, plotdir=plotdir + '/' + family if plotdir is not None else None)
            # partition = mds.run_sklearn_mds(self.n_mds_components, self.XXX_n_kmeans_clusters, seqfos, self.args.seed, reco_info=self.reco_info, region=self.region, plotdir=plotdir + '/' + family if plotdir is not None else None)
            clusterfos += self.get_clusterfos_from_partition(partition, qr_seqs)

        clusterfos = sorted(clusterfos, key=lambda c: len(c['seqfos']), reverse=True)
        return clusterfos
Ejemplo n.º 2
0
    def make_mds_plots(self,
                       sorted_clusters,
                       annotations,
                       base_plotdir,
                       max_cluster_size=10000,
                       reco_info=None,
                       color_rule=None,
                       run_in_parallel=False,
                       debug=False):
        debug = True

        # ----------------------------------------------------------------------------------------
        def get_fname(ic):
            return 'icluster-%d' % ic

        # ----------------------------------------------------------------------------------------
        def get_cluster_info(full_cluster):
            full_info = annotations[':'.join(full_cluster)]
            title = '%s   (size: %d)' % (self.get_cdr3_title(full_info),
                                         len(full_cluster))

            all_seqs = set()
            kept_indices = []
            for iseq in range(len(full_cluster)):
                if full_info['seqs'][
                        iseq] in all_seqs:  # duplicates are from shm indels (I think I did this on purpose in sw)
                    continue
                if full_info['n_mutations'][
                        iseq] == 0:  # remove unmutated sequences since a) they'll crash mds after we add the naive seq below and b) they'd show up in the same spot anyway (note that the only way there can be more than one is if there's Ns either within the sequences or on either end)
                    continue
                kept_indices.append(iseq)
                all_seqs.add(full_info['seqs'][iseq])

            if len(kept_indices) > max_cluster_size:
                uids_to_choose_from = set([
                    full_cluster[i] for i in kept_indices
                ])  # note similarity to code in seqfileopener.post_process()
                if self.args.queries_to_include is not None:
                    uids_to_choose_from -= set(self.args.queries_to_include)
                n_to_remove = len(kept_indices) - max_cluster_size
                if n_to_remove >= len(
                        uids_to_choose_from
                ):  # i.e. if we'd have to start removing queries that are in <queries_to_include>
                    removed_uids = uids_to_choose_from
                else:
                    removed_uids = numpy.random.choice(
                        list(uids_to_choose_from), n_to_remove, replace=False
                    )  # i think this'll still crash if len(uids_to_choose_from) is zero, but, meh
                kept_indices = sorted(
                    set(kept_indices) -
                    set([full_cluster.index(uid) for uid in removed_uids]))
                title += ' (subset: %d / %d)' % (len(kept_indices),
                                                 len(full_cluster))

            seqfos = [{
                'name': full_info['unique_ids'][iseq],
                'seq': full_info['seqs'][iseq]
            } for iseq in kept_indices]
            color_scale_vals = {
                full_cluster[iseq]: full_info['n_mutations'][iseq]
                for iseq in kept_indices
            }

            seqfos.append(
                {
                    'name': '_naive',
                    'seq': full_info['naive_seq']
                }
            )  # note that if any naive sequences that were removed above are in self.args.queries_to_include, they won't be labeled in the plot (but, screw it, who's going to ask to specifically label a sequence that's already specifically labeled?)
            color_scale_vals[
                '_naive'] = 0  # leading underscore is 'cause the mds will crash if there's another sequence with the same name, and e.g. christian's simulation spits out the naive sequence with name 'naive'. No, this is not a good long term fix
            queries_to_include = ['_naive']
            if self.args.queries_to_include is not None:
                queries_to_include += self.args.queries_to_include

            return seqfos, color_scale_vals, queries_to_include, title

        # ----------------------------------------------------------------------------------------
        def get_labels_for_coloring(full_cluster, color_rule):
            full_info = annotations[':'.join(full_cluster)]
            if color_rule == 'nearest-target':  # color by the index of the nearest cluster index (bcr-phylo simulation only)
                if 'target_seqs' not in reco_info[full_cluster[0]]:
                    return
                labels = {
                    uid: str(reco_info[uid]['nearest_target_indices'][0])
                    for uid in full_cluster
                }
                labels['_naive'] = 'foop'
            elif color_rule == 'wtf':
                labels = {uid: uid.split('@')[1] for uid in full_cluster}
                labels['_naive'] = 'foop'
            else:
                assert False

            return labels

        # ----------------------------------------------------------------------------------------
        def prep_cmdfo(iclust, seqfos, queries_to_include, color_scale_vals,
                       title):
            subworkdir = '%s/mds-%d' % (self.args.workdir, iclust)
            utils.prep_dir(subworkdir)
            tmpfname = '%s/seqs.fa' % subworkdir
            with open(tmpfname, 'w') as tmpfile:
                for sfo in seqfos:
                    csval = None
                    if sfo['name'] in color_scale_vals:
                        csval = color_scale_vals[sfo['name']]
                    tmpfile.write(
                        '>%s%s\n%s\n' %
                        (sfo['name'],
                         (' %d' % csval) if csval is not None else '',
                         sfo['seq']))
            cmdstr = './bin/mds-run.py %s --aligned --plotdir %s --plotname %s --workdir %s --seed %d' % (
                tmpfname, plotdir, get_fname(iclust), subworkdir,
                self.args.seed)
            if queries_to_include is not None:
                cmdstr += ' --queries-to-include %s' % ':'.join(
                    queries_to_include)
            if title is not None:
                cmdstr += ' --title=%s' % title.replace(' ', '@')
            return {
                'cmd_str': cmdstr,
                'workdir': subworkdir,
                'outfname': '%s/%s.svg' % (plotdir, get_fname(iclust)),
                'workfnames': [tmpfname]
            }

        # ----------------------------------------------------------------------------------------
        subd, plotdir = self.init_subd('mds', base_plotdir)

        start = time.time()
        if debug:
            if not run_in_parallel:
                print '    making mds plots starting with %d clusters' % len(
                    sorted_clusters)
                print '       size (+naive)   mds    plot   total'
        skipped_cluster_lengths = []
        fnames = [[]]
        cmdfos = []
        for iclust in range(len(sorted_clusters)):
            if not self.plot_this_cluster(sorted_clusters, iclust):
                skipped_cluster_lengths.append(len(sorted_clusters[iclust]))
                continue

            seqfos, color_scale_vals, queries_to_include, title = get_cluster_info(
                sorted_clusters[iclust])

            labels = None
            if color_rule is not None:
                labels = get_labels_for_coloring(sorted_clusters[iclust],
                                                 color_rule)
                # print '   %s setting color_scale_vals to None so we can use colors for nearest target seq index' % utils.color('red', 'note')
                color_scale_vals = None  # not sure this is really the best way to do this

            if debug and not run_in_parallel:
                substart = time.time()
                subset_str = '' if len(
                    sorted_clusters[iclust]
                ) <= max_cluster_size else utils.color(
                    'red',
                    '/%d' % len(sorted_clusters[iclust]),
                    width=6,
                    padside='right')  # -1 is for the added naive seq
                tmpfo = annotations[':'.join(sorted_clusters[iclust])]
                # n_naive_in_cluster = len([iseq for iseq in range(len(sorted_clusters[iclust])) if tmpfo['n_mutations'][iseq] == 0])  # work out if there was a sequence already in the cluster that was the same as the naive sequence
                # print '      %4d%6s' % (len(seqfos) - 1 + n_naive_in_cluster, subset_str),
                print '      %4d%6s' % (len(seqfos), subset_str),

            if run_in_parallel:
                assert labels is None  # would need to implement this (or just switch to non-parallel version if you need to run with labels set)
                cmdfos.append(
                    prep_cmdfo(iclust, seqfos, queries_to_include,
                               color_scale_vals, title))
            else:
                mds.run_bios2mds(self.n_mds_components,
                                 None,
                                 seqfos,
                                 self.args.workdir,
                                 self.args.seed,
                                 aligned=True,
                                 plotdir=plotdir,
                                 plotname=get_fname(iclust),
                                 queries_to_include=queries_to_include,
                                 color_scale_vals=color_scale_vals,
                                 labels=labels,
                                 title=title)
                if debug:
                    print '  %5.1f' % (time.time() - substart)
            self.addfname(fnames, '%s' % get_fname(iclust))

        if run_in_parallel:
            utils.run_cmds(cmdfos, clean_on_success=True)  #, debug='print')

        if debug and len(skipped_cluster_lengths) > 0:
            print '    skipped %d clusters with lengths: %s (+%d singletons)' % (
                len(skipped_cluster_lengths), ' '.join([
                    '%d' % l for l in skipped_cluster_lengths if l > 1
                ]), skipped_cluster_lengths.count(1))

        if not self.args.only_csv_plots:
            self.plotting.make_html(plotdir, fnames=fnames)

        print '    made %d mds plots (%.1fs)' % (sum(
            len(x) for x in fnames), time.time() - start)

        return [[subd + '/' + fn for fn in fnames[0]]]
Ejemplo n.º 3
0
parser.add_argument('--workdir',
                    default='/tmp/dralph/mds/' +
                    str(random.randint(0, 999999)))
parser.add_argument('--seed', type=int, default=1)
parser.add_argument('--aligned', action='store_true')
args = parser.parse_args()
args.queries_to_include = utils.get_arg_list(args.queries_to_include)
if args.title is not None:
    args.title = args.title.replace('@', ' ')  # this is kind of hackey

seqfos = utils.read_fastx(args.infname)
color_scale_vals = {}
for sfo in seqfos:
    if len(sfo['infostrs']) == 2:
        color_scale_vals[sfo['name']] = int(sfo['infostrs'][1])
if len(color_scale_vals) == 0:
    color_scale_vals = None

# mds.run_sklearn_mds(args.n_components, args.n_clusters, seqfos, args.seed, plotdir=args.plotdir)
mds.run_bios2mds(args.n_components,
                 args.n_clusters,
                 seqfos,
                 args.workdir,
                 args.seed,
                 aligned=args.aligned,
                 plotdir=args.plotdir,
                 plotname=args.plotname,
                 queries_to_include=args.queries_to_include,
                 color_scale_vals=color_scale_vals,
                 title=args.title)