Example #1
0
    def make_cluster_size_distribution(self,
                                       base_plotdir,
                                       partition=None,
                                       infiles=None):
        subd, plotdir = self.init_subd('sizes', base_plotdir)

        if partition is not None:  # one partition
            csize_hists = {
                'best': self.plotting.get_cluster_size_hist(partition)
            }
        elif infiles is not None:  # plot the mean of a partition from each file
            subset_hists = []
            for fname in infiles:
                cp = ClusterPath()
                cp.readfile(fname)
                subset_hists.append(
                    self.plotting.get_cluster_size_hist(
                        cp.partitions[cp.i_best]))
            csize_hists = {'best': self.plotting.make_mean_hist(subset_hists)}
            for ih in range(len(subset_hists)):
                subset_hists[ih].write(plotdir +
                                       ('/subset-%d-cluster-sizes.csv' % ih))
        else:
            assert False

        self.plotting.plot_cluster_size_hists(plotdir + '/cluster-sizes.svg',
                                              csize_hists,
                                              title='',
                                              log='x')
        return [[subd + '/cluster-sizes.svg']]
Example #2
0
    def read_partition_performance(self, version_stype, input_stype, debug=False):
        """ Read new partitions from self.dirs['new'], and put the comparison numbers in self.perf_info (compare either to true, for simulation, or to the partition in reference dir, for data). """
        def do_this_test(pt):
            if 'partition' not in pt:
                return False
            if input_stype not in pt:
                return False
            if args.quick and pt not in self.quick_tests:
                return False
            return True

        ptest_list = [k for k in self.tests.keys() if do_this_test(k)]
        if len(ptest_list) == 0:
            return
        if debug:
            print '  version %s input %s partitioning' % (version_stype, input_stype)
            print '  precision      sensitivity        test                    description'
        for ptest in ptest_list:
            cp = ClusterPath(-1)
            cp.readfile(self.dirs[version_stype] + '/' + ptest + '.csv')
            ccfs = cp.ccfs[cp.i_best]
            if None in ccfs:
                raise Exception('none type ccf read from %s' % self.dirs[version_stype] + '/' + ptest + '.csv')
            self.perf_info[version_stype][ptest + '-precision'], self.perf_info[version_stype][ptest + '-sensitivity'] = ccfs
            if debug:
                print '    %5.2f          %5.2f      %-28s   to true partition' % (self.perf_info[version_stype][ptest + '-precision'], self.perf_info[version_stype][ptest + '-sensitivity'], ptest)
Example #3
0
    def read_partition_performance(self, version_stype, input_stype, debug=False):
        """ Read new partitions from self.dirs['new'], and put the comparison numbers in self.perf_info (compare either to true, for simulation, or to the partition in reference dir, for data). """
        def do_this_test(pt):
            if 'partition' not in pt:
                return False
            if input_stype not in pt:
                return False
            if args.quick and pt not in self.quick_tests:
                return False
            return True

        ptest_list = [k for k in self.tests.keys() if do_this_test(k)]
        if len(ptest_list) == 0:
            return
        if debug:
            print '  version %s input %s partitioning' % (version_stype, input_stype)
            print '  purity         completeness        test                    description'
        for ptest in ptest_list:
            cp = ClusterPath(-1)
            cp.readfile(self.dirs[version_stype] + '/' + ptest + '.csv')
            ccfs = cp.ccfs[cp.i_best]
            if None in ccfs:
                raise Exception('none type ccf read from %s' % self.dirs[version_stype] + '/' + ptest + '.csv')
            self.perf_info[version_stype][ptest + '-purity'], self.perf_info[version_stype][ptest + '-completeness'] = ccfs
            if debug:
                print '    %5.2f          %5.2f      %-28s   to true partition' % (self.perf_info[version_stype][ptest + '-purity'], self.perf_info[version_stype][ptest + '-completeness'], ptest)
Example #4
0
    def plot(self, plotdir, partition=None, infiles=None, annotations=None, only_csv=None):
        print '  plotting partitions'
        sys.stdout.flush()
        start = time.time()
        for subdir in self.subplotdirs:
            utils.prep_dir(plotdir + '/' + subdir, wildlings=['*.csv', '*.svg'])

        if partition is not None:  # one partition
            assert infiles is None
            assert annotations is not None
            csize_hists = {'best' : plotting.get_cluster_size_hist(partition)}
            self.plot_within_vs_between_hists(partition, annotations, plotdir)
        elif infiles is not None:  # plot the mean of a partition from each file
            subset_hists = []
            for fname in infiles:
                cp = ClusterPath()
                cp.readfile(fname)
                subset_hists.append(plotting.get_cluster_size_hist(cp.partitions[cp.i_best]))
            csize_hists = {'best' : plotting.make_mean_hist(subset_hists)}
            for ih in range(len(subset_hists)):
                subset_hists[ih].write(plotdir + ('/subset-%d-cluster-sizes.csv' % ih))
        else:
            assert False

        plotting.plot_cluster_size_hists(plotdir + '/overall/cluster-sizes.svg', csize_hists, title='', log='x')

        if not only_csv:
            for subdir in self.subplotdirs:
                plotting.make_html(plotdir + '/' + subdir)

        print '(%.1f sec)' % (time.time()-start)
Example #5
0
    def plot(self,
             plotdir,
             partition=None,
             infiles=None,
             annotations=None,
             only_csv=None):
        import plotting
        print '  plotting partitions'
        sys.stdout.flush()
        start = time.time()
        for subdir in self.subplotdirs:
            utils.prep_dir(plotdir + '/' + subdir,
                           wildlings=['*.csv', '*.svg'])

        fnames = []

        if partition is not None:  # one partition
            assert infiles is None
            assert annotations is not None
            csize_hists = {'best': plotting.get_cluster_size_hist(partition)}
            # self.plot_within_vs_between_hists(partition, annotations, plotdir)
            fnames += self.plot_size_vs_shm(partition, annotations, plotdir)
        elif infiles is not None:  # plot the mean of a partition from each file
            subset_hists = []
            for fname in infiles:
                cp = ClusterPath()
                cp.readfile(fname)
                subset_hists.append(
                    plotting.get_cluster_size_hist(cp.partitions[cp.i_best]))
            csize_hists = {'best': plotting.make_mean_hist(subset_hists)}
            for ih in range(len(subset_hists)):
                subset_hists[ih].write(plotdir +
                                       ('/subset-%d-cluster-sizes.csv' % ih))
        else:
            assert False

        plotting.plot_cluster_size_hists(plotdir +
                                         '/overall/cluster-sizes.svg',
                                         csize_hists,
                                         title='',
                                         log='x')
        fnames.append(['cluster-sizes.svg'])

        if not only_csv:
            for subdir in self.subplotdirs:
                plotting.make_html(plotdir + '/' + subdir,
                                   fnames=fnames,
                                   new_table_each_row=True)

        print '(%.1f sec)' % (time.time() - start)
    def read_file_info(self, infname, n_paths, calc_adj_mi):
        paths = [None for _ in range(n_paths)]
        with opener('r')(infname) as csvfile:
            reader = csv.DictReader(csvfile)
            for line in reader:
                if line['partition'] == '':
                    raise Exception('ERROR null partition (one of the processes probably got passed zero sequences')  # shouldn't happen any more FLW
                uids = []
                for cluster in line['partition'].split(';'):
                    uids.append([unique_id for unique_id in cluster.split(':')])
                path_index = int(line['path_index'])
                if paths[path_index] is None:
                    paths[path_index] = ClusterPath(int(line['initial_path_index']))
                else:
                    assert paths[path_index].initial_path_index == int(line['initial_path_index'])
                n_procs = int(line['n_procs']) if 'n_procs' in line else 1
                logweight = float(line['logweight']) if 'logweight' in line else None
                adj_mi = -1
                if calc_adj_mi:
                    adj_mi = utils.mutual_information(uids, self.reco_info, debug=False) if self.reco_info is not None else -1
                paths[path_index].add_partition(uids, float(line['logprob']), n_procs=n_procs, logweight=logweight, adj_mi=adj_mi)

        for cp in paths:
            if cp is None:
                raise Exception('None type path read from %s' % infname)
            for ptn in cp.partitions:
                if len(ptn) == 0:
                    raise Exception('zero length partition read from %s' % infname)

        return paths
Example #7
0
    def make_cluster_size_distribution(self,
                                       base_plotdir,
                                       partition=None,
                                       infiles=None):
        subd, plotdir = self.init_subd('sizes', base_plotdir)

        if partition is not None:  # one partition
            csize_hists = {
                'best': self.plotting.get_cluster_size_hist(partition)
            }
        elif infiles is not None:  # plot the mean of a partition from each file
            subset_hists = []
            for fname in infiles:
                cp = ClusterPath(fname=fname)
                subset_hists.append(
                    self.plotting.get_cluster_size_hist(
                        cp.partitions[cp.i_best]))
            csize_hists = {'best': self.plotting.make_mean_hist(subset_hists)}
            for ih in range(len(subset_hists)):
                subset_hists[ih].write(plotdir +
                                       ('/subset-%d-cluster-sizes.csv' % ih))
        else:
            assert False

        fname = 'cluster-sizes'
        if infiles is not None:
            print '%s should probably rewrite this to integrate with the below' % utils.color(
                'red', 'note')
            self.plotting.plot_cluster_size_hists(plotdir + '/' + fname +
                                                  '.svg',
                                                  csize_hists,
                                                  title='',
                                                  log='x')
        else:
            fig, ax = self.plotting.mpl_init()
            for label, hist in csize_hists.items():
                hist.mpl_plot(ax,
                              remove_empty_bins=True,
                              label=label if len(csize_hists) > 1 else None)
            csizes = sorted([len(c) for c in partition])
            xticks = [
                x for x in numpy.logspace(
                    math.log(csizes[0], 10), math.log(csizes[-1], 10), num=5)
            ]

            def tstr(xt):
                return ('%.0f' % xt) if xt < 500 else '%.0e' % xt

            self.plotting.mpl_finish(ax,
                                     plotdir,
                                     fname,
                                     xlabel='cluster size',
                                     ylabel='number of clusters',
                                     log='xy',
                                     xticks=xticks,
                                     xticklabels=[tstr(x) for x in xticks])

        return [[subd + '/cluster-sizes.svg']]
Example #8
0
 def read_partition_performance(self, version_stype, input_stype, debug=False):
     """ Read new partitions from self.dirs['new'], and put the comparison numbers in self.perf_info (compare either to true, for simulation, or to the partition in reference dir, for data). """
     ptest = "partition-" + input_stype + "-simu"
     if args.quick and ptest not in self.quick_tests:
         return
     if debug:
         print "  version %s input %s partitioning" % (version_stype, input_stype)
         print "    adj mi   ccf under/over        test                    description"
     for ptest in [k for k in self.tests.keys() if "partition" in k and input_stype in k]:
         if args.quick and ptest not in self.quick_tests:
             continue
         cp = ClusterPath(-1)
         cp.readfile(self.dirs[version_stype] + "/" + ptest + ".csv")
         if "data" in ptest:
             raise Exception("needs fixing")
             ref_cp = ClusterPath(-1)
             ref_cp.readfile(self.dirs["xxxref"] + "/" + ptest + ".csv")
             self.perf_info["xxx"][ptest] = utils.adjusted_mutual_information(
                 cp.partitions[cp.i_best], ref_cp.partitions[ref_cp.i_best]
             )  # adj mi between the reference and the new data partitions
             if debug:
                 print "    %5.2f   %-28s   to reference partition" % (self.perf_info["xxx"][ptest], ptest)
         else:
             self.perf_info[version_stype][ptest + "-adj_mi"] = cp.adj_mis[cp.i_best]  # adj mi to true partition
             self.perf_info[version_stype][ptest + "-ccf_under"], self.perf_info[version_stype][
                 ptest + "-ccf_over"
             ] = cp.ccfs[cp.i_best]
             if debug:
                 print "    %5.2f    %5.2f %5.2f      %-28s   to true partition" % (
                     self.perf_info[version_stype][ptest + "-adj_mi"],
                     self.perf_info[version_stype][ptest + "-ccf_under"],
                     self.perf_info[version_stype][ptest + "-ccf_over"],
                     ptest,
                 )
Example #9
0
 def read_partition_performance(self, version_stype, input_stype, debug=False):
     """ Read new partitions from self.dirs['new'], and put the comparison numbers in self.perf_info (compare either to true, for simulation, or to the partition in reference dir, for data). """
     ptest = 'partition-' + input_stype + '-simu'
     if args.quick and ptest not in self.quick_tests:
         return
     if debug:
         print '  version %s input %s partitioning' % (version_stype, input_stype)
         print '  precision      sensitivity        test                    description'
     for ptest in [k for k in self.tests.keys() if 'partition' in k and input_stype in k]:
         if args.quick and ptest not in self.quick_tests:
             continue
         cp = ClusterPath(-1)
         cp.readfile(self.dirs[version_stype] + '/' + ptest + '.csv')
         if 'data' in ptest:
             raise Exception('needs fixing')
             # ref_cp = ClusterPath(-1)
             # ref_cp.readfile(self.dirs['xxxref'] + '/' + ptest + '.csv')
             # self.perf_info['xxx'][ptest] = utils.adjusted_mutual_information(cp.partitions[cp.i_best], ref_cp.partitions[ref_cp.i_best])  # adj mi between the reference and the new data partitions
             # if debug:
             #     print '    %5.2f   %-28s   to reference partition' % (self.perf_info['xxx'][ptest], ptest)
         else:
             self.perf_info[version_stype][ptest + '-precision'], self.perf_info[version_stype][ptest + '-sensitivity'] = cp.ccfs[cp.i_best]
             if debug:
                 print '    %5.2f          %5.2f      %-28s   to true partition' % (self.perf_info[version_stype][ptest + '-precision'], self.perf_info[version_stype][ptest + '-sensitivity'], ptest)
Example #10
0
    def read_file_info(self, infname, n_paths):
        paths = [None for _ in range(n_paths)]
        lines_list = [[] for _ in range(n_paths)]
        with open(infname, 'r') as csvfile:
            reader = csv.DictReader(csvfile)
            for line in reader:
                if line['partition'] == '':
                    print '    %s null partition (one of the processes probably got passed zero sequences)' % utils.color(
                        'red', 'warning')
                    return paths
                path_index = int(
                    line['path_index']) if 'path_index' in line else 0
                initial_path_index = int(
                    line['initial_path_index']
                ) if 'initial_path_index' in line else 0
                if paths[path_index] is None:  # is this the first line for this path?
                    paths[path_index] = ClusterPath(
                        initial_path_index, seed_unique_id=self.seed_unique_id
                    )  # NOTE I may have screwed up the initial_path_index/path_index distinction here... it's been too long since I wrote the smc stuff and I'm not sure
                else:
                    assert paths[
                        path_index].initial_path_index == initial_path_index
                lines_list[path_index].append(line)

        if paths.count(None) > 0:
            raise Exception(
                'couldn\'t find the required number of paths in file %s' %
                infname)

        for path_index in range(n_paths):
            paths[path_index].readlines(lines_list[path_index],
                                        process_csv=True)

        for cp in paths:
            if cp is None:
                raise Exception('None type path read from %s' % infname)
            for ptn in cp.partitions:
                if len(ptn) == 0:
                    raise Exception('zero length partition read from %s' %
                                    infname)

        return paths
#!/usr/bin/env python
import csv
import sys

partis_path = '.'  # edit this if you're not running from the main partis dir
sys.path.insert(1, partis_path + '/python')
import utils
import glutils
from clusterpath import ClusterPath

# read default germline info
glfo = glutils.read_glfo(partis_path + '/data/germlines/human', locus='igh')

print 'first parse an annotation csv file:'
with open(partis_path + '/test/reference-results/annotate-new-simu.csv') as csvfile:
    reader = csv.DictReader(csvfile)
    for line in reader:
        if line['v_gene'] == '':  # failed (i.e. couldn't find an annotation)
            continue
        utils.process_input_line(line)
        utils.add_implicit_info(glfo, line)
        utils.print_reco_event(line)
        break

print 'then parse a partition csv file:'
cp = ClusterPath()
cp.readfile(partis_path + '/test/reference-results/seed-partition-new-simu.csv')
cp.print_partitions(abbreviate=True)
Example #12
0
    def merge_fileinfos(self,
                        fileinfos,
                        smc_particles,
                        previous_info=None,
                        debug=False):
        self.paths = [
            ClusterPath(None, seed_unique_id=self.seed_unique_id)
            for _ in range(smc_particles)
        ]  # each path's initial_path_index is None since we're merging paths that, in general, have different initial path indices

        # DEAR FUTURE SELF this won't make any sense until you find that picture you took of the white board
        if previous_info is not None and smc_particles > 1:  # if we're doing smc, this has to happen *beforehand*, since the previous paths are separate for each process (cont'd at XX)
            assert len(previous_info) == len(
                fileinfos
            )  # both are the number of processes we're merging into one
            # it would be nice to prevent this from adding duplicate adjacent partitions (well... not that important)
            if debug:
                print 'prepend previous history'
            for ifile in range(len(fileinfos)):
                if debug:
                    print 'ifile', ifile
                for ipath in range(smc_particles):
                    if debug:
                        print '  ipath', ipath
                        print '    before'
                        fileinfos[ifile][ipath].print_partitions(
                            self.reco_info)
                    initial_path_index = fileinfos[ifile][
                        ipath].initial_path_index  # which previous path are we hooking up to?
                    previous_path = previous_info[ifile][initial_path_index]
                    current_path = fileinfos[ifile][ipath]
                    # first_new_logprob = current_path.logprobs[0]
                    extended_path = ClusterPath(
                        None, seed_unique_id=self.seed_unique_id)
                    for ip in range(len(previous_path.partitions)):
                        # if previous_path.logprobs[ip] >= first_new_logprob:  # skip the merges past which we rewound
                        #     continue
                        extended_path.add_partition(
                            list(previous_path.partitions[ip]),
                            previous_path.logprobs[ip],
                            previous_path.n_procs[ip],
                            logweight=previous_path.logweights[ip])
                    for ip in range(len(current_path.partitions)):
                        extended_path.add_partition(
                            list(current_path.partitions[ip]),
                            current_path.logprobs[ip],
                            current_path.n_procs[ip],
                            logweight=current_path.logweights[ip])
                    fileinfos[ifile][ipath] = extended_path
                    fileinfos[ifile][ipath].set_synthetic_logweight_history(
                        self.reco_info
                    )  # need to multiply the combinatorical factors in the later partitions by the factors from the earlier partitions
                    if debug:
                        print '    after'
                        fileinfos[ifile][ipath].print_partitions(
                            self.reco_info)

        # do the actual process-merging
        for ipath in range(smc_particles):

            if debug and len(fileinfos) > 1:
                print 'merge path %d from %d processes:' % (ipath,
                                                            len(fileinfos))
                for ifile in range(len(fileinfos)):
                    fileinfos[ifile][ipath].print_partitions(
                        self.reco_info, extrastr=('%d' % (ifile)))
                    print ''

            # merge all the steps in each path
            def last_one():
                last = True
                for ifile in range(
                        len(fileinfos)
                ):  # we're finished when all the files are out of glomeration steps (i.e. they all only have one [the last] line left)
                    last &= len(fileinfos[ifile][ipath].partitions) == 1
                return last

            def remove_one_of_the_first_partitions():
                maxdelta, ibestfile = None, None
                for ifile in range(len(fileinfos)):
                    if len(
                            fileinfos[ifile][ipath].partitions
                    ) == 1:  # if this is the last line (i.e. there aren't any more glomeration steps in this file), leave it alone
                        continue
                    thisdelta = fileinfos[ifile][ipath].logprobs[
                        1] - fileinfos[ifile][ipath].logprobs[
                            0]  # logprob difference between the next partition and this one
                    if maxdelta is None or thisdelta > maxdelta:
                        maxdelta = thisdelta
                        ibestfile = ifile
                # print '    ibest %d with %f - %f = %f' % (ibestfile, fileinfos[ibestfile][ipath].logprobs[1], fileinfos[ibestfile][ipath].logprobs[0], fileinfos[ibestfile][ipath].logprobs[1] - fileinfos[ibestfile][ipath].logprobs[0])
                fileinfos[ibestfile][ipath].remove_partition(0)

            def add_next_global_partition():
                global_partition = []
                global_logprob = 0.
                for ifile in range(
                        len(fileinfos)
                ):  # combine the first line in each file to make a global partition
                    for cluster in fileinfos[ifile][ipath].partitions[0]:
                        global_partition.append(list(cluster))
                    global_logprob += fileinfos[ifile][ipath].logprobs[0]
                self.paths[ipath].add_partition(
                    global_partition,
                    global_logprob,
                    n_procs=len(fileinfos),
                    logweight=0.
                )  # don't know the logweight yet (or maybe at all!)

            while not last_one():
                add_next_global_partition()
                remove_one_of_the_first_partitions()
            add_next_global_partition()

            if smc_particles > 1:
                self.paths[ipath].set_synthetic_logweight_history(
                    self.reco_info)
            if debug:
                print '  merged path %d with %d glomeration steps and %d final clusters' % (
                    ipath, len(self.paths[ipath].partitions),
                    len(self.paths[ipath].partitions[-1]))
                self.paths[ipath].print_partitions(self.reco_info)

        if smc_particles == 1:  # XX: ...whereas if we're *not* doing smc, we have to add the previous histories *afterward*, since the previous histories are all in one piece
            if previous_info is None:
                if debug:
                    print '  no previous history'
            else:
                # it would be nice to prevent this from adding duplicate adjacent partitions
                if debug:
                    print 'prepend previous history'
                if debug:
                    print '    before'
                    assert len(
                        self.paths
                    ) == 1  # in case gremlins sneak in and add some between lines of code
                    self.paths[0].print_partitions(self.reco_info)
                # initial_path_index = fileinfos[ifile][ipath].initial_path_index  # which previous path are we hooking up to?
                previous_path = previous_info
                current_path = self.paths[0]
                # first_new_logprob = UPDATEME current_path.logprobs[0]
                extended_path = ClusterPath(None,
                                            seed_unique_id=self.seed_unique_id)
                for ip in range(len(previous_path.partitions)):
                    # if previous_path.logprobs[ip] >= first_new_logprob:  # skip the merges past which we rewound
                    #     continue
                    extended_path.add_partition(
                        list(previous_path.partitions[ip]),
                        previous_path.logprobs[ip],
                        previous_path.n_procs[ip],
                        logweight=previous_path.logweights[ip])
                for ip in range(len(current_path.partitions)):
                    extended_path.add_partition(
                        list(current_path.partitions[ip]),
                        current_path.logprobs[ip],
                        current_path.n_procs[ip],
                        logweight=current_path.logweights[ip])
                self.paths[0] = extended_path
                # self.paths[0].set_synthetic_logweight_history(self.reco_info)  # need to multiply the combinatorical factors in the later partitions by the factors from the earlier partitions
                if debug:
                    print '    after'
                    self.paths[0].print_partitions(self.reco_info)
Example #13
0
#!/usr/bin/env python
import sys
sys.path.insert(1, './python')
import csv
csv.field_size_limit(sys.maxsize)  # make sure we can write very large csv fields
import argparse

from clusterpath import ClusterPath
from seqfileopener import get_seqfile_info
import utils

parser = argparse.ArgumentParser()
parser.add_argument('--infname', required=True)
parser.add_argument('--dont-abbreviate', action='store_true', help='Print full seq IDs (otherwise just prints an \'o\')')
parser.add_argument('--n-to-print', type=int, help='How many partitions to print (centered on the best partition)')
parser.add_argument('--datadir', default='data/imgt')
parser.add_argument('--simfname')
parser.add_argument('--is-data', action='store_true')
args = parser.parse_args()

glfo = utils.read_germline_set(args.datadir)

reco_info = None
if args.simfname is not None:
    input_info, reco_info = get_seqfile_info(args.simfname, args.is_data, glfo=glfo)

cp = ClusterPath()
cp.readfile(args.infname)
cp.print_partitions(abbreviate=(not args.dont_abbreviate), n_to_print=args.n_to_print, reco_info=reco_info)
from clusterpath import ClusterPath

parser = argparse.ArgumentParser()
parser.add_argument('--infile')
parser.add_argument('--locus')
parser.add_argument('--param')
parser.add_argument('--nclust')
args = parser.parse_args()

glfo = glutils.read_glfo(args.param + '/hmm/germline-sets', locus=args.locus)

print(sys.argv)
print 'infile =', args.infile
print 'param =', args.param

cp = ClusterPath()
cp.readfile(args.infile)
best_partition = cp.partitions[cp.i_best]
# sorted_clusters = sorted(best_partition, key=len, reverse=True)  # sort by size

# clonal family attributes to print
print '''

score = interest score, indicating interesting attributes: size, SHM, SFS, bnAb VH usage

Size & SHM:
4 points for rank in top 25
3 points for rank 25-50
2 points for rank 50-75
1 point for rank 75-100
Example #15
0
def run_bios2mds(n_components, n_clusters, seqfos, base_workdir, seed, aligned=False, reco_info=None, region=None,
                 max_runs=100, max_iterations=1000, method='euclidean',
                 plotdir=None, plotname='mds', queries_to_include=None, color_scale_vals=None, labels=None, title=None, remove_duplicates=False, debug=False):
    workdir = base_workdir + '/mds'
    msafname = workdir + '/msa.fa'
    mdsfname = workdir + '/components.txt'
    clusterfname = workdir + '/clusters.txt'
    if not os.path.exists(workdir):
        os.makedirs(workdir)

    if len(set([sfo['seq'] for sfo in seqfos])) < len(seqfos):  # it'll just crash when it's running mds later, but this is faster
        if remove_duplicates:
            seq_groups = [list(group) for _, group in itertools.groupby(sorted(seqfos, key=lambda x: x['seq']), key=lambda x: x['seq'])]
            seqs_to_remove = []
            for sgroup in seq_groups:
                seqs_to_remove += [sfo['name'] for sfo in sgroup[1:]]  # remove any after the first one
            seqfos = [sfo for sfo in seqfos if sfo['name'] not in seqs_to_remove]
        else:
            raise Exception('duplicate sequences in seqfos')

    if aligned:  # NOTE unlike the sklearn version below, this doesn't modify <seqfos>
        with open(msafname, 'w') as fastafile:
            for sfo in seqfos:
                fastafile.write('>%s\n%s\n' % (sfo['name'], sfo['seq']))
    else:
        utils.align_many_seqs(seqfos, outfname=msafname)

    # build the R cmd file
    cmdlines = [
        'options(rgl.useNULL=TRUE)',
        'require(bios2mds, quietly=TRUE)',
        'set.seed(%d)' % seed,
        'human <- import.fasta("%s")' % msafname,
        'active <- mat.dif(human, human)',  # mat.dif or mat.dis?
    ]

    if n_components is not None:
        cmdlines += ['mmds_active <- mmds(active, pc=%d)' % n_components]
        cmdlines += ['capture.output(mmds_active$coord, file="%s")' % mdsfname]
    else:
        raise Exception('need to implement')

    if n_clusters is not None:
        cmdlines += [
            'kmeans.run1 <- kmeans.run(mmds_active$coord, nb.clus=%d, nb.run=%d, iter.max=%d, method="%s")' % (n_clusters, max_runs, max_iterations, method),
            # 'kmeans.run1$clusters',
            # 'kmeans.run1$elements',
            'options(width=10000)',
            'capture.output(kmeans.run1$clusters, file="%s")' % clusterfname,
            # sil.score(mat, nb.clus = c(2:13), nb.run = 100, iter.max = 1000,  # run for every possible number of clusters (?)
            #               method = "euclidean")
            # random.msa  # builds a random [...]
        ]

    rstart = time.time()
    try:
        utils.run_r(cmdlines, workdir)  #, print_time='kmeans')
    except subprocess.CalledProcessError as e:  # typically happens because of complex eigenvalues
        print e
        print '   mds failed on cluster'  # NOTE will still crash in read_kmeans_clusterfile(), but I'm not using that a.t.m.
        title = (title if title is not None else '') + ' mds failed'
    pcvals = read_component_file(mdsfname, n_components, seqfos)
    partition = read_kmeans_clusterfile(clusterfname, seqfos) if n_clusters is not None else None
    rstop = time.time()
    if debug and partition is not None:
        print '  kmeans partition:'
        cp = ClusterPath(partition=partition)
        cp.print_partitions(abbreviate=False)

    os.remove(msafname)
    os.rmdir(workdir)

    plotstart = time.time()
    if plotdir is not None:
        # utils.prep_dir(plotdir, wildlings=['*.svg'])
        plot_mds(n_components, pcvals, plotdir, plotname, partition=partition if n_clusters is not None else None, queries_to_include=queries_to_include, color_scale_vals=color_scale_vals, labels=labels, title=title)
        if reco_info is not None:
            labels = {uid : reco_info[uid][region + '_gene'] for uid in pcvals}
            plot_mds(n_components, pcvals, plotdir, 'true-genes', labels=labels, queries_to_include=queries_to_include, color_scale_vals=color_scale_vals, title=title)
    if not debug:  # this isn't a great way to do this, but I don't want to deal with finding all the calling functions, I just want to add some debug printing to this fcn
        print '    %5.1f  %5.1f' % (rstop - rstart, time.time() - plotstart),

    return partition
Example #16
0
        return None
    return hdist


# ----------------------------------------------------------------------------------------
def cdr3_translation(info):
    naive_cdr3_seq = naive_cdr3(info)
    naive_cdr3_seq = naive_cdr3_seq[3:len(naive_cdr3_seq) - 3]
    if len(naive_cdr3_seq) % 3 != 0:
        # print '  out of frame: adding %s' % ((3 - len(naive_cdr3_seq) % 3) * 'N')
        naive_cdr3_seq += (3 - len(naive_cdr3_seq) % 3) * 'N'
    return utils.ltranslate(naive_cdr3_seq)


# ----------------------------------------------------------------------------------------
cpaths = [ClusterPath() for _ in range(len(args.infiles))]
for ifile in range(len(args.infiles)):
    cpaths[ifile].readfile(args.infiles[ifile])
partitions = [
    sorted(cp.partitions[cp.i_best], key=len, reverse=True) for cp in cpaths
]

repertoire_sizes = [
    sum([len(c) for c in partition]) for partition in partitions
]
min_inner_sizes = [
    args.min_inner_size if args.min_inner_rep_frac is None else
    args.min_inner_rep_frac * repertoire_sizes[isample]
    for isample in range(len(args.infiles))
]
min_outer_sizes = [
Example #17
0
    def merge_fileinfos(self, fileinfos, smc_particles, previous_info=None, debug=False):
        self.paths = [ClusterPath(None, seed_unique_id=self.seed_unique_id) for _ in range(smc_particles)]  # each path's initial_path_index is None since we're merging paths that, in general, have different initial path indices

        # DEAR FUTURE SELF this won't make any sense until you find that picture you took of the white board
        if previous_info is not None and smc_particles > 1:  # if we're doing smc, this has to happen *beforehand*, since the previous paths are separate for each process (cont'd at XX)
            assert len(previous_info) == len(fileinfos)  # both are the number of processes we're merging into one
            # TODO prevent this from adding duplicate adjacent partitions (well... not that important)
            if debug:
                print 'prepend previous history'
            for ifile in range(len(fileinfos)):
                if debug:
                    print 'ifile', ifile
                for ipath in range(smc_particles):
                    if debug:
                        print '  ipath', ipath
                        print '    before'
                        fileinfos[ifile][ipath].print_partitions(self.reco_info)
                    initial_path_index = fileinfos[ifile][ipath].initial_path_index  # which previous path are we hooking up to?
                    previous_path = previous_info[ifile][initial_path_index]
                    current_path = fileinfos[ifile][ipath]
                    # first_new_logprob = current_path.logprobs[0]
                    extended_path = ClusterPath(None, seed_unique_id=self.seed_unique_id)
                    for ip in range(len(previous_path.partitions)):
                        # if previous_path.logprobs[ip] >= first_new_logprob:  # skip the merges past which we rewound
                        #     continue
                        extended_path.add_partition(list(previous_path.partitions[ip]), previous_path.logprobs[ip], previous_path.n_procs[ip], logweight=previous_path.logweights[ip], adj_mi=previous_path.adj_mis[ip])
                    for ip in range(len(current_path.partitions)):
                        extended_path.add_partition(list(current_path.partitions[ip]), current_path.logprobs[ip], current_path.n_procs[ip], logweight=current_path.logweights[ip], adj_mi=current_path.adj_mis[ip])
                    fileinfos[ifile][ipath] = extended_path
                    fileinfos[ifile][ipath].set_synthetic_logweight_history(self.reco_info)  # need to multiply the combinatorical factors in the later partitions by the factors from the earlier partitions
                    if debug:
                        print '    after'
                        fileinfos[ifile][ipath].print_partitions(self.reco_info)

        # do the actual process-merging
        for ipath in range(smc_particles):

            if debug and len(fileinfos) > 1:
                print 'merge path %d from %d processes:' % (ipath, len(fileinfos))
                for ifile in range(len(fileinfos)):
                    fileinfos[ifile][ipath].print_partitions(self.reco_info, extrastr=('%d' % (ifile)))
                    print ''

            # merge all the steps in each path
            def last_one():
                last = True
                for ifile in range(len(fileinfos)):  # we're finished when all the files are out of glomeration steps (i.e. they all only have one [the last] line left)
                    last &= len(fileinfos[ifile][ipath].partitions) == 1
                return last

            def remove_one_of_the_first_partitions():
                maxdelta, ibestfile = None, None
                for ifile in range(len(fileinfos)):
                    if len(fileinfos[ifile][ipath].partitions) == 1:  # if this is the last line (i.e. there aren't any more glomeration steps in this file), leave it alone
                        continue
                    thisdelta = fileinfos[ifile][ipath].logprobs[1] - fileinfos[ifile][ipath].logprobs[0]  # logprob difference between the next partition and this one
                    if maxdelta is None or thisdelta > maxdelta:
                        maxdelta = thisdelta
                        ibestfile = ifile
                # print '    ibest %d with %f - %f = %f' % (ibestfile, fileinfos[ibestfile][ipath].logprobs[1], fileinfos[ibestfile][ipath].logprobs[0], fileinfos[ibestfile][ipath].logprobs[1] - fileinfos[ibestfile][ipath].logprobs[0])
                fileinfos[ibestfile][ipath].remove_first_partition()

            def add_next_global_partition():
                global_partition = []
                global_logprob = 0.
                for ifile in range(len(fileinfos)):  # combine the first line in each file to make a global partition
                    for cluster in fileinfos[ifile][ipath].partitions[0]:
                        global_partition.append(list(cluster))
                    global_logprob += fileinfos[ifile][ipath].logprobs[0]
                self.paths[ipath].add_partition(global_partition, global_logprob, n_procs=len(fileinfos), logweight=0.)  # don't know the logweight yet (or maybe at all!)

            while not last_one():
                add_next_global_partition()
                remove_one_of_the_first_partitions()
            add_next_global_partition()


            if smc_particles > 1:
                self.paths[ipath].set_synthetic_logweight_history(self.reco_info)
            if debug:
                print '  merged path %d with %d glomeration steps and %d final clusters' % (ipath, len(self.paths[ipath].partitions), len(self.paths[ipath].partitions[-1]))
                self.paths[ipath].print_partitions(self.reco_info)

        if smc_particles == 1:  # XX: ...whereas if we're *not* doing smc, we have to add the previous histories *afterward*, since the previous histories are all in one piece
            if previous_info is None:
                if debug:
                    print '  no previous history'
            else:
                # TODO prevent this from adding duplicate adjacent partitions
                if debug:
                    print 'prepend previous history'
                if debug:
                    print '    before'
                    assert len(self.paths) == 1  # in case gremlins sneak in and add some between lines of code
                    self.paths[0].print_partitions(self.reco_info)
                # initial_path_index = fileinfos[ifile][ipath].initial_path_index  # which previous path are we hooking up to?
                previous_path = previous_info
                current_path = self.paths[0]
                # first_new_logprob = UPDATEME current_path.logprobs[0]
                extended_path = ClusterPath(None, seed_unique_id=self.seed_unique_id)
                for ip in range(len(previous_path.partitions)):
                    # if previous_path.logprobs[ip] >= first_new_logprob:  # skip the merges past which we rewound
                    #     continue
                    extended_path.add_partition(list(previous_path.partitions[ip]), previous_path.logprobs[ip], previous_path.n_procs[ip], logweight=previous_path.logweights[ip], adj_mi=previous_path.adj_mis[ip])
                for ip in range(len(current_path.partitions)):
                    extended_path.add_partition(list(current_path.partitions[ip]), current_path.logprobs[ip], current_path.n_procs[ip], logweight=current_path.logweights[ip], adj_mi=current_path.adj_mis[ip])
                self.paths[0] = extended_path
                # self.paths[0].set_synthetic_logweight_history(self.reco_info)  # need to multiply the combinatorical factors in the later partitions by the factors from the earlier partitions
                if debug:
                    print '    after'
                    self.paths[0].print_partitions(self.reco_info)
import sys

partis_path = '.'  # edit this if you're not running from the main partis dir
sys.path.insert(1, partis_path + '/python')
import utils
import glutils
from clusterpath import ClusterPath

# read default germline info
glfo = glutils.read_glfo(partis_path + '/data/germlines/human', chain='h')

print 'first parse an annotation csv file:'
with open(partis_path + '/test/reference-results/annotate-new-simu.csv') as csvfile:
    reader = csv.DictReader(csvfile)
    for line in reader:
        utils.process_input_line(line)
        utils.add_implicit_info(glfo, line)
        utils.print_reco_event(glfo['seqs'], line)
        cdr3_bounds = (line['codon_positions']['v'], line['codon_positions']['j'] + 3)
        print ''
        print '  should match the above:'
        print '    %s naive cdr3' % line['naive_seq'][cdr3_bounds[0] : cdr3_bounds[1]]
        print '    %s mature' % line['indel_reversed_seqs'][0][cdr3_bounds[0] : cdr3_bounds[1]]
        print ''
        break

print 'then parse a partition csv file:'
cp = ClusterPath()
cp.readfile(partis_path + '/test/reference-results/seed-partition-new-simu.csv')
cp.print_partitions(abbreviate=True)