Esempio n. 1
0
def main():
    transition_cmds = [
        # './adb shell input tap 383 1203',  # click on next (Challenge mode)
        './adb shell input tap 549 156',  # tap outside to remove touch artifacts (example: developer options)
    ]

    # Thrice so that even if click on ads by mistake, we go back to the game
    back_cmds = ['./adb shell input keyevent 4' for _ in range(3)]

    pkl_filepath = './log.pkl'
    log = utils.load_if_pickled(pkl_filepath)

    new_in_log = 0
    while True:
        if new_in_log >= 5:
            utils.save_to_pickle(log, pkl_filepath)
            new_in_log = 0

        try:
            utils.run_cmds(transition_cmds)
            rundata = solve()
            log.append(rundata)
            new_in_log += 1
            time.sleep(4)  # Time in seconds.
        except (solver.BFSError, solver.HashError):
            utils.run_cmds(back_cmds)
Esempio n. 2
0
    def add_mutants(self, reco_event, irandom):
        chosen_treeinfo = self.treeinfo[random.randint(0, len(self.treeinfo)-1)]
        chosen_tree = chosen_treeinfo.split(';')[0] + ';'
        branch_length_ratios = {}  # NOTE a.t.m (and probably permanently) the mean branch lengths for each region are the *same* for all the trees in the file, I just don't have a better place to put them while I'm passing from TreeGenerator to here than at the end of each line in the file
        for tmpstr in chosen_treeinfo.split(';')[1].split(','):  # looks like e.g.: (t2:0.003751736951,t1:0.003751736951):0.001248262937;v:0.98,d:1.8,j:0.87, where the newick trees has branch lengths corresponding to the whole sequence  (i.e. the weighted mean of v, d, and j)
            region = tmpstr.split(':')[0]
            assert region in utils.regions
            ratio = float(tmpstr.split(':')[1])
            if self.args.mutation_multiplier is not None:  # multiply the branch lengths by some factor
                # if self.args.debug:
                # print '    adding branch length factor %f ' % self.args.mutation_multiplier
                ratio *= self.args.mutation_multiplier
            branch_length_ratios[region] = ratio

        if self.args.debug:  # NOTE should be the same for t[0-9]... but I guess I should check at some point
            print '  using tree with total depth %f' % treegenerator.get_leaf_node_depths(chosen_tree)['t1']  # kind of hackey to just look at t1, but they're all the same anyway and it's just for printing purposes...
            if len(re.findall('t', chosen_tree)) > 1:  # if more than one leaf
                Phylo.draw_ascii(Phylo.read(StringIO(chosen_tree), 'newick'))
            else:
                print '    one leaf'
            print '    with branch length ratios ', ', '.join(['%s %f' % (region, branch_length_ratios[region]) for region in utils.regions])

        scaled_trees = self.get_rescaled_trees(chosen_tree, branch_length_ratios)
        treg = re.compile('t[0-9][0-9]*')
        n_leaf_nodes = len(treg.findall(chosen_tree))
        cmdfos = []
        for region in utils.regions:
            simstr = reco_event.eroded_seqs[region]
            if region == 'd':
                simstr = reco_event.insertions['vd'] + simstr + reco_event.insertions['dj']
            cmdfos.append(self.prepare_bppseqgen(simstr, scaled_trees[region], n_leaf_nodes, reco_event.genes[region], reco_event, seed=irandom))

        utils.run_cmds([cfo for cfo in cmdfos if cfo is not None], sleep=False)  # shenanigan is to handle zero-length regional seqs

        mseqs = {}
        for ireg in range(len(utils.regions)):
            if cmdfos[ireg] is None:
                mseqs[utils.regions[ireg]] = ['' for _ in range(n_leaf_nodes)]  # return an empty string for each leaf node
            else:
                mseqs[utils.regions[ireg]] = self.read_bppseqgen_output(cmdfos[ireg], n_leaf_nodes)

        assert len(reco_event.final_seqs) == 0
        for iseq in range(n_leaf_nodes):
            seq = mseqs['v'][iseq] + mseqs['d'][iseq] + mseqs['j'][iseq]
            seq = reco_event.revert_conserved_codons(seq)  # if mutation screwed up the conserved codons, just switch 'em back to what they were to start with
            reco_event.final_seqs.append(seq)  # set final sequnce in reco_event

        self.add_shm_indels(reco_event)
Esempio n. 3
0
def multiple_tests(args):
    def getlogdir(iproc):
        logdir = args.outdir + '/' + str(iproc) + '/logs'
        if args.plot_annotation_performance:
            logdir += '/annotation-performance-plots'
        return logdir + '/' + '-'.join(args.methods)

    def cmd_str(iproc):
        clist = copy.deepcopy(sys.argv)
        utils.remove_from_arglist(clist, '--n-tests', has_arg=True)
        utils.remove_from_arglist(clist, '--iteststart', has_arg=True)
        utils.replace_in_arglist(clist, '--outdir',
                                 args.outdir + '/' + str(iproc))
        utils.replace_in_arglist(clist, '--seed', str(args.seed + iproc))
        # clist.append('--slurm')
        return ' '.join(clist)

    for iproc in range(
            args.iteststart, args.n_tests
    ):  # don't overwrite old log files... need to eventually fix this so it isn't necessary

        def lfn(iproc, ilog):
            logfname = args.outdir + '/' + str(iproc) + '/log'
            if ilog > 0:
                logfname += '.' + str(ilog)
            return logfname

    cmdfos = [{
        'cmd_str': cmd_str(iproc),
        'workdir': args.workdir + '/' + str(iproc),
        'logdir': getlogdir(iproc),
        'outfname': args.outdir + '/' + str(iproc)
    } for iproc in range(args.iteststart, args.n_tests)]
    if args.dry_run:
        for iproc in range(args.iteststart, args.n_tests):
            utils.simplerun(cmdfos[iproc - args.iteststart]['cmd_str'],
                            dryrun=True)
        return
    for iproc in range(args.iteststart, args.n_tests):
        logd = getlogdir(iproc)
        if os.path.exists(logd + '/log'):
            ilog = 0
            while os.path.exists(logd + '/log.' + str(ilog)):
                ilog += 1
            check_call(['mv', '-v', logd + '/log', logd + '/log.' + str(ilog)])
    print '  look for logs in %s' % args.outdir
    utils.run_cmds(cmdfos, debug='write')
Esempio n. 4
0
def solve():

    t0 = time.time()

    # print 'Capturing and transferring screenshot...'
    screen_filename = 'screen.png'
    screen_cmds = [
        './adb shell screencap -p /sdcard/{}'.format(screen_filename),
        './adb pull /sdcard/{}'.format(screen_filename),
    ]
    utils.run_cmds(screen_cmds)
    t1 = time.time()

    # print 'Getting initial configuration...',
    start = solver.get_start_config(screen_filename)
    # print 'Done'
    t2 = time.time()

    # print 'Computing the moves...',
    swipes = solver.solve(start)
    if swipes is None:
        print 'Invalid grid'
        raise RuntimeError
    t3 = time.time()

    # print 'Converting to swipe commands...',
    X_vals = [127, 294, 461, 628, 795, 962]
    Y_vals = [633, 800, 967, 1134, 1301, 1468]
    pattern = 'input swipe {} {} {} {} {}; '
    move_cmds = [get_cmd(swipe, pattern, X_vals, Y_vals) for swipe in swipes]
    solution_cmd = './adb shell "{}"'.format('\n'.join(move_cmds))
    # print 'Done'

    # print 'Solving the puzzle...',
    utils.run_cmds([solution_cmd])
    # print 'Done\n'
    t4 = time.time()

    rundata = classes.Rundata(start, swipes, t1 - t0, t2 - t1, t3 - t2,
                              t4 - t3)
    rundata.pprint()

    return rundata
Esempio n. 5
0
def comprehensive_test(args):
    def cmd_str(iproc):
        clist = copy.deepcopy(sys.argv)
        utils.remove_from_arglist(clist, '--comprehensive')
        utils.remove_from_arglist(clist, '--n-tests', has_arg=True)
        utils.replace_in_arglist(clist, '--outdir', args.outdir + '/' + str(iproc))
        utils.replace_in_arglist(clist, '--seed', str(args.seed + iproc))
        # clist.append('--slurm')
        return ' '.join(clist)

    cmdfos = [{'cmd_str' : cmd_str(iproc),
               'workdir' : args.workdir + '/' + str(iproc),
               'logdir' : args.outdir + '/' + str(iproc),
               'outfname' : args.outdir + '/' + str(iproc)}
        for iproc in range(args.n_tests)]
    for iproc in range(args.n_tests):
        if os.path.exists(cmdfos[iproc]['outfname']):
            check_call(['rm', '-r', cmdfos[iproc]['outfname']])
    print '  look for logs in %s' % args.outdir
    utils.run_cmds(cmdfos, debug='write')
Esempio n. 6
0
def simulate():

    rearrange()

    glfo, naive_event_list, cpath = utils.read_output(naive_fname())
    assert len(naive_event_list) == args.n_sim_events

    outdirs = ['%s/event-%d' % (simdir(), i) for i in range(len(naive_event_list))]

    start = time.time()
    cmdfos = []
    if args.n_procs > 1:
        print '    starting %d events' % len(naive_event_list)
    uid_str_len = 6 + int(math.log(len(naive_event_list), 10))  # if the final sample's going to contain many trees, it's worth making the uids longer so there's fewer collisions/duplicates
    for ievent, (naive_line, outdir) in enumerate(zip(naive_event_list, outdirs)):
        if args.n_sim_events > 1 and args.n_procs == 1:
            print '  %s %d' % (utils.color('blue', 'ievent'), ievent)
        cfo = run_bcr_phylo(naive_line, outdir, ievent, len(naive_event_list), uid_str_len=uid_str_len)  # if n_procs > 1, doesn't run, just returns cfo
        if cfo is not None:
            print '      %s %s' % (utils.color('red', 'run'), cfo['cmd_str'])
            cmdfos.append(cfo)
    if args.n_procs > 1 and len(cmdfos) > 0:
        utils.run_cmds(cmdfos, shell=True, n_max_procs=args.n_procs, batch_system='slurm' if args.slurm else None, allow_failure=True, debug='print')
    print '  bcr-phylo run time: %.1fs' % (time.time() - start)

    if utils.output_exists(args, simfname(), outlabel='mutated simu', offset=4):  # i guess if it crashes during the plotting just below, this'll get confused
        return

    start = time.time()
    mutated_events = []
    for ievent, (naive_line, outdir) in enumerate(zip(naive_event_list, outdirs)):
        mutated_events.append(parse_bcr_phylo_output(glfo, naive_line, outdir, ievent))
    print '  parsing time: %.1fs' % (time.time() - start)

    print '  writing annotations to %s' % simfname()
    utils.write_annotations(simfname(), glfo, mutated_events, utils.simulation_headers)

    if not args.only_csv_plots:
        import lbplotting
        for outdir, event in zip(outdirs, mutated_events):
            lbplotting.plot_bcr_phylo_simulation(outdir, event, args.extrastr, lbplotting.metric_for_target_distance_labels[args.metric_for_target_distance])
Esempio n. 7
0
def make_tree(all_genes, workdir, use_cache=False):
    aligned_fname = workdir + '/all-aligned.fa'
    raxml_label = 'xxx'
    raxml_output_fnames = [
        '%s/RAxML_%s.%s' % (workdir, fn, raxml_label)
        for fn in ['parsimonyTree', 'log', 'result', 'info', 'bestTree']
    ]
    treefname = [fn for fn in raxml_output_fnames if 'result' in fn][0]
    if use_cache:  # don't re-run muxcle & raxml, just use the previous run's output tree file
        return treefname
    utils.prep_dir(workdir,
                   wildlings=[
                       '*.' + raxml_label,
                       os.path.basename(aligned_fname), 'out', 'err',
                       os.path.basename(aligned_fname) + '.reduced'
                   ])

    # write and align an .fa with all alleles from any gl set
    with tempfile.NamedTemporaryFile() as tmpfile:
        for name, seq in all_genes.items():
            tmpfile.write('>%s\n%s\n' % (name, seq))
        tmpfile.flush()  # BEWARE if you forget this you are f****d
        cmdstr = '%s -in %s -out %s' % (args.muscle_path, tmpfile.name,
                                        aligned_fname)
        print '    %s %s' % (utils.color('red', 'run'), cmdstr)
        utils.run_cmds(get_cmdfos(cmdstr, workdir, aligned_fname),
                       ignore_stderr=True)

    # get a tree for the aligned .fa
    cmdstr = '%s -mGTRCAT -n%s -s%s -p1 -w%s' % (args.raxml_path, raxml_label,
                                                 aligned_fname, workdir)
    print '    %s %s' % (utils.color('red', 'run'), cmdstr)
    utils.run_cmds(get_cmdfos(cmdstr, workdir, treefname), ignore_stderr=True)

    os.remove(aligned_fname)  # rm muscle output
    for fn in [
            f for f in raxml_output_fnames if f != treefname
    ]:  # rm all the raxml outputs except what the one file we really want
        os.remove(fn)

    return treefname
Esempio n. 8
0
def multiple_tests(args):
    def cmd_str(iproc):
        clist = copy.deepcopy(sys.argv)
        utils.remove_from_arglist(clist, '--n-tests', has_arg=True)
        utils.replace_in_arglist(clist, '--outdir',
                                 args.outdir + '/' + str(iproc))
        utils.replace_in_arglist(clist, '--seed', str(args.seed + iproc))
        # clist.append('--slurm')
        return ' '.join(clist)

    cmdfos = [{
        'cmd_str': cmd_str(iproc),
        'workdir': args.workdir + '/' + str(iproc),
        'logdir': args.outdir + '/' + str(iproc),
        'outfname': args.outdir + '/' + str(iproc)
    } for iproc in range(args.n_tests)]
    for iproc in range(args.n_tests):
        if os.path.exists(cmdfos[iproc]['outfname']):
            check_call(['rm', '-r', cmdfos[iproc]['outfname']])
    print '  look for logs in %s' % args.outdir
    utils.run_cmds(cmdfos, debug='write')
Esempio n. 9
0
    def add_mutants(self, reco_event, irandom):
        if self.args.mutation_multiplier is not None and self.args.mutation_multiplier == 0.:  # some of the stuff below fails if mut mult is actually 0.
            reco_event.final_seqs.append(
                reco_event.recombined_seq)  # set final sequnce in reco_event
            reco_event.indelfos = [
                indelutils.get_empty_indel()
                for _ in range(len(reco_event.final_seqs))
            ]
            return

        # When generating trees, each tree's number of leaves and total depth are chosen from the specified distributions (a.t.m., by default n-leaves is from a geometric/zipf, and depth is from data)
        # This chosen depth corresponds to the sequence-wide mutation frequency.
        # In order to account for varying mutation rates in v, d, and j we simulate these regions separately, by appropriately rescaling the tree for each region.
        # i.e.: here we get the sequence-wide mute freq from the tree, and rescale it by the repertoire-wide ratios from data (which are stored in the tree file).
        # looks like e.g.: (t2:0.003751736951,t1:0.003751736951):0.001248262937;v:0.98,d:1.8,j:0.87, where the newick trees has branch lengths corresponding to the whole sequence  (i.e. the weighted mean of v, d, and j)
        # NOTE a.t.m (and probably permanently) the mean branch lengths for each region are the same for all the trees in the file, I just don't have a better place to put them while I'm passing from TreeGenerator to here than at the end of each line in the file
        treefostr = self.treeinfo[random.randint(
            0,
            len(self.treeinfo) - 1
        )]  # per-region mutation info is tacked on after the tree... sigh. kind of hackey but works ok.
        assert treefostr.count(';') == 1
        isplit = treefostr.find(';') + 1
        chosen_tree = treefostr[:isplit]  # includes semi-colon
        mutefo = [rstr for rstr in treefostr[isplit:].split(',')]
        mean_total_height = treegenerator.get_mean_height(chosen_tree)
        regional_heights = {
        }  # per-region height, including <self.args.mutation_multiplier>
        for tmpstr in mutefo:
            region, ratio = tmpstr.split(':')
            assert region in utils.regions
            ratio = float(ratio)
            if self.args.mutation_multiplier is not None:  # multiply the branch lengths by some factor
                ratio *= self.args.mutation_multiplier
            regional_heights[region] = mean_total_height * ratio

        scaled_trees = {
            r: treegenerator.rescale_tree(chosen_tree, regional_heights[r])
            for r in utils.regions
        }

        if self.args.debug:
            print '  chose tree with total height %f' % treegenerator.get_mean_height(
                chosen_tree)
            print '    regional trees rescaled to heights:  %s' % ('   '.join([
                '%s %.3f  (expected %.3f)' %
                (region, treegenerator.get_mean_height(
                    scaled_trees[region]), regional_heights[region])
                for region in utils.regions
            ]))
            print treegenerator.get_ascii_tree(chosen_tree, extra_str='    ')

        n_leaves = treegenerator.get_n_leaves(chosen_tree)
        cmdfos = []
        for region in utils.regions:
            simstr = reco_event.eroded_seqs[region]
            if region == 'd':
                simstr = reco_event.insertions[
                    'vd'] + simstr + reco_event.insertions['dj']
            cmdfos.append(
                self.prepare_bppseqgen(simstr,
                                       scaled_trees[region],
                                       n_leaves,
                                       reco_event.genes[region],
                                       reco_event,
                                       seed=irandom))

        utils.run_cmds(
            [cfo for cfo in cmdfos if cfo is not None],
            sleep=False)  # shenanigan is to handle zero-length regional seqs

        mseqs = {}
        for ireg in range(
                len(utils.regions)
        ):  # NOTE kind of sketchy just using index in <utils.regions> (although it just depends on the loop immediately above a.t.m.)
            if cmdfos[ireg] is None:
                mseqs[utils.regions[ireg]] = [
                    '' for _ in range(n_leaves)
                ]  # return an empty string for each leaf node
            else:
                mseqs[utils.regions[ireg]] = self.read_bppseqgen_output(
                    cmdfos[ireg], n_leaves)

        assert len(reco_event.final_seqs) == 0
        for iseq in range(n_leaves):
            seq = mseqs['v'][iseq] + mseqs['d'][iseq] + mseqs['j'][iseq]
            seq = reco_event.revert_conserved_codons(
                seq, debug=self.args.debug
            )  # if mutation screwed up the conserved codons, just switch 'em back to what they were to start with
            reco_event.final_seqs.append(
                seq)  # set final sequnce in reco_event
            reco_event.final_codon_positions.append(
                copy.deepcopy(reco_event.post_erosion_codon_positions)
            )  # separate codon positions for each sequence, because of shm indels

        self.add_shm_indels(reco_event)

        reco_event.setline(
            irandom
        )  # set the line here because we use it when checking tree simulation, and want to make sure the uids are always set at the same point in the workflow

        self.check_tree_simulation(mean_total_height, regional_heights,
                                   scaled_trees, mseqs, reco_event)

        if self.args.debug:
            utils.print_reco_event(reco_event.line, extra_str='    ')
Esempio n. 10
0
    def make_mds_plots(self,
                       sorted_clusters,
                       annotations,
                       base_plotdir,
                       max_cluster_size=10000,
                       reco_info=None,
                       color_rule=None,
                       run_in_parallel=False,
                       debug=False):
        debug = True

        # ----------------------------------------------------------------------------------------
        def get_fname(ic):
            return 'icluster-%d' % ic

        # ----------------------------------------------------------------------------------------
        def get_cluster_info(full_cluster):
            full_info = annotations[':'.join(full_cluster)]
            title = '%s   (size: %d)' % (self.get_cdr3_title(full_info),
                                         len(full_cluster))

            all_seqs = set()
            kept_indices = []
            for iseq in range(len(full_cluster)):
                if full_info['seqs'][
                        iseq] in all_seqs:  # duplicates are from shm indels (I think I did this on purpose in sw)
                    continue
                if full_info['n_mutations'][
                        iseq] == 0:  # remove unmutated sequences since a) they'll crash mds after we add the naive seq below and b) they'd show up in the same spot anyway (note that the only way there can be more than one is if there's Ns either within the sequences or on either end)
                    continue
                kept_indices.append(iseq)
                all_seqs.add(full_info['seqs'][iseq])

            if len(kept_indices) > max_cluster_size:
                uids_to_choose_from = set([
                    full_cluster[i] for i in kept_indices
                ])  # note similarity to code in seqfileopener.post_process()
                if self.args.queries_to_include is not None:
                    uids_to_choose_from -= set(self.args.queries_to_include)
                n_to_remove = len(kept_indices) - max_cluster_size
                if n_to_remove >= len(
                        uids_to_choose_from
                ):  # i.e. if we'd have to start removing queries that are in <queries_to_include>
                    removed_uids = uids_to_choose_from
                else:
                    removed_uids = numpy.random.choice(
                        list(uids_to_choose_from), n_to_remove, replace=False
                    )  # i think this'll still crash if len(uids_to_choose_from) is zero, but, meh
                kept_indices = sorted(
                    set(kept_indices) -
                    set([full_cluster.index(uid) for uid in removed_uids]))
                title += ' (subset: %d / %d)' % (len(kept_indices),
                                                 len(full_cluster))

            seqfos = [{
                'name': full_info['unique_ids'][iseq],
                'seq': full_info['seqs'][iseq]
            } for iseq in kept_indices]
            color_scale_vals = {
                full_cluster[iseq]: full_info['n_mutations'][iseq]
                for iseq in kept_indices
            }

            seqfos.append(
                {
                    'name': '_naive',
                    'seq': full_info['naive_seq']
                }
            )  # note that if any naive sequences that were removed above are in self.args.queries_to_include, they won't be labeled in the plot (but, screw it, who's going to ask to specifically label a sequence that's already specifically labeled?)
            color_scale_vals[
                '_naive'] = 0  # leading underscore is 'cause the mds will crash if there's another sequence with the same name, and e.g. christian's simulation spits out the naive sequence with name 'naive'. No, this is not a good long term fix
            queries_to_include = ['_naive']
            if self.args.queries_to_include is not None:
                queries_to_include += self.args.queries_to_include

            return seqfos, color_scale_vals, queries_to_include, title

        # ----------------------------------------------------------------------------------------
        def get_labels_for_coloring(full_cluster, color_rule):
            full_info = annotations[':'.join(full_cluster)]
            if color_rule == 'nearest-target':  # color by the index of the nearest cluster index (bcr-phylo simulation only)
                if 'target_seqs' not in reco_info[full_cluster[0]]:
                    return
                labels = {
                    uid: str(reco_info[uid]['nearest_target_indices'][0])
                    for uid in full_cluster
                }
                labels['_naive'] = 'foop'
            elif color_rule == 'wtf':
                labels = {uid: uid.split('@')[1] for uid in full_cluster}
                labels['_naive'] = 'foop'
            else:
                assert False

            return labels

        # ----------------------------------------------------------------------------------------
        def prep_cmdfo(iclust, seqfos, queries_to_include, color_scale_vals,
                       title):
            subworkdir = '%s/mds-%d' % (self.args.workdir, iclust)
            utils.prep_dir(subworkdir)
            tmpfname = '%s/seqs.fa' % subworkdir
            with open(tmpfname, 'w') as tmpfile:
                for sfo in seqfos:
                    csval = None
                    if sfo['name'] in color_scale_vals:
                        csval = color_scale_vals[sfo['name']]
                    tmpfile.write(
                        '>%s%s\n%s\n' %
                        (sfo['name'],
                         (' %d' % csval) if csval is not None else '',
                         sfo['seq']))
            cmdstr = './bin/mds-run.py %s --aligned --plotdir %s --plotname %s --workdir %s --seed %d' % (
                tmpfname, plotdir, get_fname(iclust), subworkdir,
                self.args.seed)
            if queries_to_include is not None:
                cmdstr += ' --queries-to-include %s' % ':'.join(
                    queries_to_include)
            if title is not None:
                cmdstr += ' --title=%s' % title.replace(' ', '@')
            return {
                'cmd_str': cmdstr,
                'workdir': subworkdir,
                'outfname': '%s/%s.svg' % (plotdir, get_fname(iclust)),
                'workfnames': [tmpfname]
            }

        # ----------------------------------------------------------------------------------------
        subd, plotdir = self.init_subd('mds', base_plotdir)

        start = time.time()
        if debug:
            if not run_in_parallel:
                print '    making mds plots starting with %d clusters' % len(
                    sorted_clusters)
                print '       size (+naive)   mds    plot   total'
        skipped_cluster_lengths = []
        fnames = [[]]
        cmdfos = []
        for iclust in range(len(sorted_clusters)):
            if not self.plot_this_cluster(sorted_clusters, iclust):
                skipped_cluster_lengths.append(len(sorted_clusters[iclust]))
                continue

            seqfos, color_scale_vals, queries_to_include, title = get_cluster_info(
                sorted_clusters[iclust])

            labels = None
            if color_rule is not None:
                labels = get_labels_for_coloring(sorted_clusters[iclust],
                                                 color_rule)
                # print '   %s setting color_scale_vals to None so we can use colors for nearest target seq index' % utils.color('red', 'note')
                color_scale_vals = None  # not sure this is really the best way to do this

            if debug and not run_in_parallel:
                substart = time.time()
                subset_str = '' if len(
                    sorted_clusters[iclust]
                ) <= max_cluster_size else utils.color(
                    'red',
                    '/%d' % len(sorted_clusters[iclust]),
                    width=6,
                    padside='right')  # -1 is for the added naive seq
                tmpfo = annotations[':'.join(sorted_clusters[iclust])]
                # n_naive_in_cluster = len([iseq for iseq in range(len(sorted_clusters[iclust])) if tmpfo['n_mutations'][iseq] == 0])  # work out if there was a sequence already in the cluster that was the same as the naive sequence
                # print '      %4d%6s' % (len(seqfos) - 1 + n_naive_in_cluster, subset_str),
                print '      %4d%6s' % (len(seqfos), subset_str),

            if run_in_parallel:
                assert labels is None  # would need to implement this (or just switch to non-parallel version if you need to run with labels set)
                cmdfos.append(
                    prep_cmdfo(iclust, seqfos, queries_to_include,
                               color_scale_vals, title))
            else:
                mds.run_bios2mds(self.n_mds_components,
                                 None,
                                 seqfos,
                                 self.args.workdir,
                                 self.args.seed,
                                 aligned=True,
                                 plotdir=plotdir,
                                 plotname=get_fname(iclust),
                                 queries_to_include=queries_to_include,
                                 color_scale_vals=color_scale_vals,
                                 labels=labels,
                                 title=title)
                if debug:
                    print '  %5.1f' % (time.time() - substart)
            self.addfname(fnames, '%s' % get_fname(iclust))

        if run_in_parallel:
            utils.run_cmds(cmdfos, clean_on_success=True)  #, debug='print')

        if debug and len(skipped_cluster_lengths) > 0:
            print '    skipped %d clusters with lengths: %s (+%d singletons)' % (
                len(skipped_cluster_lengths), ' '.join([
                    '%d' % l for l in skipped_cluster_lengths if l > 1
                ]), skipped_cluster_lengths.count(1))

        if not self.args.only_csv_plots:
            self.plotting.make_html(plotdir, fnames=fnames)

        print '    made %d mds plots (%.1fs)' % (sum(
            len(x) for x in fnames), time.time() - start)

        return [[subd + '/' + fn for fn in fnames[0]]]
Esempio n. 11
0
                ytmpfo['perfect_vals']) if lbp > min_ptile_to_plot
        ]

    # ----------------------------------------------------------------------------------------
    for cg in treeutils.cgroups:
        for tv in treeutils.dtr_targets[cg]:
            print '  %s %s' % (cg, tv)
            print '    %s' % ' '.join(
                tuple('%20s' % k for k in sorted(plotfo[0]['cfg'])))
            for pfo in plotfo:
                yfn = treeutils.tmfname(pfo['plotdir'],
                                        'dtr',
                                        lbplotting.getptvar(tv),
                                        cg=cg,
                                        tv=tv)
                diff_vals = getptvals(yfn, cg, tv)
                diff_to_perfect = numpy.mean(diff_vals)
                print '     %s       %6.3f    %s' % ('    '.join(
                    tuple('%15d' % v
                          for k, v in sorted(pfo['cfg'].items(),
                                             key=operator.itemgetter(0)))),
                                                     diff_to_perfect,
                                                     pfo['plotdir'])
else:
    print '  starting %d jobs' % len(cmdfos)
    n_max_procs = 2  #utils.auto_n_procs()
    utils.run_cmds(cmdfos,
                   n_max_procs=n_max_procs,
                   proc_limit_str='test/cf-tree-metrics',
                   debug='write:cf-tree-metrics.log')
Esempio n. 12
0
    def add_mutants(self, reco_event, irandom):
        if self.args.mutation_multiplier is not None and self.args.mutation_multiplier == 0.:  # some of the stuff below fails if mut mult is actually 0.
            reco_event.final_seqs.append(
                reco_event.recombined_seq)  # set final sequnce in reco_event
            reco_event.indelfos = [
                utils.get_empty_indel()
                for _ in range(len(reco_event.final_seqs))
            ]
            return

        chosen_treeinfo = self.treeinfo[random.randint(0,
                                                       len(self.treeinfo) - 1)]
        chosen_tree = chosen_treeinfo.split(';')[0] + ';'
        branch_length_ratios = {
        }  # NOTE a.t.m (and probably permanently) the mean branch lengths for each region are the *same* for all the trees in the file, I just don't have a better place to put them while I'm passing from TreeGenerator to here than at the end of each line in the file
        for tmpstr in chosen_treeinfo.split(';')[1].split(
                ','
        ):  # looks like e.g.: (t2:0.003751736951,t1:0.003751736951):0.001248262937;v:0.98,d:1.8,j:0.87, where the newick trees has branch lengths corresponding to the whole sequence  (i.e. the weighted mean of v, d, and j)
            region = tmpstr.split(':')[0]
            assert region in utils.regions
            ratio = float(tmpstr.split(':')[1])
            if self.args.mutation_multiplier is not None:  # multiply the branch lengths by some factor
                # if self.args.debug:
                # print '    adding branch length factor %f ' % self.args.mutation_multiplier
                ratio *= self.args.mutation_multiplier
            branch_length_ratios[region] = ratio

        if self.args.debug:  # NOTE should be the same for t[0-9]... but I guess I should check at some point
            print '  using tree with total depth %f' % treegenerator.get_leaf_node_depths(
                chosen_tree
            )['t1']  # kind of hackey to just look at t1, but they're all the same anyway and it's just for printing purposes...
            if len(re.findall('t', chosen_tree)) > 1:  # if more than one leaf
                Phylo.draw_ascii(Phylo.read(StringIO(chosen_tree), 'newick'))
            else:
                print '    one leaf'
            print '    with branch length ratios ', ', '.join([
                '%s %f' % (region, branch_length_ratios[region])
                for region in utils.regions
            ])

        scaled_trees = self.get_rescaled_trees(chosen_tree,
                                               branch_length_ratios)
        treg = re.compile('t[0-9][0-9]*')
        n_leaf_nodes = len(treg.findall(chosen_tree))
        cmdfos = []
        for region in utils.regions:
            simstr = reco_event.eroded_seqs[region]
            if region == 'd':
                simstr = reco_event.insertions[
                    'vd'] + simstr + reco_event.insertions['dj']
            cmdfos.append(
                self.prepare_bppseqgen(simstr,
                                       scaled_trees[region],
                                       n_leaf_nodes,
                                       reco_event.genes[region],
                                       reco_event,
                                       seed=irandom))

        utils.run_cmds(
            [cfo for cfo in cmdfos if cfo is not None],
            sleep=False)  # shenanigan is to handle zero-length regional seqs

        mseqs = {}
        for ireg in range(len(utils.regions)):
            if cmdfos[ireg] is None:
                mseqs[utils.regions[ireg]] = [
                    '' for _ in range(n_leaf_nodes)
                ]  # return an empty string for each leaf node
            else:
                mseqs[utils.regions[ireg]] = self.read_bppseqgen_output(
                    cmdfos[ireg], n_leaf_nodes)

        assert len(reco_event.final_seqs) == 0
        for iseq in range(n_leaf_nodes):
            seq = mseqs['v'][iseq] + mseqs['d'][iseq] + mseqs['j'][iseq]
            seq = reco_event.revert_conserved_codons(
                seq
            )  # if mutation screwed up the conserved codons, just switch 'em back to what they were to start with
            reco_event.final_seqs.append(
                seq)  # set final sequnce in reco_event

        self.add_shm_indels(reco_event)
Esempio n. 13
0
def simulate():

    rearrange()

    glfos, naive_events = read_rearrangements()
    assert len(naive_events) == args.n_sim_events

    outdirs = [evtdir(i) for i in range(len(naive_events))]

    start = time.time()
    cmdfos = []
    if args.n_procs > 1:
        print '    starting %d events' % len(naive_events)
    uid_str_len = 6 + int(
        math.log(len(naive_events), 10)
    )  # if the final sample's going to contain many trees, it's worth making the uids longer so there's fewer collisions/duplicates
    for ievent, outdir in enumerate(outdirs):
        if args.n_sim_events > 1 and args.n_procs == 1:
            print '  %s %d' % (utils.color('blue', 'ievent'), ievent)
        if args.paired_loci:
            hline, lline = naive_events[ievent]
            naive_seq = utils.pad_nuc_seq(
                hline['naive_seq']) + lline['naive_seq']
        else:
            naive_seq = naive_events[ievent]['naive_seq']
        cfo = run_bcr_phylo(naive_seq, outdir, ievent, uid_str_len=uid_str_len
                            )  # if n_procs > 1, doesn't run, just returns cfo
        if cfo is not None:
            print '      %s %s' % (utils.color('red', 'run'), cfo['cmd_str'])
            cmdfos.append(cfo)
    if args.n_procs > 1 and len(cmdfos) > 0:
        utils.run_cmds(cmdfos,
                       shell=True,
                       n_max_procs=args.n_procs,
                       batch_system='slurm' if args.slurm else None,
                       allow_failure=True,
                       debug='print')
    print '  bcr-phylo run time: %.1fs' % (time.time() - start)

    if utils.output_exists(
            args, simfname('igh'), outlabel='mutated simu', offset=4
    ):  # i guess if it crashes during the plotting just below, this'll get confused
        return

    start = time.time()
    mutated_events = []
    for ievent, outdir in enumerate(outdirs):
        mutated_events.append(
            parse_bcr_phylo_output(glfos, naive_events, outdir, ievent))
    print '  parsing time: %.1fs' % (time.time() - start)

    print '  writing annotations to %s' % spath('mutated')
    write_simulation(glfos, mutated_events)

    if not args.only_csv_plots:
        import lbplotting
        for ievent, outdir in enumerate(outdirs):
            if args.paired_loci:
                lpair = [l['loci'][0] for l in mutated_events[ievent]]
                evtlist = mutated_events[ievent]
            else:
                lpair = None
                evtlist = [mutated_events[ievent]]
            lbplotting.plot_bcr_phylo_simulation(
                outdir + '/plots',
                outdir,
                evtlist,
                args.extrastr,
                lbplotting.metric_for_target_distance_labels[
                    args.metric_for_target_distance],
                lpair=lpair)