Example #1
0
def parse_ramesh_seqs(glseqs, outdir, debug=False):
    for locus in glseqs:
        glutils.remove_glfo_files(outdir, locus)
        # write to a glfo dir without extra info
        for region in glseqs[locus]:
            fn = glutils.get_fname(outdir, locus, region)
            if not os.path.exists(os.path.dirname(fn)):
                os.makedirs(os.path.dirname(fn))
            with open(fn, 'w') as ofile:
                for gene, seq in glseqs[locus][region].items():
                    ofile.write('>%s\n%s\n' % (gene, seq))

        # figure out extra info
        template_glfo = glutils.read_glfo('data/germlines/macaque', locus)
        glfo = glutils.read_glfo(outdir,
                                 locus,
                                 template_glfo=template_glfo,
                                 remove_bad_genes=True,
                                 debug=True)

        # trim non-coding stuff upstream of v (and remove non-full-length ones)
        gene_groups = {}
        for region in ['v']:
            group_labels = sorted(
                set([utils.gene_family(g) for g in glfo['seqs'][region]]))
            gene_groups[region] = [(glabel, {
                g: glfo['seqs'][region][g]
                for g in glfo['seqs'][region] if utils.gene_family(g) == glabel
            }) for glabel in group_labels]
        for region in [r for r in utils.regions if r in gene_groups]:
            if debug:
                print '%s' % utils.color('reverse_video',
                                         utils.color('green', region))
            for group_label, group_seqs in gene_groups[
                    region]:  # ok, this isn't really doing anything any more
                if debug:
                    print '  %s' % utils.color('blue', group_label)
                for gene, seq in group_seqs.items():
                    trim_and_remove_genes(region,
                                          gene,
                                          seq,
                                          glfo,
                                          template_glfo,
                                          debug=debug)

        # remove any seqs with ambiguous bases
        for region in [r for r in utils.regions if r in glfo['seqs']]:
            for gene, seq in glfo['seqs'][region].items():
                if utils.ambig_frac(seq) > 0.:
                    if debug:
                        print '   %d ambiguous bases: %s' % (
                            len(seq) * utils.ambig_frac(seq),
                            utils.color_gene(gene))
                    glutils.remove_gene(glfo, gene)

        # glutils.print_glfo(glfo)

        # write final result
        glutils.write_glfo(outdir, glfo, debug=True)
Example #2
0
def run_partis_parameter_cache(args, method):
    if utils.output_exists(args, get_outfname(args, method)):
        return

    paramdir = args.outdir + '/' + method
    plotdir = args.outdir + '/' + method + '/plots'

    # remove any old sw cache files
    sw_cachefiles = glob.glob(paramdir + '/sw-cache-*.csv')
    if len(sw_cachefiles) > 0:
        for cachefname in sw_cachefiles:
            check_call(['rm', '-v', cachefname])
            sw_cache_gldir = cachefname.replace('.csv', '-glfo')
            if os.path.exists(
                    sw_cache_gldir
            ):  # if stuff fails halfway through, you can get one but not the other
                glutils.remove_glfo_files(sw_cache_gldir, args.locus)
                # os.rmdir(sw_cache_gldir)

    # generate germline set and cache parameters
    cmd_str = args.partis_path + ' cache-parameters --infname ' + args.simfname + ' --only-smith-waterman'
    cmd_str += ' --initial-germline-dir %s' % args.default_germline_dir
    if method == 'partis':
        cmd_str += ' --debug-allele-finding'  # --always-find-new-alleles'
        cmd_str += ' --is-simu --simulation-germline-dir ' + args.outdir + '/germlines/simulation'  # alleleclusterer is the only one that really uses this, but for now I want its dbg output to have the sim info
        if args.allele_cluster:
            cmd_str += ' --allele-cluster'
            if args.kmeans_allele_cluster:
                cmd_str += ' --kmeans-allele-cluster'
    elif method == 'full':
        cmd_str += ' --leave-default-germline'
    else:
        assert False

    if args.species != 'human':
        cmd_str += ' --species %s' % args.species

    cmd_str += ' --n-procs ' + str(args.n_procs)
    if args.n_max_queries is not None:
        cmd_str += ' --n-max-queries ' + str(
            args.n_max_queries
        )  # NOTE do *not* use --n-random-queries, since it'll change the cluster size distribution
    if args.slurm:
        cmd_str += ' --batch-system slurm'

    if not args.gls_gen:  # otherwise it uses the default (full) germline dir
        cmd_str += ' --initial-germline-dir ' + args.inf_glfo_dir  # --dont-remove-unlikely-alleles

    cmd_str += ' --parameter-dir ' + paramdir
    cmd_str += ' --plotdir ' + plotdir
    if args.seed is not None:
        cmd_str += ' --seed ' + str(args.seed)
    if args.plot_and_fit_absolutely_everything is not None:
        cmd_str += ' --plot-and-fit-absolutely-everything ' + str(
            args.plot_and_fit_absolutely_everything)
    utils.simplerun(cmd_str, dryrun=args.dryrun)
Example #3
0
    def write(
        self, base_outdir
    ):  # NOTE most of the time in here is taken up by mutefrequer.finalize() (if it plot() wasn't called first, that is)
        print '    writing parameters to %s' % base_outdir,
        sys.stdout.flush()
        start = time.time()

        if os.path.exists(base_outdir + '/' + glutils.glfo_dir):
            for tmploc in [
                    l for l in utils.loci
                    if os.path.exists(base_outdir + '/' + glutils.glfo_dir +
                                      '/' + l)
            ]:
                glutils.remove_glfo_files(base_outdir + '/' + glutils.glfo_dir,
                                          tmploc,
                                          print_warning=False)
        utils.prep_dir(
            base_outdir,
            subdirs=('hmms', 'mute-freqs', glutils.glfo_dir),
            wildlings=('*.csv', '*.yaml', '*.fasta')
        )  # it's kind of hackey to specify the /hmms dir here, but as soon as we write the parameters below, the previous yamels are out of date, so it's pretty much necessary

        self.mfreqer.write(
            base_outdir + '/mute-freqs',
            mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv'
        )  # REGION is replace by each region in the three output files)
        genes_with_counts = [
            g[0] for r in utils.regions
            for g in self.counts[r + '_gene'].keys()
        ]
        glutils.write_glfo(base_outdir + '/' + glutils.glfo_dir,
                           self.glfo,
                           only_genes=genes_with_counts,
                           debug=False)

        for column in self.counts:
            index = None
            outfname = None
            if column in self.no_write_columns:
                continue
            elif column == 'all':
                index = tuple(list(utils.index_columns) + [
                    'cdr3_length',
                ])
                outfname = base_outdir + '/' + utils.get_parameter_fname(
                    column='all')
            elif '_content' in column or column == 'cluster_size':
                index = [
                    column,
                ]
                outfname = base_outdir + '/' + column + '.csv'
            else:
                index = [
                    column,
                ] + utils.column_dependencies[column]
                outfname = base_outdir + '/' + utils.get_parameter_fname(
                    column_and_deps=index)
            if os.path.isfile(outfname):
                os.remove(outfname)
            elif not os.path.exists(base_outdir):
                os.makedirs(base_outdir)
            with open(outfname, 'w') as outfile:
                out_fieldnames = list(index)
                out_fieldnames.append('count')
                out_data = csv.DictWriter(outfile, out_fieldnames)
                out_data.writeheader()
                # NOTE this will in general not be sorted
                for key, count in self.counts[column].iteritems():
                    line = {}
                    for ic in range(len(key)):
                        line[index[ic]] = key[ic]
                    line['count'] = count
                    out_data.writerow(line)

        print '(%.1f sec)' % (time.time() - start)
def run_test(args):
    print 'seed %d' % args.seed
    label = 'test'  #get_label(existing_genes, new_allele)
    simfname = args.outdir + '/simu-' + label + '.csv'
    outpdir = args.outdir + '/simu-' + label
    plotdir = args.outdir + '/simu-' + label + '-plots'

    # simulate
    if not args.nosim:
        cmd_str = base_cmd + ' simulate --n-sim-events ' + str(args.n_sim_events) + ' --n-leaves ' + str(args.n_leaves) + ' --constant-number-of-leaves --rearrange-from-scratch --outfname ' + simfname
        cmd_str += ' --mutation-multiplier ' + str(args.mut_mult)

        cmd_str += ' --n-procs ' + str(args.n_procs)
        if args.slurm:
            cmd_str += ' --batch-system slurm --subsimproc'

        if args.gen_gset:
            cmd_str += ' --generate-germline-set'
            cmd_str += ' --n-genes-per-region 1:5:3'
            cmd_str += ' --n-alleles-per-gene 2,3:1,2:1,2'
        else:
            simulation_genes = ':'.join(args.sim_v_genes + args.dj_genes)
            sglfo = glutils.read_glfo('data/germlines/human', chain=chain, only_genes=simulation_genes.split(':'))

            added_snp_names = None
            if args.snp_positions is not None:
                snps_to_add = [{'gene' : args.sim_v_genes[ig], 'positions' : args.snp_positions[ig]} for ig in range(len(args.sim_v_genes))]
                added_snp_names = glutils.add_some_snps(snps_to_add, sglfo, debug=True, remove_template_genes=args.remove_template_genes)

            if args.allele_prevalence_freqs is not None:
                if len(args.allele_prevalence_freqs) != len(sglfo['seqs']['v']):
                    raise Exception('--allele-prevalence-freqs not the right length')
                gene_list = sorted(sglfo['seqs']['v']) if added_snp_names is None else list(set(args.sim_v_genes)) + added_snp_names
                prevalence_freqs = {'v' : {g : f for g, f in zip(gene_list, args.allele_prevalence_freqs)}, 'd' : {}, 'j' : {}}
                glutils.write_allele_prevalence_freqs(prevalence_freqs, args.workdir + '/allele-prevalence-freqs.csv')
                cmd_str += ' --allele-prevalence-fname ' + args.workdir + '/allele-prevalence-freqs.csv'

            glutils.write_glfo(args.outdir + '/germlines/simulation', sglfo)
            cmd_str += ' --initial-germline-dir ' + args.outdir + '/germlines/simulation'

        if args.seed is not None:
            cmd_str += ' --seed ' + str(args.seed)
        run(cmd_str)

    # remove any old sw cache files
    sw_cachefiles = glob.glob(outpdir + '/sw-cache-*.csv')
    if len(sw_cachefiles) > 0:
        for cachefname in sw_cachefiles:
            check_call(['rm', '-v', cachefname])
            sw_cache_gldir = cachefname.replace('.csv', '-glfo')
            if os.path.exists(sw_cache_gldir):  # if stuff fails halfway through, you can get one but not the other
                glutils.remove_glfo_files(sw_cache_gldir, chain)
                # os.rmdir(sw_cache_gldir)

    # generate germline set and cache parameters
    cmd_str = base_cmd + ' cache-parameters --infname ' + simfname + ' --only-smith-waterman --debug-allele-finding --always-find-new-alleles --n-max-allele-finding-iterations 2' # --dont-collapse-clones'
    # cmd_str = 'python -m cProfile -s tottime -o prof.out ' + cmd_str
    cmd_str += ' --n-procs ' + str(args.n_procs)
    if args.slurm:
        cmd_str += ' --batch-system slurm'

    if args.gen_gset:
        cmd_str += ' --find-new-alleles'
    else:
        inference_genes = ':'.join(args.inf_v_genes + args.dj_genes)
        iglfo = glutils.read_glfo('data/germlines/human', chain=chain, only_genes=inference_genes.split(':'), debug=True)
        glutils.write_glfo(args.outdir + '/germlines/inference', iglfo)
        cmd_str += ' --initial-germline-dir ' + args.outdir + '/germlines/inference'
        cmd_str += ' --find-new-alleles --dont-remove-unlikely-alleles'  # --new-allele-fname ' + args.outdir + '/new-alleles.fa'
        # cmd_str += ' --n-max-snps 12'

    cmd_str += ' --parameter-dir ' + outpdir
    cmd_str += ' --plotdir ' + plotdir
    if args.seed is not None:
        cmd_str += ' --seed ' + str(args.seed)
    run(cmd_str)
Example #5
0
def run_test(args):
    print 'seed %d' % args.seed
    label = 'test'  #get_label(existing_genes, new_allele)
    simfname = args.outdir + '/simu-' + label + '.csv'
    outpdir = args.outdir + '/simu-' + label
    plotdir = args.outdir + '/simu-' + label + '-plots'

    # simulate
    if not args.nosim:
        cmd_str = base_cmd + ' simulate --n-sim-events ' + str(
            args.n_sim_events) + ' --n-leaves ' + str(
                args.n_leaves
            ) + ' --rearrange-from-scratch --outfname ' + simfname
        if args.n_leaf_distribution is None:
            cmd_str += ' --constant-number-of-leaves'
        else:
            cmd_str += ' --n-leaf-distribution ' + args.n_leaf_distribution
        if args.mut_mult is not None:
            cmd_str += ' --mutation-multiplier ' + str(args.mut_mult)

        cmd_str += ' --n-procs ' + str(args.n_procs)
        if args.slurm:
            cmd_str += ' --batch-system slurm --subsimproc'

        # figure what genes we're using
        if args.gen_gset:
            cmd_str += ' --generate-germline-set'
            cmd_str += ' --n-genes-per-region 1:5:3'
            cmd_str += ' --n-alleles-per-gene 2,3:1,2:1,2'
        else:
            simulation_genes = ':'.join(args.sim_v_genes + args.dj_genes)
            sglfo = glutils.read_glfo('data/germlines/human',
                                      locus=locus,
                                      only_genes=simulation_genes.split(':'))

            added_snp_names = None
            if args.snp_positions is not None:
                snps_to_add = [{
                    'gene': args.sim_v_genes[ig],
                    'positions': args.snp_positions[ig]
                } for ig in range(len(args.sim_v_genes))]
                added_snp_names = glutils.add_some_snps(
                    snps_to_add,
                    sglfo,
                    debug=True,
                    remove_template_genes=args.remove_template_genes)

            if args.allele_prevalence_freqs is not None:
                if len(args.allele_prevalence_freqs) != len(
                        sglfo['seqs']['v']
                ):  # already checked when parsing args, but, you know...
                    raise Exception(
                        '--allele-prevalence-freqs not the right length')
                gene_list = sorted(
                    sglfo['seqs']['v']) if added_snp_names is None else list(
                        set(args.sim_v_genes)) + added_snp_names
                prevalence_freqs = {
                    'v': {
                        g: f
                        for g, f in zip(gene_list,
                                        args.allele_prevalence_freqs)
                    },
                    'd': {},
                    'j': {}
                }
                glutils.write_allele_prevalence_freqs(
                    prevalence_freqs,
                    args.workdir + '/allele-prevalence-freqs.csv')
                cmd_str += ' --allele-prevalence-fname ' + args.workdir + '/allele-prevalence-freqs.csv'

            print '  simulating with %d v: %s' % (len(
                sglfo['seqs']['v']), ' '.join(
                    [utils.color_gene(g) for g in sglfo['seqs']['v']]))
            glutils.write_glfo(args.outdir + '/germlines/simulation', sglfo)
            cmd_str += ' --initial-germline-dir ' + args.outdir + '/germlines/simulation'

        # run simulation
        if args.seed is not None:
            cmd_str += ' --seed ' + str(args.seed)
        run(cmd_str)

    # remove any old sw cache files
    sw_cachefiles = glob.glob(outpdir + '/sw-cache-*.csv')
    if len(sw_cachefiles) > 0:
        for cachefname in sw_cachefiles:
            check_call(['rm', '-v', cachefname])
            sw_cache_gldir = cachefname.replace('.csv', '-glfo')
            if os.path.exists(
                    sw_cache_gldir
            ):  # if stuff fails halfway through, you can get one but not the other
                glutils.remove_glfo_files(sw_cache_gldir, locus)
                # os.rmdir(sw_cache_gldir)

    # generate germline set and cache parameters
    cmd_str = base_cmd + ' cache-parameters --infname ' + simfname + ' --only-smith-waterman --debug-allele-finding --always-find-new-alleles --n-max-allele-finding-iterations 2'  # --dont-collapse-clones'
    # cmd_str = 'python -m cProfile -s tottime -o prof.out ' + cmd_str
    cmd_str += ' --n-procs ' + str(args.n_procs)
    if args.n_max_queries is not None:
        cmd_str += ' --n-max-queries ' + str(
            args.n_max_queries
        )  # NOTE do *not* use --n-random-queries, since it'll change the cluster size distribution
    if args.slurm:
        cmd_str += ' --batch-system slurm'

    cmd_str += ' --find-new-alleles'

    if args.gen_gset:
        pass  # i.e. uses default (full) germline dir
    else:
        cmd_str += ' --dont-remove-unlikely-alleles'  # --new-allele-fname ' + args.outdir + '/new-alleles.fa'
        inference_genes = ':'.join(args.inf_v_genes + args.dj_genes)
        iglfo = glutils.read_glfo('data/germlines/human',
                                  locus=locus,
                                  only_genes=inference_genes.split(':'))
        print '  starting inference with %d v: %s' % (len(
            iglfo['seqs']['v']), ' '.join(
                [utils.color_gene(g) for g in iglfo['seqs']['v']]))
        glutils.write_glfo(args.outdir + '/germlines/inference', iglfo)
        cmd_str += ' --initial-germline-dir ' + args.outdir + '/germlines/inference'
        # cmd_str += ' --n-max-snps 12'

    cmd_str += ' --parameter-dir ' + outpdir
    cmd_str += ' --only-overall-plots --plotdir ' + plotdir
    if args.seed is not None:
        cmd_str += ' --seed ' + str(args.seed)
    run(cmd_str)