Esempio n. 1
0
def run_igdiscover(infname, outfname, outdir):
    if utils.output_exists(args, outfname):
        return

    prepare_igdiscover_outdir(outdir)

    if args.n_random_queries is not None:
        sub_infname = outdir + '/' + os.path.basename(infname.replace(utils.getsuffix(infname), '-n-random-queries-%d%s' % (args.n_random_queries, utils.getsuffix(infname))))
        if os.path.exists(sub_infname):
            print '    --n-random-queries: leaving existing fasta for igdiscover (hopefully it has %d queries)' % args.n_random_queries
        else:
            print '    --n-random-queries: writing new fasta for igdiscover (%d queries)' % args.n_random_queries
            seqfos = utils.read_fastx(infname, n_random_queries=args.n_random_queries)
            with open(sub_infname, 'w') as sub_infile:
                for seqfo in seqfos:
                    sub_infile.write('>%s\n%s\n' % (seqfo['name'], seqfo['seq']))
        infname = sub_infname

    igdiscover_outfname = outdir + '/work/final/database/%s.fasta' % args.region.upper()

    cmds = getpathcmd()
    cmds += ['conda activate %s' % args.env_label]
    cmds += ['cd %s' % outdir]
    cmds += ['igdiscover init --db db --single-reads %s work' % infname]  # prepares to run, putting files into <outdir>
    cmds += ['cp %s work/' % os.path.basename(args.yamlfname)]
    cmds += ['cd work']
    cmds += ['igdiscover run']
    utils.simplerun('\n'.join(cmds) + '\n', cmdfname=outdir + '/run.sh', print_time='igdiscover', debug=True)

    template_gldir = args.glfo_dir  # if args.glfo_dir is not None else 'data/germlines/ XXX human'  # can probably delete this now that --glfo-dir is required (but leaving for now, to show how it used to be in case it comes up)
    glfo = glutils.create_glfo_from_fasta(igdiscover_outfname, args.locus, args.region, template_gldir, simulation_germline_dir=args.simulation_germline_dir)
    out_gldir = os.path.dirname(outfname).rstrip('/' + args.locus)
    assert glutils.get_fname(out_gldir, args.locus, args.region) == outfname
    glutils.write_glfo(out_gldir, glfo, debug=True)
Esempio n. 2
0
def parse_ramesh_seqs(glseqs, outdir, debug=False):
    for locus in glseqs:
        glutils.remove_glfo_files(outdir, locus)
        # write to a glfo dir without extra info
        for region in glseqs[locus]:
            fn = glutils.get_fname(outdir, locus, region)
            if not os.path.exists(os.path.dirname(fn)):
                os.makedirs(os.path.dirname(fn))
            with open(fn, 'w') as ofile:
                for gene, seq in glseqs[locus][region].items():
                    ofile.write('>%s\n%s\n' % (gene, seq))

        # figure out extra info
        template_glfo = glutils.read_glfo('data/germlines/macaque', locus)
        glfo = glutils.read_glfo(outdir,
                                 locus,
                                 template_glfo=template_glfo,
                                 remove_bad_genes=True,
                                 debug=True)

        # trim non-coding stuff upstream of v (and remove non-full-length ones)
        gene_groups = {}
        for region in ['v']:
            group_labels = sorted(
                set([utils.gene_family(g) for g in glfo['seqs'][region]]))
            gene_groups[region] = [(glabel, {
                g: glfo['seqs'][region][g]
                for g in glfo['seqs'][region] if utils.gene_family(g) == glabel
            }) for glabel in group_labels]
        for region in [r for r in utils.regions if r in gene_groups]:
            if debug:
                print '%s' % utils.color('reverse_video',
                                         utils.color('green', region))
            for group_label, group_seqs in gene_groups[
                    region]:  # ok, this isn't really doing anything any more
                if debug:
                    print '  %s' % utils.color('blue', group_label)
                for gene, seq in group_seqs.items():
                    trim_and_remove_genes(region,
                                          gene,
                                          seq,
                                          glfo,
                                          template_glfo,
                                          debug=debug)

        # remove any seqs with ambiguous bases
        for region in [r for r in utils.regions if r in glfo['seqs']]:
            for gene, seq in glfo['seqs'][region].items():
                if utils.ambig_frac(seq) > 0.:
                    if debug:
                        print '   %d ambiguous bases: %s' % (
                            len(seq) * utils.ambig_frac(seq),
                            utils.color_gene(gene))
                    glutils.remove_gene(glfo, gene)

        # glutils.print_glfo(glfo)

        # write final result
        glutils.write_glfo(outdir, glfo, debug=True)
Esempio n. 3
0
    def _generate_germline_set(self,
                               n_genes_per_region="20:1:1",
                               n_sim_alleles_per_gene="1.5:1:1",
                               min_sim_allele_prevalence_freq=0.1):
        """
        Call partis's germline set simulation function and write to files
        """
        PARTIS_PATH = './partis'
        sys.path.insert(1, PARTIS_PATH + '/python')
        import glutils
        glfo = glutils.read_glfo(self.GERMLINE_FOLDER + "/" + self.organism,
                                 self.locus)
        glutils.generate_germline_set(
            glfo,
            n_genes_per_region,
            n_sim_alleles_per_gene,
            min_sim_allele_prevalence_freq,
            self.allele_freq_file,
            debug=self.debug,
        )
        glutils.write_glfo(self.output_dir, glfo)

        # Read allele prevalences
        germline_freqs = dict()
        with open(self.allele_freq_file, "r") as f:
            allele_reader = csv.reader(f)
            allele_reader.next()
            for row in allele_reader:
                if row[0].startswith(self.locus.upper() + "V"):
                    germline_freqs[row[0]] = float(row[1])

        # Read the selected germline alleles
        germline_seqs = dict()
        with open(self.ig_file, "r") as f:
            lines = f.read().splitlines()
            for line_idx in range(len(lines) / 2):
                allele = lines[line_idx * 2].replace(">", "")
                allele_seq = lines[line_idx * 2 + 1]

                # Trim allele until multiple of 3 - randomly pick a reading frame
                mod_seq_len = len(allele_seq) % 3
                if mod_seq_len != 0:
                    offset = np.random.choice(mod_seq_len + 1)
                    if mod_seq_len != offset:
                        allele_seq = allele_seq[offset:-(mod_seq_len - offset)]
                    else:
                        allele_seq = allele_seq[offset:]

                # Make sure no N in the germline sequence
                while "N" in allele_seq:
                    allele_seq = allele_seq.replace(
                        "N", np.random.choice(list(NUCLEOTIDE_SET)), 1)

                germline_seqs[allele] = allele_seq

        return germline_seqs, germline_freqs
Esempio n. 4
0
def build_v_gene_set(glfo, introns):
    total_d_counts = {}
    refseqs = {}
    for d_gene, counts in introns.items():
        total_d_counts[d_gene] = sum(counts.values())
    for d_gene, _ in sorted(total_d_counts.items(), key=operator.itemgetter(1), reverse=True):
        counts = introns[d_gene]

        # first decide on the reference sequences
        refseq, column_counts = None, None
        for seq in sorted(counts, key=len, reverse=True):
            if refseq is None:  # first one, i.e. the longest
                refseq = seq
                column_counts = [{n : 0 for n in utils.nukes} for i in range(len(refseq))]
            ioffset = len(refseq) - len(seq)
            partial_refseq = refseq[ioffset:]
            assert len(partial_refseq) == len(seq)
            for ibase in range(ioffset, len(refseq)):
                column_counts[ibase][seq[ibase - ioffset]] += counts[seq]

        refseqs[d_gene] = []
        for basecounts in column_counts:
            most_common_base = sorted(basecounts.items(), key=operator.itemgetter(1), reverse=True)[0][0]
            refseqs[d_gene].append(most_common_base)
        refseqs[d_gene] = ''.join(refseqs[d_gene])

        n_ok = 0
        mutecounts = {}
        for seq in sorted(counts, key=len, reverse=True):
            # print '    %3d   %150s' % (count, seq)
            partial_refseq = refseqs[d_gene][len(refseqs[d_gene]) - len(seq):]
            if seq == partial_refseq:
                n_ok += counts[seq]
            else:
                # utils.color_mutants(partial_refseq, seq, print_result=True, extra_str='                ')
                n_mutes = utils.hamming_distance(partial_refseq, seq)
                if n_mutes not in mutecounts:
                    mutecounts[n_mutes] = 0
                mutecounts[n_mutes] += counts[seq]
        print '  %s   %4d / %-4d ok' % (utils.color_gene(d_gene, width=10), n_ok, n_ok + sum(mutecounts.values())),
        if len(mutecounts) > 0:
            print '(mean of %.1f mutations among the other %d' % (numpy.average(mutecounts.keys(), weights=mutecounts.values()), sum(mutecounts.values())),
        print ''

    # add the intronic v genes to glfo
    for d_gene, refseq in refseqs.items():
        glfo['seqs']['v'][utils.generate_dummy_v(d_gene)] = refseq
        glfo['cyst-positions'][utils.generate_dummy_v(d_gene)] = len(refseq) - 3

    # write a glfo dir with everything
    glutils.write_glfo(outdir + '/germlines/imgt-and-intronic', glfo, debug=True)

    # remove the original v genes, and write a glfo dir with just the intronic ones
    glutils.remove_genes(glfo, [g for g in glfo['seqs']['v'] if 'xDx' not in g], debug=True)
    glutils.write_glfo(outdir + '/germlines/intronic', glfo, debug=True)
Esempio n. 5
0
def write_inf_glfo(
    args
):  # read default glfo, restrict it to the specified alleles, and write to somewhere where all the methods can read it
    # NOTE this dir should *not* be modified by any of the methods
    inf_glfo = glutils.read_glfo('data/germlines/human',
                                 locus=args.locus,
                                 only_genes=args.inf_v_genes + args.dj_genes)
    print '  writing initial inference glfo with %d v: %s' % (len(
        inf_glfo['seqs']['v']), ' '.join(
            [utils.color_gene(g) for g in inf_glfo['seqs']['v']]))
    glutils.write_glfo(args.inf_glfo_dir, inf_glfo)
Esempio n. 6
0
def run_igdiscover(infname, outfname, outdir):
    if utils.output_exists(args, outfname):
        return

    prepare_igdiscover_outdir(outdir)

    if args.n_random_queries is not None:
        sub_infname = outdir + '/' + os.path.basename(
            infname.replace(
                utils.getsuffix(infname), '-n-random-queries-%d%s' %
                (args.n_random_queries, utils.getsuffix(infname))))
        if os.path.exists(sub_infname):
            print '    --n-random-queries: leaving existing fasta for igdiscover (hopefully it has %d queries)' % args.n_random_queries
        else:
            print '    --n-random-queries: writing new fasta for igdiscover (%d queries)' % args.n_random_queries
            seqfos = utils.read_fastx(infname,
                                      n_random_queries=args.n_random_queries)
            with open(sub_infname, 'w') as sub_infile:
                for seqfo in seqfos:
                    sub_infile.write('>%s\n%s\n' %
                                     (seqfo['name'], seqfo['seq']))
        infname = sub_infname

    igdiscover_outfname = outdir + '/work/final/database/%s.fasta' % args.region.upper(
    )

    cmds = ['#!/bin/bash']
    cmds += ['export PATH=%s:$PATH' % args.condapath]
    cmds += [
        'export PYTHONNOUSERSITE=True'
    ]  # otherwise it finds the pip-installed packages in .local and breaks (see https://github.com/conda/conda/issues/448)
    cmds += ['cd %s' % outdir]
    cmds += ['igdiscover init --db db --single-reads %s work' % infname
             ]  # prepares to run, putting files into <outdir>
    cmds += ['cp %s work/' % os.path.basename(args.yamlfname)]
    cmds += ['cd work']
    cmds += ['igdiscover run']
    utils.simplerun('\n'.join(cmds) + '\n',
                    cmdfname=outdir + '/run.sh',
                    print_time='igdiscover',
                    debug=True)

    template_gldir = args.glfo_dir if args.glfo_dir is not None else 'data/germlines/human'
    glfo = glutils.create_glfo_from_fasta(
        igdiscover_outfname,
        args.locus,
        args.region,
        template_gldir,
        simulation_germline_dir=args.simulation_germline_dir)
    out_gldir = os.path.dirname(outfname).rstrip('/' + args.locus)
    assert glutils.get_fname(out_gldir, args.locus, args.region) == outfname
    glutils.write_glfo(out_gldir, glfo, debug=True)
Esempio n. 7
0
def simulate(args):
    if utils.output_exists(args, args.simfname):
        return
    cmd_str = args.partis_path + ' simulate --n-sim-events ' + str(args.n_sim_events) + ' --outfname ' + args.simfname + ' --n-leaves ' + str(args.n_leaves) + ' --rearrange-from-scratch --shm-parameter-dir ' + partis_dir + '/data/recombinator/scratch-parameters'
    if args.n_leaf_distribution is None:
        cmd_str += ' --constant-number-of-leaves'
    else:
        cmd_str += ' --n-leaf-distribution ' + args.n_leaf_distribution
    if args.mut_mult is not None:
        cmd_str += ' --mutation-multiplier ' + str(args.mut_mult)
    if args.root_mrca_weibull_parameter is not None:
        cmd_str += ' --root-mrca-weibull-parameter ' + str(args.root_mrca_weibull_parameter)

    cmd_str += ' --n-procs ' + str(args.n_procs)
    if args.slurm:
        cmd_str += ' --batch-system slurm --subsimproc'

    allele_prevalence_fname = args.workdir + '/allele-prevalence-freqs.csv'

    # figure what genes we're using
    if args.gls_gen:
        assert args.sim_v_genes is None and args.allele_prevalence_freqs is None

        sglfo = glutils.read_glfo(args.default_germline_dir, locus=args.locus)
        glutils.remove_v_genes_with_bad_cysteines(sglfo)
        glutils.generate_germline_set(sglfo, args.n_genes_per_region, args.n_sim_alleles_per_gene, args.min_allele_prevalence_freq, allele_prevalence_fname, new_allele_info=args.new_allele_info, dont_remove_template_genes=args.dont_remove_template_genes, debug=True)
        cmd_str += ' --allele-prevalence-fname ' + allele_prevalence_fname
    else:
        sglfo = glutils.read_glfo(args.default_germline_dir, locus=args.locus, only_genes=(args.sim_v_genes + args.dj_genes))
        added_snp_names = glutils.generate_new_alleles(sglfo, args.new_allele_info, debug=True, remove_template_genes=(not args.dont_remove_template_genes))  # NOTE template gene removal is the default for glutils.generate_germline_set

        if args.allele_prevalence_freqs is not None:
            if not utils.is_normed(args.allele_prevalence_freqs):
                raise Exception('--allele-prevalence-freqs %s not normalized' % args.allele_prevalence_freqs)
            if len(args.allele_prevalence_freqs) != len(sglfo['seqs']['v']):  # already checked when parsing args, but, you know...
                raise Exception('--allele-prevalence-freqs %d not the same length as sglfo %d' % (len(args.allele_prevalence_freqs), len(sglfo['seqs']['v'])))
            gene_list = sorted(sglfo['seqs']['v']) if len(added_snp_names) == 0 else list(set(args.sim_v_genes)) + added_snp_names
            prevalence_freqs = {'v' : {g : f for g, f in zip(gene_list, args.allele_prevalence_freqs)}, 'd' : {}, 'j' : {}}
            glutils.write_allele_prevalence_freqs(prevalence_freqs, allele_prevalence_fname)
            cmd_str += ' --allele-prevalence-fname ' + allele_prevalence_fname

    glutils.write_glfo(args.outdir + '/germlines/simulation', sglfo)
    cmd_str += ' --initial-germline-dir ' + args.outdir + '/germlines/simulation'
    # glutils.print_glfo(sglfo)

    # run simulation
    if args.seed is not None:
        cmd_str += ' --seed ' + str(args.seed)
    utils.simplerun(cmd_str, dryrun=args.dry_run)
Esempio n. 8
0
def run_test(simulation_v_genes, inference_v_genes, dj_genes, seed=None):
    if seed is not None:
        random.seed(seed)

    label = 'test'  #get_label(existing_genes, new_allele)
    simfname = outdir + '/simu-' + label + '.csv'
    outpdir = outdir + '/simu-' + label
    if os.getenv('www') is not None:
        plotdir = os.getenv('www') + '/partis/allele-finding/' + label
    else:
        plotdir = '_www/partis/allele-finding/' + label

    snps_to_add = [
        {'gene' : 'IGHV3-71*01', 'positions' : (35, )},
        {'gene' : 'IGHV3-71*01', 'positions' : (35, 50)},
        # {'gene' : 'IGHV3-71*01', 'positions' : (35, 45, 20, 50, 77)},
        # {'gene' : 'IGHV3-71*01', 'positions' : (35, 60, 50)},
        # {'gene' : 'IGHV1-18*01', 'positions' : (100, 101)},
        # {'gene' : 'IGHV1-18*01', 'positions' : (20, )}
    ]
    simulation_genes = simulation_v_genes + ':' + dj_genes
    sglfo = glutils.read_glfo('data/imgt', chain=chain, only_genes=simulation_genes.split(':'), debug=True)
    glutils.add_some_snps(snps_to_add, sglfo, remove_template_genes=False, debug=True)
    glutils.write_glfo(outdir + '/germlines-for-simulation', sglfo)

    # simulate
    cmd_str = base_cmd + ' simulate --n-sim-events 1000 --n-procs 10 --simulate-partially-from-scratch --mutation-multiplier 0.5'
    cmd_str += ' --initial-datadir ' + outdir + '/germlines-for-simulation'
    cmd_str += ' --outfname ' + simfname
    if seed is not None:
        cmd_str += ' --seed ' + str(seed)
    run(cmd_str)

    inference_genes = inference_v_genes + ':' + dj_genes
    iglfo = glutils.read_glfo('data/imgt', chain=chain, only_genes=inference_genes.split(':'), debug=True)
    glutils.write_glfo(outdir + '/germlines-for-inference', iglfo)

    # generate germline set and cache parameters
    cmd_str = base_cmd + ' cache-parameters --infname ' + simfname + ' --n-procs 10 --only-smith-waterman'
    cmd_str += ' --find-new-alleles --new-allele-fname ' + outdir + '/new-alleles.fa'
    # cmd_str += ' --generate-germline-set'
    cmd_str += '  --debug-new-allele-finding'
    cmd_str += ' --initial-datadir ' + outdir + '/germlines-for-inference'
    cmd_str += ' --parameter-dir ' + outpdir
    cmd_str += ' --plotdir ' + plotdir
    if seed is not None:
        cmd_str += ' --seed ' + str(seed)
    run(cmd_str)
Esempio n. 9
0
def run_tigger(infname, outfname, outdir):
    if utils.output_exists(args, outfname, offset=8):
        return

    rcmds = ['library(tigger)', 'library(dplyr)']
    # rcmds += ['data(sample_db, germline_ighv)']

    db_name = 'annotations'
    gls_name = 'gls'
    rcmds += ['%s = read.csv("%s", sep="\t")' % (db_name, infname)]
    rcmds += ['%s = readIgFasta("%s")' % (gls_name, get_glfname('v', aligned=True))]

    tigger_outfname = outdir + '/tigger.fasta'
    rcmds += ['novel_df = findNovelAlleles(%s, %s, germline_min=2, nproc=%d)' % (db_name, gls_name, args.n_procs)]  #
    rcmds += ['geno = inferGenotype(%s, find_unmutated = FALSE, germline_db = %s, novel_df = novel_df)' % (db_name, gls_name)]
    rcmds += ['genotype_seqs = genotypeFasta(geno, %s, novel_df)' % (gls_name)]
    rcmds += ['writeFasta(genotype_seqs, "%s")' % tigger_outfname]
    cmdfname = args.workdir + '/tigger-in.cmd'
    with open(cmdfname, 'w') as cmdfile:
        cmdfile.write('\n'.join(rcmds) + '\n')
    cmdstr = 'R --slave -f ' + cmdfname
    utils.simplerun(cmdstr, shell=True, print_time='tigger')

    # post-process tigger .fa
    gldir = args.glfo_dir if args.glfo_dir is not None else 'data/germlines/human'
    glfo = glutils.read_glfo(gldir, args.locus)
    tigger_alleles = set()
    for seqfo in utils.read_fastx(tigger_outfname):
        seq = seqfo['seq'].replace(utils.gap_chars[0], '')  # it should be just dots...
        tigger_alleles.add(seqfo['name'])
        if seqfo['name'] not in glfo['seqs'][args.region]:
            newfo = {'gene' : seqfo['name'], 'seq' : seq}
            use_template_for_codon_info = False
            if '+' in newfo['gene']:
                newfo['template-gene'] = newfo['gene'].split('+')[0]
                use_template_for_codon_info = True
            glutils.add_new_allele(glfo, newfo, use_template_for_codon_info=use_template_for_codon_info, debug=True)
        elif glfo['seqs'][args.region][seqfo['name']] != seq:
            print '%s different sequences in glfo and tigger output for %s:\n    %s\n    %s' % (utils.color('red', 'error'), seqfo['name'], glfo['seqs'][args.region][seqfo['name']], seqfo['seq'])
    for gene in glfo['seqs'][args.region]:  # remove them afterwards so we can use existing ones to get codon info
        if gene not in tigger_alleles:
            glutils.remove_gene(glfo, gene)

    out_gldir = os.path.dirname(outfname).rstrip('/' + args.locus)
    assert glutils.get_fname(out_gldir, args.locus, args.region) == outfname
    glutils.write_glfo(out_gldir, glfo)

    os.remove(cmdfname)
Esempio n. 10
0
    def write(self, base_outdir):
        print "    writing parameters",
        sys.stdout.flush()
        start = time.time()

        utils.prep_dir(
            base_outdir, subdirs=("hmms", "mute-freqs", "germline-sets"), wildlings=("*.csv", "*.yaml", "*.fasta")
        )  # it's kind of hackey to specify the /hmms dir here, but as soon as we write the parameters below, the previous yamels are out of date, so it's pretty much necessary

        self.mfreqer.write(
            base_outdir + "/mute-freqs", mean_freq_outfname=base_outdir + "/REGION-mean-mute-freqs.csv"
        )  # REGION is replace by each region in the three output files)
        genes_with_counts = [g[0] for r in utils.regions for g in self.counts[r + "_gene"].keys()]
        glutils.write_glfo(base_outdir + "/" + glutils.glfo_dir, self.glfo, only_genes=genes_with_counts, debug=False)

        for column in self.counts:
            index = None
            outfname = None
            if column == "all":
                index = tuple(list(utils.index_columns) + ["cdr3_length"])
                outfname = base_outdir + "/" + utils.get_parameter_fname(column="all")
            elif "_content" in column:
                index = [column]
                outfname = base_outdir + "/" + column + ".csv"
            else:
                index = [column] + utils.column_dependencies[column]
                outfname = base_outdir + "/" + utils.get_parameter_fname(column_and_deps=index)
            if os.path.isfile(outfname):
                os.remove(outfname)
            elif not os.path.exists(base_outdir):
                os.makedirs(base_outdir)
            with opener("w")(outfname) as outfile:
                out_fieldnames = list(index)
                out_fieldnames.append("count")
                out_data = csv.DictWriter(outfile, out_fieldnames)
                out_data.writeheader()
                # NOTE this will in general not be sorted
                for key, count in self.counts[column].iteritems():
                    line = {}
                    for ic in range(len(key)):
                        line[index[ic]] = key[ic]
                    line["count"] = count
                    out_data.writerow(line)

        print "(%.1f sec)" % (time.time() - start)
Esempio n. 11
0
def write_single_zenodo_subdir(zenodo_dir, args, study, dset, method, mfo):
    method_outdir = heads.get_datadir(study, 'processed', extra_str=args.label) + '/' + dset
    gls_dir = get_gls_dir(method_outdir, method, data=True)
    print '            %s --> %s' % (gls_dir, zenodo_dir)
    glfo = glutils.read_glfo(gls_dir, mfo['locus'], remove_orfs='partis' in method)
    glutils.write_glfo(zenodo_dir, glfo)
    if method == 'partis':
        # allele finding plots
        plotdir = gls_dir.replace('hmm/germline-sets', 'plots/sw/allele-finding')
        if not os.path.exists(zenodo_dir + '/fits'):
            os.makedirs(zenodo_dir + '/fits')
        for genedir in glob.glob(plotdir + '/try-0/*'):  # would be nice to copy html, but links will be wrong
            subprocess.check_call(['cp', '-r', genedir, zenodo_dir + '/fits/'])

        # csv prevalence files
        for tmpreg in utils.regions:
            with open(gls_dir.replace('/germline-sets', '/%s_gene-probs.csv' % tmpreg)) as infile:
                reader = csv.DictReader(infile)
                countfo = {line['%s_gene' % tmpreg] : int(line['count']) for line in reader}
                old_total = sum(countfo.values())
                orf_genes = [g for g in countfo if g not in glfo['seqs'][tmpreg]]  # this is kind of dangerous... but the genes are read from the same parameter dir that we're reading this prevalence file, so the only way it's gonna be missing is if we just removed it with the read_glfo() line above
                for ogene in orf_genes:
                    # if tmpreg == 'v':
                    #     _, nearest_gene, _ = glutils.find_nearest_gene_with_same_cpos(glfo, glfo['seqs'][tmpreg][ogene])  # oops, that's dumb... of course it isn't there
                    # else:
                    nearest_gene = glutils.find_nearest_gene_using_names(glfo, ogene)
                    # print '  adding %d to %s from %s' % (countfo[ogene], utils.color_gene(nearest_gene), utils.color_gene(ogene))
                    countfo[nearest_gene] += countfo[ogene]
                for ogene in orf_genes:
                    del countfo[ogene]
                assert old_total == sum(countfo.values())
                with open('%s/%s_gene-probs.csv' % (zenodo_dir, tmpreg), 'w') as outfile:
                    writer = csv.DictWriter(outfile, ('%s_gene' % tmpreg, 'count'))
                    writer.writeheader()
                    for gene in countfo:
                        writer.writerow({'%s_gene' % tmpreg : gene, 'count' : countfo[gene]})
    elif method == 'tigger-default':
        # doesn't seem to have written anything
        pass
    elif method == 'igdiscover':
        # for fname in ['errorhistograms.pdf', 'V_usage.pdf', 'V_usage.tab']:
        #     subprocess.check_call(['cp', '%s/work/final/%s' % (gls_dir, fname), zenodo_dir + '/'])
        subprocess.check_call(['cp', '-r', '%s/work/final' % gls_dir, zenodo_dir + '/'])  # aw, screw it, just write everything. The simulation stuff is already huge, anyway
    else:
        assert False
Esempio n. 12
0
    def write(self, base_outdir, my_datadir=None):
        print '    writing parameters',
        sys.stdout.flush()
        start = time.time()

        utils.prep_dir(base_outdir, subdirs=('hmms', 'mute-freqs', 'germline-sets'), wildlings=('*.csv', '*.yaml', '*.fasta'))  # it's kind of hackey to specify the /hmms dir here, but as soon as we write the parameters below, the previous yamels are out of date, so it's pretty much necessary

        self.mfreqer.write(base_outdir + '/mute-freqs', mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv')  # REGION is replace by each region in the three output files)
        genes_with_counts = [g[0] for r in utils.regions for g in self.counts[r + '_gene'].keys()]
        glutils.write_glfo(base_outdir + '/' + glutils.glfo_dir, self.glfo, only_genes=genes_with_counts, debug=True)

        for column in self.counts:
            index = None
            outfname = None
            if column == 'all':
                index = tuple(list(utils.index_columns) + ['cdr3_length', ])
                outfname = base_outdir + '/' + utils.get_parameter_fname(column='all')
            elif '_content' in column:
                index = [column,]
                outfname = base_outdir + '/' + column + '.csv'
            else:
                index = [column,] + utils.column_dependencies[column]
                outfname = base_outdir + '/' + utils.get_parameter_fname(column_and_deps=index)
            if os.path.isfile(outfname):
                os.remove(outfname)
            elif not os.path.exists(base_outdir):
                os.makedirs(base_outdir)
            with opener('w')(outfname) as outfile:
                out_fieldnames = list(index)
                out_fieldnames.append('count')
                out_data = csv.DictWriter(outfile, out_fieldnames)
                out_data.writeheader()
                # NOTE this will in general not be sorted
                for key, count in self.counts[column].iteritems():
                    line = {}
                    for ic in range(len(key)):
                        line[index[ic]] = key[ic]
                    line['count'] = count
                    out_data.writerow(line)

        print '(%.1f sec)' % (time.time()-start)
Esempio n. 13
0
    def write(self, base_outdir):
        print '    writing parameters',
        sys.stdout.flush()
        start = time.time()

        utils.prep_dir(base_outdir, subdirs=('hmms', 'mute-freqs', 'germline-sets'), wildlings=('*.csv', '*.yaml', '*.fasta'))  # it's kind of hackey to specify the /hmms dir here, but as soon as we write the parameters below, the previous yamels are out of date, so it's pretty much necessary

        self.mfreqer.write(base_outdir + '/mute-freqs', mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv')  # REGION is replace by each region in the three output files)
        genes_with_counts = [g[0] for r in utils.regions for g in self.counts[r + '_gene'].keys()]
        glutils.write_glfo(base_outdir + '/' + glutils.glfo_dir, self.glfo, only_genes=genes_with_counts, debug=False)

        for column in self.counts:
            index = None
            outfname = None
            if column == 'all':
                index = tuple(list(utils.index_columns) + ['cdr3_length', ])
                outfname = base_outdir + '/' + utils.get_parameter_fname(column='all')
            elif '_content' in column:
                index = [column,]
                outfname = base_outdir + '/' + column + '.csv'
            else:
                index = [column,] + utils.column_dependencies[column]
                outfname = base_outdir + '/' + utils.get_parameter_fname(column_and_deps=index)
            if os.path.isfile(outfname):
                os.remove(outfname)
            elif not os.path.exists(base_outdir):
                os.makedirs(base_outdir)
            with open(outfname, 'w') as outfile:
                out_fieldnames = list(index)
                out_fieldnames.append('count')
                out_data = csv.DictWriter(outfile, out_fieldnames)
                out_data.writeheader()
                # NOTE this will in general not be sorted
                for key, count in self.counts[column].iteritems():
                    line = {}
                    for ic in range(len(key)):
                        line[index[ic]] = key[ic]
                    line['count'] = count
                    out_data.writerow(line)

        print '(%.1f sec)' % (time.time()-start)
Esempio n. 14
0
        for region in [r for r in utils.regions if r in glfo['seqs']]:
            for gene, seq in glfo['seqs'][region].items():
                if utils.ambig_frac(seq) > 0.:
                    if debug:
                        print '   %d ambiguous bases: %s' % (
                            len(seq) * utils.ambig_frac(seq),
                            utils.color_gene(gene))
                    glutils.remove_gene(glfo, gene)

        # glutils.print_glfo(glfo)

        # write final result
        glutils.write_glfo(outdir, glfo, debug=True)


# ----------------------------------------------------------------------------------------
fname = 'macaque/ramesh-v1/coding.fa'
outdir = 'macaque/ramesh-cleaned'
# parse_ramesh_seqs(read_ramesh_file(fname, outdir), outdir, debug=True)
# sys.exit()

# ----------------------------------------------------------------------------------------
for locus in ['igh', 'igk', 'igl']:
    ref_glfo = glutils.read_glfo('data/germlines/macaque', locus, debug=True)
    glfo = glutils.read_glfo(outdir, locus, debug=True)
    merged_glfo = glutils.get_merged_glfo(ref_glfo, glfo, debug=True)
    # glutils.print_glfo(merged_glfo, print_separate_cons_seqs=True)
    glutils.write_glfo('datascripts/meta/crotty-fna/imgt-plus-ramesh',
                       merged_glfo,
                       debug=True)
Esempio n. 15
0
def run_tigger(infname, outfname, outdir):
    if utils.output_exists(args, outfname, offset=8):
        return

    rcmds = [
        'library(ggplot2)', 'library(tigger, warn.conflicts=FALSE)',
        'library(dplyr, warn.conflicts=FALSE)'
    ]
    # rcmds += ['data(sample_db, germline_ighv)']

    db_name = 'annotations'
    gls_name = 'gls'
    rcmds += ['%s = read.csv("%s", sep="\t")' % (db_name, infname)]
    rcmds += [
        '%s = readIgFasta("%s")' % (gls_name, get_glfname('v', aligned=True))
    ]

    tigger_outfname = outdir + '/tigger.fasta'
    find_novel_argstr = '%s, %s, nproc=%d' % (db_name, gls_name,
                                              utils.auto_n_procs())
    if args.tuned_tigger_params:
        germline_min = 5  # only analyze genes which correspond to at least this many V calls (default 200)
        min_seqs = 5  # minimum number of total sequences
        j_max = 0.95  # of sequences which align perfectly (i.e. zero mutation?) to a new allele, no more than this fraction can correspond to each junction length + j gene combination (default 0.15)
        find_novel_argstr += ', germline_min=%d, min_seqs=%d, j_max=%f' % (
            germline_min, min_seqs, j_max)
    rcmds += ['novel_df = findNovelAlleles(%s)' % find_novel_argstr]
    # rcmds += ['sessionInfo()']
    rcmds += ['print(novel_df)']
    rcmds += [
        'geno = inferGenotype(%s, find_unmutated = TRUE, germline_db = %s, novel_df = novel_df)'
        % (db_name, gls_name)
    ]
    rcmds += ['genotype_seqs = genotypeFasta(geno, %s, novel_df)' % (gls_name)]
    rcmds += ['writeFasta(genotype_seqs, "%s")' % tigger_outfname]
    cmdfname = args.workdir + '/tigger-in.cmd'
    with open(cmdfname, 'w') as cmdfile:
        cmdfile.write('\n'.join(rcmds) + '\n')
    cmdstr = 'R --slave -f ' + cmdfname

    cmdfo = {'cmd_str': cmdstr, 'logdir': args.workdir, 'env': os.environ}
    proc = utils.run_cmd(cmdfo)
    while proc.poll() is None:
        time.sleep(0.01)
    if proc.returncode != 0:  # damn thing crashes if it thinks the sample size is small
        with open(args.workdir + '/err') as ferr:
            errstr = ''.join(ferr.readlines())
        if 'Not enough sample sequences were assigned to any germline' in errstr:
            with open(tigger_outfname, 'w') as dummy_outfasta:
                dummy_outfasta.write('')
        else:
            subprocess.check_call(['cat', args.workdir + '/out'])
            subprocess.check_call(['cat', args.workdir + '/err'])
            sys.exit(proc.returncode)

    for oe in ['err', 'out']:
        with open(args.workdir + '/' + oe) as oefile:
            print ''.join(oefile.readlines())
        os.remove(args.workdir + '/' + oe)

    # post-process tigger .fa
    template_gldir = args.glfo_dir if args.glfo_dir is not None else 'data/germlines/human'
    glfo = glutils.create_glfo_from_fasta(
        tigger_outfname,
        args.locus,
        args.region,
        template_gldir,
        simulation_germline_dir=args.simulation_germline_dir)
    out_gldir = os.path.dirname(outfname).rstrip('/' + args.locus)
    assert glutils.get_fname(out_gldir, args.locus, args.region) == outfname
    glutils.write_glfo(out_gldir, glfo)

    os.remove(cmdfname)
Esempio n. 16
0
    print 'RUN', cmd_str
    sys.stdout.flush()
    check_call(cmd_str.split())

outdir = '_tmp/light-chain'
base_cmd = './bin/partis'

dj_genes = 'IGHD6-19*01:IGHJ4*02'
v_genes = 'IGHV3-71*01' #:IGHV1-18*01'
all_genes= v_genes + ':' + dj_genes

# glutils.write_glfo(outdir + '/germline-set', input_dir=outdir + '/imgt', only_genes=all_genes.split(':'), debug=True)

# glutils.write_glfo('imgt', chain='l', input_dir='data/imgt', debug=True, generate_new_alignment=True)
assert False  # needs updating
glutils.write_glfo('imgt', chain='k', input_dir='imgt', debug=True, generate_new_alignment=True)
# glutils.write_glfo(outdir + '/germline-set', input_dir=outdir + '/imgt', debug=True)
sys.exit()

# simulate
cmd_str = base_cmd + ' simulate --n-sim-events 10 --simulate-partially-from-scratch --mutation-multiplier 0.5 --debug 1 --n-trees 10'
cmd_str += ' --initial-datadir ' + outdir + '/germline-set'
cmd_str += ' --outfname ' + outdir + '/simu.csv'
# run(cmd_str)

# cache parameters
cmd_str = base_cmd + ' cache-parameters --chain-weight light --light-chain-locus kappa --debug 1'
cmd_str += ' --infname ' + outdir + '/simu.csv'
cmd_str += ' --initial-datadir ' + outdir + '/germline-set'
cmd_str += ' --outfname ' + 'tmp.csv'
run(cmd_str)
Esempio n. 17
0
def run_test(args):
    print 'seed %d' % args.seed
    label = 'test'  #get_label(existing_genes, new_allele)
    simfname = args.outdir + '/simu-' + label + '.csv'
    outpdir = args.outdir + '/simu-' + label
    plotdir = args.outdir + '/simu-' + label + '-plots'

    # simulate
    if not args.nosim:
        cmd_str = base_cmd + ' simulate --n-sim-events ' + str(
            args.n_sim_events) + ' --n-leaves ' + str(
                args.n_leaves
            ) + ' --rearrange-from-scratch --outfname ' + simfname
        if args.n_leaf_distribution is None:
            cmd_str += ' --constant-number-of-leaves'
        else:
            cmd_str += ' --n-leaf-distribution ' + args.n_leaf_distribution
        if args.mut_mult is not None:
            cmd_str += ' --mutation-multiplier ' + str(args.mut_mult)

        cmd_str += ' --n-procs ' + str(args.n_procs)
        if args.slurm:
            cmd_str += ' --batch-system slurm --subsimproc'

        # figure what genes we're using
        if args.gen_gset:
            cmd_str += ' --generate-germline-set'
            cmd_str += ' --n-genes-per-region 1:5:3'
            cmd_str += ' --n-alleles-per-gene 2,3:1,2:1,2'
        else:
            simulation_genes = ':'.join(args.sim_v_genes + args.dj_genes)
            sglfo = glutils.read_glfo('data/germlines/human',
                                      locus=locus,
                                      only_genes=simulation_genes.split(':'))

            added_snp_names = None
            if args.snp_positions is not None:
                snps_to_add = [{
                    'gene': args.sim_v_genes[ig],
                    'positions': args.snp_positions[ig]
                } for ig in range(len(args.sim_v_genes))]
                added_snp_names = glutils.add_some_snps(
                    snps_to_add,
                    sglfo,
                    debug=True,
                    remove_template_genes=args.remove_template_genes)

            if args.allele_prevalence_freqs is not None:
                if len(args.allele_prevalence_freqs) != len(
                        sglfo['seqs']['v']
                ):  # already checked when parsing args, but, you know...
                    raise Exception(
                        '--allele-prevalence-freqs not the right length')
                gene_list = sorted(
                    sglfo['seqs']['v']) if added_snp_names is None else list(
                        set(args.sim_v_genes)) + added_snp_names
                prevalence_freqs = {
                    'v': {
                        g: f
                        for g, f in zip(gene_list,
                                        args.allele_prevalence_freqs)
                    },
                    'd': {},
                    'j': {}
                }
                glutils.write_allele_prevalence_freqs(
                    prevalence_freqs,
                    args.workdir + '/allele-prevalence-freqs.csv')
                cmd_str += ' --allele-prevalence-fname ' + args.workdir + '/allele-prevalence-freqs.csv'

            print '  simulating with %d v: %s' % (len(
                sglfo['seqs']['v']), ' '.join(
                    [utils.color_gene(g) for g in sglfo['seqs']['v']]))
            glutils.write_glfo(args.outdir + '/germlines/simulation', sglfo)
            cmd_str += ' --initial-germline-dir ' + args.outdir + '/germlines/simulation'

        # run simulation
        if args.seed is not None:
            cmd_str += ' --seed ' + str(args.seed)
        run(cmd_str)

    # remove any old sw cache files
    sw_cachefiles = glob.glob(outpdir + '/sw-cache-*.csv')
    if len(sw_cachefiles) > 0:
        for cachefname in sw_cachefiles:
            check_call(['rm', '-v', cachefname])
            sw_cache_gldir = cachefname.replace('.csv', '-glfo')
            if os.path.exists(
                    sw_cache_gldir
            ):  # if stuff fails halfway through, you can get one but not the other
                glutils.remove_glfo_files(sw_cache_gldir, locus)
                # os.rmdir(sw_cache_gldir)

    # generate germline set and cache parameters
    cmd_str = base_cmd + ' cache-parameters --infname ' + simfname + ' --only-smith-waterman --debug-allele-finding --always-find-new-alleles --n-max-allele-finding-iterations 2'  # --dont-collapse-clones'
    # cmd_str = 'python -m cProfile -s tottime -o prof.out ' + cmd_str
    cmd_str += ' --n-procs ' + str(args.n_procs)
    if args.n_max_queries is not None:
        cmd_str += ' --n-max-queries ' + str(
            args.n_max_queries
        )  # NOTE do *not* use --n-random-queries, since it'll change the cluster size distribution
    if args.slurm:
        cmd_str += ' --batch-system slurm'

    cmd_str += ' --find-new-alleles'

    if args.gen_gset:
        pass  # i.e. uses default (full) germline dir
    else:
        cmd_str += ' --dont-remove-unlikely-alleles'  # --new-allele-fname ' + args.outdir + '/new-alleles.fa'
        inference_genes = ':'.join(args.inf_v_genes + args.dj_genes)
        iglfo = glutils.read_glfo('data/germlines/human',
                                  locus=locus,
                                  only_genes=inference_genes.split(':'))
        print '  starting inference with %d v: %s' % (len(
            iglfo['seqs']['v']), ' '.join(
                [utils.color_gene(g) for g in iglfo['seqs']['v']]))
        glutils.write_glfo(args.outdir + '/germlines/inference', iglfo)
        cmd_str += ' --initial-germline-dir ' + args.outdir + '/germlines/inference'
        # cmd_str += ' --n-max-snps 12'

    cmd_str += ' --parameter-dir ' + outpdir
    cmd_str += ' --only-overall-plots --plotdir ' + plotdir
    if args.seed is not None:
        cmd_str += ' --seed ' + str(args.seed)
    run(cmd_str)
Esempio n. 18
0
def run_test(args):
    print 'seed %d' % args.seed
    label = 'test'  #get_label(existing_genes, new_allele)
    simfname = args.outdir + '/simu-' + label + '.csv'
    outpdir = args.outdir + '/simu-' + label
    plotdir = args.outdir + '/simu-' + label + '-plots'

    # simulate
    if not args.nosim:
        cmd_str = base_cmd + ' simulate --n-sim-events ' + str(args.n_sim_events) + ' --n-leaves ' + str(args.n_leaves) + ' --constant-number-of-leaves --rearrange-from-scratch --outfname ' + simfname
        cmd_str += ' --mutation-multiplier ' + str(args.mut_mult)

        cmd_str += ' --n-procs ' + str(args.n_procs)
        if args.slurm:
            cmd_str += ' --batch-system slurm --subsimproc'

        if args.gen_gset:
            cmd_str += ' --generate-germline-set'
            cmd_str += ' --n-genes-per-region 1:5:3'
            cmd_str += ' --n-alleles-per-gene 2,3:1,2:1,2'
        else:
            simulation_genes = ':'.join(args.sim_v_genes + args.dj_genes)
            sglfo = glutils.read_glfo('data/germlines/human', chain=chain, only_genes=simulation_genes.split(':'))

            added_snp_names = None
            if args.snp_positions is not None:
                snps_to_add = [{'gene' : args.sim_v_genes[ig], 'positions' : args.snp_positions[ig]} for ig in range(len(args.sim_v_genes))]
                added_snp_names = glutils.add_some_snps(snps_to_add, sglfo, debug=True, remove_template_genes=args.remove_template_genes)

            if args.allele_prevalence_freqs is not None:
                if len(args.allele_prevalence_freqs) != len(sglfo['seqs']['v']):
                    raise Exception('--allele-prevalence-freqs not the right length')
                gene_list = sorted(sglfo['seqs']['v']) if added_snp_names is None else list(set(args.sim_v_genes)) + added_snp_names
                prevalence_freqs = {'v' : {g : f for g, f in zip(gene_list, args.allele_prevalence_freqs)}, 'd' : {}, 'j' : {}}
                glutils.write_allele_prevalence_freqs(prevalence_freqs, args.workdir + '/allele-prevalence-freqs.csv')
                cmd_str += ' --allele-prevalence-fname ' + args.workdir + '/allele-prevalence-freqs.csv'

            glutils.write_glfo(args.outdir + '/germlines/simulation', sglfo)
            cmd_str += ' --initial-germline-dir ' + args.outdir + '/germlines/simulation'

        if args.seed is not None:
            cmd_str += ' --seed ' + str(args.seed)
        run(cmd_str)

    # remove any old sw cache files
    sw_cachefiles = glob.glob(outpdir + '/sw-cache-*.csv')
    if len(sw_cachefiles) > 0:
        for cachefname in sw_cachefiles:
            check_call(['rm', '-v', cachefname])
            sw_cache_gldir = cachefname.replace('.csv', '-glfo')
            if os.path.exists(sw_cache_gldir):  # if stuff fails halfway through, you can get one but not the other
                glutils.remove_glfo_files(sw_cache_gldir, chain)
                # os.rmdir(sw_cache_gldir)

    # generate germline set and cache parameters
    cmd_str = base_cmd + ' cache-parameters --infname ' + simfname + ' --only-smith-waterman --debug-allele-finding --always-find-new-alleles --n-max-allele-finding-iterations 2' # --dont-collapse-clones'
    # cmd_str = 'python -m cProfile -s tottime -o prof.out ' + cmd_str
    cmd_str += ' --n-procs ' + str(args.n_procs)
    if args.slurm:
        cmd_str += ' --batch-system slurm'

    if args.gen_gset:
        cmd_str += ' --find-new-alleles'
    else:
        inference_genes = ':'.join(args.inf_v_genes + args.dj_genes)
        iglfo = glutils.read_glfo('data/germlines/human', chain=chain, only_genes=inference_genes.split(':'), debug=True)
        glutils.write_glfo(args.outdir + '/germlines/inference', iglfo)
        cmd_str += ' --initial-germline-dir ' + args.outdir + '/germlines/inference'
        cmd_str += ' --find-new-alleles --dont-remove-unlikely-alleles'  # --new-allele-fname ' + args.outdir + '/new-alleles.fa'
        # cmd_str += ' --n-max-snps 12'

    cmd_str += ' --parameter-dir ' + outpdir
    cmd_str += ' --plotdir ' + plotdir
    if args.seed is not None:
        cmd_str += ' --seed ' + str(args.seed)
    run(cmd_str)
#!/usr/bin/env python

# Script to process the extras.csv files from partis' germline directories
# Assumes the fasta files (e.g. ighv.fa) have been deduplicated

import sys

partis_dir = "/home/bolson2/Software/partis"
sys.path.insert(1, partis_dir + '/python')

import glutils

igb_path = "/home/bolson2/Software/igblast"
igb_database_path = igb_path + "/bin_deduplicated"

glfo = glutils.read_glfo(igb_database_path, locus='igh', debug=True)
glutils.write_glfo(igb_path + '/partis_friendly_bin', glfo, debug=True)

glfo = glutils.read_glfo(igb_database_path, locus='igk', debug=True)
glutils.write_glfo(igb_path + '/partis_friendly_bin', glfo, debug=True)

glfo = glutils.read_glfo(igb_database_path, locus='igl', debug=True)
glutils.write_glfo(igb_path + '/partis_friendly_bin', glfo, debug=True)