Beispiel #1
0
def parse_ramesh_seqs(glseqs, outdir, debug=False):
    for locus in glseqs:
        glutils.remove_glfo_files(outdir, locus)
        # write to a glfo dir without extra info
        for region in glseqs[locus]:
            fn = glutils.get_fname(outdir, locus, region)
            if not os.path.exists(os.path.dirname(fn)):
                os.makedirs(os.path.dirname(fn))
            with open(fn, 'w') as ofile:
                for gene, seq in glseqs[locus][region].items():
                    ofile.write('>%s\n%s\n' % (gene, seq))

        # figure out extra info
        template_glfo = glutils.read_glfo('data/germlines/macaque', locus)
        glfo = glutils.read_glfo(outdir,
                                 locus,
                                 template_glfo=template_glfo,
                                 remove_bad_genes=True,
                                 debug=True)

        # trim non-coding stuff upstream of v (and remove non-full-length ones)
        gene_groups = {}
        for region in ['v']:
            group_labels = sorted(
                set([utils.gene_family(g) for g in glfo['seqs'][region]]))
            gene_groups[region] = [(glabel, {
                g: glfo['seqs'][region][g]
                for g in glfo['seqs'][region] if utils.gene_family(g) == glabel
            }) for glabel in group_labels]
        for region in [r for r in utils.regions if r in gene_groups]:
            if debug:
                print '%s' % utils.color('reverse_video',
                                         utils.color('green', region))
            for group_label, group_seqs in gene_groups[
                    region]:  # ok, this isn't really doing anything any more
                if debug:
                    print '  %s' % utils.color('blue', group_label)
                for gene, seq in group_seqs.items():
                    trim_and_remove_genes(region,
                                          gene,
                                          seq,
                                          glfo,
                                          template_glfo,
                                          debug=debug)

        # remove any seqs with ambiguous bases
        for region in [r for r in utils.regions if r in glfo['seqs']]:
            for gene, seq in glfo['seqs'][region].items():
                if utils.ambig_frac(seq) > 0.:
                    if debug:
                        print '   %d ambiguous bases: %s' % (
                            len(seq) * utils.ambig_frac(seq),
                            utils.color_gene(gene))
                    glutils.remove_gene(glfo, gene)

        # glutils.print_glfo(glfo)

        # write final result
        glutils.write_glfo(outdir, glfo, debug=True)
Beispiel #2
0
def simulate(args):
    if utils.output_exists(args, args.simfname):
        return
    cmd_str = args.partis_path + ' simulate --n-sim-events ' + str(args.n_sim_events) + ' --outfname ' + args.simfname + ' --n-leaves ' + str(args.n_leaves) + ' --rearrange-from-scratch --shm-parameter-dir ' + partis_dir + '/data/recombinator/scratch-parameters'
    if args.n_leaf_distribution is None:
        cmd_str += ' --constant-number-of-leaves'
    else:
        cmd_str += ' --n-leaf-distribution ' + args.n_leaf_distribution
    if args.mut_mult is not None:
        cmd_str += ' --mutation-multiplier ' + str(args.mut_mult)
    if args.root_mrca_weibull_parameter is not None:
        cmd_str += ' --root-mrca-weibull-parameter ' + str(args.root_mrca_weibull_parameter)

    cmd_str += ' --n-procs ' + str(args.n_procs)
    if args.slurm:
        cmd_str += ' --batch-system slurm --subsimproc'

    allele_prevalence_fname = args.workdir + '/allele-prevalence-freqs.csv'

    # figure what genes we're using
    if args.gls_gen:
        assert args.sim_v_genes is None and args.allele_prevalence_freqs is None

        sglfo = glutils.read_glfo(args.default_germline_dir, locus=args.locus)
        glutils.remove_v_genes_with_bad_cysteines(sglfo)
        glutils.generate_germline_set(sglfo, args.n_genes_per_region, args.n_sim_alleles_per_gene, args.min_allele_prevalence_freq, allele_prevalence_fname, new_allele_info=args.new_allele_info, dont_remove_template_genes=args.dont_remove_template_genes, debug=True)
        cmd_str += ' --allele-prevalence-fname ' + allele_prevalence_fname
    else:
        sglfo = glutils.read_glfo(args.default_germline_dir, locus=args.locus, only_genes=(args.sim_v_genes + args.dj_genes))
        added_snp_names = glutils.generate_new_alleles(sglfo, args.new_allele_info, debug=True, remove_template_genes=(not args.dont_remove_template_genes))  # NOTE template gene removal is the default for glutils.generate_germline_set

        if args.allele_prevalence_freqs is not None:
            if not utils.is_normed(args.allele_prevalence_freqs):
                raise Exception('--allele-prevalence-freqs %s not normalized' % args.allele_prevalence_freqs)
            if len(args.allele_prevalence_freqs) != len(sglfo['seqs']['v']):  # already checked when parsing args, but, you know...
                raise Exception('--allele-prevalence-freqs %d not the same length as sglfo %d' % (len(args.allele_prevalence_freqs), len(sglfo['seqs']['v'])))
            gene_list = sorted(sglfo['seqs']['v']) if len(added_snp_names) == 0 else list(set(args.sim_v_genes)) + added_snp_names
            prevalence_freqs = {'v' : {g : f for g, f in zip(gene_list, args.allele_prevalence_freqs)}, 'd' : {}, 'j' : {}}
            glutils.write_allele_prevalence_freqs(prevalence_freqs, allele_prevalence_fname)
            cmd_str += ' --allele-prevalence-fname ' + allele_prevalence_fname

    glutils.write_glfo(args.outdir + '/germlines/simulation', sglfo)
    cmd_str += ' --initial-germline-dir ' + args.outdir + '/germlines/simulation'
    # glutils.print_glfo(sglfo)

    # run simulation
    if args.seed is not None:
        cmd_str += ' --seed ' + str(args.seed)
    utils.simplerun(cmd_str, dryrun=args.dry_run)
Beispiel #3
0
def run_test(simulation_v_genes, inference_v_genes, dj_genes, seed=None):
    if seed is not None:
        random.seed(seed)

    label = 'test'  #get_label(existing_genes, new_allele)
    simfname = outdir + '/simu-' + label + '.csv'
    outpdir = outdir + '/simu-' + label
    if os.getenv('www') is not None:
        plotdir = os.getenv('www') + '/partis/allele-finding/' + label
    else:
        plotdir = '_www/partis/allele-finding/' + label

    snps_to_add = [
        {'gene' : 'IGHV3-71*01', 'positions' : (35, )},
        {'gene' : 'IGHV3-71*01', 'positions' : (35, 50)},
        # {'gene' : 'IGHV3-71*01', 'positions' : (35, 45, 20, 50, 77)},
        # {'gene' : 'IGHV3-71*01', 'positions' : (35, 60, 50)},
        # {'gene' : 'IGHV1-18*01', 'positions' : (100, 101)},
        # {'gene' : 'IGHV1-18*01', 'positions' : (20, )}
    ]
    simulation_genes = simulation_v_genes + ':' + dj_genes
    sglfo = glutils.read_glfo('data/imgt', chain=chain, only_genes=simulation_genes.split(':'), debug=True)
    glutils.add_some_snps(snps_to_add, sglfo, remove_template_genes=False, debug=True)
    glutils.write_glfo(outdir + '/germlines-for-simulation', sglfo)

    # simulate
    cmd_str = base_cmd + ' simulate --n-sim-events 1000 --n-procs 10 --simulate-partially-from-scratch --mutation-multiplier 0.5'
    cmd_str += ' --initial-datadir ' + outdir + '/germlines-for-simulation'
    cmd_str += ' --outfname ' + simfname
    if seed is not None:
        cmd_str += ' --seed ' + str(seed)
    run(cmd_str)

    inference_genes = inference_v_genes + ':' + dj_genes
    iglfo = glutils.read_glfo('data/imgt', chain=chain, only_genes=inference_genes.split(':'), debug=True)
    glutils.write_glfo(outdir + '/germlines-for-inference', iglfo)

    # generate germline set and cache parameters
    cmd_str = base_cmd + ' cache-parameters --infname ' + simfname + ' --n-procs 10 --only-smith-waterman'
    cmd_str += ' --find-new-alleles --new-allele-fname ' + outdir + '/new-alleles.fa'
    # cmd_str += ' --generate-germline-set'
    cmd_str += '  --debug-new-allele-finding'
    cmd_str += ' --initial-datadir ' + outdir + '/germlines-for-inference'
    cmd_str += ' --parameter-dir ' + outpdir
    cmd_str += ' --plotdir ' + plotdir
    if seed is not None:
        cmd_str += ' --seed ' + str(seed)
    run(cmd_str)
Beispiel #4
0
def read_partis_output(partition_file, glfo_dir=None, locus=None):
    glfo = (None if utils.getsuffix(partition_file)
            == ".yaml" else glutils.read_glfo(
                glfo_dir if glfo_dir else default_glfo_dir, locus))
    glfo, annotation_list, cpath = utils.read_output(
        partition_file, glfo=glfo
    )  # returns glfo from the file if it's there, otherwise it returns the one we passed in
    return glfo, annotation_list, cpath
Beispiel #5
0
    def __init__(self, args, seed, sublabel=None):
        self.args = args

        if sublabel == None:
            self.workdir = self.args.workdir + '/recombinator'
            self.outfname = self.args.outfname
        else:  # need a separate workdir for each subprocess
            self.workdir = self.args.workdir + '/recombinator-' + sublabel
            self.outfname = self.workdir + '/' + os.path.basename(self.args.outfname)
        utils.prep_dir(self.workdir)

        if not self.args.simulate_partially_from_scratch:
            parameter_dir = self.args.parameter_dir
        else:  # we start from scratch, except for the mute freq stuff
            parameter_dir = self.args.scratch_mute_freq_dir

        if parameter_dir is None or not os.path.exists(parameter_dir):
            raise Exception('parameter dir ' + parameter_dir + ' d.n.e')

        self.index_keys = {}  # this is kind of hackey, but I suspect indexing my huge table of freqs with a tuple is better than a dict
        self.mute_models = {}
        # self.treeinfo = []  # list of newick-formatted tree strings with region-specific branch info tacked at the end
        for region in utils.regions:
            self.mute_models[region] = {}
            for model in ['gtr', 'gamma']:
                self.mute_models[region][model] = {}

        self.glfo = glutils.read_glfo(self.args.initial_datadir, self.args.chain, only_genes=self.args.only_genes)

        self.allowed_genes = self.get_allowed_genes(parameter_dir)  # set of genes a) for which we read per-position mutation information and b) from which we choose when running partially from scratch
        self.version_freq_table = self.read_vdj_version_freqs(parameter_dir)  # list of the probabilities with which each VDJ combo (plus other rearrangement parameters) appears in data
        self.insertion_content_probs = self.read_insertion_content(parameter_dir)
        self.all_mute_freqs = {}
        self.parameter_dir = parameter_dir  # damnit, I guess I do need to save this in self

        # read shm info NOTE I'm not inferring the gtr parameters a.t.m., so I'm just (very wrongly) using the same ones for all individuals
        with opener('r')(self.args.gtrfname) as gtrfile:  # read gtr parameters
            reader = csv.DictReader(gtrfile)
            for line in reader:
                parameters = line['parameter'].split('.')
                region = parameters[0][3].lower()
                assert region == 'v' or region == 'd' or region == 'j'
                model = parameters[1].lower()
                parameter_name = parameters[2]
                assert model in self.mute_models[region]
                self.mute_models[region][model][parameter_name] = line['value']
        treegen = treegenerator.TreeGenerator(args, parameter_dir, seed=seed)
        self.treefname = self.workdir + '/trees.tre'
        treegen.generate_trees(seed, self.treefname)
        with opener('r')(self.treefname) as treefile:  # read in the trees (and other info) that we just generated
            self.treeinfo = treefile.readlines()
        os.remove(self.treefname)

        if os.path.exists(self.outfname):
            os.remove(self.outfname)
        elif not os.path.exists(os.path.dirname(os.path.abspath(self.outfname))):
            os.makedirs(os.path.dirname(os.path.abspath(self.outfname)))
Beispiel #6
0
def get_single_performance(region, outdir, method, debug=False):
    sglfo = glutils.read_glfo(outdir + '/germlines/simulation', locus=args.locus)
    iglfo = glutils.read_glfo(outdir + '/' + method + '/sw/germline-sets', locus=args.locus)
    glutils.synchronize_glfos(ref_glfo=sglfo, new_glfo=iglfo, region=region)
    missing_alleles = set(sglfo['seqs'][region]) - set(iglfo['seqs'][region])
    spurious_alleles = set(iglfo['seqs'][region]) - set(sglfo['seqs'][region])
    if debug:
        if len(missing_alleles) > 0:
            print '    %2d  missing %s' % (len(missing_alleles), ' '.join([utils.color_gene(g) for g in missing_alleles]))
        if len(spurious_alleles) > 0:
            print '    %2d spurious %s' % (len(spurious_alleles), ' '.join([utils.color_gene(g) for g in spurious_alleles]))
        if len(missing_alleles) == 0 and len(spurious_alleles) == 0:
            print '    none missing'
    return {
        'missing' : len(missing_alleles),
        'spurious' : len(spurious_alleles),
        'total' : len([g for g in sglfo['seqs'][region] if '+' in g]),  # anybody with a '+' should be a new allele
    }
Beispiel #7
0
def main():
    import argparse
    parser = argparse.ArgumentParser(
        description='Annotate BCR sequence for SPURF.',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--sequence',
                        type=str,
                        required=True,
                        help='Sequence for annotation.')
    parser.add_argument('--outfile',
                        type=str,
                        default='out.csv',
                        help='Output csv filename.')
    parser.add_argument(
        '--SIM_SIZE',
        type=int,
        required=False,
        default=10000,
        help='Number of random draws to simulate the neutral profile.')
    parser.add_argument('--LOCUS',
                        type=str,
                        required=False,
                        default='igh',
                        help='Locus, either igh, igk or igl.')
    parser.add_argument('--SPECIES',
                        type=str,
                        required=False,
                        default='human',
                        help='Species, either human.')
    global args
    args = parser.parse_args()
    mutability = PATH2FILE + '/S5F/Mutability.csv'
    substitution = PATH2FILE + '/S5F/Substitution.csv'

    # Read default germline info:
    global glfo
    glfo = glutils.read_glfo(partis_path + '/data/germlines/human',
                             locus=args.LOCUS)

    naive, fixed_input_seq, VDJ = run_partis(args.sequence)
    naiveAA = str(Seq(naive, generic_dna).translate())
    fixed_input_seqAA = str(Seq(fixed_input_seq, generic_dna).translate())

    # AHo annotate on the naive amino acid sequence:
    AHo_naive, numb_profile = AHo_annotate_naive(naiveAA)

    # Use the AHo annotation to make a profile over the input sequence:
    AHo_input = AHo_annotate_input(fixed_input_seqAA, numb_profile)

    # Simulate a profile under a neutral substitution process:
    Nmuts = hamming_dist(naive, fixed_input_seq)
    sim_profile = simulate_profile([Nmuts], naive, numb_profile, mutability,
                                   substitution)

    df = make_dataframe(AHo_input, AHo_naive, sim_profile, VDJ)
    write_dataframe(df, args.outfile)
Beispiel #8
0
    def _generate_germline_set(self,
                               n_genes_per_region="20:1:1",
                               n_sim_alleles_per_gene="1.5:1:1",
                               min_sim_allele_prevalence_freq=0.1):
        """
        Call partis's germline set simulation function and write to files
        """
        PARTIS_PATH = './partis'
        sys.path.insert(1, PARTIS_PATH + '/python')
        import glutils
        glfo = glutils.read_glfo(self.GERMLINE_FOLDER + "/" + self.organism,
                                 self.locus)
        glutils.generate_germline_set(
            glfo,
            n_genes_per_region,
            n_sim_alleles_per_gene,
            min_sim_allele_prevalence_freq,
            self.allele_freq_file,
            debug=self.debug,
        )
        glutils.write_glfo(self.output_dir, glfo)

        # Read allele prevalences
        germline_freqs = dict()
        with open(self.allele_freq_file, "r") as f:
            allele_reader = csv.reader(f)
            allele_reader.next()
            for row in allele_reader:
                if row[0].startswith(self.locus.upper() + "V"):
                    germline_freqs[row[0]] = float(row[1])

        # Read the selected germline alleles
        germline_seqs = dict()
        with open(self.ig_file, "r") as f:
            lines = f.read().splitlines()
            for line_idx in range(len(lines) / 2):
                allele = lines[line_idx * 2].replace(">", "")
                allele_seq = lines[line_idx * 2 + 1]

                # Trim allele until multiple of 3 - randomly pick a reading frame
                mod_seq_len = len(allele_seq) % 3
                if mod_seq_len != 0:
                    offset = np.random.choice(mod_seq_len + 1)
                    if mod_seq_len != offset:
                        allele_seq = allele_seq[offset:-(mod_seq_len - offset)]
                    else:
                        allele_seq = allele_seq[offset:]

                # Make sure no N in the germline sequence
                while "N" in allele_seq:
                    allele_seq = allele_seq.replace(
                        "N", np.random.choice(list(NUCLEOTIDE_SET)), 1)

                germline_seqs[allele] = allele_seq

        return germline_seqs, germline_freqs
Beispiel #9
0
def write_inf_glfo(
    args
):  # read default glfo, restrict it to the specified alleles, and write to somewhere where all the methods can read it
    # NOTE this dir should *not* be modified by any of the methods
    inf_glfo = glutils.read_glfo('data/germlines/human',
                                 locus=args.locus,
                                 only_genes=args.inf_v_genes + args.dj_genes)
    print '  writing initial inference glfo with %d v: %s' % (len(
        inf_glfo['seqs']['v']), ' '.join(
            [utils.color_gene(g) for g in inf_glfo['seqs']['v']]))
    glutils.write_glfo(args.inf_glfo_dir, inf_glfo)
Beispiel #10
0
def run_tigger(infname, outfname, outdir):
    if utils.output_exists(args, outfname, offset=8):
        return

    rcmds = ['library(tigger)', 'library(dplyr)']
    # rcmds += ['data(sample_db, germline_ighv)']

    db_name = 'annotations'
    gls_name = 'gls'
    rcmds += ['%s = read.csv("%s", sep="\t")' % (db_name, infname)]
    rcmds += ['%s = readIgFasta("%s")' % (gls_name, get_glfname('v', aligned=True))]

    tigger_outfname = outdir + '/tigger.fasta'
    rcmds += ['novel_df = findNovelAlleles(%s, %s, germline_min=2, nproc=%d)' % (db_name, gls_name, args.n_procs)]  #
    rcmds += ['geno = inferGenotype(%s, find_unmutated = FALSE, germline_db = %s, novel_df = novel_df)' % (db_name, gls_name)]
    rcmds += ['genotype_seqs = genotypeFasta(geno, %s, novel_df)' % (gls_name)]
    rcmds += ['writeFasta(genotype_seqs, "%s")' % tigger_outfname]
    cmdfname = args.workdir + '/tigger-in.cmd'
    with open(cmdfname, 'w') as cmdfile:
        cmdfile.write('\n'.join(rcmds) + '\n')
    cmdstr = 'R --slave -f ' + cmdfname
    utils.simplerun(cmdstr, shell=True, print_time='tigger')

    # post-process tigger .fa
    gldir = args.glfo_dir if args.glfo_dir is not None else 'data/germlines/human'
    glfo = glutils.read_glfo(gldir, args.locus)
    tigger_alleles = set()
    for seqfo in utils.read_fastx(tigger_outfname):
        seq = seqfo['seq'].replace(utils.gap_chars[0], '')  # it should be just dots...
        tigger_alleles.add(seqfo['name'])
        if seqfo['name'] not in glfo['seqs'][args.region]:
            newfo = {'gene' : seqfo['name'], 'seq' : seq}
            use_template_for_codon_info = False
            if '+' in newfo['gene']:
                newfo['template-gene'] = newfo['gene'].split('+')[0]
                use_template_for_codon_info = True
            glutils.add_new_allele(glfo, newfo, use_template_for_codon_info=use_template_for_codon_info, debug=True)
        elif glfo['seqs'][args.region][seqfo['name']] != seq:
            print '%s different sequences in glfo and tigger output for %s:\n    %s\n    %s' % (utils.color('red', 'error'), seqfo['name'], glfo['seqs'][args.region][seqfo['name']], seqfo['seq'])
    for gene in glfo['seqs'][args.region]:  # remove them afterwards so we can use existing ones to get codon info
        if gene not in tigger_alleles:
            glutils.remove_gene(glfo, gene)

    out_gldir = os.path.dirname(outfname).rstrip('/' + args.locus)
    assert glutils.get_fname(out_gldir, args.locus, args.region) == outfname
    glutils.write_glfo(out_gldir, glfo)

    os.remove(cmdfname)
Beispiel #11
0
def run_vsearch(
    seqfos
):  # run vsearch to see if you can get a match for each locus for every sequence
    print '  running vsearch on %d sequences:' % len(seqfos)
    n_rev_compd, n_total = 0, 0
    for locus in utils.sub_loci(args.ig_or_tr):
        lglfo = glutils.read_glfo(args.germline_dir, locus)
        annotations = utils.run_vsearch_with_duplicate_uids(
            'search',
            seqfos,
            args.workdir + '/vsearch',
            args.vsearch_threshold,
            glfo=lglfo,
            print_time=True,
            vsearch_binary=args.vsearch_binary,
            get_annotations=True,
            expect_failure=True,
            extra_str='   %s  fwd:' % utils.color('blue', locus)
            if args.reverse_negative_strands else '    %s: ' % locus)
        assert len(annotations) == len(seqfos)
        if args.reverse_negative_strands:  # it might be nicer to user vsearch options to run on both senses at once, but otoh this might be nicer
            revnotations = utils.run_vsearch_with_duplicate_uids(
                'search',
                revfos,
                args.workdir + '/vsearch',
                args.vsearch_threshold,
                glfo=lglfo,
                print_time=True,
                vsearch_binary=args.vsearch_binary,
                get_annotations=True,
                expect_failure=True,
                extra_str='        rev:')
            assert len(revnotations) == len(seqfos)
        for il, (sfo, line) in enumerate(zip(seqfos, annotations)):
            assert sfo['name'] == line['unique_ids'][
                0]  # note that they're not full annotations, they just have a couple keys
            if args.reverse_negative_strands and use_rev_comp(
                    line, revnotations[il]):
                sfo['seq'] = revfos[il]['seq']
                line = revnotations[il]
                n_rev_compd += 1
            sfo[locus] = line  # add info for each locus to the input seqfos
            n_total += 1
    if args.reverse_negative_strands:
        print '  used rev comp for %d/%d locus results (for %d seqs)' % (
            n_rev_compd, n_total, len(seqfos))
Beispiel #12
0
def write_single_zenodo_subdir(zenodo_dir, args, study, dset, method, mfo):
    method_outdir = heads.get_datadir(study, 'processed', extra_str=args.label) + '/' + dset
    gls_dir = get_gls_dir(method_outdir, method, data=True)
    print '            %s --> %s' % (gls_dir, zenodo_dir)
    glfo = glutils.read_glfo(gls_dir, mfo['locus'], remove_orfs='partis' in method)
    glutils.write_glfo(zenodo_dir, glfo)
    if method == 'partis':
        # allele finding plots
        plotdir = gls_dir.replace('hmm/germline-sets', 'plots/sw/allele-finding')
        if not os.path.exists(zenodo_dir + '/fits'):
            os.makedirs(zenodo_dir + '/fits')
        for genedir in glob.glob(plotdir + '/try-0/*'):  # would be nice to copy html, but links will be wrong
            subprocess.check_call(['cp', '-r', genedir, zenodo_dir + '/fits/'])

        # csv prevalence files
        for tmpreg in utils.regions:
            with open(gls_dir.replace('/germline-sets', '/%s_gene-probs.csv' % tmpreg)) as infile:
                reader = csv.DictReader(infile)
                countfo = {line['%s_gene' % tmpreg] : int(line['count']) for line in reader}
                old_total = sum(countfo.values())
                orf_genes = [g for g in countfo if g not in glfo['seqs'][tmpreg]]  # this is kind of dangerous... but the genes are read from the same parameter dir that we're reading this prevalence file, so the only way it's gonna be missing is if we just removed it with the read_glfo() line above
                for ogene in orf_genes:
                    # if tmpreg == 'v':
                    #     _, nearest_gene, _ = glutils.find_nearest_gene_with_same_cpos(glfo, glfo['seqs'][tmpreg][ogene])  # oops, that's dumb... of course it isn't there
                    # else:
                    nearest_gene = glutils.find_nearest_gene_using_names(glfo, ogene)
                    # print '  adding %d to %s from %s' % (countfo[ogene], utils.color_gene(nearest_gene), utils.color_gene(ogene))
                    countfo[nearest_gene] += countfo[ogene]
                for ogene in orf_genes:
                    del countfo[ogene]
                assert old_total == sum(countfo.values())
                with open('%s/%s_gene-probs.csv' % (zenodo_dir, tmpreg), 'w') as outfile:
                    writer = csv.DictWriter(outfile, ('%s_gene' % tmpreg, 'count'))
                    writer.writeheader()
                    for gene in countfo:
                        writer.writerow({'%s_gene' % tmpreg : gene, 'count' : countfo[gene]})
    elif method == 'tigger-default':
        # doesn't seem to have written anything
        pass
    elif method == 'igdiscover':
        # for fname in ['errorhistograms.pdf', 'V_usage.pdf', 'V_usage.tab']:
        #     subprocess.check_call(['cp', '%s/work/final/%s' % (gls_dir, fname), zenodo_dir + '/'])
        subprocess.check_call(['cp', '-r', '%s/work/final' % gls_dir, zenodo_dir + '/'])  # aw, screw it, just write everything. The simulation stuff is already huge, anyway
    else:
        assert False
def get_gene_sets(glsfnames, glslabels, ref_label=None):
    glfos = {}
    for label, fname in zip(glslabels, glsfnames):
        gldir = os.path.dirname(fname).replace('/' + args.locus, '')
        glfos[label] = glutils.read_glfo(
            gldir, args.locus
        )  # this is gonna fail for tigger since you only have the .fa

    if ref_label is not None:
        for label in [l for l in glslabels if l != ref_label]:
            print '    syncronizing %s names to match %s' % (label, ref_label)
            glutils.synchronize_glfos(ref_glfo=glfos[ref_label],
                                      new_glfo=glfos[label],
                                      region=args.region)

    gl_sets = {
        label:
        {g: seq
         for g, seq in glfos[label]['seqs'][args.region].items()}
        for label in glfos
    }
    all_genes = {g: s for gls in gl_sets.values() for g, s in gls.items()}

    return all_genes, gl_sets
import utils
import glutils
from clusterpath import ClusterPath

parser = argparse.ArgumentParser()
parser.add_argument('--fname',
                    default=partis_dir +
                    '/test/reference-results/partition-ref-simu.yaml')
parser.add_argument('--glfo-dir', default=partis_dir + '/data/germlines/human')
parser.add_argument('--locus', default='igh')
args = parser.parse_args()

glfo = None
if utils.getsuffix(args.fname) == '.csv':
    print '  reading deprecated csv format, so need to read germline info from somewhere else, using --glfo-dir %s, hopefully it works' % args.glfo_dir
    glfo = glutils.read_glfo(args.glfo_dir, locus=args.locus)

glfo, annotation_list, cpath = utils.read_output(args.fname, glfo=glfo)

if cpath is None or len(cpath.partitions) == 0:
    print 'no partitions read from %s, so just printing first annotation:' % args.fname
    utils.print_reco_event(annotation_list[0])
    sys.exit(0)

print utils.color('green', 'list of partitions:')
cpath.print_partitions(
    abbreviate=True
)  # 'abbreviate' print little 'o's instead of the full sequence ids

# print annotations for the biggest cluster in the most likely partition
annotations = {
Beispiel #15
0
    print 'WARNING current script dir %s doesn\'t exist, so python path may not be correctly set' % partis_dir
sys.path.insert(1, partis_dir + '/python')
import utils
from hist import Hist
import plotting
import glutils

parser = argparse.ArgumentParser()
parser.add_argument('infile')
parser.add_argument('plotdir')
args = parser.parse_args()

def gk(uids):
    return ':'.join(uids)

glfo = glutils.read_glfo(args.infile.replace('.csv', '-glfo'), locus='igh')

annotations = {}
with open(args.infile) as csvfile:
    reader = csv.DictReader(csvfile)
    for line in reader:
        if line['v_gene'] == '':  # failed (i.e. couldn't find an annotation)
            continue
        utils.process_input_line(line)  # converts strings in the csv file to floats/ints/dicts/etc.
        utils.add_implicit_info(glfo, line)  # add stuff to <line> that's useful, isn't written to the csv since it's redundant
        annotations[gk(line['unique_ids'])] = line

chfo = {uid : utils.get_chimera_max_abs_diff(annotations[uid], iseq=0) for uid in annotations}
biggest_adiffs = sorted(chfo, key=lambda q: chfo[q][1], reverse=True)
for uid in biggest_adiffs[:10]:
    print chfo[uid]
from Bio.Seq import Seq

partis_path = '.'  # edit this if you're not running from the main partis dir
sys.path.insert(1, partis_path + '/python')
import utils
import glutils
from clusterpath import ClusterPath

parser = argparse.ArgumentParser()
parser.add_argument('--infile')
parser.add_argument('--locus')
parser.add_argument('--param')
parser.add_argument('--nclust')
args = parser.parse_args()

glfo = glutils.read_glfo(args.param + '/hmm/germline-sets', locus=args.locus)

print(sys.argv)
print 'infile =', args.infile
print 'param =', args.param

cp = ClusterPath()
cp.readfile(args.infile)
best_partition = cp.partitions[cp.i_best]
# sorted_clusters = sorted(best_partition, key=len, reverse=True)  # sort by size

# clonal family attributes to print
print '''

score = interest score, indicating interesting attributes: size, SHM, SFS, bnAb VH usage
Beispiel #17
0
parser.add_argument('gldir1')
parser.add_argument('gldir2')
parser.add_argument(
    '--names',
    default='+gl-1:+gl-2',
    help=
    'colon-separated list of length 2 with labels for gldir1 and gldir2, which will be appended to each gene name in the ascii output'
)
parser.add_argument('--locus', default='igh')
args = parser.parse_args()
args.names = utils.get_arg_list(args.names)

glfos = []
for name, gldir in zip(args.names, [args.gldir1, args.gldir2]):
    print '%s:' % utils.color('yellow', name)
    glfos.append(glutils.read_glfo(gldir, args.locus, debug=True))

for region in [r for r in utils.regions if r in glfos[0]['seqs']]:
    aset, bset = [set(g['seqs'][region]) for g in glfos]

    tmpfo = glutils.get_empty_glfo(
        args.locus)  # make a new glfo that will only have non-shared genes
    for glabel, gset, gfo in zip(
            args.names, [aset - bset, bset - aset],
            glfos):  # <gset> is the genes that're only in <glabel>
        for ogene in gset:
            glutils.add_new_allele(tmpfo, {
                'gene': '+'.join([ogene, glabel]),
                'seq': gfo['seqs'][region][ogene],
                'cpos': utils.cdn_pos(gfo, region, ogene)
            },
#!/usr/bin/env python
import csv
import sys

partis_path = '.'  # edit this if you're not running from the main partis dir
sys.path.insert(1, partis_path + '/python')
import utils
import glutils
from clusterpath import ClusterPath

# read default germline info
glfo = glutils.read_glfo(partis_path + '/data/germlines/human', locus='igh')

print 'first parse an annotation csv file:'
with open(partis_path + '/test/reference-results/annotate-new-simu.csv') as csvfile:
    reader = csv.DictReader(csvfile)
    for line in reader:
        if line['v_gene'] == '':  # failed (i.e. couldn't find an annotation)
            continue
        utils.process_input_line(line)
        utils.add_implicit_info(glfo, line)
        utils.print_reco_event(line)
        break

print 'then parse a partition csv file:'
cp = ClusterPath()
cp.readfile(partis_path + '/test/reference-results/seed-partition-new-simu.csv')
cp.print_partitions(abbreviate=True)
Beispiel #19
0
current_script_dir = os.path.dirname(os.path.realpath(__file__)).replace('/bin', '/python')
if not os.path.exists(current_script_dir):
    print 'WARNING current script dir %s doesn\'t exist, so python path may not be correctly set' % current_script_dir
sys.path.insert(1, current_script_dir)

import utils
import glutils
import plotting

# ----------------------------------------------------------------------------------------
datadir = 'data/germlines/human'
xtitles = {
    'indels' : 'fraction of positions indel\'d',
    'subs' : 'substitution fraction'
}
glfo = glutils.read_glfo(datadir)
vgenes = glfo['aligned-genes']['v'].keys()
pversions = OrderedDict()
for vg in vgenes:
    pv = utils.primary_version(vg)
    if pv not in pversions:
        pversions[pv] = []
    pversions[pv].append(vg)

# remove primary versions that only have one gene
for pv in pversions:
    if len(pversions[pv]) == 1:
        print 'removing single-gene pv %s' % pv
        del pversions[pv]

# ----------------------------------------------------------------------------------------
Beispiel #20
0
for iname in range(len(args.names)):
    args.names[iname] = args.names[iname].replace('@', ' ')

# if you just pass in one parent directory, we assume <args.names> contains the desired subdirs
if len(args.plotdirs) == 1:
    parentdir = args.plotdirs[0]
    args.plotdirs = [parentdir + '/' + n for n in args.names]

if len(args.plotdirs) != len(args.names):
    raise Exception('poorly formatted args:\n  %s\n  %s' %
                    (' '.join(args.plotdirs), ' '.join(args.names)))

# if args.gldir is not 'none':
args.glfo = None
if os.path.exists(args.gldir):
    args.glfo = glutils.read_glfo(args.gldir, args.locus)

# figure out if there's subdirs we need to deal with
listof_plotdirlists, listof_outdirs = [], []
firstdir = args.plotdirs[0]
if len(glob.glob(firstdir +
                 '/*.csv')) > 0:  # add the parent dirs if they've got csvs
    listof_plotdirlists.append(args.plotdirs)
    listof_outdirs.append(args.outdir)
for subdir in [
        d for d in os.listdir(firstdir) if os.path.isdir(firstdir + '/' + d)
]:
    listof_plotdirlists.append([d + '/' + subdir for d in args.plotdirs])
    listof_outdirs.append(args.outdir + '/' + subdir)

for dlist, outdir in zip(listof_plotdirlists, listof_outdirs):
Beispiel #21
0
    def get_alleles(self, swfo, plotdir=None, debug=False):
        print 'clustering for new alleles'

        # NOTE do *not* modify <self.glfo> (in the future it would be nice to just modify <self.glfo>, but for now we need it to be super clear in partitiondriver what is happening to <self.glfo>)
        default_initial_glfo = self.glfo
        if self.args.default_initial_germline_dir is not None:  # if this is set, we want to take any new allele names from this directory's glfo if they're in there
            default_initial_glfo = glutils.read_glfo(self.args.default_initial_germline_dir, self.glfo['locus'])
            glfo_to_modify = copy.deepcopy(default_initial_glfo)  # so we can add new genes to it, so we can check for equivalency more easily TODO fix that shit, obviously
        else:
            print '  %s --default-initial-germline-dir isn\'t set, so new allele names won\'t correspond to existing names' % utils.color('yellow', 'warning')

        qr_seqs, threshold = self.choose_clonal_representatives(swfo, debug=debug)
        if qr_seqs is None:
            return {}

        # self.check_for_donuts(debug=debug)

        if not self.args.kmeans_allele_cluster:
            clusterfos, msa_info = self.vsearch_cluster_v_seqs(qr_seqs, threshold, debug=debug)
        else:
            clusterfos = self.kmeans_cluster_v_seqs(qr_seqs, swfo, plotdir=plotdir, debug=debug)
            msa_info = clusterfos

        # and finally loop over each cluster, deciding if it corresponds to a new allele
        if debug:
            print '  looping over %d clusters with %d sequences' % (len(clusterfos), sum([len(cfo['seqfos']) for cfo in clusterfos]))
            print '   rank  seqs   v/j mfreq                 seqs      snps (%s)' % utils.color('blue', 'indels')
        new_alleles = {}
        n_existing_gene_clusters = 0
        for iclust in range(len(clusterfos)):
            clusterfo = clusterfos[iclust]

            # dot_products = [utils.dot_product(clusterfo['cons_seq'], seq1, seq2) for seq1, seq2 in itertools.combinations([seqfo['seq'] for seqfo in clusterfo['seqfos']], 2)]
            # mean_dot_product = numpy.average(dot_products)

            # choose the most common existing gene to use as a template (the most similar gene might be a better choice, but deciding on "most similar" would involve adjudicating between snps and indels, and it shouldn't really matter)
            sorted_glcounts, true_sorted_glcounts = self.get_glcounts(clusterfo)
            template_gene, template_counts = sorted_glcounts[0]
            template_seq = self.glfo['seqs'][self.region][template_gene]
            template_cpos = utils.cdn_pos(self.glfo, self.region, template_gene)

            assert '.' not in clusterfo['cons_seq']  # make sure you haven't switched to something that doesn't use '-' for gap chars
            new_seq = clusterfo['cons_seq'].replace('-', '')  # I'm not sure that I completely understand the dashes in this sequence, but it seems to be right to just remove 'em

            aligned_template_seq, aligned_new_seq = utils.align_seqs(template_seq, clusterfo['cons_seq'])
            has_indels = '-' in aligned_template_seq.strip('-') or '-' in aligned_new_seq.strip('-')  # only counts internal indels
            cluster_mfreqs = {r : [self.mfreqs[r][seqfo['name']] for seqfo in clusterfo['seqfos']] for r in self.mfreqs}  # regional mfreqs for each sequence in the cluster corresponding to the initially-assigned existing gene
            mean_cluster_mfreqs = {r : numpy.mean(cluster_mfreqs[r]) for r in cluster_mfreqs}

            equiv_name, equiv_seq = glutils.find_equivalent_gene_in_glfo(glfo_to_modify, new_seq, template_cpos)
            if equiv_name is not None:
                new_name = equiv_name
                new_seq = equiv_seq
            else:
                new_name, _ = glutils.choose_new_allele_name(template_gene, new_seq, indelfo={'indels' : ['xxx', 'xxx', 'xxx']} if has_indels else None)  # the fcn just checks to see if it's non-None and of length greater than zero...TODO it would be nice to figure out actual snp and indel info

            if debug:
                self.print_cluster(iclust, clusterfo, sorted_glcounts, new_seq, true_sorted_glcounts, mean_cluster_mfreqs, has_indels)

            if new_name in self.glfo['seqs'][self.region]:  # note that this only looks in <self.glfo>, not in <new_alleles>
                n_existing_gene_clusters += 1
                if debug:
                    print 'existing %s' % utils.color_gene(new_name)
                continue

            if new_name in new_alleles:  # already added it NOTE might make more sense to use <glfo_to_modify> here instead of <new_alleles> (or just not have freaking both of them)
                if debug:
                    print '%s (%s)' % (utils.color_gene(new_name), utils.color('red', 'new'))
                continue
            assert new_seq not in new_alleles.values()  # if it's the same seq, it should've got the same damn name

            if not has_indels:  # we assume that the presence of indels somewhat precludes false positives, which is equivalent to an assumption about the rarity of shm indels
                if self.too_close_to_existing_glfo_gene(clusterfo, new_seq, template_seq, template_cpos, template_gene, debug=debug):  # presumably if it were really close to another (non-template) existing glfo gene, that one would've been the template
                    continue

                if mean_cluster_mfreqs['j'] > 0. and self.mean_mfreqs['j'] > 0.:
                    this_cluster_ratio = mean_cluster_mfreqs['v'] / mean_cluster_mfreqs['j']
                    overall_ratio = self.mean_mfreqs['v'] / self.mean_mfreqs['j']
                    if this_cluster_ratio / overall_ratio < self.mfreq_ratio_threshold:
                        if debug:
                            print 'v / j cluster mfreqs too small %6.3f / %6.3f = %6.3f < %6.3f' % (this_cluster_ratio, overall_ratio, this_cluster_ratio / overall_ratio, self.mfreq_ratio_threshold)
                        continue

            if self.too_close_to_already_added_gene(new_seq, new_alleles, debug=debug):  # this needs to be applied even if there are indels, since the indels are with respect to the (existing glfo) template gene, not to the [potentially] previously-added gene
                continue

            print '%s %s%s' % (utils.color('red', 'new'), utils.color_gene(new_name), ' (exists in default germline dir)' if new_name in default_initial_glfo['seqs'][self.region] else '')
            new_alleles[new_name] = {'template-gene' : template_gene, 'gene' : new_name, 'seq' : new_seq}
            if new_alleles[new_name]['gene'] not in glfo_to_modify['seqs'][self.region]:  # if it's in <default_initial_glfo> it'll already be in there
                glutils.add_new_allele(glfo_to_modify, new_alleles[new_name])  # just so we can check for equivalency

        if debug:
            print '  %d / %d clusters consensed to existing genes' % (n_existing_gene_clusters, len(msa_info))

        self.reassign_template_counts(msa_info, new_alleles, debug=False)
        for new_name, newfo in new_alleles.items():
            # print '%s  %s  %.1f / %.1f = %.4f' % (new_name, newfo['template-gene'], self.adjusted_glcounts[newfo['template-gene']], float(sum(self.adjusted_glcounts.values())), self.adjusted_glcounts[newfo['template-gene']] / float(sum(self.adjusted_glcounts.values())))
            if self.adjusted_glcounts[newfo['template-gene']] / float(sum(self.adjusted_glcounts.values())) < self.args.min_allele_prevalence_fraction:  # NOTE self.adjusted_glcounts only includes large clusters, and the constituents of those clusters are clonal representatives, so this isn't quite the same as in alleleremover
                newfo['remove-template-gene'] = True

        return new_alleles
Beispiel #22
0
def partis_naive_seq(lseq, fnam):
    '''
    Given a number of sequences infer the naive sequence using partis.
    '''
    # Specify filenames:
    pretty_random_fnam = str(random.randint(1, 10**100))
    inpf = pretty_random_fnam + '_input'
    outf = pretty_random_fnam + '_output'
    # Write input fasta file for partis:
    with open(TMPDIR+'/'+inpf+'.fa', 'w') as fho:
        for i, s in enumerate(lseq):
            fho.write('>{}\n{}\n'.format(str(i), s))
    # Run partis:
    cmd = '{}/bin/partis partition --locus {} --species {} --infname {}/{}.fa --outfname {}/{}.csv'.format(partis_path, args.LOCUS, args.SPECIES, TMPDIR, inpf, TMPDIR, outf)
    # os.system(cmd)  # Print partis STDOUT to screen
    os.system('{} > {}/{}.log'.format(cmd, TMPDIR, pretty_random_fnam))

    try:
        # Read the partis output file and extract the naive sequence:
        with open(TMPDIR+'/'+outf+'-cluster-annotations.csv') as fh:
            reader = csv.DictReader(fh)
            data = list(reader)
        # assert(len(data) == 1)  # There should really only be one clonal family, but there often are, so just take the first (largest)
        # Extract germline bounds info and trim the naive DNA sequence:
        try:
            utils.process_input_line(data[0])       # Process dataframe row
            fnam_base = fnam.split('_partitions')[0].split('/')
            #glfo = glutils.read_glfo('{}/_output/{}/hmm/germline-sets'.format(fnam_base[0], fnam_base[-1]), locus=args.LOCUS)
            glfo = glutils.read_glfo(partis_path + '/data/germlines/human', locus=args.LOCUS)
            utils.add_implicit_info(glfo, data[0])  # Adding germline infor
        except Exception as e:
            print e
            raise e

        naiveDNA = data[0]['naive_seq'][:]
        first_lseq = data[0]['input_seqs'][:][0]
        vj_bounds = (data[0]['regional_bounds']['v'][0], data[0]['regional_bounds']['j'][1])
        naiveDNA = repair_new_naive(naiveDNA[:], naiveDNA[:], vj_bounds)
        first_lseq = repair_new_naive(first_lseq, naiveDNA[:], vj_bounds)
        try:
            assert(len(first_lseq) == len(naiveDNA))
        except:
            print 'len(first_lseq) != len(data[0]["naive_seq"])'
            print len(first_lseq)
            print first_lseq
            print len(naiveDNA)
            print naiveDNA
        # If the inferred naive sequence contains a stop codon replace it by the input sequence codon:
        if '*' in str(Seq(naiveDNA, generic_dna).translate()):
            print 'Found stop codon in inferred naive sequnce, will replace with input sequence codon.'
            print 'Before replacement:', naiveDNA
            naiveDNA_l = list(naiveDNA[:])
            for codon in range(vj_bounds[0], vj_bounds[1], 3):
                if '*' == str(Seq(naiveDNA[codon:codon+3], generic_dna).translate()):
                    naiveDNA_l[codon:codon+3] = first_lseq[codon:codon+3]
            naiveDNA = ''.join(naiveDNA_l)
            print 'After replacement:', naiveDNA
        if naiveDNA == first_lseq:
            print 'Complaining to say naiveDNA == first_lseq (nothing bad just to be sure the repair is not just replacing the naive sequence with the input entirely)'

        return(naiveDNA)
    finally:
        # Clean up:
        os.system('rm -r {}/{}* _output/*{}*'.format(TMPDIR, pretty_random_fnam, pretty_random_fnam))
Beispiel #23
0
def extract_seqs(fnam):
    '''
    Reads a partis cluster-annotations file and extracts relevant information and sequences.
    '''
    # Read cluster annotations into a data list of dictionaries:
    with open(fnam) as fh:
        reader = csv.DictReader(fh)
        data = list(reader)

    sequences_i = list()
    info_i = list()

    if args.allele_finding:
        fnam_base = fnam.split('_partitions')[0].split('/')
        glfo = glutils.read_glfo('{}/_output/{}/hmm/germline-sets'.format(fnam_base[0], fnam_base[-1]), locus=args.LOCUS)
    else:
        glfo = glutils.read_glfo(partis_path + '/data/germlines/human', locus=args.LOCUS)
    for row in data:
        # Process the partis data row and add germline information:
        try:
            utils.process_input_line(row)
            # Read default germline info
            utils.add_implicit_info(glfo, row)
        except Exception as e:  # Skip rows that cannot be processed
            if 'failed annotation' not in e:
                pass
                # print('First skip')
                # print(e)
            else:
                print 'Reading from'
                print '{}/_output/{}/hmm/germline-sets'.format(fnam_base[0], fnam_base[-1])
                print e
            continue

#        # Process the partis data row and add germline information:
#        try:
#            utils.process_input_line(row)
#            utils.add_implicit_info(glfo, row)
#        except:  # Skip rows that cannot be processed
#            continue

        # Extract the full N padded naive sequence,
        # and find the v -and j gene bound on this naive sequence:
        cdr3_bounds = (row['codon_positions']['v'], row['codon_positions']['j'] + 3)
        vj_bounds = (row['regional_bounds']['v'][0], row['regional_bounds']['j'][1])
        naiveDNA = row['naive_seq']
        # Skip naive sequences too short or with stop codons:
        if repair_seq(naiveDNA, naiveDNA, vj_bounds, keep_check=True) is False:
            continue
        trimmed_naiveDNA = repair_seq(naiveDNA[:], naiveDNA[:], vj_bounds)
        naiveAA = str(Seq(trimmed_naiveDNA, generic_dna).translate())

        # There has been a name change and this try/except is meant to provide backwards compatability:
        try:
            lseq = row['input_seqs'][:]
        except:
            lseq = row['seqs'][:]
        ir_lseq = row['indel_reversed_seqs']
        stop_seq = row['stops']
        assert(len(lseq) == len(ir_lseq))
        assert(len(lseq) == len(stop_seq))
        # Only keep sequences without indels and stop codons and minimum length amino acid length (QC):
        ### ir_lseq[i] == '' or lseq[i] == ir_lseq[i]  <-- No indels
        ### stop_seq[i]  <-- No partis annotated stops (there seems still to be stops after these are removed though)
        ### repair_seq(lseq[i], naiveDNA, vj_bounds, keep_check=True)  <-- Checks whether the sequence is long enougth or have stop codons
        keep_idx = [1 if ((ir_lseq[i] == '' or lseq[i] == ir_lseq[i]) and stop_seq[i] is False and repair_seq(lseq[i], naiveDNA, vj_bounds, keep_check=True)) else 0 for i in range(len(lseq))]

        # Now only keep those sequences that passed QC:
        lseq = [s for s, keep in zip(lseq, keep_idx) if keep == 1]
        # Get amino acid sequences:
        lAAseq = [str(Seq(repair_seq(s[:], naiveDNA[:], vj_bounds), generic_dna).translate()) for s in lseq]
        # And mutation frequencies:
        mut_freqs = [s for s, keep in zip(row['mut_freqs'], keep_idx) if keep == 1]
        assert(len(mut_freqs) == len(lseq))
        # Convert frequency to counts:
        Nmuts = [int(round(float(t[0])*len(t[1].strip('N')))) for i, t in enumerate(zip(mut_freqs, lseq))]

        # Deduplicate AAseqs and lseq according to the duplications on amino acid level:
        lAAseq_dict = dict()
        lseq_unique = list()
        for i, aa in enumerate(lAAseq):
            if aa in lAAseq_dict:
                lAAseq_dict[aa].append(i)
            else:
                lAAseq_dict[aa] = [i]
                lseq_unique.append(repair_seq(lseq[i][:], naiveDNA[:], vj_bounds))
        assert(len(lAAseq_dict) == len(lseq_unique))
        # Make the deduplicated sequence list and the mutation rates:
        lAAseq_dedup = list()
        Nmuts_dedup = list()
        for aa, idxs in lAAseq_dict.items():
            lAAseq_dedup.append(aa)
            Nmut_list = [float(Nmuts[i]) for i in idxs]
            Nmuts_dedup.append(int(round(sum(Nmut_list)/len(Nmut_list))))
        assert(len(lAAseq_dedup) == len(Nmuts_dedup))
        assert(len(lAAseq_dedup) == len(lseq_unique))

        # Exclude small clonal families after all the QC and deduplication:
        if len(lAAseq_dedup) < args.MIN_OBS:
            continue

        # Store the results in a list:
        sequences_i.append(['naive_seq', naiveAA])  # This format is for ANARCI numbering
        info_i.append({'fnam': fnam, 'v_gene': row['v_gene'], 'd_gene': row['d_gene'], 'j_gene': row['j_gene'],
                       'naive_seq': naiveAA, 'naive_seq_DNA': trimmed_naiveDNA, 'Nmuts': Nmuts_dedup[:],
                       'AAseqs': lAAseq_dedup[:], 'DNAseqs': lseq_unique[:]})
    return(sequences_i, info_i)
Beispiel #24
0
def _get_clonal_family_stats(path_to_annotations,
                             metadata,
                             use_np=False,
                             use_immunized=False,
                             locus=''):
    '''
    get data statistics from partis annotations

    @param path_to_annotations: path to partis annotations
    @param metadata: path to partis metadata 
    @param use_np: use nonproductive seqs?
    @param use_immunized: for Cui data, use immunized mice?
    @param locus: which locus to use

    @return list of dicts with clonal family sizes and naive seqs from processed data
    '''

    partition_info = get_partition_info(
        path_to_annotations,
        metadata,
    )

    if use_np:
        # return only nonproductive sequences
        # here "nonproductive" is defined as having a stop codon or being
        # out of frame or having a mutated conserved cysteine
        good_seq = lambda seqs: seqs['stops'] or not seqs['in_frames'] or seqs[
            'mutated_invariants']
    else:
        # return all sequences
        good_seq = lambda seqs: [True for seq in seqs['seqs']]

    all_germline_dicts = []
    for data_idx, data_info in enumerate(partition_info):
        if use_immunized and data_info['group'] != 'immunized':
            continue
        if not locus or data_info['locus'] != locus:
            continue
        PARTIS_PATH = os.path.dirname(os.path.realpath(__file__)) + '/partis'
        sys.path.insert(1, PARTIS_PATH + '/python')
        from utils import add_implicit_info, process_input_line
        import glutils
        glfo = glutils.read_glfo(data_info['germline_file'],
                                 locus=data_info['locus'])
        with open(data_info['annotations_file'], "r") as csvfile:
            reader = csv.DictReader(csvfile)
            for idx, line in enumerate(reader):
                # add goodies from partis
                if len(line['input_seqs']) == 0:
                    # sometimes data will have empty clusters
                    continue
                process_input_line(line)
                add_implicit_info(glfo, line)
                good_seq_idx = [
                    i for i, is_good in enumerate(good_seq(line)) if is_good
                ]
                if not good_seq_idx:
                    # no nonproductive sequences... skip
                    continue
                else:
                    all_germline_dicts.append({
                        'n_taxa':
                        len(good_seq_idx),
                        'germline_sequence':
                        disambiguate(line['v_gl_seq'].lower()),
                        'germline_name':
                        '-'.join([line['v_gene'], str(idx)]),
                        'v_call':
                        line['v_gene'],
                    })

    return all_germline_dicts
Beispiel #25
0
def write_partis_data_from_annotations(
    output_genes,
    output_seqs,
    path_to_annotations,
    metadata,
    filters={},
    seq_filters={},
    min_clonal_family_size=0,
    min_seq_len=0,
    max_mut_pct=1.,
    min_mut_pct=0.,
    clone_str='',
    region='v',
    germline_family='v',
):
    """
    Function to read partis annotations csv

    @param path_to_annotations: path to annotations files
    @param metadata: csv file of metadata; if None defaults will be used for chain/species
    @param filters: dictionary of lists with keys as column name and items as those values of the column variable to retain;
        filters out families, e.g., {'locus': ['igk']}, etc.
    @param seq_filters: same as filters, but for sequences, e.g., {indel_reversed_seqs': [''], 'in_frames': [False]} will
        only retain sequences that are out of frame and did not have an indel
    @param min_clonal_family_size: minimum clonal family size
    @param min_seq_len: minimum sequence length
    @param max_mut_pct: maximum mutation percentage
    @param min_mut_pct: minimum mutation percentage
    @param clone_str: string for identifying clones (useful if merging annotations from multiple datasets)
    @param region: B-cell receptor region ('v', 'd', 'j', or 'vdj')
    @param germline_family: for performing cross validation ('v', 'd', or 'j')

    @write genes to output_genes and seqs to output_seqs
    """

    families = ['v', 'd', 'j']
    if germline_family not in families:
        raise ValueError("Invalid germline_family: %s. Must be one of %s" %
                         (germline_family, families))

    regions = ['v', 'd', 'j', 'vdj']
    if region not in regions:
        raise ValueError("Invalid region: %s. Must be one of %s" %
                         (region, regions))

    PARTIS_PATH = os.path.dirname(os.path.realpath(__file__)) + '/partis'
    sys.path.insert(1, PARTIS_PATH + '/python')
    from utils import add_implicit_info, process_input_line
    import glutils

    partition_info = get_partition_info(
        path_to_annotations,
        metadata,
    )

    with open(output_genes, 'w') as genes_file, open(output_seqs,
                                                     'w') as seqs_file:
        gene_writer = csv.DictWriter(genes_file,
                                     ['germline_name', 'germline_sequence'])
        gene_writer.writeheader()

        seq_header = [
            'germline_name',
            'sequence_name',
            'sequence',
            'germline_family',
            'v_gene',
            'region',
        ]

        for key, _ in partition_info[0].iteritems():
            seq_header += [key]

        seq_writer = csv.DictWriter(seqs_file, seq_header)
        seq_writer.writeheader()
        for data_idx, data_info in enumerate(partition_info):
            if any([
                    data_info[key] not in values
                    for key, values in filters.iteritems()
            ]):
                continue
            glfo = glutils.read_glfo(data_info['germline_file'],
                                     locus=data_info['locus'])
            with open(data_info['annotations_file'], "r") as csvfile:
                reader = csv.DictReader(csvfile)
                for idx, line in enumerate(reader):
                    if line['v_gene'] == '':
                        # failed annotations
                        continue

                    # add goodies from partis
                    process_input_line(line)
                    add_implicit_info(glfo, line)
                    n_seqs = len(line['input_seqs'])
                    if n_seqs < min_clonal_family_size:
                        # don't take small clonal families---for data quality purposes
                        continue

                    if region == 'vdj':
                        gl_seq = line['naive_seq'].lower()
                        all_seqs = [seq.lower() for seq in line['seqs']]
                    else:
                        gl_seq = line['v_gl_seq'].lower()
                        all_seqs = [seq.lower() for seq in line['v_qr_seqs']]

                    idx_list = []
                    # frequency filter
                    idx_list.append(
                        set([
                            i for i, val in enumerate(line['mut_freqs'])
                            if val < max_mut_pct and val >= min_mut_pct
                        ]))
                    # sequence length filter
                    idx_list.append(
                        set([
                            i for i, val in enumerate(all_seqs)
                            if len(val.translate(None, 'n')) > min_seq_len
                        ]))
                    for key, values in seq_filters.iteritems():
                        idx_list.append(
                            set([
                                i for i, val in enumerate(line[key])
                                if val in values
                            ]))

                    good_seq_idx = set.intersection(*idx_list)
                    if not good_seq_idx:
                        # no sequences after filtering... skip
                        continue

                    gl_name = 'clone{}-{}-{}'.format(
                        *[data_idx, idx, clone_str])
                    gene_writer.writerow({
                        'germline_name': gl_name,
                        'germline_sequence': gl_seq,
                    })

                    for good_idx in good_seq_idx:
                        base_dict = {
                            'germline_name':
                            gl_name,
                            'sequence_name':
                            '-'.join([gl_name, line['unique_ids'][good_idx]]),
                            'sequence':
                            all_seqs[good_idx].lower(),
                            'germline_family':
                            line['{}_gene'.format(germline_family)][:5],
                            'v_gene':
                            line['v_gene'],
                            'region':
                            region,
                        }

                        for key, value in data_info.iteritems():
                            base_dict[key] = value

                        seq_writer.writerow(base_dict)
Beispiel #26
0
parser.add_argument('--strings-to-ignore')  # remove this string from the plot names in each dir (e.g. '-mean-bins') NOTE replaces '_' with '-'

print 'TODO this should really be an importable module, not its own script'

args = parser.parse_args()
if args.strings_to_ignore is not None:
    args.strings_to_ignore = args.strings_to_ignore.replace('_', '-')
args.plotdirs = utils.get_arg_list(args.plotdirs)
args.scale_errors = utils.get_arg_list(args.scale_errors)
args.colors = utils.get_arg_list(args.colors, intify=True, translation={810 : 'red', 634 : 'darkred', 596 : 'mediumblue', 418 : 'green', 798 : 'goldenrod', 869 : 'lightseagreen'})
args.str_colors = utils.get_arg_list(args.str_colors)
if args.str_colors is not None:
    args.colors = args.str_colors
args.linestyles = utils.get_arg_list(args.linestyles, intify=True, translation={1 : '-',2 : '--'})
args.names = utils.get_arg_list(args.names)
args.leaves_per_tree = utils.get_arg_list(args.leaves_per_tree, intify=True)
args.strings_to_ignore = utils.get_arg_list(args.strings_to_ignore)
args.markersizes = utils.get_arg_list(args.markersizes, intify=True)
args.linewidths = utils.get_arg_list(args.linewidths, intify=True)
args.alphas = utils.get_arg_list(args.alphas, floatify=True)
for iname in range(len(args.names)):
    args.names[iname] = args.names[iname].replace('@', ' ')

assert len(args.plotdirs) == len(args.names)

glfo = glutils.read_glfo(args.datadir)
args.cyst_positions = glfo['cyst-positions']
args.tryp_positions = glfo['tryp-positions']

plotting.compare_directories(args)
Beispiel #27
0
def parse_args():
    def existing_file(fname):
        """Argparse type for an existing file"""
        if not os.path.isfile(fname):
            raise ValueError("Can't find file: " + str(fname))
        return fname

    parser = argparse.ArgumentParser(description=__doc__)
    inputs = parser.add_argument_group(title="Input files",
                                       description="(required)")
    inputs.add_argument('--partition-file',
                        help='partitions file as output by partis',
                        type=existing_file,
                        required=True)
    inputs.add_argument(
        '--upstream-seqmeta',
        help=
        'optionally, specify upstream seqmeta as a csv with cols: unique_id,timepoint,multiplicity',
        # Index rows by unique id
        type=csv_reader('unique_id'))

    outputs = parser.add_argument_group(title="Output files",
                                        description="(optional)")
    outputs.add_argument('--seqmeta-out',
                         help='per sequence metadata CSV file')
    outputs.add_argument('--seqs-out',
                         help='cluster sequences as a FASTA file')
    outputs.add_argument('--cluster-meta-out',
                         help='cluster sequences as a JSON file')
    # If we support a recursive option, we have to name I guess?
    #outputs.add_argument(
    #'--process-all-data-to',
    #help="writes all data for all partitions/clusters to the specified directory")

    partis_args = parser.add_argument_group(
        title="Partis args",
        description=
        """These arguments (as passed to partis) are required in order to process the data correctly."""
    )
    partis_args.add_argument(
        '--parameter-dir',
        help=
        'parameter dir path, as passed to partis (if omitted, gls assumed to be '
        + default_germline_sets + ')')
    partis_args.add_argument('--locus',
                             help='again, as passed to partis',
                             required=True)

    cluster_selection_args = parser.add_argument_group(
        title="Cluster selection args",
        description=
        """Given a partition file and associated cluster annotation file, there may be multiple
        clusters one might extract data for. These options allow you to specify a selection."""
    )
    cluster_selection_args.add_argument(
        '--partition',
        type=int,
        default=0,
        help=
        '"best plus" index; defaults to 0 (best partition); 1 selects the next partition step, etc.'
    )
    cluster_selection_args.add_argument(
        '--cluster',
        type=int,
        help=
        """index of cluster in partition-file after sorting by cluster size; defaults to seed cluster if
        seeded and 0 (the largest cluster) otherwise.""")
    # add a non sorted version?
    cluster_selection_args.add_argument(
        '--unique-ids',
        help='select a specific cluster using its unique_ids signature')
    cluster_selection_args.add_argument(
        '--unique-ids-file',
        help=
        'select a specific cluster using its unique_ids signature in a single line in a file',
        type=lambda x: file(x).read().strip())

    other_args = parser.add_argument_group(title="Other options")
    other_args.add_argument(
        '--remove-frameshifts',
        help='if set, removes seqs with frameshifted indels from output',
        action="store_true")
    other_args.add_argument(
        '--remove-stops',
        help='if set, removes seqs with stop codons from output',
        action="store_true")
    other_args.add_argument(
        '--remove-mutated-invariants',
        help=
        'if set, removes seqs with mutated "invariant" regions from output',
        action="store_true")
    other_args.add_argument(
        '--indel-reversed-seqs',
        help=
        'if set, uses the "indel_reversed_seqs" output of partis instead of "seqs"',
        action="store_true")
    other_args.add_argument(
        '--max-sequences',
        help=
        """if set, downsamples semi-randomly, with preference towards sequences with higher multiplicity
        and order output by partis""",
        type=int)
    other_args.add_argument(
        '--always-include',
        type=lambda x: x.split(','),
        help='comma separated list of ids to keep if --max-sequences is set',
        default=[])
    other_args.add_argument(
        '--paths-relative-to',
        default='.',
        help=
        'files pointed to from metadata.json file will be specified relative to this path'
    )
    other_args.add_argument(
        '--namespace',
        help='namespace to be applied to cluster meta attr names')
    other_args.add_argument('--inferred-naive-name',
                            help='see scons option help')
    # --indel-reversed-seqs
    # --remove-mutated-invariants

    # parse args and decorate with derived values
    args = parser.parse_args()
    # default paths_relative_to is just whatever the output dir is
    args.unique_ids = args.unique_ids or args.unique_ids_file
    # default germline set (discouraged)
    args.germline_sets = os.path.join(
        args.parameter_dir,
        'hmm/germline-sets') if args.parameter_dir else default_germline_sets
    args.glfo = glutils.read_glfo(args.germline_sets, args.locus)

    return args
    # add the intronic v genes to glfo
    for d_gene, refseq in refseqs.items():
        glfo['seqs']['v'][utils.generate_dummy_v(d_gene)] = refseq
        glfo['cyst-positions'][utils.generate_dummy_v(d_gene)] = len(refseq) - 3

    # write a glfo dir with everything
    glutils.write_glfo(outdir + '/germlines/imgt-and-intronic', glfo, debug=True)

    # remove the original v genes, and write a glfo dir with just the intronic ones
    glutils.remove_genes(glfo, [g for g in glfo['seqs']['v'] if 'xDx' not in g], debug=True)
    glutils.write_glfo(outdir + '/germlines/intronic', glfo, debug=True)


# tmpglfo = glutils.read_glfo('tmp-germlines', 'h')
glfo = glutils.read_glfo('data/germlines/human', 'h')

infname = '/fh/fast/matsen_e/data/2016-06-02-katie/VMO_Memory-3/VMO_Memory-3.tsv'
outdir = '/fh/fast/matsen_e/processed-data/partis/2016-06-02-katie'

n_failed, n_v_ok, n_total = 0, 0, 0
introns = {}

with open(infname) as infile:
    reader = csv.DictReader(infile, delimiter='\t')
    iline = 0
    introns = {}
    for line in reader:
        iline += 1
        # if iline > 1000:
        #     break
Beispiel #29
0
    args.names[iname] = args.names[iname].replace('@', ' ')

# if you just pass in one parent directory, we assume <args.names> contains the desired subdirs
if len(args.plotdirs) == 1:
    parentdir = args.plotdirs[0]
    args.plotdirs = [parentdir + '/' + n for n in args.names]

if len(args.plotdirs) != len(args.names):
    raise Exception('poorly formatted args:\n  %s\n  %s' %
                    (' '.join(args.plotdirs), ' '.join(args.names)))

# make a merged glfo from all the gldirs
args.glfo = None
if args.gldirs is not None:
    for gldir in [gd for gd in args.gldirs if os.path.exists(gd)]:
        tmpglfo = glutils.read_glfo(gldir, args.locus)
        if args.glfo is None:
            args.glfo = tmpglfo
        else:
            args.glfo = glutils.get_merged_glfo(args.glfo, tmpglfo)

# figure out if there's subdirs we need to deal with
listof_plotdirlists, listof_outdirs = [], []
firstdir = args.plotdirs[0]
if len(glob.glob(firstdir +
                 '/*.csv')) > 0:  # add the parent dirs if they've got csvs
    listof_plotdirlists.append(args.plotdirs)
    listof_outdirs.append(args.outdir)
for subdir in [
        d for d in os.listdir(firstdir) if os.path.isdir(firstdir + '/' + d)
]:
#!/usr/bin/env python
import csv
import sys

partis_path = '.'  # edit this if you're not running from the main partis dir
sys.path.insert(1, partis_path + '/python')
import utils
import glutils
from clusterpath import ClusterPath

# read default germline info
glfo = glutils.read_glfo(partis_path + '/data/germlines/human', chain='h')

print 'first parse an annotation csv file:'
with open(partis_path + '/test/reference-results/annotate-new-simu.csv') as csvfile:
    reader = csv.DictReader(csvfile)
    for line in reader:
        utils.process_input_line(line)
        utils.add_implicit_info(glfo, line)
        utils.print_reco_event(glfo['seqs'], line)
        cdr3_bounds = (line['codon_positions']['v'], line['codon_positions']['j'] + 3)
        print ''
        print '  should match the above:'
        print '    %s naive cdr3' % line['naive_seq'][cdr3_bounds[0] : cdr3_bounds[1]]
        print '    %s mature' % line['indel_reversed_seqs'][0][cdr3_bounds[0] : cdr3_bounds[1]]
        print ''
        break

print 'then parse a partition csv file:'
cp = ClusterPath()
cp.readfile(partis_path + '/test/reference-results/seed-partition-new-simu.csv')
Beispiel #31
0
current_script_dir = os.path.dirname(os.path.realpath(__file__)).replace('/bin', '/python')
if not os.path.exists(current_script_dir):
    print 'WARNING current script dir %s doesn\'t exist, so python path may not be correctly set' % current_script_dir
sys.path.insert(1, current_script_dir)

import utils
import glutils
import plotting

# ----------------------------------------------------------------------------------------
datadir = 'data/germlines/human'
xtitles = {
    'indels' : 'fraction of positions indel\'d',
    'subs' : 'substitution fraction'
}
glfo = glutils.read_glfo(datadir)
vgenes = glfo['aligned-genes']['v'].keys()
pversions = OrderedDict()
for vg in vgenes:
    pv = utils.primary_version(vg)
    if pv not in pversions:
        pversions[pv] = []
    pversions[pv].append(vg)

# remove primary versions that only have one gene
for pv in pversions:
    if len(pversions[pv]) == 1:
        print 'removing single-gene pv %s' % pv
        del pversions[pv]

# ----------------------------------------------------------------------------------------
def run_test(args):
    print 'seed %d' % args.seed
    label = 'test'  #get_label(existing_genes, new_allele)
    simfname = args.outdir + '/simu-' + label + '.csv'
    outpdir = args.outdir + '/simu-' + label
    plotdir = args.outdir + '/simu-' + label + '-plots'

    # simulate
    if not args.nosim:
        cmd_str = base_cmd + ' simulate --n-sim-events ' + str(args.n_sim_events) + ' --n-leaves ' + str(args.n_leaves) + ' --constant-number-of-leaves --rearrange-from-scratch --outfname ' + simfname
        cmd_str += ' --mutation-multiplier ' + str(args.mut_mult)

        cmd_str += ' --n-procs ' + str(args.n_procs)
        if args.slurm:
            cmd_str += ' --batch-system slurm --subsimproc'

        if args.gen_gset:
            cmd_str += ' --generate-germline-set'
            cmd_str += ' --n-genes-per-region 1:5:3'
            cmd_str += ' --n-alleles-per-gene 2,3:1,2:1,2'
        else:
            simulation_genes = ':'.join(args.sim_v_genes + args.dj_genes)
            sglfo = glutils.read_glfo('data/germlines/human', chain=chain, only_genes=simulation_genes.split(':'))

            added_snp_names = None
            if args.snp_positions is not None:
                snps_to_add = [{'gene' : args.sim_v_genes[ig], 'positions' : args.snp_positions[ig]} for ig in range(len(args.sim_v_genes))]
                added_snp_names = glutils.add_some_snps(snps_to_add, sglfo, debug=True, remove_template_genes=args.remove_template_genes)

            if args.allele_prevalence_freqs is not None:
                if len(args.allele_prevalence_freqs) != len(sglfo['seqs']['v']):
                    raise Exception('--allele-prevalence-freqs not the right length')
                gene_list = sorted(sglfo['seqs']['v']) if added_snp_names is None else list(set(args.sim_v_genes)) + added_snp_names
                prevalence_freqs = {'v' : {g : f for g, f in zip(gene_list, args.allele_prevalence_freqs)}, 'd' : {}, 'j' : {}}
                glutils.write_allele_prevalence_freqs(prevalence_freqs, args.workdir + '/allele-prevalence-freqs.csv')
                cmd_str += ' --allele-prevalence-fname ' + args.workdir + '/allele-prevalence-freqs.csv'

            glutils.write_glfo(args.outdir + '/germlines/simulation', sglfo)
            cmd_str += ' --initial-germline-dir ' + args.outdir + '/germlines/simulation'

        if args.seed is not None:
            cmd_str += ' --seed ' + str(args.seed)
        run(cmd_str)

    # remove any old sw cache files
    sw_cachefiles = glob.glob(outpdir + '/sw-cache-*.csv')
    if len(sw_cachefiles) > 0:
        for cachefname in sw_cachefiles:
            check_call(['rm', '-v', cachefname])
            sw_cache_gldir = cachefname.replace('.csv', '-glfo')
            if os.path.exists(sw_cache_gldir):  # if stuff fails halfway through, you can get one but not the other
                glutils.remove_glfo_files(sw_cache_gldir, chain)
                # os.rmdir(sw_cache_gldir)

    # generate germline set and cache parameters
    cmd_str = base_cmd + ' cache-parameters --infname ' + simfname + ' --only-smith-waterman --debug-allele-finding --always-find-new-alleles --n-max-allele-finding-iterations 2' # --dont-collapse-clones'
    # cmd_str = 'python -m cProfile -s tottime -o prof.out ' + cmd_str
    cmd_str += ' --n-procs ' + str(args.n_procs)
    if args.slurm:
        cmd_str += ' --batch-system slurm'

    if args.gen_gset:
        cmd_str += ' --find-new-alleles'
    else:
        inference_genes = ':'.join(args.inf_v_genes + args.dj_genes)
        iglfo = glutils.read_glfo('data/germlines/human', chain=chain, only_genes=inference_genes.split(':'), debug=True)
        glutils.write_glfo(args.outdir + '/germlines/inference', iglfo)
        cmd_str += ' --initial-germline-dir ' + args.outdir + '/germlines/inference'
        cmd_str += ' --find-new-alleles --dont-remove-unlikely-alleles'  # --new-allele-fname ' + args.outdir + '/new-alleles.fa'
        # cmd_str += ' --n-max-snps 12'

    cmd_str += ' --parameter-dir ' + outpdir
    cmd_str += ' --plotdir ' + plotdir
    if args.seed is not None:
        cmd_str += ' --seed ' + str(args.seed)
    run(cmd_str)
Beispiel #33
0
        for region in [r for r in utils.regions if r in glfo['seqs']]:
            for gene, seq in glfo['seqs'][region].items():
                if utils.ambig_frac(seq) > 0.:
                    if debug:
                        print '   %d ambiguous bases: %s' % (
                            len(seq) * utils.ambig_frac(seq),
                            utils.color_gene(gene))
                    glutils.remove_gene(glfo, gene)

        # glutils.print_glfo(glfo)

        # write final result
        glutils.write_glfo(outdir, glfo, debug=True)


# ----------------------------------------------------------------------------------------
fname = 'macaque/ramesh-v1/coding.fa'
outdir = 'macaque/ramesh-cleaned'
# parse_ramesh_seqs(read_ramesh_file(fname, outdir), outdir, debug=True)
# sys.exit()

# ----------------------------------------------------------------------------------------
for locus in ['igh', 'igk', 'igl']:
    ref_glfo = glutils.read_glfo('data/germlines/macaque', locus, debug=True)
    glfo = glutils.read_glfo(outdir, locus, debug=True)
    merged_glfo = glutils.get_merged_glfo(ref_glfo, glfo, debug=True)
    # glutils.print_glfo(merged_glfo, print_separate_cons_seqs=True)
    glutils.write_glfo('datascripts/meta/crotty-fna/imgt-plus-ramesh',
                       merged_glfo,
                       debug=True)
Beispiel #34
0
def run_test(args):
    print 'seed %d' % args.seed
    label = 'test'  #get_label(existing_genes, new_allele)
    simfname = args.outdir + '/simu-' + label + '.csv'
    outpdir = args.outdir + '/simu-' + label
    plotdir = args.outdir + '/simu-' + label + '-plots'

    # simulate
    if not args.nosim:
        cmd_str = base_cmd + ' simulate --n-sim-events ' + str(
            args.n_sim_events) + ' --n-leaves ' + str(
                args.n_leaves
            ) + ' --rearrange-from-scratch --outfname ' + simfname
        if args.n_leaf_distribution is None:
            cmd_str += ' --constant-number-of-leaves'
        else:
            cmd_str += ' --n-leaf-distribution ' + args.n_leaf_distribution
        if args.mut_mult is not None:
            cmd_str += ' --mutation-multiplier ' + str(args.mut_mult)

        cmd_str += ' --n-procs ' + str(args.n_procs)
        if args.slurm:
            cmd_str += ' --batch-system slurm --subsimproc'

        # figure what genes we're using
        if args.gen_gset:
            cmd_str += ' --generate-germline-set'
            cmd_str += ' --n-genes-per-region 1:5:3'
            cmd_str += ' --n-alleles-per-gene 2,3:1,2:1,2'
        else:
            simulation_genes = ':'.join(args.sim_v_genes + args.dj_genes)
            sglfo = glutils.read_glfo('data/germlines/human',
                                      locus=locus,
                                      only_genes=simulation_genes.split(':'))

            added_snp_names = None
            if args.snp_positions is not None:
                snps_to_add = [{
                    'gene': args.sim_v_genes[ig],
                    'positions': args.snp_positions[ig]
                } for ig in range(len(args.sim_v_genes))]
                added_snp_names = glutils.add_some_snps(
                    snps_to_add,
                    sglfo,
                    debug=True,
                    remove_template_genes=args.remove_template_genes)

            if args.allele_prevalence_freqs is not None:
                if len(args.allele_prevalence_freqs) != len(
                        sglfo['seqs']['v']
                ):  # already checked when parsing args, but, you know...
                    raise Exception(
                        '--allele-prevalence-freqs not the right length')
                gene_list = sorted(
                    sglfo['seqs']['v']) if added_snp_names is None else list(
                        set(args.sim_v_genes)) + added_snp_names
                prevalence_freqs = {
                    'v': {
                        g: f
                        for g, f in zip(gene_list,
                                        args.allele_prevalence_freqs)
                    },
                    'd': {},
                    'j': {}
                }
                glutils.write_allele_prevalence_freqs(
                    prevalence_freqs,
                    args.workdir + '/allele-prevalence-freqs.csv')
                cmd_str += ' --allele-prevalence-fname ' + args.workdir + '/allele-prevalence-freqs.csv'

            print '  simulating with %d v: %s' % (len(
                sglfo['seqs']['v']), ' '.join(
                    [utils.color_gene(g) for g in sglfo['seqs']['v']]))
            glutils.write_glfo(args.outdir + '/germlines/simulation', sglfo)
            cmd_str += ' --initial-germline-dir ' + args.outdir + '/germlines/simulation'

        # run simulation
        if args.seed is not None:
            cmd_str += ' --seed ' + str(args.seed)
        run(cmd_str)

    # remove any old sw cache files
    sw_cachefiles = glob.glob(outpdir + '/sw-cache-*.csv')
    if len(sw_cachefiles) > 0:
        for cachefname in sw_cachefiles:
            check_call(['rm', '-v', cachefname])
            sw_cache_gldir = cachefname.replace('.csv', '-glfo')
            if os.path.exists(
                    sw_cache_gldir
            ):  # if stuff fails halfway through, you can get one but not the other
                glutils.remove_glfo_files(sw_cache_gldir, locus)
                # os.rmdir(sw_cache_gldir)

    # generate germline set and cache parameters
    cmd_str = base_cmd + ' cache-parameters --infname ' + simfname + ' --only-smith-waterman --debug-allele-finding --always-find-new-alleles --n-max-allele-finding-iterations 2'  # --dont-collapse-clones'
    # cmd_str = 'python -m cProfile -s tottime -o prof.out ' + cmd_str
    cmd_str += ' --n-procs ' + str(args.n_procs)
    if args.n_max_queries is not None:
        cmd_str += ' --n-max-queries ' + str(
            args.n_max_queries
        )  # NOTE do *not* use --n-random-queries, since it'll change the cluster size distribution
    if args.slurm:
        cmd_str += ' --batch-system slurm'

    cmd_str += ' --find-new-alleles'

    if args.gen_gset:
        pass  # i.e. uses default (full) germline dir
    else:
        cmd_str += ' --dont-remove-unlikely-alleles'  # --new-allele-fname ' + args.outdir + '/new-alleles.fa'
        inference_genes = ':'.join(args.inf_v_genes + args.dj_genes)
        iglfo = glutils.read_glfo('data/germlines/human',
                                  locus=locus,
                                  only_genes=inference_genes.split(':'))
        print '  starting inference with %d v: %s' % (len(
            iglfo['seqs']['v']), ' '.join(
                [utils.color_gene(g) for g in iglfo['seqs']['v']]))
        glutils.write_glfo(args.outdir + '/germlines/inference', iglfo)
        cmd_str += ' --initial-germline-dir ' + args.outdir + '/germlines/inference'
        # cmd_str += ' --n-max-snps 12'

    cmd_str += ' --parameter-dir ' + outpdir
    cmd_str += ' --only-overall-plots --plotdir ' + plotdir
    if args.seed is not None:
        cmd_str += ' --seed ' + str(args.seed)
    run(cmd_str)
Beispiel #35
0
    help=
    'ignore clusters with a cdr3 that differs by more than this many nucleotides'
)
args = parser.parse_args()

args.infiles = utils.get_arg_list(args.infiles)
args.labels = utils.get_arg_list(args.labels)
args.parameter_dirs = utils.get_arg_list(args.parameter_dirs)
assert len(args.infiles) == len(args.labels)
if len(args.parameter_dirs) == 1:
    print '  note: using same glfo for all infiles'
    args.parameter_dirs = [args.parameter_dirs[0] for _ in args.labels]
assert len(args.parameter_dirs) == len(args.labels)

glfos = [
    glutils.read_glfo(pdir + '/hmm/germline-sets', locus=args.locus)
    for pdir in args.parameter_dirs
]


# ----------------------------------------------------------------------------------------
def getkey(uid_list):
    return ':'.join(uid_list)


# ----------------------------------------------------------------------------------------
def read_annotations(fname, glfo):
    annotations = {}
    with open(fname.replace('.csv', '-cluster-annotations.csv')) as csvfile:
        reader = csv.DictReader(csvfile)
        for line in reader:  # there's a line for each cluster
Beispiel #36
0
args.colors = utils.get_arg_list(args.colors)
args.linewidths = utils.get_arg_list(args.linewidths)
for iname in range(len(args.names)):
    args.names[iname] = args.names[iname].replace('@', ' ')

# if you just pass in one parent directory, we assume <args.names> contains the desired subdirs
if len(args.plotdirs) == 1:
    parentdir = args.plotdirs[0]
    args.plotdirs = [parentdir + '/' + n for n in args.names]

if len(args.plotdirs) != len(args.names):
    raise Exception('poorly formatted args:\n  %s\n  %s' % (' '.join(args.plotdirs), ' '.join(args.names)))

# if args.gldir is not 'none':
args.glfo = None
if os.path.exists(args.gldir):
    args.glfo = glutils.read_glfo(args.gldir, args.chain)

# figure out if there's subdirs we need to deal with
listof_plotdirlists, listof_outdirs = [], []
firstdir = args.plotdirs[0]
if len(glob.glob(firstdir + '/*.csv')) > 0:  # add the parent dirs if they've got csvs
    listof_plotdirlists.append(args.plotdirs)
    listof_outdirs.append(args.outdir)
for subdir in [d for d in os.listdir(firstdir) if os.path.isdir(firstdir + '/' + d)]:
    listof_plotdirlists.append([d + '/' + subdir for d in args.plotdirs])
    listof_outdirs.append(args.outdir + '/' + subdir)

for dlist, outdir in zip(listof_plotdirlists, listof_outdirs):
    compare_directories(args, dlist, outdir)
#!/usr/bin/env python

# Script to process the extras.csv files from partis' germline directories
# Assumes the fasta files (e.g. ighv.fa) have been deduplicated

import sys

partis_dir = "/home/bolson2/Software/partis"
sys.path.insert(1, partis_dir + '/python')

import glutils

igb_path = "/home/bolson2/Software/igblast"
igb_database_path = igb_path + "/bin_deduplicated"

glfo = glutils.read_glfo(igb_database_path, locus='igh', debug=True)
glutils.write_glfo(igb_path + '/partis_friendly_bin', glfo, debug=True)

glfo = glutils.read_glfo(igb_database_path, locus='igk', debug=True)
glutils.write_glfo(igb_path + '/partis_friendly_bin', glfo, debug=True)

glfo = glutils.read_glfo(igb_database_path, locus='igl', debug=True)
glutils.write_glfo(igb_path + '/partis_friendly_bin', glfo, debug=True)
Beispiel #38
0
def extract_seqs(fnam, uid2iso):
    '''Reads a partis cluster-annotations files and extrats relevant information and sequences.'''
    # Read cluster annotations into a data list of dictionaries:
    with open(fnam) as fh:
        reader = csv.DictReader(fh)
        data = list(reader)

    sequences_i = list()
    info_i = list()
    for row in data:
        fnam_base = fnam.split('_partitions')[0]
        cwd = os.getcwd()
        if 'IgK' in fnam_base:
            locus = 'igk'
        elif 'IgL' in fnam_base:
            locus = 'igl'
        else:
            locus = 'igh'
        # Process the partis data row and add germline information:
        try:
            utils.process_input_line(row)
            # Read default germline info
            glfo = glutils.read_glfo('{}/_output/{}/hmm/germline-sets'.format(cwd, fnam_base), locus=locus)
            utils.add_implicit_info(glfo, row)
        except Exception as e:  # Skip rows that cannot be processed
            print('First skip')
            print(e)
            continue

        uids = [dl + [u] if (len(dl) > 0 and dl[0] != '') else [u] for dl, u in zip(row['duplicates'], row['unique_ids'])]

        # Extract the full N padded naive sequence,
        # and find the v -and j gene bound on this naive sequence:
        cdr3_bounds = (row['codon_positions']['v'], row['codon_positions']['j'] + 3)
        vj_bounds = (row['regional_bounds']['v'][0], row['regional_bounds']['j'][1])
        if row['invalid'] is True or (cdr3_bounds[0]-cdr3_bounds[1])%3 != 0:
            print('Invalid clonal family, skipping.')
            continue

        naiveDNA = row['naive_seq']
        if repair_seq(naiveDNA, naiveDNA, vj_bounds, keep_check=True) is False:  # Skip naive sequences too short or with stop codons:
            # print('Third skip')
            if len(row['input_seqs'][:]) > 100:
                print('Bad naive even after 100 seqs in clonal family.')
                repair_seq_debug(naiveDNA, naiveDNA, vj_bounds)
            continue
        trimmed_naiveDNA = repair_seq(naiveDNA[:], naiveDNA[:], vj_bounds)
        naiveAA = str(Seq(trimmed_naiveDNA, generic_dna).translate())

        # There has been a name change and this try/except
        # is meant to provide backwards compatability:
        try:
            lseq = row['input_seqs'][:]
        except:
            lseq = row['seqs'][:]
        ir_lseq = row['indel_reversed_seqs']
        stop_seq = row['stops']
        assert(len(lseq) == len(ir_lseq))
        assert(len(lseq) == len(stop_seq))
        # Only keep sequences without indels and stop codons and minimum length amino acid length:
        ### ir_lseq[i] == '' or lseq[i] == ir_lseq[i]  <-- No indels
        ### stop_seq[i]  <-- No partis annotated stops (there seems still to be stops after these are removed though)
        ### repair_seq(lseq[i], naiveDNA, vj_bounds, keep_check=True)  <-- Checks whether the sequence is long enougth or have stop codons
        keep_idx = [1 if ((ir_lseq[i] == '' or lseq[i] == ir_lseq[i]) and stop_seq[i] is False and repair_seq(lseq[i], naiveDNA, vj_bounds, keep_check=True)) else 0 for i in range(len(lseq))]

        # Now only keep those sequences that passed QC:
        lseq = [s for s, keep in zip(lseq, keep_idx) if keep == 1]
        # Exclude small clonal families:
        if len(lseq) < MIN_OBS:
            # print(len(lseq))
            # print('Fourth skip')
            continue
        # Get amino acid sequences:
        lAAseq = [str(Seq(repair_seq(s[:], naiveDNA[:], vj_bounds), generic_dna).translate()) for s in lseq]
#        mut_freqs = [s for s, keep in zip(row['mut_freqs'], keep_idx) if keep == 1]
#        print(row['n_mutations'].split(':'))
        Nmuts = [int(s) for s, keep in zip(row['n_mutations'].split(':'), keep_idx) if keep == 1]
        abundance = [len(d) for d, keep in zip(uids, keep_idx) if keep == 1]
        uids = [s for s, keep in zip(uids, keep_idx) if keep == 1]
        assert(len(Nmuts) == len(lseq))
        assert(len(abundance) == len(lseq))
        assert(len(uids) == len(lseq))
#        assert(len(mut_freqs) == len(lseq))
        # Convert frequency to counts and throw out info for discarded sequences:
#        Nmuts = [int(round(float(t[0])*len(t[1].strip('N')))) for i, t in enumerate(zip(mut_freqs, lseq))]

        # Deduplicate AAseqs and lseq according to the AA deduplication:
        '''
        lAAseq_dict = dict()
        lAAseq_sort = dict()
        lseq_dedup = list()
        for i, aa in enumerate(lAAseq):
            if aa in lAAseq_sort:
                lAAseq_sort[aa].append((i, repair_seq(lseq[i][:], naiveDNA[:], vj_bounds), abundance[i]))
            else:
                lAAseq_sort[aa] = [(i, repair_seq(lseq[i][:], naiveDNA[:], vj_bounds), abundance[i])]

        for i, aa in enumerate(lAAseq_sort):
            lAAseq_dict[aa] = [t[0] for t in lAAseq_sort[aa]]
            s = sorted(lAAseq_sort[aa], )
            ab_seq = sorted(lAAseq_sort[aa], key=lambda x: x[2], reverse=True)[0][1]
            lseq_dedup.append(ab_seq)

        assert(len(lAAseq_dict) == len(lseq_dedup))
        # Make the deduplicated list and take the mutation rates,
        #  as the mutation rate for the deduplicated sequence:
        lAAseq_dedup = list()
        Nmuts_dedup = list()
        abundance_dedup = list()
        for aa, idxs in lAAseq_dict.items():
            lAAseq_dedup.append(aa)
            Nmut_list = [float(Nmuts[i]) for i in idxs]
            Nmuts_dedup.append(int(round(sum(Nmut_list)/len(Nmut_list))))
            abundance_list = [abundance[i] for i in idxs]
            abundance_dedup.append(sum(abundance_list))
        assert(len(lAAseq_dedup) == len(Nmuts_dedup))
        assert(len(lAAseq_dedup) == len(abundance_dedup))
        assert(len(lAAseq_dedup) == len(lseq_dedup))

        # Exclude small clonal families:
        if len(lAAseq_dedup) < MIN_OBS:
            # print(len(lseq))
            # print('Fourth skip')
            continue
        '''
        iso_list = [[uid2iso[u] for u in ul] for ul in uids]
        # Store the results in a list:
        sequences_i.append(['naive_seq', naiveAA])  # This format is for ANARCI numbering
        info_i.append({'fnam': fnam, 'v_gene': row['v_gene'], 'd_gene': row['d_gene'], 'j_gene': row['j_gene'],
                       'naive_seq': naiveAA, 'naive_seq_DNA': trimmed_naiveDNA, 'Nmuts': Nmuts[:], 'abundance': abundance[:],
                       'AAseqs': lAAseq[:], 'DNAseqs': lseq[:], 'UID': uids[:], 'isotype': iso_list[:],
                       'CDR3_start': cdr3_bounds[0], 'CDR3_end': cdr3_bounds[1]})

    return(sequences_i, info_i)