def parse_ramesh_seqs(glseqs, outdir, debug=False): for locus in glseqs: glutils.remove_glfo_files(outdir, locus) # write to a glfo dir without extra info for region in glseqs[locus]: fn = glutils.get_fname(outdir, locus, region) if not os.path.exists(os.path.dirname(fn)): os.makedirs(os.path.dirname(fn)) with open(fn, 'w') as ofile: for gene, seq in glseqs[locus][region].items(): ofile.write('>%s\n%s\n' % (gene, seq)) # figure out extra info template_glfo = glutils.read_glfo('data/germlines/macaque', locus) glfo = glutils.read_glfo(outdir, locus, template_glfo=template_glfo, remove_bad_genes=True, debug=True) # trim non-coding stuff upstream of v (and remove non-full-length ones) gene_groups = {} for region in ['v']: group_labels = sorted( set([utils.gene_family(g) for g in glfo['seqs'][region]])) gene_groups[region] = [(glabel, { g: glfo['seqs'][region][g] for g in glfo['seqs'][region] if utils.gene_family(g) == glabel }) for glabel in group_labels] for region in [r for r in utils.regions if r in gene_groups]: if debug: print '%s' % utils.color('reverse_video', utils.color('green', region)) for group_label, group_seqs in gene_groups[ region]: # ok, this isn't really doing anything any more if debug: print ' %s' % utils.color('blue', group_label) for gene, seq in group_seqs.items(): trim_and_remove_genes(region, gene, seq, glfo, template_glfo, debug=debug) # remove any seqs with ambiguous bases for region in [r for r in utils.regions if r in glfo['seqs']]: for gene, seq in glfo['seqs'][region].items(): if utils.ambig_frac(seq) > 0.: if debug: print ' %d ambiguous bases: %s' % ( len(seq) * utils.ambig_frac(seq), utils.color_gene(gene)) glutils.remove_gene(glfo, gene) # glutils.print_glfo(glfo) # write final result glutils.write_glfo(outdir, glfo, debug=True)
def simulate(args): if utils.output_exists(args, args.simfname): return cmd_str = args.partis_path + ' simulate --n-sim-events ' + str(args.n_sim_events) + ' --outfname ' + args.simfname + ' --n-leaves ' + str(args.n_leaves) + ' --rearrange-from-scratch --shm-parameter-dir ' + partis_dir + '/data/recombinator/scratch-parameters' if args.n_leaf_distribution is None: cmd_str += ' --constant-number-of-leaves' else: cmd_str += ' --n-leaf-distribution ' + args.n_leaf_distribution if args.mut_mult is not None: cmd_str += ' --mutation-multiplier ' + str(args.mut_mult) if args.root_mrca_weibull_parameter is not None: cmd_str += ' --root-mrca-weibull-parameter ' + str(args.root_mrca_weibull_parameter) cmd_str += ' --n-procs ' + str(args.n_procs) if args.slurm: cmd_str += ' --batch-system slurm --subsimproc' allele_prevalence_fname = args.workdir + '/allele-prevalence-freqs.csv' # figure what genes we're using if args.gls_gen: assert args.sim_v_genes is None and args.allele_prevalence_freqs is None sglfo = glutils.read_glfo(args.default_germline_dir, locus=args.locus) glutils.remove_v_genes_with_bad_cysteines(sglfo) glutils.generate_germline_set(sglfo, args.n_genes_per_region, args.n_sim_alleles_per_gene, args.min_allele_prevalence_freq, allele_prevalence_fname, new_allele_info=args.new_allele_info, dont_remove_template_genes=args.dont_remove_template_genes, debug=True) cmd_str += ' --allele-prevalence-fname ' + allele_prevalence_fname else: sglfo = glutils.read_glfo(args.default_germline_dir, locus=args.locus, only_genes=(args.sim_v_genes + args.dj_genes)) added_snp_names = glutils.generate_new_alleles(sglfo, args.new_allele_info, debug=True, remove_template_genes=(not args.dont_remove_template_genes)) # NOTE template gene removal is the default for glutils.generate_germline_set if args.allele_prevalence_freqs is not None: if not utils.is_normed(args.allele_prevalence_freqs): raise Exception('--allele-prevalence-freqs %s not normalized' % args.allele_prevalence_freqs) if len(args.allele_prevalence_freqs) != len(sglfo['seqs']['v']): # already checked when parsing args, but, you know... raise Exception('--allele-prevalence-freqs %d not the same length as sglfo %d' % (len(args.allele_prevalence_freqs), len(sglfo['seqs']['v']))) gene_list = sorted(sglfo['seqs']['v']) if len(added_snp_names) == 0 else list(set(args.sim_v_genes)) + added_snp_names prevalence_freqs = {'v' : {g : f for g, f in zip(gene_list, args.allele_prevalence_freqs)}, 'd' : {}, 'j' : {}} glutils.write_allele_prevalence_freqs(prevalence_freqs, allele_prevalence_fname) cmd_str += ' --allele-prevalence-fname ' + allele_prevalence_fname glutils.write_glfo(args.outdir + '/germlines/simulation', sglfo) cmd_str += ' --initial-germline-dir ' + args.outdir + '/germlines/simulation' # glutils.print_glfo(sglfo) # run simulation if args.seed is not None: cmd_str += ' --seed ' + str(args.seed) utils.simplerun(cmd_str, dryrun=args.dry_run)
def run_test(simulation_v_genes, inference_v_genes, dj_genes, seed=None): if seed is not None: random.seed(seed) label = 'test' #get_label(existing_genes, new_allele) simfname = outdir + '/simu-' + label + '.csv' outpdir = outdir + '/simu-' + label if os.getenv('www') is not None: plotdir = os.getenv('www') + '/partis/allele-finding/' + label else: plotdir = '_www/partis/allele-finding/' + label snps_to_add = [ {'gene' : 'IGHV3-71*01', 'positions' : (35, )}, {'gene' : 'IGHV3-71*01', 'positions' : (35, 50)}, # {'gene' : 'IGHV3-71*01', 'positions' : (35, 45, 20, 50, 77)}, # {'gene' : 'IGHV3-71*01', 'positions' : (35, 60, 50)}, # {'gene' : 'IGHV1-18*01', 'positions' : (100, 101)}, # {'gene' : 'IGHV1-18*01', 'positions' : (20, )} ] simulation_genes = simulation_v_genes + ':' + dj_genes sglfo = glutils.read_glfo('data/imgt', chain=chain, only_genes=simulation_genes.split(':'), debug=True) glutils.add_some_snps(snps_to_add, sglfo, remove_template_genes=False, debug=True) glutils.write_glfo(outdir + '/germlines-for-simulation', sglfo) # simulate cmd_str = base_cmd + ' simulate --n-sim-events 1000 --n-procs 10 --simulate-partially-from-scratch --mutation-multiplier 0.5' cmd_str += ' --initial-datadir ' + outdir + '/germlines-for-simulation' cmd_str += ' --outfname ' + simfname if seed is not None: cmd_str += ' --seed ' + str(seed) run(cmd_str) inference_genes = inference_v_genes + ':' + dj_genes iglfo = glutils.read_glfo('data/imgt', chain=chain, only_genes=inference_genes.split(':'), debug=True) glutils.write_glfo(outdir + '/germlines-for-inference', iglfo) # generate germline set and cache parameters cmd_str = base_cmd + ' cache-parameters --infname ' + simfname + ' --n-procs 10 --only-smith-waterman' cmd_str += ' --find-new-alleles --new-allele-fname ' + outdir + '/new-alleles.fa' # cmd_str += ' --generate-germline-set' cmd_str += ' --debug-new-allele-finding' cmd_str += ' --initial-datadir ' + outdir + '/germlines-for-inference' cmd_str += ' --parameter-dir ' + outpdir cmd_str += ' --plotdir ' + plotdir if seed is not None: cmd_str += ' --seed ' + str(seed) run(cmd_str)
def read_partis_output(partition_file, glfo_dir=None, locus=None): glfo = (None if utils.getsuffix(partition_file) == ".yaml" else glutils.read_glfo( glfo_dir if glfo_dir else default_glfo_dir, locus)) glfo, annotation_list, cpath = utils.read_output( partition_file, glfo=glfo ) # returns glfo from the file if it's there, otherwise it returns the one we passed in return glfo, annotation_list, cpath
def __init__(self, args, seed, sublabel=None): self.args = args if sublabel == None: self.workdir = self.args.workdir + '/recombinator' self.outfname = self.args.outfname else: # need a separate workdir for each subprocess self.workdir = self.args.workdir + '/recombinator-' + sublabel self.outfname = self.workdir + '/' + os.path.basename(self.args.outfname) utils.prep_dir(self.workdir) if not self.args.simulate_partially_from_scratch: parameter_dir = self.args.parameter_dir else: # we start from scratch, except for the mute freq stuff parameter_dir = self.args.scratch_mute_freq_dir if parameter_dir is None or not os.path.exists(parameter_dir): raise Exception('parameter dir ' + parameter_dir + ' d.n.e') self.index_keys = {} # this is kind of hackey, but I suspect indexing my huge table of freqs with a tuple is better than a dict self.mute_models = {} # self.treeinfo = [] # list of newick-formatted tree strings with region-specific branch info tacked at the end for region in utils.regions: self.mute_models[region] = {} for model in ['gtr', 'gamma']: self.mute_models[region][model] = {} self.glfo = glutils.read_glfo(self.args.initial_datadir, self.args.chain, only_genes=self.args.only_genes) self.allowed_genes = self.get_allowed_genes(parameter_dir) # set of genes a) for which we read per-position mutation information and b) from which we choose when running partially from scratch self.version_freq_table = self.read_vdj_version_freqs(parameter_dir) # list of the probabilities with which each VDJ combo (plus other rearrangement parameters) appears in data self.insertion_content_probs = self.read_insertion_content(parameter_dir) self.all_mute_freqs = {} self.parameter_dir = parameter_dir # damnit, I guess I do need to save this in self # read shm info NOTE I'm not inferring the gtr parameters a.t.m., so I'm just (very wrongly) using the same ones for all individuals with opener('r')(self.args.gtrfname) as gtrfile: # read gtr parameters reader = csv.DictReader(gtrfile) for line in reader: parameters = line['parameter'].split('.') region = parameters[0][3].lower() assert region == 'v' or region == 'd' or region == 'j' model = parameters[1].lower() parameter_name = parameters[2] assert model in self.mute_models[region] self.mute_models[region][model][parameter_name] = line['value'] treegen = treegenerator.TreeGenerator(args, parameter_dir, seed=seed) self.treefname = self.workdir + '/trees.tre' treegen.generate_trees(seed, self.treefname) with opener('r')(self.treefname) as treefile: # read in the trees (and other info) that we just generated self.treeinfo = treefile.readlines() os.remove(self.treefname) if os.path.exists(self.outfname): os.remove(self.outfname) elif not os.path.exists(os.path.dirname(os.path.abspath(self.outfname))): os.makedirs(os.path.dirname(os.path.abspath(self.outfname)))
def get_single_performance(region, outdir, method, debug=False): sglfo = glutils.read_glfo(outdir + '/germlines/simulation', locus=args.locus) iglfo = glutils.read_glfo(outdir + '/' + method + '/sw/germline-sets', locus=args.locus) glutils.synchronize_glfos(ref_glfo=sglfo, new_glfo=iglfo, region=region) missing_alleles = set(sglfo['seqs'][region]) - set(iglfo['seqs'][region]) spurious_alleles = set(iglfo['seqs'][region]) - set(sglfo['seqs'][region]) if debug: if len(missing_alleles) > 0: print ' %2d missing %s' % (len(missing_alleles), ' '.join([utils.color_gene(g) for g in missing_alleles])) if len(spurious_alleles) > 0: print ' %2d spurious %s' % (len(spurious_alleles), ' '.join([utils.color_gene(g) for g in spurious_alleles])) if len(missing_alleles) == 0 and len(spurious_alleles) == 0: print ' none missing' return { 'missing' : len(missing_alleles), 'spurious' : len(spurious_alleles), 'total' : len([g for g in sglfo['seqs'][region] if '+' in g]), # anybody with a '+' should be a new allele }
def main(): import argparse parser = argparse.ArgumentParser( description='Annotate BCR sequence for SPURF.', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--sequence', type=str, required=True, help='Sequence for annotation.') parser.add_argument('--outfile', type=str, default='out.csv', help='Output csv filename.') parser.add_argument( '--SIM_SIZE', type=int, required=False, default=10000, help='Number of random draws to simulate the neutral profile.') parser.add_argument('--LOCUS', type=str, required=False, default='igh', help='Locus, either igh, igk or igl.') parser.add_argument('--SPECIES', type=str, required=False, default='human', help='Species, either human.') global args args = parser.parse_args() mutability = PATH2FILE + '/S5F/Mutability.csv' substitution = PATH2FILE + '/S5F/Substitution.csv' # Read default germline info: global glfo glfo = glutils.read_glfo(partis_path + '/data/germlines/human', locus=args.LOCUS) naive, fixed_input_seq, VDJ = run_partis(args.sequence) naiveAA = str(Seq(naive, generic_dna).translate()) fixed_input_seqAA = str(Seq(fixed_input_seq, generic_dna).translate()) # AHo annotate on the naive amino acid sequence: AHo_naive, numb_profile = AHo_annotate_naive(naiveAA) # Use the AHo annotation to make a profile over the input sequence: AHo_input = AHo_annotate_input(fixed_input_seqAA, numb_profile) # Simulate a profile under a neutral substitution process: Nmuts = hamming_dist(naive, fixed_input_seq) sim_profile = simulate_profile([Nmuts], naive, numb_profile, mutability, substitution) df = make_dataframe(AHo_input, AHo_naive, sim_profile, VDJ) write_dataframe(df, args.outfile)
def _generate_germline_set(self, n_genes_per_region="20:1:1", n_sim_alleles_per_gene="1.5:1:1", min_sim_allele_prevalence_freq=0.1): """ Call partis's germline set simulation function and write to files """ PARTIS_PATH = './partis' sys.path.insert(1, PARTIS_PATH + '/python') import glutils glfo = glutils.read_glfo(self.GERMLINE_FOLDER + "/" + self.organism, self.locus) glutils.generate_germline_set( glfo, n_genes_per_region, n_sim_alleles_per_gene, min_sim_allele_prevalence_freq, self.allele_freq_file, debug=self.debug, ) glutils.write_glfo(self.output_dir, glfo) # Read allele prevalences germline_freqs = dict() with open(self.allele_freq_file, "r") as f: allele_reader = csv.reader(f) allele_reader.next() for row in allele_reader: if row[0].startswith(self.locus.upper() + "V"): germline_freqs[row[0]] = float(row[1]) # Read the selected germline alleles germline_seqs = dict() with open(self.ig_file, "r") as f: lines = f.read().splitlines() for line_idx in range(len(lines) / 2): allele = lines[line_idx * 2].replace(">", "") allele_seq = lines[line_idx * 2 + 1] # Trim allele until multiple of 3 - randomly pick a reading frame mod_seq_len = len(allele_seq) % 3 if mod_seq_len != 0: offset = np.random.choice(mod_seq_len + 1) if mod_seq_len != offset: allele_seq = allele_seq[offset:-(mod_seq_len - offset)] else: allele_seq = allele_seq[offset:] # Make sure no N in the germline sequence while "N" in allele_seq: allele_seq = allele_seq.replace( "N", np.random.choice(list(NUCLEOTIDE_SET)), 1) germline_seqs[allele] = allele_seq return germline_seqs, germline_freqs
def write_inf_glfo( args ): # read default glfo, restrict it to the specified alleles, and write to somewhere where all the methods can read it # NOTE this dir should *not* be modified by any of the methods inf_glfo = glutils.read_glfo('data/germlines/human', locus=args.locus, only_genes=args.inf_v_genes + args.dj_genes) print ' writing initial inference glfo with %d v: %s' % (len( inf_glfo['seqs']['v']), ' '.join( [utils.color_gene(g) for g in inf_glfo['seqs']['v']])) glutils.write_glfo(args.inf_glfo_dir, inf_glfo)
def run_tigger(infname, outfname, outdir): if utils.output_exists(args, outfname, offset=8): return rcmds = ['library(tigger)', 'library(dplyr)'] # rcmds += ['data(sample_db, germline_ighv)'] db_name = 'annotations' gls_name = 'gls' rcmds += ['%s = read.csv("%s", sep="\t")' % (db_name, infname)] rcmds += ['%s = readIgFasta("%s")' % (gls_name, get_glfname('v', aligned=True))] tigger_outfname = outdir + '/tigger.fasta' rcmds += ['novel_df = findNovelAlleles(%s, %s, germline_min=2, nproc=%d)' % (db_name, gls_name, args.n_procs)] # rcmds += ['geno = inferGenotype(%s, find_unmutated = FALSE, germline_db = %s, novel_df = novel_df)' % (db_name, gls_name)] rcmds += ['genotype_seqs = genotypeFasta(geno, %s, novel_df)' % (gls_name)] rcmds += ['writeFasta(genotype_seqs, "%s")' % tigger_outfname] cmdfname = args.workdir + '/tigger-in.cmd' with open(cmdfname, 'w') as cmdfile: cmdfile.write('\n'.join(rcmds) + '\n') cmdstr = 'R --slave -f ' + cmdfname utils.simplerun(cmdstr, shell=True, print_time='tigger') # post-process tigger .fa gldir = args.glfo_dir if args.glfo_dir is not None else 'data/germlines/human' glfo = glutils.read_glfo(gldir, args.locus) tigger_alleles = set() for seqfo in utils.read_fastx(tigger_outfname): seq = seqfo['seq'].replace(utils.gap_chars[0], '') # it should be just dots... tigger_alleles.add(seqfo['name']) if seqfo['name'] not in glfo['seqs'][args.region]: newfo = {'gene' : seqfo['name'], 'seq' : seq} use_template_for_codon_info = False if '+' in newfo['gene']: newfo['template-gene'] = newfo['gene'].split('+')[0] use_template_for_codon_info = True glutils.add_new_allele(glfo, newfo, use_template_for_codon_info=use_template_for_codon_info, debug=True) elif glfo['seqs'][args.region][seqfo['name']] != seq: print '%s different sequences in glfo and tigger output for %s:\n %s\n %s' % (utils.color('red', 'error'), seqfo['name'], glfo['seqs'][args.region][seqfo['name']], seqfo['seq']) for gene in glfo['seqs'][args.region]: # remove them afterwards so we can use existing ones to get codon info if gene not in tigger_alleles: glutils.remove_gene(glfo, gene) out_gldir = os.path.dirname(outfname).rstrip('/' + args.locus) assert glutils.get_fname(out_gldir, args.locus, args.region) == outfname glutils.write_glfo(out_gldir, glfo) os.remove(cmdfname)
def run_vsearch( seqfos ): # run vsearch to see if you can get a match for each locus for every sequence print ' running vsearch on %d sequences:' % len(seqfos) n_rev_compd, n_total = 0, 0 for locus in utils.sub_loci(args.ig_or_tr): lglfo = glutils.read_glfo(args.germline_dir, locus) annotations = utils.run_vsearch_with_duplicate_uids( 'search', seqfos, args.workdir + '/vsearch', args.vsearch_threshold, glfo=lglfo, print_time=True, vsearch_binary=args.vsearch_binary, get_annotations=True, expect_failure=True, extra_str=' %s fwd:' % utils.color('blue', locus) if args.reverse_negative_strands else ' %s: ' % locus) assert len(annotations) == len(seqfos) if args.reverse_negative_strands: # it might be nicer to user vsearch options to run on both senses at once, but otoh this might be nicer revnotations = utils.run_vsearch_with_duplicate_uids( 'search', revfos, args.workdir + '/vsearch', args.vsearch_threshold, glfo=lglfo, print_time=True, vsearch_binary=args.vsearch_binary, get_annotations=True, expect_failure=True, extra_str=' rev:') assert len(revnotations) == len(seqfos) for il, (sfo, line) in enumerate(zip(seqfos, annotations)): assert sfo['name'] == line['unique_ids'][ 0] # note that they're not full annotations, they just have a couple keys if args.reverse_negative_strands and use_rev_comp( line, revnotations[il]): sfo['seq'] = revfos[il]['seq'] line = revnotations[il] n_rev_compd += 1 sfo[locus] = line # add info for each locus to the input seqfos n_total += 1 if args.reverse_negative_strands: print ' used rev comp for %d/%d locus results (for %d seqs)' % ( n_rev_compd, n_total, len(seqfos))
def write_single_zenodo_subdir(zenodo_dir, args, study, dset, method, mfo): method_outdir = heads.get_datadir(study, 'processed', extra_str=args.label) + '/' + dset gls_dir = get_gls_dir(method_outdir, method, data=True) print ' %s --> %s' % (gls_dir, zenodo_dir) glfo = glutils.read_glfo(gls_dir, mfo['locus'], remove_orfs='partis' in method) glutils.write_glfo(zenodo_dir, glfo) if method == 'partis': # allele finding plots plotdir = gls_dir.replace('hmm/germline-sets', 'plots/sw/allele-finding') if not os.path.exists(zenodo_dir + '/fits'): os.makedirs(zenodo_dir + '/fits') for genedir in glob.glob(plotdir + '/try-0/*'): # would be nice to copy html, but links will be wrong subprocess.check_call(['cp', '-r', genedir, zenodo_dir + '/fits/']) # csv prevalence files for tmpreg in utils.regions: with open(gls_dir.replace('/germline-sets', '/%s_gene-probs.csv' % tmpreg)) as infile: reader = csv.DictReader(infile) countfo = {line['%s_gene' % tmpreg] : int(line['count']) for line in reader} old_total = sum(countfo.values()) orf_genes = [g for g in countfo if g not in glfo['seqs'][tmpreg]] # this is kind of dangerous... but the genes are read from the same parameter dir that we're reading this prevalence file, so the only way it's gonna be missing is if we just removed it with the read_glfo() line above for ogene in orf_genes: # if tmpreg == 'v': # _, nearest_gene, _ = glutils.find_nearest_gene_with_same_cpos(glfo, glfo['seqs'][tmpreg][ogene]) # oops, that's dumb... of course it isn't there # else: nearest_gene = glutils.find_nearest_gene_using_names(glfo, ogene) # print ' adding %d to %s from %s' % (countfo[ogene], utils.color_gene(nearest_gene), utils.color_gene(ogene)) countfo[nearest_gene] += countfo[ogene] for ogene in orf_genes: del countfo[ogene] assert old_total == sum(countfo.values()) with open('%s/%s_gene-probs.csv' % (zenodo_dir, tmpreg), 'w') as outfile: writer = csv.DictWriter(outfile, ('%s_gene' % tmpreg, 'count')) writer.writeheader() for gene in countfo: writer.writerow({'%s_gene' % tmpreg : gene, 'count' : countfo[gene]}) elif method == 'tigger-default': # doesn't seem to have written anything pass elif method == 'igdiscover': # for fname in ['errorhistograms.pdf', 'V_usage.pdf', 'V_usage.tab']: # subprocess.check_call(['cp', '%s/work/final/%s' % (gls_dir, fname), zenodo_dir + '/']) subprocess.check_call(['cp', '-r', '%s/work/final' % gls_dir, zenodo_dir + '/']) # aw, screw it, just write everything. The simulation stuff is already huge, anyway else: assert False
def get_gene_sets(glsfnames, glslabels, ref_label=None): glfos = {} for label, fname in zip(glslabels, glsfnames): gldir = os.path.dirname(fname).replace('/' + args.locus, '') glfos[label] = glutils.read_glfo( gldir, args.locus ) # this is gonna fail for tigger since you only have the .fa if ref_label is not None: for label in [l for l in glslabels if l != ref_label]: print ' syncronizing %s names to match %s' % (label, ref_label) glutils.synchronize_glfos(ref_glfo=glfos[ref_label], new_glfo=glfos[label], region=args.region) gl_sets = { label: {g: seq for g, seq in glfos[label]['seqs'][args.region].items()} for label in glfos } all_genes = {g: s for gls in gl_sets.values() for g, s in gls.items()} return all_genes, gl_sets
import utils import glutils from clusterpath import ClusterPath parser = argparse.ArgumentParser() parser.add_argument('--fname', default=partis_dir + '/test/reference-results/partition-ref-simu.yaml') parser.add_argument('--glfo-dir', default=partis_dir + '/data/germlines/human') parser.add_argument('--locus', default='igh') args = parser.parse_args() glfo = None if utils.getsuffix(args.fname) == '.csv': print ' reading deprecated csv format, so need to read germline info from somewhere else, using --glfo-dir %s, hopefully it works' % args.glfo_dir glfo = glutils.read_glfo(args.glfo_dir, locus=args.locus) glfo, annotation_list, cpath = utils.read_output(args.fname, glfo=glfo) if cpath is None or len(cpath.partitions) == 0: print 'no partitions read from %s, so just printing first annotation:' % args.fname utils.print_reco_event(annotation_list[0]) sys.exit(0) print utils.color('green', 'list of partitions:') cpath.print_partitions( abbreviate=True ) # 'abbreviate' print little 'o's instead of the full sequence ids # print annotations for the biggest cluster in the most likely partition annotations = {
print 'WARNING current script dir %s doesn\'t exist, so python path may not be correctly set' % partis_dir sys.path.insert(1, partis_dir + '/python') import utils from hist import Hist import plotting import glutils parser = argparse.ArgumentParser() parser.add_argument('infile') parser.add_argument('plotdir') args = parser.parse_args() def gk(uids): return ':'.join(uids) glfo = glutils.read_glfo(args.infile.replace('.csv', '-glfo'), locus='igh') annotations = {} with open(args.infile) as csvfile: reader = csv.DictReader(csvfile) for line in reader: if line['v_gene'] == '': # failed (i.e. couldn't find an annotation) continue utils.process_input_line(line) # converts strings in the csv file to floats/ints/dicts/etc. utils.add_implicit_info(glfo, line) # add stuff to <line> that's useful, isn't written to the csv since it's redundant annotations[gk(line['unique_ids'])] = line chfo = {uid : utils.get_chimera_max_abs_diff(annotations[uid], iseq=0) for uid in annotations} biggest_adiffs = sorted(chfo, key=lambda q: chfo[q][1], reverse=True) for uid in biggest_adiffs[:10]: print chfo[uid]
from Bio.Seq import Seq partis_path = '.' # edit this if you're not running from the main partis dir sys.path.insert(1, partis_path + '/python') import utils import glutils from clusterpath import ClusterPath parser = argparse.ArgumentParser() parser.add_argument('--infile') parser.add_argument('--locus') parser.add_argument('--param') parser.add_argument('--nclust') args = parser.parse_args() glfo = glutils.read_glfo(args.param + '/hmm/germline-sets', locus=args.locus) print(sys.argv) print 'infile =', args.infile print 'param =', args.param cp = ClusterPath() cp.readfile(args.infile) best_partition = cp.partitions[cp.i_best] # sorted_clusters = sorted(best_partition, key=len, reverse=True) # sort by size # clonal family attributes to print print ''' score = interest score, indicating interesting attributes: size, SHM, SFS, bnAb VH usage
parser.add_argument('gldir1') parser.add_argument('gldir2') parser.add_argument( '--names', default='+gl-1:+gl-2', help= 'colon-separated list of length 2 with labels for gldir1 and gldir2, which will be appended to each gene name in the ascii output' ) parser.add_argument('--locus', default='igh') args = parser.parse_args() args.names = utils.get_arg_list(args.names) glfos = [] for name, gldir in zip(args.names, [args.gldir1, args.gldir2]): print '%s:' % utils.color('yellow', name) glfos.append(glutils.read_glfo(gldir, args.locus, debug=True)) for region in [r for r in utils.regions if r in glfos[0]['seqs']]: aset, bset = [set(g['seqs'][region]) for g in glfos] tmpfo = glutils.get_empty_glfo( args.locus) # make a new glfo that will only have non-shared genes for glabel, gset, gfo in zip( args.names, [aset - bset, bset - aset], glfos): # <gset> is the genes that're only in <glabel> for ogene in gset: glutils.add_new_allele(tmpfo, { 'gene': '+'.join([ogene, glabel]), 'seq': gfo['seqs'][region][ogene], 'cpos': utils.cdn_pos(gfo, region, ogene) },
#!/usr/bin/env python import csv import sys partis_path = '.' # edit this if you're not running from the main partis dir sys.path.insert(1, partis_path + '/python') import utils import glutils from clusterpath import ClusterPath # read default germline info glfo = glutils.read_glfo(partis_path + '/data/germlines/human', locus='igh') print 'first parse an annotation csv file:' with open(partis_path + '/test/reference-results/annotate-new-simu.csv') as csvfile: reader = csv.DictReader(csvfile) for line in reader: if line['v_gene'] == '': # failed (i.e. couldn't find an annotation) continue utils.process_input_line(line) utils.add_implicit_info(glfo, line) utils.print_reco_event(line) break print 'then parse a partition csv file:' cp = ClusterPath() cp.readfile(partis_path + '/test/reference-results/seed-partition-new-simu.csv') cp.print_partitions(abbreviate=True)
current_script_dir = os.path.dirname(os.path.realpath(__file__)).replace('/bin', '/python') if not os.path.exists(current_script_dir): print 'WARNING current script dir %s doesn\'t exist, so python path may not be correctly set' % current_script_dir sys.path.insert(1, current_script_dir) import utils import glutils import plotting # ---------------------------------------------------------------------------------------- datadir = 'data/germlines/human' xtitles = { 'indels' : 'fraction of positions indel\'d', 'subs' : 'substitution fraction' } glfo = glutils.read_glfo(datadir) vgenes = glfo['aligned-genes']['v'].keys() pversions = OrderedDict() for vg in vgenes: pv = utils.primary_version(vg) if pv not in pversions: pversions[pv] = [] pversions[pv].append(vg) # remove primary versions that only have one gene for pv in pversions: if len(pversions[pv]) == 1: print 'removing single-gene pv %s' % pv del pversions[pv] # ----------------------------------------------------------------------------------------
for iname in range(len(args.names)): args.names[iname] = args.names[iname].replace('@', ' ') # if you just pass in one parent directory, we assume <args.names> contains the desired subdirs if len(args.plotdirs) == 1: parentdir = args.plotdirs[0] args.plotdirs = [parentdir + '/' + n for n in args.names] if len(args.plotdirs) != len(args.names): raise Exception('poorly formatted args:\n %s\n %s' % (' '.join(args.plotdirs), ' '.join(args.names))) # if args.gldir is not 'none': args.glfo = None if os.path.exists(args.gldir): args.glfo = glutils.read_glfo(args.gldir, args.locus) # figure out if there's subdirs we need to deal with listof_plotdirlists, listof_outdirs = [], [] firstdir = args.plotdirs[0] if len(glob.glob(firstdir + '/*.csv')) > 0: # add the parent dirs if they've got csvs listof_plotdirlists.append(args.plotdirs) listof_outdirs.append(args.outdir) for subdir in [ d for d in os.listdir(firstdir) if os.path.isdir(firstdir + '/' + d) ]: listof_plotdirlists.append([d + '/' + subdir for d in args.plotdirs]) listof_outdirs.append(args.outdir + '/' + subdir) for dlist, outdir in zip(listof_plotdirlists, listof_outdirs):
def get_alleles(self, swfo, plotdir=None, debug=False): print 'clustering for new alleles' # NOTE do *not* modify <self.glfo> (in the future it would be nice to just modify <self.glfo>, but for now we need it to be super clear in partitiondriver what is happening to <self.glfo>) default_initial_glfo = self.glfo if self.args.default_initial_germline_dir is not None: # if this is set, we want to take any new allele names from this directory's glfo if they're in there default_initial_glfo = glutils.read_glfo(self.args.default_initial_germline_dir, self.glfo['locus']) glfo_to_modify = copy.deepcopy(default_initial_glfo) # so we can add new genes to it, so we can check for equivalency more easily TODO fix that shit, obviously else: print ' %s --default-initial-germline-dir isn\'t set, so new allele names won\'t correspond to existing names' % utils.color('yellow', 'warning') qr_seqs, threshold = self.choose_clonal_representatives(swfo, debug=debug) if qr_seqs is None: return {} # self.check_for_donuts(debug=debug) if not self.args.kmeans_allele_cluster: clusterfos, msa_info = self.vsearch_cluster_v_seqs(qr_seqs, threshold, debug=debug) else: clusterfos = self.kmeans_cluster_v_seqs(qr_seqs, swfo, plotdir=plotdir, debug=debug) msa_info = clusterfos # and finally loop over each cluster, deciding if it corresponds to a new allele if debug: print ' looping over %d clusters with %d sequences' % (len(clusterfos), sum([len(cfo['seqfos']) for cfo in clusterfos])) print ' rank seqs v/j mfreq seqs snps (%s)' % utils.color('blue', 'indels') new_alleles = {} n_existing_gene_clusters = 0 for iclust in range(len(clusterfos)): clusterfo = clusterfos[iclust] # dot_products = [utils.dot_product(clusterfo['cons_seq'], seq1, seq2) for seq1, seq2 in itertools.combinations([seqfo['seq'] for seqfo in clusterfo['seqfos']], 2)] # mean_dot_product = numpy.average(dot_products) # choose the most common existing gene to use as a template (the most similar gene might be a better choice, but deciding on "most similar" would involve adjudicating between snps and indels, and it shouldn't really matter) sorted_glcounts, true_sorted_glcounts = self.get_glcounts(clusterfo) template_gene, template_counts = sorted_glcounts[0] template_seq = self.glfo['seqs'][self.region][template_gene] template_cpos = utils.cdn_pos(self.glfo, self.region, template_gene) assert '.' not in clusterfo['cons_seq'] # make sure you haven't switched to something that doesn't use '-' for gap chars new_seq = clusterfo['cons_seq'].replace('-', '') # I'm not sure that I completely understand the dashes in this sequence, but it seems to be right to just remove 'em aligned_template_seq, aligned_new_seq = utils.align_seqs(template_seq, clusterfo['cons_seq']) has_indels = '-' in aligned_template_seq.strip('-') or '-' in aligned_new_seq.strip('-') # only counts internal indels cluster_mfreqs = {r : [self.mfreqs[r][seqfo['name']] for seqfo in clusterfo['seqfos']] for r in self.mfreqs} # regional mfreqs for each sequence in the cluster corresponding to the initially-assigned existing gene mean_cluster_mfreqs = {r : numpy.mean(cluster_mfreqs[r]) for r in cluster_mfreqs} equiv_name, equiv_seq = glutils.find_equivalent_gene_in_glfo(glfo_to_modify, new_seq, template_cpos) if equiv_name is not None: new_name = equiv_name new_seq = equiv_seq else: new_name, _ = glutils.choose_new_allele_name(template_gene, new_seq, indelfo={'indels' : ['xxx', 'xxx', 'xxx']} if has_indels else None) # the fcn just checks to see if it's non-None and of length greater than zero...TODO it would be nice to figure out actual snp and indel info if debug: self.print_cluster(iclust, clusterfo, sorted_glcounts, new_seq, true_sorted_glcounts, mean_cluster_mfreqs, has_indels) if new_name in self.glfo['seqs'][self.region]: # note that this only looks in <self.glfo>, not in <new_alleles> n_existing_gene_clusters += 1 if debug: print 'existing %s' % utils.color_gene(new_name) continue if new_name in new_alleles: # already added it NOTE might make more sense to use <glfo_to_modify> here instead of <new_alleles> (or just not have freaking both of them) if debug: print '%s (%s)' % (utils.color_gene(new_name), utils.color('red', 'new')) continue assert new_seq not in new_alleles.values() # if it's the same seq, it should've got the same damn name if not has_indels: # we assume that the presence of indels somewhat precludes false positives, which is equivalent to an assumption about the rarity of shm indels if self.too_close_to_existing_glfo_gene(clusterfo, new_seq, template_seq, template_cpos, template_gene, debug=debug): # presumably if it were really close to another (non-template) existing glfo gene, that one would've been the template continue if mean_cluster_mfreqs['j'] > 0. and self.mean_mfreqs['j'] > 0.: this_cluster_ratio = mean_cluster_mfreqs['v'] / mean_cluster_mfreqs['j'] overall_ratio = self.mean_mfreqs['v'] / self.mean_mfreqs['j'] if this_cluster_ratio / overall_ratio < self.mfreq_ratio_threshold: if debug: print 'v / j cluster mfreqs too small %6.3f / %6.3f = %6.3f < %6.3f' % (this_cluster_ratio, overall_ratio, this_cluster_ratio / overall_ratio, self.mfreq_ratio_threshold) continue if self.too_close_to_already_added_gene(new_seq, new_alleles, debug=debug): # this needs to be applied even if there are indels, since the indels are with respect to the (existing glfo) template gene, not to the [potentially] previously-added gene continue print '%s %s%s' % (utils.color('red', 'new'), utils.color_gene(new_name), ' (exists in default germline dir)' if new_name in default_initial_glfo['seqs'][self.region] else '') new_alleles[new_name] = {'template-gene' : template_gene, 'gene' : new_name, 'seq' : new_seq} if new_alleles[new_name]['gene'] not in glfo_to_modify['seqs'][self.region]: # if it's in <default_initial_glfo> it'll already be in there glutils.add_new_allele(glfo_to_modify, new_alleles[new_name]) # just so we can check for equivalency if debug: print ' %d / %d clusters consensed to existing genes' % (n_existing_gene_clusters, len(msa_info)) self.reassign_template_counts(msa_info, new_alleles, debug=False) for new_name, newfo in new_alleles.items(): # print '%s %s %.1f / %.1f = %.4f' % (new_name, newfo['template-gene'], self.adjusted_glcounts[newfo['template-gene']], float(sum(self.adjusted_glcounts.values())), self.adjusted_glcounts[newfo['template-gene']] / float(sum(self.adjusted_glcounts.values()))) if self.adjusted_glcounts[newfo['template-gene']] / float(sum(self.adjusted_glcounts.values())) < self.args.min_allele_prevalence_fraction: # NOTE self.adjusted_glcounts only includes large clusters, and the constituents of those clusters are clonal representatives, so this isn't quite the same as in alleleremover newfo['remove-template-gene'] = True return new_alleles
def partis_naive_seq(lseq, fnam): ''' Given a number of sequences infer the naive sequence using partis. ''' # Specify filenames: pretty_random_fnam = str(random.randint(1, 10**100)) inpf = pretty_random_fnam + '_input' outf = pretty_random_fnam + '_output' # Write input fasta file for partis: with open(TMPDIR+'/'+inpf+'.fa', 'w') as fho: for i, s in enumerate(lseq): fho.write('>{}\n{}\n'.format(str(i), s)) # Run partis: cmd = '{}/bin/partis partition --locus {} --species {} --infname {}/{}.fa --outfname {}/{}.csv'.format(partis_path, args.LOCUS, args.SPECIES, TMPDIR, inpf, TMPDIR, outf) # os.system(cmd) # Print partis STDOUT to screen os.system('{} > {}/{}.log'.format(cmd, TMPDIR, pretty_random_fnam)) try: # Read the partis output file and extract the naive sequence: with open(TMPDIR+'/'+outf+'-cluster-annotations.csv') as fh: reader = csv.DictReader(fh) data = list(reader) # assert(len(data) == 1) # There should really only be one clonal family, but there often are, so just take the first (largest) # Extract germline bounds info and trim the naive DNA sequence: try: utils.process_input_line(data[0]) # Process dataframe row fnam_base = fnam.split('_partitions')[0].split('/') #glfo = glutils.read_glfo('{}/_output/{}/hmm/germline-sets'.format(fnam_base[0], fnam_base[-1]), locus=args.LOCUS) glfo = glutils.read_glfo(partis_path + '/data/germlines/human', locus=args.LOCUS) utils.add_implicit_info(glfo, data[0]) # Adding germline infor except Exception as e: print e raise e naiveDNA = data[0]['naive_seq'][:] first_lseq = data[0]['input_seqs'][:][0] vj_bounds = (data[0]['regional_bounds']['v'][0], data[0]['regional_bounds']['j'][1]) naiveDNA = repair_new_naive(naiveDNA[:], naiveDNA[:], vj_bounds) first_lseq = repair_new_naive(first_lseq, naiveDNA[:], vj_bounds) try: assert(len(first_lseq) == len(naiveDNA)) except: print 'len(first_lseq) != len(data[0]["naive_seq"])' print len(first_lseq) print first_lseq print len(naiveDNA) print naiveDNA # If the inferred naive sequence contains a stop codon replace it by the input sequence codon: if '*' in str(Seq(naiveDNA, generic_dna).translate()): print 'Found stop codon in inferred naive sequnce, will replace with input sequence codon.' print 'Before replacement:', naiveDNA naiveDNA_l = list(naiveDNA[:]) for codon in range(vj_bounds[0], vj_bounds[1], 3): if '*' == str(Seq(naiveDNA[codon:codon+3], generic_dna).translate()): naiveDNA_l[codon:codon+3] = first_lseq[codon:codon+3] naiveDNA = ''.join(naiveDNA_l) print 'After replacement:', naiveDNA if naiveDNA == first_lseq: print 'Complaining to say naiveDNA == first_lseq (nothing bad just to be sure the repair is not just replacing the naive sequence with the input entirely)' return(naiveDNA) finally: # Clean up: os.system('rm -r {}/{}* _output/*{}*'.format(TMPDIR, pretty_random_fnam, pretty_random_fnam))
def extract_seqs(fnam): ''' Reads a partis cluster-annotations file and extracts relevant information and sequences. ''' # Read cluster annotations into a data list of dictionaries: with open(fnam) as fh: reader = csv.DictReader(fh) data = list(reader) sequences_i = list() info_i = list() if args.allele_finding: fnam_base = fnam.split('_partitions')[0].split('/') glfo = glutils.read_glfo('{}/_output/{}/hmm/germline-sets'.format(fnam_base[0], fnam_base[-1]), locus=args.LOCUS) else: glfo = glutils.read_glfo(partis_path + '/data/germlines/human', locus=args.LOCUS) for row in data: # Process the partis data row and add germline information: try: utils.process_input_line(row) # Read default germline info utils.add_implicit_info(glfo, row) except Exception as e: # Skip rows that cannot be processed if 'failed annotation' not in e: pass # print('First skip') # print(e) else: print 'Reading from' print '{}/_output/{}/hmm/germline-sets'.format(fnam_base[0], fnam_base[-1]) print e continue # # Process the partis data row and add germline information: # try: # utils.process_input_line(row) # utils.add_implicit_info(glfo, row) # except: # Skip rows that cannot be processed # continue # Extract the full N padded naive sequence, # and find the v -and j gene bound on this naive sequence: cdr3_bounds = (row['codon_positions']['v'], row['codon_positions']['j'] + 3) vj_bounds = (row['regional_bounds']['v'][0], row['regional_bounds']['j'][1]) naiveDNA = row['naive_seq'] # Skip naive sequences too short or with stop codons: if repair_seq(naiveDNA, naiveDNA, vj_bounds, keep_check=True) is False: continue trimmed_naiveDNA = repair_seq(naiveDNA[:], naiveDNA[:], vj_bounds) naiveAA = str(Seq(trimmed_naiveDNA, generic_dna).translate()) # There has been a name change and this try/except is meant to provide backwards compatability: try: lseq = row['input_seqs'][:] except: lseq = row['seqs'][:] ir_lseq = row['indel_reversed_seqs'] stop_seq = row['stops'] assert(len(lseq) == len(ir_lseq)) assert(len(lseq) == len(stop_seq)) # Only keep sequences without indels and stop codons and minimum length amino acid length (QC): ### ir_lseq[i] == '' or lseq[i] == ir_lseq[i] <-- No indels ### stop_seq[i] <-- No partis annotated stops (there seems still to be stops after these are removed though) ### repair_seq(lseq[i], naiveDNA, vj_bounds, keep_check=True) <-- Checks whether the sequence is long enougth or have stop codons keep_idx = [1 if ((ir_lseq[i] == '' or lseq[i] == ir_lseq[i]) and stop_seq[i] is False and repair_seq(lseq[i], naiveDNA, vj_bounds, keep_check=True)) else 0 for i in range(len(lseq))] # Now only keep those sequences that passed QC: lseq = [s for s, keep in zip(lseq, keep_idx) if keep == 1] # Get amino acid sequences: lAAseq = [str(Seq(repair_seq(s[:], naiveDNA[:], vj_bounds), generic_dna).translate()) for s in lseq] # And mutation frequencies: mut_freqs = [s for s, keep in zip(row['mut_freqs'], keep_idx) if keep == 1] assert(len(mut_freqs) == len(lseq)) # Convert frequency to counts: Nmuts = [int(round(float(t[0])*len(t[1].strip('N')))) for i, t in enumerate(zip(mut_freqs, lseq))] # Deduplicate AAseqs and lseq according to the duplications on amino acid level: lAAseq_dict = dict() lseq_unique = list() for i, aa in enumerate(lAAseq): if aa in lAAseq_dict: lAAseq_dict[aa].append(i) else: lAAseq_dict[aa] = [i] lseq_unique.append(repair_seq(lseq[i][:], naiveDNA[:], vj_bounds)) assert(len(lAAseq_dict) == len(lseq_unique)) # Make the deduplicated sequence list and the mutation rates: lAAseq_dedup = list() Nmuts_dedup = list() for aa, idxs in lAAseq_dict.items(): lAAseq_dedup.append(aa) Nmut_list = [float(Nmuts[i]) for i in idxs] Nmuts_dedup.append(int(round(sum(Nmut_list)/len(Nmut_list)))) assert(len(lAAseq_dedup) == len(Nmuts_dedup)) assert(len(lAAseq_dedup) == len(lseq_unique)) # Exclude small clonal families after all the QC and deduplication: if len(lAAseq_dedup) < args.MIN_OBS: continue # Store the results in a list: sequences_i.append(['naive_seq', naiveAA]) # This format is for ANARCI numbering info_i.append({'fnam': fnam, 'v_gene': row['v_gene'], 'd_gene': row['d_gene'], 'j_gene': row['j_gene'], 'naive_seq': naiveAA, 'naive_seq_DNA': trimmed_naiveDNA, 'Nmuts': Nmuts_dedup[:], 'AAseqs': lAAseq_dedup[:], 'DNAseqs': lseq_unique[:]}) return(sequences_i, info_i)
def _get_clonal_family_stats(path_to_annotations, metadata, use_np=False, use_immunized=False, locus=''): ''' get data statistics from partis annotations @param path_to_annotations: path to partis annotations @param metadata: path to partis metadata @param use_np: use nonproductive seqs? @param use_immunized: for Cui data, use immunized mice? @param locus: which locus to use @return list of dicts with clonal family sizes and naive seqs from processed data ''' partition_info = get_partition_info( path_to_annotations, metadata, ) if use_np: # return only nonproductive sequences # here "nonproductive" is defined as having a stop codon or being # out of frame or having a mutated conserved cysteine good_seq = lambda seqs: seqs['stops'] or not seqs['in_frames'] or seqs[ 'mutated_invariants'] else: # return all sequences good_seq = lambda seqs: [True for seq in seqs['seqs']] all_germline_dicts = [] for data_idx, data_info in enumerate(partition_info): if use_immunized and data_info['group'] != 'immunized': continue if not locus or data_info['locus'] != locus: continue PARTIS_PATH = os.path.dirname(os.path.realpath(__file__)) + '/partis' sys.path.insert(1, PARTIS_PATH + '/python') from utils import add_implicit_info, process_input_line import glutils glfo = glutils.read_glfo(data_info['germline_file'], locus=data_info['locus']) with open(data_info['annotations_file'], "r") as csvfile: reader = csv.DictReader(csvfile) for idx, line in enumerate(reader): # add goodies from partis if len(line['input_seqs']) == 0: # sometimes data will have empty clusters continue process_input_line(line) add_implicit_info(glfo, line) good_seq_idx = [ i for i, is_good in enumerate(good_seq(line)) if is_good ] if not good_seq_idx: # no nonproductive sequences... skip continue else: all_germline_dicts.append({ 'n_taxa': len(good_seq_idx), 'germline_sequence': disambiguate(line['v_gl_seq'].lower()), 'germline_name': '-'.join([line['v_gene'], str(idx)]), 'v_call': line['v_gene'], }) return all_germline_dicts
def write_partis_data_from_annotations( output_genes, output_seqs, path_to_annotations, metadata, filters={}, seq_filters={}, min_clonal_family_size=0, min_seq_len=0, max_mut_pct=1., min_mut_pct=0., clone_str='', region='v', germline_family='v', ): """ Function to read partis annotations csv @param path_to_annotations: path to annotations files @param metadata: csv file of metadata; if None defaults will be used for chain/species @param filters: dictionary of lists with keys as column name and items as those values of the column variable to retain; filters out families, e.g., {'locus': ['igk']}, etc. @param seq_filters: same as filters, but for sequences, e.g., {indel_reversed_seqs': [''], 'in_frames': [False]} will only retain sequences that are out of frame and did not have an indel @param min_clonal_family_size: minimum clonal family size @param min_seq_len: minimum sequence length @param max_mut_pct: maximum mutation percentage @param min_mut_pct: minimum mutation percentage @param clone_str: string for identifying clones (useful if merging annotations from multiple datasets) @param region: B-cell receptor region ('v', 'd', 'j', or 'vdj') @param germline_family: for performing cross validation ('v', 'd', or 'j') @write genes to output_genes and seqs to output_seqs """ families = ['v', 'd', 'j'] if germline_family not in families: raise ValueError("Invalid germline_family: %s. Must be one of %s" % (germline_family, families)) regions = ['v', 'd', 'j', 'vdj'] if region not in regions: raise ValueError("Invalid region: %s. Must be one of %s" % (region, regions)) PARTIS_PATH = os.path.dirname(os.path.realpath(__file__)) + '/partis' sys.path.insert(1, PARTIS_PATH + '/python') from utils import add_implicit_info, process_input_line import glutils partition_info = get_partition_info( path_to_annotations, metadata, ) with open(output_genes, 'w') as genes_file, open(output_seqs, 'w') as seqs_file: gene_writer = csv.DictWriter(genes_file, ['germline_name', 'germline_sequence']) gene_writer.writeheader() seq_header = [ 'germline_name', 'sequence_name', 'sequence', 'germline_family', 'v_gene', 'region', ] for key, _ in partition_info[0].iteritems(): seq_header += [key] seq_writer = csv.DictWriter(seqs_file, seq_header) seq_writer.writeheader() for data_idx, data_info in enumerate(partition_info): if any([ data_info[key] not in values for key, values in filters.iteritems() ]): continue glfo = glutils.read_glfo(data_info['germline_file'], locus=data_info['locus']) with open(data_info['annotations_file'], "r") as csvfile: reader = csv.DictReader(csvfile) for idx, line in enumerate(reader): if line['v_gene'] == '': # failed annotations continue # add goodies from partis process_input_line(line) add_implicit_info(glfo, line) n_seqs = len(line['input_seqs']) if n_seqs < min_clonal_family_size: # don't take small clonal families---for data quality purposes continue if region == 'vdj': gl_seq = line['naive_seq'].lower() all_seqs = [seq.lower() for seq in line['seqs']] else: gl_seq = line['v_gl_seq'].lower() all_seqs = [seq.lower() for seq in line['v_qr_seqs']] idx_list = [] # frequency filter idx_list.append( set([ i for i, val in enumerate(line['mut_freqs']) if val < max_mut_pct and val >= min_mut_pct ])) # sequence length filter idx_list.append( set([ i for i, val in enumerate(all_seqs) if len(val.translate(None, 'n')) > min_seq_len ])) for key, values in seq_filters.iteritems(): idx_list.append( set([ i for i, val in enumerate(line[key]) if val in values ])) good_seq_idx = set.intersection(*idx_list) if not good_seq_idx: # no sequences after filtering... skip continue gl_name = 'clone{}-{}-{}'.format( *[data_idx, idx, clone_str]) gene_writer.writerow({ 'germline_name': gl_name, 'germline_sequence': gl_seq, }) for good_idx in good_seq_idx: base_dict = { 'germline_name': gl_name, 'sequence_name': '-'.join([gl_name, line['unique_ids'][good_idx]]), 'sequence': all_seqs[good_idx].lower(), 'germline_family': line['{}_gene'.format(germline_family)][:5], 'v_gene': line['v_gene'], 'region': region, } for key, value in data_info.iteritems(): base_dict[key] = value seq_writer.writerow(base_dict)
parser.add_argument('--strings-to-ignore') # remove this string from the plot names in each dir (e.g. '-mean-bins') NOTE replaces '_' with '-' print 'TODO this should really be an importable module, not its own script' args = parser.parse_args() if args.strings_to_ignore is not None: args.strings_to_ignore = args.strings_to_ignore.replace('_', '-') args.plotdirs = utils.get_arg_list(args.plotdirs) args.scale_errors = utils.get_arg_list(args.scale_errors) args.colors = utils.get_arg_list(args.colors, intify=True, translation={810 : 'red', 634 : 'darkred', 596 : 'mediumblue', 418 : 'green', 798 : 'goldenrod', 869 : 'lightseagreen'}) args.str_colors = utils.get_arg_list(args.str_colors) if args.str_colors is not None: args.colors = args.str_colors args.linestyles = utils.get_arg_list(args.linestyles, intify=True, translation={1 : '-',2 : '--'}) args.names = utils.get_arg_list(args.names) args.leaves_per_tree = utils.get_arg_list(args.leaves_per_tree, intify=True) args.strings_to_ignore = utils.get_arg_list(args.strings_to_ignore) args.markersizes = utils.get_arg_list(args.markersizes, intify=True) args.linewidths = utils.get_arg_list(args.linewidths, intify=True) args.alphas = utils.get_arg_list(args.alphas, floatify=True) for iname in range(len(args.names)): args.names[iname] = args.names[iname].replace('@', ' ') assert len(args.plotdirs) == len(args.names) glfo = glutils.read_glfo(args.datadir) args.cyst_positions = glfo['cyst-positions'] args.tryp_positions = glfo['tryp-positions'] plotting.compare_directories(args)
def parse_args(): def existing_file(fname): """Argparse type for an existing file""" if not os.path.isfile(fname): raise ValueError("Can't find file: " + str(fname)) return fname parser = argparse.ArgumentParser(description=__doc__) inputs = parser.add_argument_group(title="Input files", description="(required)") inputs.add_argument('--partition-file', help='partitions file as output by partis', type=existing_file, required=True) inputs.add_argument( '--upstream-seqmeta', help= 'optionally, specify upstream seqmeta as a csv with cols: unique_id,timepoint,multiplicity', # Index rows by unique id type=csv_reader('unique_id')) outputs = parser.add_argument_group(title="Output files", description="(optional)") outputs.add_argument('--seqmeta-out', help='per sequence metadata CSV file') outputs.add_argument('--seqs-out', help='cluster sequences as a FASTA file') outputs.add_argument('--cluster-meta-out', help='cluster sequences as a JSON file') # If we support a recursive option, we have to name I guess? #outputs.add_argument( #'--process-all-data-to', #help="writes all data for all partitions/clusters to the specified directory") partis_args = parser.add_argument_group( title="Partis args", description= """These arguments (as passed to partis) are required in order to process the data correctly.""" ) partis_args.add_argument( '--parameter-dir', help= 'parameter dir path, as passed to partis (if omitted, gls assumed to be ' + default_germline_sets + ')') partis_args.add_argument('--locus', help='again, as passed to partis', required=True) cluster_selection_args = parser.add_argument_group( title="Cluster selection args", description= """Given a partition file and associated cluster annotation file, there may be multiple clusters one might extract data for. These options allow you to specify a selection.""" ) cluster_selection_args.add_argument( '--partition', type=int, default=0, help= '"best plus" index; defaults to 0 (best partition); 1 selects the next partition step, etc.' ) cluster_selection_args.add_argument( '--cluster', type=int, help= """index of cluster in partition-file after sorting by cluster size; defaults to seed cluster if seeded and 0 (the largest cluster) otherwise.""") # add a non sorted version? cluster_selection_args.add_argument( '--unique-ids', help='select a specific cluster using its unique_ids signature') cluster_selection_args.add_argument( '--unique-ids-file', help= 'select a specific cluster using its unique_ids signature in a single line in a file', type=lambda x: file(x).read().strip()) other_args = parser.add_argument_group(title="Other options") other_args.add_argument( '--remove-frameshifts', help='if set, removes seqs with frameshifted indels from output', action="store_true") other_args.add_argument( '--remove-stops', help='if set, removes seqs with stop codons from output', action="store_true") other_args.add_argument( '--remove-mutated-invariants', help= 'if set, removes seqs with mutated "invariant" regions from output', action="store_true") other_args.add_argument( '--indel-reversed-seqs', help= 'if set, uses the "indel_reversed_seqs" output of partis instead of "seqs"', action="store_true") other_args.add_argument( '--max-sequences', help= """if set, downsamples semi-randomly, with preference towards sequences with higher multiplicity and order output by partis""", type=int) other_args.add_argument( '--always-include', type=lambda x: x.split(','), help='comma separated list of ids to keep if --max-sequences is set', default=[]) other_args.add_argument( '--paths-relative-to', default='.', help= 'files pointed to from metadata.json file will be specified relative to this path' ) other_args.add_argument( '--namespace', help='namespace to be applied to cluster meta attr names') other_args.add_argument('--inferred-naive-name', help='see scons option help') # --indel-reversed-seqs # --remove-mutated-invariants # parse args and decorate with derived values args = parser.parse_args() # default paths_relative_to is just whatever the output dir is args.unique_ids = args.unique_ids or args.unique_ids_file # default germline set (discouraged) args.germline_sets = os.path.join( args.parameter_dir, 'hmm/germline-sets') if args.parameter_dir else default_germline_sets args.glfo = glutils.read_glfo(args.germline_sets, args.locus) return args
# add the intronic v genes to glfo for d_gene, refseq in refseqs.items(): glfo['seqs']['v'][utils.generate_dummy_v(d_gene)] = refseq glfo['cyst-positions'][utils.generate_dummy_v(d_gene)] = len(refseq) - 3 # write a glfo dir with everything glutils.write_glfo(outdir + '/germlines/imgt-and-intronic', glfo, debug=True) # remove the original v genes, and write a glfo dir with just the intronic ones glutils.remove_genes(glfo, [g for g in glfo['seqs']['v'] if 'xDx' not in g], debug=True) glutils.write_glfo(outdir + '/germlines/intronic', glfo, debug=True) # tmpglfo = glutils.read_glfo('tmp-germlines', 'h') glfo = glutils.read_glfo('data/germlines/human', 'h') infname = '/fh/fast/matsen_e/data/2016-06-02-katie/VMO_Memory-3/VMO_Memory-3.tsv' outdir = '/fh/fast/matsen_e/processed-data/partis/2016-06-02-katie' n_failed, n_v_ok, n_total = 0, 0, 0 introns = {} with open(infname) as infile: reader = csv.DictReader(infile, delimiter='\t') iline = 0 introns = {} for line in reader: iline += 1 # if iline > 1000: # break
args.names[iname] = args.names[iname].replace('@', ' ') # if you just pass in one parent directory, we assume <args.names> contains the desired subdirs if len(args.plotdirs) == 1: parentdir = args.plotdirs[0] args.plotdirs = [parentdir + '/' + n for n in args.names] if len(args.plotdirs) != len(args.names): raise Exception('poorly formatted args:\n %s\n %s' % (' '.join(args.plotdirs), ' '.join(args.names))) # make a merged glfo from all the gldirs args.glfo = None if args.gldirs is not None: for gldir in [gd for gd in args.gldirs if os.path.exists(gd)]: tmpglfo = glutils.read_glfo(gldir, args.locus) if args.glfo is None: args.glfo = tmpglfo else: args.glfo = glutils.get_merged_glfo(args.glfo, tmpglfo) # figure out if there's subdirs we need to deal with listof_plotdirlists, listof_outdirs = [], [] firstdir = args.plotdirs[0] if len(glob.glob(firstdir + '/*.csv')) > 0: # add the parent dirs if they've got csvs listof_plotdirlists.append(args.plotdirs) listof_outdirs.append(args.outdir) for subdir in [ d for d in os.listdir(firstdir) if os.path.isdir(firstdir + '/' + d) ]:
#!/usr/bin/env python import csv import sys partis_path = '.' # edit this if you're not running from the main partis dir sys.path.insert(1, partis_path + '/python') import utils import glutils from clusterpath import ClusterPath # read default germline info glfo = glutils.read_glfo(partis_path + '/data/germlines/human', chain='h') print 'first parse an annotation csv file:' with open(partis_path + '/test/reference-results/annotate-new-simu.csv') as csvfile: reader = csv.DictReader(csvfile) for line in reader: utils.process_input_line(line) utils.add_implicit_info(glfo, line) utils.print_reco_event(glfo['seqs'], line) cdr3_bounds = (line['codon_positions']['v'], line['codon_positions']['j'] + 3) print '' print ' should match the above:' print ' %s naive cdr3' % line['naive_seq'][cdr3_bounds[0] : cdr3_bounds[1]] print ' %s mature' % line['indel_reversed_seqs'][0][cdr3_bounds[0] : cdr3_bounds[1]] print '' break print 'then parse a partition csv file:' cp = ClusterPath() cp.readfile(partis_path + '/test/reference-results/seed-partition-new-simu.csv')
def run_test(args): print 'seed %d' % args.seed label = 'test' #get_label(existing_genes, new_allele) simfname = args.outdir + '/simu-' + label + '.csv' outpdir = args.outdir + '/simu-' + label plotdir = args.outdir + '/simu-' + label + '-plots' # simulate if not args.nosim: cmd_str = base_cmd + ' simulate --n-sim-events ' + str(args.n_sim_events) + ' --n-leaves ' + str(args.n_leaves) + ' --constant-number-of-leaves --rearrange-from-scratch --outfname ' + simfname cmd_str += ' --mutation-multiplier ' + str(args.mut_mult) cmd_str += ' --n-procs ' + str(args.n_procs) if args.slurm: cmd_str += ' --batch-system slurm --subsimproc' if args.gen_gset: cmd_str += ' --generate-germline-set' cmd_str += ' --n-genes-per-region 1:5:3' cmd_str += ' --n-alleles-per-gene 2,3:1,2:1,2' else: simulation_genes = ':'.join(args.sim_v_genes + args.dj_genes) sglfo = glutils.read_glfo('data/germlines/human', chain=chain, only_genes=simulation_genes.split(':')) added_snp_names = None if args.snp_positions is not None: snps_to_add = [{'gene' : args.sim_v_genes[ig], 'positions' : args.snp_positions[ig]} for ig in range(len(args.sim_v_genes))] added_snp_names = glutils.add_some_snps(snps_to_add, sglfo, debug=True, remove_template_genes=args.remove_template_genes) if args.allele_prevalence_freqs is not None: if len(args.allele_prevalence_freqs) != len(sglfo['seqs']['v']): raise Exception('--allele-prevalence-freqs not the right length') gene_list = sorted(sglfo['seqs']['v']) if added_snp_names is None else list(set(args.sim_v_genes)) + added_snp_names prevalence_freqs = {'v' : {g : f for g, f in zip(gene_list, args.allele_prevalence_freqs)}, 'd' : {}, 'j' : {}} glutils.write_allele_prevalence_freqs(prevalence_freqs, args.workdir + '/allele-prevalence-freqs.csv') cmd_str += ' --allele-prevalence-fname ' + args.workdir + '/allele-prevalence-freqs.csv' glutils.write_glfo(args.outdir + '/germlines/simulation', sglfo) cmd_str += ' --initial-germline-dir ' + args.outdir + '/germlines/simulation' if args.seed is not None: cmd_str += ' --seed ' + str(args.seed) run(cmd_str) # remove any old sw cache files sw_cachefiles = glob.glob(outpdir + '/sw-cache-*.csv') if len(sw_cachefiles) > 0: for cachefname in sw_cachefiles: check_call(['rm', '-v', cachefname]) sw_cache_gldir = cachefname.replace('.csv', '-glfo') if os.path.exists(sw_cache_gldir): # if stuff fails halfway through, you can get one but not the other glutils.remove_glfo_files(sw_cache_gldir, chain) # os.rmdir(sw_cache_gldir) # generate germline set and cache parameters cmd_str = base_cmd + ' cache-parameters --infname ' + simfname + ' --only-smith-waterman --debug-allele-finding --always-find-new-alleles --n-max-allele-finding-iterations 2' # --dont-collapse-clones' # cmd_str = 'python -m cProfile -s tottime -o prof.out ' + cmd_str cmd_str += ' --n-procs ' + str(args.n_procs) if args.slurm: cmd_str += ' --batch-system slurm' if args.gen_gset: cmd_str += ' --find-new-alleles' else: inference_genes = ':'.join(args.inf_v_genes + args.dj_genes) iglfo = glutils.read_glfo('data/germlines/human', chain=chain, only_genes=inference_genes.split(':'), debug=True) glutils.write_glfo(args.outdir + '/germlines/inference', iglfo) cmd_str += ' --initial-germline-dir ' + args.outdir + '/germlines/inference' cmd_str += ' --find-new-alleles --dont-remove-unlikely-alleles' # --new-allele-fname ' + args.outdir + '/new-alleles.fa' # cmd_str += ' --n-max-snps 12' cmd_str += ' --parameter-dir ' + outpdir cmd_str += ' --plotdir ' + plotdir if args.seed is not None: cmd_str += ' --seed ' + str(args.seed) run(cmd_str)
for region in [r for r in utils.regions if r in glfo['seqs']]: for gene, seq in glfo['seqs'][region].items(): if utils.ambig_frac(seq) > 0.: if debug: print ' %d ambiguous bases: %s' % ( len(seq) * utils.ambig_frac(seq), utils.color_gene(gene)) glutils.remove_gene(glfo, gene) # glutils.print_glfo(glfo) # write final result glutils.write_glfo(outdir, glfo, debug=True) # ---------------------------------------------------------------------------------------- fname = 'macaque/ramesh-v1/coding.fa' outdir = 'macaque/ramesh-cleaned' # parse_ramesh_seqs(read_ramesh_file(fname, outdir), outdir, debug=True) # sys.exit() # ---------------------------------------------------------------------------------------- for locus in ['igh', 'igk', 'igl']: ref_glfo = glutils.read_glfo('data/germlines/macaque', locus, debug=True) glfo = glutils.read_glfo(outdir, locus, debug=True) merged_glfo = glutils.get_merged_glfo(ref_glfo, glfo, debug=True) # glutils.print_glfo(merged_glfo, print_separate_cons_seqs=True) glutils.write_glfo('datascripts/meta/crotty-fna/imgt-plus-ramesh', merged_glfo, debug=True)
def run_test(args): print 'seed %d' % args.seed label = 'test' #get_label(existing_genes, new_allele) simfname = args.outdir + '/simu-' + label + '.csv' outpdir = args.outdir + '/simu-' + label plotdir = args.outdir + '/simu-' + label + '-plots' # simulate if not args.nosim: cmd_str = base_cmd + ' simulate --n-sim-events ' + str( args.n_sim_events) + ' --n-leaves ' + str( args.n_leaves ) + ' --rearrange-from-scratch --outfname ' + simfname if args.n_leaf_distribution is None: cmd_str += ' --constant-number-of-leaves' else: cmd_str += ' --n-leaf-distribution ' + args.n_leaf_distribution if args.mut_mult is not None: cmd_str += ' --mutation-multiplier ' + str(args.mut_mult) cmd_str += ' --n-procs ' + str(args.n_procs) if args.slurm: cmd_str += ' --batch-system slurm --subsimproc' # figure what genes we're using if args.gen_gset: cmd_str += ' --generate-germline-set' cmd_str += ' --n-genes-per-region 1:5:3' cmd_str += ' --n-alleles-per-gene 2,3:1,2:1,2' else: simulation_genes = ':'.join(args.sim_v_genes + args.dj_genes) sglfo = glutils.read_glfo('data/germlines/human', locus=locus, only_genes=simulation_genes.split(':')) added_snp_names = None if args.snp_positions is not None: snps_to_add = [{ 'gene': args.sim_v_genes[ig], 'positions': args.snp_positions[ig] } for ig in range(len(args.sim_v_genes))] added_snp_names = glutils.add_some_snps( snps_to_add, sglfo, debug=True, remove_template_genes=args.remove_template_genes) if args.allele_prevalence_freqs is not None: if len(args.allele_prevalence_freqs) != len( sglfo['seqs']['v'] ): # already checked when parsing args, but, you know... raise Exception( '--allele-prevalence-freqs not the right length') gene_list = sorted( sglfo['seqs']['v']) if added_snp_names is None else list( set(args.sim_v_genes)) + added_snp_names prevalence_freqs = { 'v': { g: f for g, f in zip(gene_list, args.allele_prevalence_freqs) }, 'd': {}, 'j': {} } glutils.write_allele_prevalence_freqs( prevalence_freqs, args.workdir + '/allele-prevalence-freqs.csv') cmd_str += ' --allele-prevalence-fname ' + args.workdir + '/allele-prevalence-freqs.csv' print ' simulating with %d v: %s' % (len( sglfo['seqs']['v']), ' '.join( [utils.color_gene(g) for g in sglfo['seqs']['v']])) glutils.write_glfo(args.outdir + '/germlines/simulation', sglfo) cmd_str += ' --initial-germline-dir ' + args.outdir + '/germlines/simulation' # run simulation if args.seed is not None: cmd_str += ' --seed ' + str(args.seed) run(cmd_str) # remove any old sw cache files sw_cachefiles = glob.glob(outpdir + '/sw-cache-*.csv') if len(sw_cachefiles) > 0: for cachefname in sw_cachefiles: check_call(['rm', '-v', cachefname]) sw_cache_gldir = cachefname.replace('.csv', '-glfo') if os.path.exists( sw_cache_gldir ): # if stuff fails halfway through, you can get one but not the other glutils.remove_glfo_files(sw_cache_gldir, locus) # os.rmdir(sw_cache_gldir) # generate germline set and cache parameters cmd_str = base_cmd + ' cache-parameters --infname ' + simfname + ' --only-smith-waterman --debug-allele-finding --always-find-new-alleles --n-max-allele-finding-iterations 2' # --dont-collapse-clones' # cmd_str = 'python -m cProfile -s tottime -o prof.out ' + cmd_str cmd_str += ' --n-procs ' + str(args.n_procs) if args.n_max_queries is not None: cmd_str += ' --n-max-queries ' + str( args.n_max_queries ) # NOTE do *not* use --n-random-queries, since it'll change the cluster size distribution if args.slurm: cmd_str += ' --batch-system slurm' cmd_str += ' --find-new-alleles' if args.gen_gset: pass # i.e. uses default (full) germline dir else: cmd_str += ' --dont-remove-unlikely-alleles' # --new-allele-fname ' + args.outdir + '/new-alleles.fa' inference_genes = ':'.join(args.inf_v_genes + args.dj_genes) iglfo = glutils.read_glfo('data/germlines/human', locus=locus, only_genes=inference_genes.split(':')) print ' starting inference with %d v: %s' % (len( iglfo['seqs']['v']), ' '.join( [utils.color_gene(g) for g in iglfo['seqs']['v']])) glutils.write_glfo(args.outdir + '/germlines/inference', iglfo) cmd_str += ' --initial-germline-dir ' + args.outdir + '/germlines/inference' # cmd_str += ' --n-max-snps 12' cmd_str += ' --parameter-dir ' + outpdir cmd_str += ' --only-overall-plots --plotdir ' + plotdir if args.seed is not None: cmd_str += ' --seed ' + str(args.seed) run(cmd_str)
help= 'ignore clusters with a cdr3 that differs by more than this many nucleotides' ) args = parser.parse_args() args.infiles = utils.get_arg_list(args.infiles) args.labels = utils.get_arg_list(args.labels) args.parameter_dirs = utils.get_arg_list(args.parameter_dirs) assert len(args.infiles) == len(args.labels) if len(args.parameter_dirs) == 1: print ' note: using same glfo for all infiles' args.parameter_dirs = [args.parameter_dirs[0] for _ in args.labels] assert len(args.parameter_dirs) == len(args.labels) glfos = [ glutils.read_glfo(pdir + '/hmm/germline-sets', locus=args.locus) for pdir in args.parameter_dirs ] # ---------------------------------------------------------------------------------------- def getkey(uid_list): return ':'.join(uid_list) # ---------------------------------------------------------------------------------------- def read_annotations(fname, glfo): annotations = {} with open(fname.replace('.csv', '-cluster-annotations.csv')) as csvfile: reader = csv.DictReader(csvfile) for line in reader: # there's a line for each cluster
args.colors = utils.get_arg_list(args.colors) args.linewidths = utils.get_arg_list(args.linewidths) for iname in range(len(args.names)): args.names[iname] = args.names[iname].replace('@', ' ') # if you just pass in one parent directory, we assume <args.names> contains the desired subdirs if len(args.plotdirs) == 1: parentdir = args.plotdirs[0] args.plotdirs = [parentdir + '/' + n for n in args.names] if len(args.plotdirs) != len(args.names): raise Exception('poorly formatted args:\n %s\n %s' % (' '.join(args.plotdirs), ' '.join(args.names))) # if args.gldir is not 'none': args.glfo = None if os.path.exists(args.gldir): args.glfo = glutils.read_glfo(args.gldir, args.chain) # figure out if there's subdirs we need to deal with listof_plotdirlists, listof_outdirs = [], [] firstdir = args.plotdirs[0] if len(glob.glob(firstdir + '/*.csv')) > 0: # add the parent dirs if they've got csvs listof_plotdirlists.append(args.plotdirs) listof_outdirs.append(args.outdir) for subdir in [d for d in os.listdir(firstdir) if os.path.isdir(firstdir + '/' + d)]: listof_plotdirlists.append([d + '/' + subdir for d in args.plotdirs]) listof_outdirs.append(args.outdir + '/' + subdir) for dlist, outdir in zip(listof_plotdirlists, listof_outdirs): compare_directories(args, dlist, outdir)
#!/usr/bin/env python # Script to process the extras.csv files from partis' germline directories # Assumes the fasta files (e.g. ighv.fa) have been deduplicated import sys partis_dir = "/home/bolson2/Software/partis" sys.path.insert(1, partis_dir + '/python') import glutils igb_path = "/home/bolson2/Software/igblast" igb_database_path = igb_path + "/bin_deduplicated" glfo = glutils.read_glfo(igb_database_path, locus='igh', debug=True) glutils.write_glfo(igb_path + '/partis_friendly_bin', glfo, debug=True) glfo = glutils.read_glfo(igb_database_path, locus='igk', debug=True) glutils.write_glfo(igb_path + '/partis_friendly_bin', glfo, debug=True) glfo = glutils.read_glfo(igb_database_path, locus='igl', debug=True) glutils.write_glfo(igb_path + '/partis_friendly_bin', glfo, debug=True)
def extract_seqs(fnam, uid2iso): '''Reads a partis cluster-annotations files and extrats relevant information and sequences.''' # Read cluster annotations into a data list of dictionaries: with open(fnam) as fh: reader = csv.DictReader(fh) data = list(reader) sequences_i = list() info_i = list() for row in data: fnam_base = fnam.split('_partitions')[0] cwd = os.getcwd() if 'IgK' in fnam_base: locus = 'igk' elif 'IgL' in fnam_base: locus = 'igl' else: locus = 'igh' # Process the partis data row and add germline information: try: utils.process_input_line(row) # Read default germline info glfo = glutils.read_glfo('{}/_output/{}/hmm/germline-sets'.format(cwd, fnam_base), locus=locus) utils.add_implicit_info(glfo, row) except Exception as e: # Skip rows that cannot be processed print('First skip') print(e) continue uids = [dl + [u] if (len(dl) > 0 and dl[0] != '') else [u] for dl, u in zip(row['duplicates'], row['unique_ids'])] # Extract the full N padded naive sequence, # and find the v -and j gene bound on this naive sequence: cdr3_bounds = (row['codon_positions']['v'], row['codon_positions']['j'] + 3) vj_bounds = (row['regional_bounds']['v'][0], row['regional_bounds']['j'][1]) if row['invalid'] is True or (cdr3_bounds[0]-cdr3_bounds[1])%3 != 0: print('Invalid clonal family, skipping.') continue naiveDNA = row['naive_seq'] if repair_seq(naiveDNA, naiveDNA, vj_bounds, keep_check=True) is False: # Skip naive sequences too short or with stop codons: # print('Third skip') if len(row['input_seqs'][:]) > 100: print('Bad naive even after 100 seqs in clonal family.') repair_seq_debug(naiveDNA, naiveDNA, vj_bounds) continue trimmed_naiveDNA = repair_seq(naiveDNA[:], naiveDNA[:], vj_bounds) naiveAA = str(Seq(trimmed_naiveDNA, generic_dna).translate()) # There has been a name change and this try/except # is meant to provide backwards compatability: try: lseq = row['input_seqs'][:] except: lseq = row['seqs'][:] ir_lseq = row['indel_reversed_seqs'] stop_seq = row['stops'] assert(len(lseq) == len(ir_lseq)) assert(len(lseq) == len(stop_seq)) # Only keep sequences without indels and stop codons and minimum length amino acid length: ### ir_lseq[i] == '' or lseq[i] == ir_lseq[i] <-- No indels ### stop_seq[i] <-- No partis annotated stops (there seems still to be stops after these are removed though) ### repair_seq(lseq[i], naiveDNA, vj_bounds, keep_check=True) <-- Checks whether the sequence is long enougth or have stop codons keep_idx = [1 if ((ir_lseq[i] == '' or lseq[i] == ir_lseq[i]) and stop_seq[i] is False and repair_seq(lseq[i], naiveDNA, vj_bounds, keep_check=True)) else 0 for i in range(len(lseq))] # Now only keep those sequences that passed QC: lseq = [s for s, keep in zip(lseq, keep_idx) if keep == 1] # Exclude small clonal families: if len(lseq) < MIN_OBS: # print(len(lseq)) # print('Fourth skip') continue # Get amino acid sequences: lAAseq = [str(Seq(repair_seq(s[:], naiveDNA[:], vj_bounds), generic_dna).translate()) for s in lseq] # mut_freqs = [s for s, keep in zip(row['mut_freqs'], keep_idx) if keep == 1] # print(row['n_mutations'].split(':')) Nmuts = [int(s) for s, keep in zip(row['n_mutations'].split(':'), keep_idx) if keep == 1] abundance = [len(d) for d, keep in zip(uids, keep_idx) if keep == 1] uids = [s for s, keep in zip(uids, keep_idx) if keep == 1] assert(len(Nmuts) == len(lseq)) assert(len(abundance) == len(lseq)) assert(len(uids) == len(lseq)) # assert(len(mut_freqs) == len(lseq)) # Convert frequency to counts and throw out info for discarded sequences: # Nmuts = [int(round(float(t[0])*len(t[1].strip('N')))) for i, t in enumerate(zip(mut_freqs, lseq))] # Deduplicate AAseqs and lseq according to the AA deduplication: ''' lAAseq_dict = dict() lAAseq_sort = dict() lseq_dedup = list() for i, aa in enumerate(lAAseq): if aa in lAAseq_sort: lAAseq_sort[aa].append((i, repair_seq(lseq[i][:], naiveDNA[:], vj_bounds), abundance[i])) else: lAAseq_sort[aa] = [(i, repair_seq(lseq[i][:], naiveDNA[:], vj_bounds), abundance[i])] for i, aa in enumerate(lAAseq_sort): lAAseq_dict[aa] = [t[0] for t in lAAseq_sort[aa]] s = sorted(lAAseq_sort[aa], ) ab_seq = sorted(lAAseq_sort[aa], key=lambda x: x[2], reverse=True)[0][1] lseq_dedup.append(ab_seq) assert(len(lAAseq_dict) == len(lseq_dedup)) # Make the deduplicated list and take the mutation rates, # as the mutation rate for the deduplicated sequence: lAAseq_dedup = list() Nmuts_dedup = list() abundance_dedup = list() for aa, idxs in lAAseq_dict.items(): lAAseq_dedup.append(aa) Nmut_list = [float(Nmuts[i]) for i in idxs] Nmuts_dedup.append(int(round(sum(Nmut_list)/len(Nmut_list)))) abundance_list = [abundance[i] for i in idxs] abundance_dedup.append(sum(abundance_list)) assert(len(lAAseq_dedup) == len(Nmuts_dedup)) assert(len(lAAseq_dedup) == len(abundance_dedup)) assert(len(lAAseq_dedup) == len(lseq_dedup)) # Exclude small clonal families: if len(lAAseq_dedup) < MIN_OBS: # print(len(lseq)) # print('Fourth skip') continue ''' iso_list = [[uid2iso[u] for u in ul] for ul in uids] # Store the results in a list: sequences_i.append(['naive_seq', naiveAA]) # This format is for ANARCI numbering info_i.append({'fnam': fnam, 'v_gene': row['v_gene'], 'd_gene': row['d_gene'], 'j_gene': row['j_gene'], 'naive_seq': naiveAA, 'naive_seq_DNA': trimmed_naiveDNA, 'Nmuts': Nmuts[:], 'abundance': abundance[:], 'AAseqs': lAAseq[:], 'DNAseqs': lseq[:], 'UID': uids[:], 'isotype': iso_list[:], 'CDR3_start': cdr3_bounds[0], 'CDR3_end': cdr3_bounds[1]}) return(sequences_i, info_i)