def run_igdiscover(infname, outfname, outdir): if utils.output_exists(args, outfname): return prepare_igdiscover_outdir(outdir) if args.n_random_queries is not None: sub_infname = outdir + '/' + os.path.basename(infname.replace(utils.getsuffix(infname), '-n-random-queries-%d%s' % (args.n_random_queries, utils.getsuffix(infname)))) if os.path.exists(sub_infname): print ' --n-random-queries: leaving existing fasta for igdiscover (hopefully it has %d queries)' % args.n_random_queries else: print ' --n-random-queries: writing new fasta for igdiscover (%d queries)' % args.n_random_queries seqfos = utils.read_fastx(infname, n_random_queries=args.n_random_queries) with open(sub_infname, 'w') as sub_infile: for seqfo in seqfos: sub_infile.write('>%s\n%s\n' % (seqfo['name'], seqfo['seq'])) infname = sub_infname igdiscover_outfname = outdir + '/work/final/database/%s.fasta' % args.region.upper() cmds = getpathcmd() cmds += ['conda activate %s' % args.env_label] cmds += ['cd %s' % outdir] cmds += ['igdiscover init --db db --single-reads %s work' % infname] # prepares to run, putting files into <outdir> cmds += ['cp %s work/' % os.path.basename(args.yamlfname)] cmds += ['cd work'] cmds += ['igdiscover run'] utils.simplerun('\n'.join(cmds) + '\n', cmdfname=outdir + '/run.sh', print_time='igdiscover', debug=True) template_gldir = args.glfo_dir # if args.glfo_dir is not None else 'data/germlines/ XXX human' # can probably delete this now that --glfo-dir is required (but leaving for now, to show how it used to be in case it comes up) glfo = glutils.create_glfo_from_fasta(igdiscover_outfname, args.locus, args.region, template_gldir, simulation_germline_dir=args.simulation_germline_dir) out_gldir = os.path.dirname(outfname).rstrip('/' + args.locus) assert glutils.get_fname(out_gldir, args.locus, args.region) == outfname glutils.write_glfo(out_gldir, glfo, debug=True)
def parse_ramesh_seqs(glseqs, outdir, debug=False): for locus in glseqs: glutils.remove_glfo_files(outdir, locus) # write to a glfo dir without extra info for region in glseqs[locus]: fn = glutils.get_fname(outdir, locus, region) if not os.path.exists(os.path.dirname(fn)): os.makedirs(os.path.dirname(fn)) with open(fn, 'w') as ofile: for gene, seq in glseqs[locus][region].items(): ofile.write('>%s\n%s\n' % (gene, seq)) # figure out extra info template_glfo = glutils.read_glfo('data/germlines/macaque', locus) glfo = glutils.read_glfo(outdir, locus, template_glfo=template_glfo, remove_bad_genes=True, debug=True) # trim non-coding stuff upstream of v (and remove non-full-length ones) gene_groups = {} for region in ['v']: group_labels = sorted( set([utils.gene_family(g) for g in glfo['seqs'][region]])) gene_groups[region] = [(glabel, { g: glfo['seqs'][region][g] for g in glfo['seqs'][region] if utils.gene_family(g) == glabel }) for glabel in group_labels] for region in [r for r in utils.regions if r in gene_groups]: if debug: print '%s' % utils.color('reverse_video', utils.color('green', region)) for group_label, group_seqs in gene_groups[ region]: # ok, this isn't really doing anything any more if debug: print ' %s' % utils.color('blue', group_label) for gene, seq in group_seqs.items(): trim_and_remove_genes(region, gene, seq, glfo, template_glfo, debug=debug) # remove any seqs with ambiguous bases for region in [r for r in utils.regions if r in glfo['seqs']]: for gene, seq in glfo['seqs'][region].items(): if utils.ambig_frac(seq) > 0.: if debug: print ' %d ambiguous bases: %s' % ( len(seq) * utils.ambig_frac(seq), utils.color_gene(gene)) glutils.remove_gene(glfo, gene) # glutils.print_glfo(glfo) # write final result glutils.write_glfo(outdir, glfo, debug=True)
def _generate_germline_set(self, n_genes_per_region="20:1:1", n_sim_alleles_per_gene="1.5:1:1", min_sim_allele_prevalence_freq=0.1): """ Call partis's germline set simulation function and write to files """ PARTIS_PATH = './partis' sys.path.insert(1, PARTIS_PATH + '/python') import glutils glfo = glutils.read_glfo(self.GERMLINE_FOLDER + "/" + self.organism, self.locus) glutils.generate_germline_set( glfo, n_genes_per_region, n_sim_alleles_per_gene, min_sim_allele_prevalence_freq, self.allele_freq_file, debug=self.debug, ) glutils.write_glfo(self.output_dir, glfo) # Read allele prevalences germline_freqs = dict() with open(self.allele_freq_file, "r") as f: allele_reader = csv.reader(f) allele_reader.next() for row in allele_reader: if row[0].startswith(self.locus.upper() + "V"): germline_freqs[row[0]] = float(row[1]) # Read the selected germline alleles germline_seqs = dict() with open(self.ig_file, "r") as f: lines = f.read().splitlines() for line_idx in range(len(lines) / 2): allele = lines[line_idx * 2].replace(">", "") allele_seq = lines[line_idx * 2 + 1] # Trim allele until multiple of 3 - randomly pick a reading frame mod_seq_len = len(allele_seq) % 3 if mod_seq_len != 0: offset = np.random.choice(mod_seq_len + 1) if mod_seq_len != offset: allele_seq = allele_seq[offset:-(mod_seq_len - offset)] else: allele_seq = allele_seq[offset:] # Make sure no N in the germline sequence while "N" in allele_seq: allele_seq = allele_seq.replace( "N", np.random.choice(list(NUCLEOTIDE_SET)), 1) germline_seqs[allele] = allele_seq return germline_seqs, germline_freqs
def build_v_gene_set(glfo, introns): total_d_counts = {} refseqs = {} for d_gene, counts in introns.items(): total_d_counts[d_gene] = sum(counts.values()) for d_gene, _ in sorted(total_d_counts.items(), key=operator.itemgetter(1), reverse=True): counts = introns[d_gene] # first decide on the reference sequences refseq, column_counts = None, None for seq in sorted(counts, key=len, reverse=True): if refseq is None: # first one, i.e. the longest refseq = seq column_counts = [{n : 0 for n in utils.nukes} for i in range(len(refseq))] ioffset = len(refseq) - len(seq) partial_refseq = refseq[ioffset:] assert len(partial_refseq) == len(seq) for ibase in range(ioffset, len(refseq)): column_counts[ibase][seq[ibase - ioffset]] += counts[seq] refseqs[d_gene] = [] for basecounts in column_counts: most_common_base = sorted(basecounts.items(), key=operator.itemgetter(1), reverse=True)[0][0] refseqs[d_gene].append(most_common_base) refseqs[d_gene] = ''.join(refseqs[d_gene]) n_ok = 0 mutecounts = {} for seq in sorted(counts, key=len, reverse=True): # print ' %3d %150s' % (count, seq) partial_refseq = refseqs[d_gene][len(refseqs[d_gene]) - len(seq):] if seq == partial_refseq: n_ok += counts[seq] else: # utils.color_mutants(partial_refseq, seq, print_result=True, extra_str=' ') n_mutes = utils.hamming_distance(partial_refseq, seq) if n_mutes not in mutecounts: mutecounts[n_mutes] = 0 mutecounts[n_mutes] += counts[seq] print ' %s %4d / %-4d ok' % (utils.color_gene(d_gene, width=10), n_ok, n_ok + sum(mutecounts.values())), if len(mutecounts) > 0: print '(mean of %.1f mutations among the other %d' % (numpy.average(mutecounts.keys(), weights=mutecounts.values()), sum(mutecounts.values())), print '' # add the intronic v genes to glfo for d_gene, refseq in refseqs.items(): glfo['seqs']['v'][utils.generate_dummy_v(d_gene)] = refseq glfo['cyst-positions'][utils.generate_dummy_v(d_gene)] = len(refseq) - 3 # write a glfo dir with everything glutils.write_glfo(outdir + '/germlines/imgt-and-intronic', glfo, debug=True) # remove the original v genes, and write a glfo dir with just the intronic ones glutils.remove_genes(glfo, [g for g in glfo['seqs']['v'] if 'xDx' not in g], debug=True) glutils.write_glfo(outdir + '/germlines/intronic', glfo, debug=True)
def write_inf_glfo( args ): # read default glfo, restrict it to the specified alleles, and write to somewhere where all the methods can read it # NOTE this dir should *not* be modified by any of the methods inf_glfo = glutils.read_glfo('data/germlines/human', locus=args.locus, only_genes=args.inf_v_genes + args.dj_genes) print ' writing initial inference glfo with %d v: %s' % (len( inf_glfo['seqs']['v']), ' '.join( [utils.color_gene(g) for g in inf_glfo['seqs']['v']])) glutils.write_glfo(args.inf_glfo_dir, inf_glfo)
def run_igdiscover(infname, outfname, outdir): if utils.output_exists(args, outfname): return prepare_igdiscover_outdir(outdir) if args.n_random_queries is not None: sub_infname = outdir + '/' + os.path.basename( infname.replace( utils.getsuffix(infname), '-n-random-queries-%d%s' % (args.n_random_queries, utils.getsuffix(infname)))) if os.path.exists(sub_infname): print ' --n-random-queries: leaving existing fasta for igdiscover (hopefully it has %d queries)' % args.n_random_queries else: print ' --n-random-queries: writing new fasta for igdiscover (%d queries)' % args.n_random_queries seqfos = utils.read_fastx(infname, n_random_queries=args.n_random_queries) with open(sub_infname, 'w') as sub_infile: for seqfo in seqfos: sub_infile.write('>%s\n%s\n' % (seqfo['name'], seqfo['seq'])) infname = sub_infname igdiscover_outfname = outdir + '/work/final/database/%s.fasta' % args.region.upper( ) cmds = ['#!/bin/bash'] cmds += ['export PATH=%s:$PATH' % args.condapath] cmds += [ 'export PYTHONNOUSERSITE=True' ] # otherwise it finds the pip-installed packages in .local and breaks (see https://github.com/conda/conda/issues/448) cmds += ['cd %s' % outdir] cmds += ['igdiscover init --db db --single-reads %s work' % infname ] # prepares to run, putting files into <outdir> cmds += ['cp %s work/' % os.path.basename(args.yamlfname)] cmds += ['cd work'] cmds += ['igdiscover run'] utils.simplerun('\n'.join(cmds) + '\n', cmdfname=outdir + '/run.sh', print_time='igdiscover', debug=True) template_gldir = args.glfo_dir if args.glfo_dir is not None else 'data/germlines/human' glfo = glutils.create_glfo_from_fasta( igdiscover_outfname, args.locus, args.region, template_gldir, simulation_germline_dir=args.simulation_germline_dir) out_gldir = os.path.dirname(outfname).rstrip('/' + args.locus) assert glutils.get_fname(out_gldir, args.locus, args.region) == outfname glutils.write_glfo(out_gldir, glfo, debug=True)
def simulate(args): if utils.output_exists(args, args.simfname): return cmd_str = args.partis_path + ' simulate --n-sim-events ' + str(args.n_sim_events) + ' --outfname ' + args.simfname + ' --n-leaves ' + str(args.n_leaves) + ' --rearrange-from-scratch --shm-parameter-dir ' + partis_dir + '/data/recombinator/scratch-parameters' if args.n_leaf_distribution is None: cmd_str += ' --constant-number-of-leaves' else: cmd_str += ' --n-leaf-distribution ' + args.n_leaf_distribution if args.mut_mult is not None: cmd_str += ' --mutation-multiplier ' + str(args.mut_mult) if args.root_mrca_weibull_parameter is not None: cmd_str += ' --root-mrca-weibull-parameter ' + str(args.root_mrca_weibull_parameter) cmd_str += ' --n-procs ' + str(args.n_procs) if args.slurm: cmd_str += ' --batch-system slurm --subsimproc' allele_prevalence_fname = args.workdir + '/allele-prevalence-freqs.csv' # figure what genes we're using if args.gls_gen: assert args.sim_v_genes is None and args.allele_prevalence_freqs is None sglfo = glutils.read_glfo(args.default_germline_dir, locus=args.locus) glutils.remove_v_genes_with_bad_cysteines(sglfo) glutils.generate_germline_set(sglfo, args.n_genes_per_region, args.n_sim_alleles_per_gene, args.min_allele_prevalence_freq, allele_prevalence_fname, new_allele_info=args.new_allele_info, dont_remove_template_genes=args.dont_remove_template_genes, debug=True) cmd_str += ' --allele-prevalence-fname ' + allele_prevalence_fname else: sglfo = glutils.read_glfo(args.default_germline_dir, locus=args.locus, only_genes=(args.sim_v_genes + args.dj_genes)) added_snp_names = glutils.generate_new_alleles(sglfo, args.new_allele_info, debug=True, remove_template_genes=(not args.dont_remove_template_genes)) # NOTE template gene removal is the default for glutils.generate_germline_set if args.allele_prevalence_freqs is not None: if not utils.is_normed(args.allele_prevalence_freqs): raise Exception('--allele-prevalence-freqs %s not normalized' % args.allele_prevalence_freqs) if len(args.allele_prevalence_freqs) != len(sglfo['seqs']['v']): # already checked when parsing args, but, you know... raise Exception('--allele-prevalence-freqs %d not the same length as sglfo %d' % (len(args.allele_prevalence_freqs), len(sglfo['seqs']['v']))) gene_list = sorted(sglfo['seqs']['v']) if len(added_snp_names) == 0 else list(set(args.sim_v_genes)) + added_snp_names prevalence_freqs = {'v' : {g : f for g, f in zip(gene_list, args.allele_prevalence_freqs)}, 'd' : {}, 'j' : {}} glutils.write_allele_prevalence_freqs(prevalence_freqs, allele_prevalence_fname) cmd_str += ' --allele-prevalence-fname ' + allele_prevalence_fname glutils.write_glfo(args.outdir + '/germlines/simulation', sglfo) cmd_str += ' --initial-germline-dir ' + args.outdir + '/germlines/simulation' # glutils.print_glfo(sglfo) # run simulation if args.seed is not None: cmd_str += ' --seed ' + str(args.seed) utils.simplerun(cmd_str, dryrun=args.dry_run)
def run_test(simulation_v_genes, inference_v_genes, dj_genes, seed=None): if seed is not None: random.seed(seed) label = 'test' #get_label(existing_genes, new_allele) simfname = outdir + '/simu-' + label + '.csv' outpdir = outdir + '/simu-' + label if os.getenv('www') is not None: plotdir = os.getenv('www') + '/partis/allele-finding/' + label else: plotdir = '_www/partis/allele-finding/' + label snps_to_add = [ {'gene' : 'IGHV3-71*01', 'positions' : (35, )}, {'gene' : 'IGHV3-71*01', 'positions' : (35, 50)}, # {'gene' : 'IGHV3-71*01', 'positions' : (35, 45, 20, 50, 77)}, # {'gene' : 'IGHV3-71*01', 'positions' : (35, 60, 50)}, # {'gene' : 'IGHV1-18*01', 'positions' : (100, 101)}, # {'gene' : 'IGHV1-18*01', 'positions' : (20, )} ] simulation_genes = simulation_v_genes + ':' + dj_genes sglfo = glutils.read_glfo('data/imgt', chain=chain, only_genes=simulation_genes.split(':'), debug=True) glutils.add_some_snps(snps_to_add, sglfo, remove_template_genes=False, debug=True) glutils.write_glfo(outdir + '/germlines-for-simulation', sglfo) # simulate cmd_str = base_cmd + ' simulate --n-sim-events 1000 --n-procs 10 --simulate-partially-from-scratch --mutation-multiplier 0.5' cmd_str += ' --initial-datadir ' + outdir + '/germlines-for-simulation' cmd_str += ' --outfname ' + simfname if seed is not None: cmd_str += ' --seed ' + str(seed) run(cmd_str) inference_genes = inference_v_genes + ':' + dj_genes iglfo = glutils.read_glfo('data/imgt', chain=chain, only_genes=inference_genes.split(':'), debug=True) glutils.write_glfo(outdir + '/germlines-for-inference', iglfo) # generate germline set and cache parameters cmd_str = base_cmd + ' cache-parameters --infname ' + simfname + ' --n-procs 10 --only-smith-waterman' cmd_str += ' --find-new-alleles --new-allele-fname ' + outdir + '/new-alleles.fa' # cmd_str += ' --generate-germline-set' cmd_str += ' --debug-new-allele-finding' cmd_str += ' --initial-datadir ' + outdir + '/germlines-for-inference' cmd_str += ' --parameter-dir ' + outpdir cmd_str += ' --plotdir ' + plotdir if seed is not None: cmd_str += ' --seed ' + str(seed) run(cmd_str)
def run_tigger(infname, outfname, outdir): if utils.output_exists(args, outfname, offset=8): return rcmds = ['library(tigger)', 'library(dplyr)'] # rcmds += ['data(sample_db, germline_ighv)'] db_name = 'annotations' gls_name = 'gls' rcmds += ['%s = read.csv("%s", sep="\t")' % (db_name, infname)] rcmds += ['%s = readIgFasta("%s")' % (gls_name, get_glfname('v', aligned=True))] tigger_outfname = outdir + '/tigger.fasta' rcmds += ['novel_df = findNovelAlleles(%s, %s, germline_min=2, nproc=%d)' % (db_name, gls_name, args.n_procs)] # rcmds += ['geno = inferGenotype(%s, find_unmutated = FALSE, germline_db = %s, novel_df = novel_df)' % (db_name, gls_name)] rcmds += ['genotype_seqs = genotypeFasta(geno, %s, novel_df)' % (gls_name)] rcmds += ['writeFasta(genotype_seqs, "%s")' % tigger_outfname] cmdfname = args.workdir + '/tigger-in.cmd' with open(cmdfname, 'w') as cmdfile: cmdfile.write('\n'.join(rcmds) + '\n') cmdstr = 'R --slave -f ' + cmdfname utils.simplerun(cmdstr, shell=True, print_time='tigger') # post-process tigger .fa gldir = args.glfo_dir if args.glfo_dir is not None else 'data/germlines/human' glfo = glutils.read_glfo(gldir, args.locus) tigger_alleles = set() for seqfo in utils.read_fastx(tigger_outfname): seq = seqfo['seq'].replace(utils.gap_chars[0], '') # it should be just dots... tigger_alleles.add(seqfo['name']) if seqfo['name'] not in glfo['seqs'][args.region]: newfo = {'gene' : seqfo['name'], 'seq' : seq} use_template_for_codon_info = False if '+' in newfo['gene']: newfo['template-gene'] = newfo['gene'].split('+')[0] use_template_for_codon_info = True glutils.add_new_allele(glfo, newfo, use_template_for_codon_info=use_template_for_codon_info, debug=True) elif glfo['seqs'][args.region][seqfo['name']] != seq: print '%s different sequences in glfo and tigger output for %s:\n %s\n %s' % (utils.color('red', 'error'), seqfo['name'], glfo['seqs'][args.region][seqfo['name']], seqfo['seq']) for gene in glfo['seqs'][args.region]: # remove them afterwards so we can use existing ones to get codon info if gene not in tigger_alleles: glutils.remove_gene(glfo, gene) out_gldir = os.path.dirname(outfname).rstrip('/' + args.locus) assert glutils.get_fname(out_gldir, args.locus, args.region) == outfname glutils.write_glfo(out_gldir, glfo) os.remove(cmdfname)
def write(self, base_outdir): print " writing parameters", sys.stdout.flush() start = time.time() utils.prep_dir( base_outdir, subdirs=("hmms", "mute-freqs", "germline-sets"), wildlings=("*.csv", "*.yaml", "*.fasta") ) # it's kind of hackey to specify the /hmms dir here, but as soon as we write the parameters below, the previous yamels are out of date, so it's pretty much necessary self.mfreqer.write( base_outdir + "/mute-freqs", mean_freq_outfname=base_outdir + "/REGION-mean-mute-freqs.csv" ) # REGION is replace by each region in the three output files) genes_with_counts = [g[0] for r in utils.regions for g in self.counts[r + "_gene"].keys()] glutils.write_glfo(base_outdir + "/" + glutils.glfo_dir, self.glfo, only_genes=genes_with_counts, debug=False) for column in self.counts: index = None outfname = None if column == "all": index = tuple(list(utils.index_columns) + ["cdr3_length"]) outfname = base_outdir + "/" + utils.get_parameter_fname(column="all") elif "_content" in column: index = [column] outfname = base_outdir + "/" + column + ".csv" else: index = [column] + utils.column_dependencies[column] outfname = base_outdir + "/" + utils.get_parameter_fname(column_and_deps=index) if os.path.isfile(outfname): os.remove(outfname) elif not os.path.exists(base_outdir): os.makedirs(base_outdir) with opener("w")(outfname) as outfile: out_fieldnames = list(index) out_fieldnames.append("count") out_data = csv.DictWriter(outfile, out_fieldnames) out_data.writeheader() # NOTE this will in general not be sorted for key, count in self.counts[column].iteritems(): line = {} for ic in range(len(key)): line[index[ic]] = key[ic] line["count"] = count out_data.writerow(line) print "(%.1f sec)" % (time.time() - start)
def write_single_zenodo_subdir(zenodo_dir, args, study, dset, method, mfo): method_outdir = heads.get_datadir(study, 'processed', extra_str=args.label) + '/' + dset gls_dir = get_gls_dir(method_outdir, method, data=True) print ' %s --> %s' % (gls_dir, zenodo_dir) glfo = glutils.read_glfo(gls_dir, mfo['locus'], remove_orfs='partis' in method) glutils.write_glfo(zenodo_dir, glfo) if method == 'partis': # allele finding plots plotdir = gls_dir.replace('hmm/germline-sets', 'plots/sw/allele-finding') if not os.path.exists(zenodo_dir + '/fits'): os.makedirs(zenodo_dir + '/fits') for genedir in glob.glob(plotdir + '/try-0/*'): # would be nice to copy html, but links will be wrong subprocess.check_call(['cp', '-r', genedir, zenodo_dir + '/fits/']) # csv prevalence files for tmpreg in utils.regions: with open(gls_dir.replace('/germline-sets', '/%s_gene-probs.csv' % tmpreg)) as infile: reader = csv.DictReader(infile) countfo = {line['%s_gene' % tmpreg] : int(line['count']) for line in reader} old_total = sum(countfo.values()) orf_genes = [g for g in countfo if g not in glfo['seqs'][tmpreg]] # this is kind of dangerous... but the genes are read from the same parameter dir that we're reading this prevalence file, so the only way it's gonna be missing is if we just removed it with the read_glfo() line above for ogene in orf_genes: # if tmpreg == 'v': # _, nearest_gene, _ = glutils.find_nearest_gene_with_same_cpos(glfo, glfo['seqs'][tmpreg][ogene]) # oops, that's dumb... of course it isn't there # else: nearest_gene = glutils.find_nearest_gene_using_names(glfo, ogene) # print ' adding %d to %s from %s' % (countfo[ogene], utils.color_gene(nearest_gene), utils.color_gene(ogene)) countfo[nearest_gene] += countfo[ogene] for ogene in orf_genes: del countfo[ogene] assert old_total == sum(countfo.values()) with open('%s/%s_gene-probs.csv' % (zenodo_dir, tmpreg), 'w') as outfile: writer = csv.DictWriter(outfile, ('%s_gene' % tmpreg, 'count')) writer.writeheader() for gene in countfo: writer.writerow({'%s_gene' % tmpreg : gene, 'count' : countfo[gene]}) elif method == 'tigger-default': # doesn't seem to have written anything pass elif method == 'igdiscover': # for fname in ['errorhistograms.pdf', 'V_usage.pdf', 'V_usage.tab']: # subprocess.check_call(['cp', '%s/work/final/%s' % (gls_dir, fname), zenodo_dir + '/']) subprocess.check_call(['cp', '-r', '%s/work/final' % gls_dir, zenodo_dir + '/']) # aw, screw it, just write everything. The simulation stuff is already huge, anyway else: assert False
def write(self, base_outdir, my_datadir=None): print ' writing parameters', sys.stdout.flush() start = time.time() utils.prep_dir(base_outdir, subdirs=('hmms', 'mute-freqs', 'germline-sets'), wildlings=('*.csv', '*.yaml', '*.fasta')) # it's kind of hackey to specify the /hmms dir here, but as soon as we write the parameters below, the previous yamels are out of date, so it's pretty much necessary self.mfreqer.write(base_outdir + '/mute-freqs', mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv') # REGION is replace by each region in the three output files) genes_with_counts = [g[0] for r in utils.regions for g in self.counts[r + '_gene'].keys()] glutils.write_glfo(base_outdir + '/' + glutils.glfo_dir, self.glfo, only_genes=genes_with_counts, debug=True) for column in self.counts: index = None outfname = None if column == 'all': index = tuple(list(utils.index_columns) + ['cdr3_length', ]) outfname = base_outdir + '/' + utils.get_parameter_fname(column='all') elif '_content' in column: index = [column,] outfname = base_outdir + '/' + column + '.csv' else: index = [column,] + utils.column_dependencies[column] outfname = base_outdir + '/' + utils.get_parameter_fname(column_and_deps=index) if os.path.isfile(outfname): os.remove(outfname) elif not os.path.exists(base_outdir): os.makedirs(base_outdir) with opener('w')(outfname) as outfile: out_fieldnames = list(index) out_fieldnames.append('count') out_data = csv.DictWriter(outfile, out_fieldnames) out_data.writeheader() # NOTE this will in general not be sorted for key, count in self.counts[column].iteritems(): line = {} for ic in range(len(key)): line[index[ic]] = key[ic] line['count'] = count out_data.writerow(line) print '(%.1f sec)' % (time.time()-start)
def write(self, base_outdir): print ' writing parameters', sys.stdout.flush() start = time.time() utils.prep_dir(base_outdir, subdirs=('hmms', 'mute-freqs', 'germline-sets'), wildlings=('*.csv', '*.yaml', '*.fasta')) # it's kind of hackey to specify the /hmms dir here, but as soon as we write the parameters below, the previous yamels are out of date, so it's pretty much necessary self.mfreqer.write(base_outdir + '/mute-freqs', mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv') # REGION is replace by each region in the three output files) genes_with_counts = [g[0] for r in utils.regions for g in self.counts[r + '_gene'].keys()] glutils.write_glfo(base_outdir + '/' + glutils.glfo_dir, self.glfo, only_genes=genes_with_counts, debug=False) for column in self.counts: index = None outfname = None if column == 'all': index = tuple(list(utils.index_columns) + ['cdr3_length', ]) outfname = base_outdir + '/' + utils.get_parameter_fname(column='all') elif '_content' in column: index = [column,] outfname = base_outdir + '/' + column + '.csv' else: index = [column,] + utils.column_dependencies[column] outfname = base_outdir + '/' + utils.get_parameter_fname(column_and_deps=index) if os.path.isfile(outfname): os.remove(outfname) elif not os.path.exists(base_outdir): os.makedirs(base_outdir) with open(outfname, 'w') as outfile: out_fieldnames = list(index) out_fieldnames.append('count') out_data = csv.DictWriter(outfile, out_fieldnames) out_data.writeheader() # NOTE this will in general not be sorted for key, count in self.counts[column].iteritems(): line = {} for ic in range(len(key)): line[index[ic]] = key[ic] line['count'] = count out_data.writerow(line) print '(%.1f sec)' % (time.time()-start)
for region in [r for r in utils.regions if r in glfo['seqs']]: for gene, seq in glfo['seqs'][region].items(): if utils.ambig_frac(seq) > 0.: if debug: print ' %d ambiguous bases: %s' % ( len(seq) * utils.ambig_frac(seq), utils.color_gene(gene)) glutils.remove_gene(glfo, gene) # glutils.print_glfo(glfo) # write final result glutils.write_glfo(outdir, glfo, debug=True) # ---------------------------------------------------------------------------------------- fname = 'macaque/ramesh-v1/coding.fa' outdir = 'macaque/ramesh-cleaned' # parse_ramesh_seqs(read_ramesh_file(fname, outdir), outdir, debug=True) # sys.exit() # ---------------------------------------------------------------------------------------- for locus in ['igh', 'igk', 'igl']: ref_glfo = glutils.read_glfo('data/germlines/macaque', locus, debug=True) glfo = glutils.read_glfo(outdir, locus, debug=True) merged_glfo = glutils.get_merged_glfo(ref_glfo, glfo, debug=True) # glutils.print_glfo(merged_glfo, print_separate_cons_seqs=True) glutils.write_glfo('datascripts/meta/crotty-fna/imgt-plus-ramesh', merged_glfo, debug=True)
def run_tigger(infname, outfname, outdir): if utils.output_exists(args, outfname, offset=8): return rcmds = [ 'library(ggplot2)', 'library(tigger, warn.conflicts=FALSE)', 'library(dplyr, warn.conflicts=FALSE)' ] # rcmds += ['data(sample_db, germline_ighv)'] db_name = 'annotations' gls_name = 'gls' rcmds += ['%s = read.csv("%s", sep="\t")' % (db_name, infname)] rcmds += [ '%s = readIgFasta("%s")' % (gls_name, get_glfname('v', aligned=True)) ] tigger_outfname = outdir + '/tigger.fasta' find_novel_argstr = '%s, %s, nproc=%d' % (db_name, gls_name, utils.auto_n_procs()) if args.tuned_tigger_params: germline_min = 5 # only analyze genes which correspond to at least this many V calls (default 200) min_seqs = 5 # minimum number of total sequences j_max = 0.95 # of sequences which align perfectly (i.e. zero mutation?) to a new allele, no more than this fraction can correspond to each junction length + j gene combination (default 0.15) find_novel_argstr += ', germline_min=%d, min_seqs=%d, j_max=%f' % ( germline_min, min_seqs, j_max) rcmds += ['novel_df = findNovelAlleles(%s)' % find_novel_argstr] # rcmds += ['sessionInfo()'] rcmds += ['print(novel_df)'] rcmds += [ 'geno = inferGenotype(%s, find_unmutated = TRUE, germline_db = %s, novel_df = novel_df)' % (db_name, gls_name) ] rcmds += ['genotype_seqs = genotypeFasta(geno, %s, novel_df)' % (gls_name)] rcmds += ['writeFasta(genotype_seqs, "%s")' % tigger_outfname] cmdfname = args.workdir + '/tigger-in.cmd' with open(cmdfname, 'w') as cmdfile: cmdfile.write('\n'.join(rcmds) + '\n') cmdstr = 'R --slave -f ' + cmdfname cmdfo = {'cmd_str': cmdstr, 'logdir': args.workdir, 'env': os.environ} proc = utils.run_cmd(cmdfo) while proc.poll() is None: time.sleep(0.01) if proc.returncode != 0: # damn thing crashes if it thinks the sample size is small with open(args.workdir + '/err') as ferr: errstr = ''.join(ferr.readlines()) if 'Not enough sample sequences were assigned to any germline' in errstr: with open(tigger_outfname, 'w') as dummy_outfasta: dummy_outfasta.write('') else: subprocess.check_call(['cat', args.workdir + '/out']) subprocess.check_call(['cat', args.workdir + '/err']) sys.exit(proc.returncode) for oe in ['err', 'out']: with open(args.workdir + '/' + oe) as oefile: print ''.join(oefile.readlines()) os.remove(args.workdir + '/' + oe) # post-process tigger .fa template_gldir = args.glfo_dir if args.glfo_dir is not None else 'data/germlines/human' glfo = glutils.create_glfo_from_fasta( tigger_outfname, args.locus, args.region, template_gldir, simulation_germline_dir=args.simulation_germline_dir) out_gldir = os.path.dirname(outfname).rstrip('/' + args.locus) assert glutils.get_fname(out_gldir, args.locus, args.region) == outfname glutils.write_glfo(out_gldir, glfo) os.remove(cmdfname)
print 'RUN', cmd_str sys.stdout.flush() check_call(cmd_str.split()) outdir = '_tmp/light-chain' base_cmd = './bin/partis' dj_genes = 'IGHD6-19*01:IGHJ4*02' v_genes = 'IGHV3-71*01' #:IGHV1-18*01' all_genes= v_genes + ':' + dj_genes # glutils.write_glfo(outdir + '/germline-set', input_dir=outdir + '/imgt', only_genes=all_genes.split(':'), debug=True) # glutils.write_glfo('imgt', chain='l', input_dir='data/imgt', debug=True, generate_new_alignment=True) assert False # needs updating glutils.write_glfo('imgt', chain='k', input_dir='imgt', debug=True, generate_new_alignment=True) # glutils.write_glfo(outdir + '/germline-set', input_dir=outdir + '/imgt', debug=True) sys.exit() # simulate cmd_str = base_cmd + ' simulate --n-sim-events 10 --simulate-partially-from-scratch --mutation-multiplier 0.5 --debug 1 --n-trees 10' cmd_str += ' --initial-datadir ' + outdir + '/germline-set' cmd_str += ' --outfname ' + outdir + '/simu.csv' # run(cmd_str) # cache parameters cmd_str = base_cmd + ' cache-parameters --chain-weight light --light-chain-locus kappa --debug 1' cmd_str += ' --infname ' + outdir + '/simu.csv' cmd_str += ' --initial-datadir ' + outdir + '/germline-set' cmd_str += ' --outfname ' + 'tmp.csv' run(cmd_str)
def run_test(args): print 'seed %d' % args.seed label = 'test' #get_label(existing_genes, new_allele) simfname = args.outdir + '/simu-' + label + '.csv' outpdir = args.outdir + '/simu-' + label plotdir = args.outdir + '/simu-' + label + '-plots' # simulate if not args.nosim: cmd_str = base_cmd + ' simulate --n-sim-events ' + str( args.n_sim_events) + ' --n-leaves ' + str( args.n_leaves ) + ' --rearrange-from-scratch --outfname ' + simfname if args.n_leaf_distribution is None: cmd_str += ' --constant-number-of-leaves' else: cmd_str += ' --n-leaf-distribution ' + args.n_leaf_distribution if args.mut_mult is not None: cmd_str += ' --mutation-multiplier ' + str(args.mut_mult) cmd_str += ' --n-procs ' + str(args.n_procs) if args.slurm: cmd_str += ' --batch-system slurm --subsimproc' # figure what genes we're using if args.gen_gset: cmd_str += ' --generate-germline-set' cmd_str += ' --n-genes-per-region 1:5:3' cmd_str += ' --n-alleles-per-gene 2,3:1,2:1,2' else: simulation_genes = ':'.join(args.sim_v_genes + args.dj_genes) sglfo = glutils.read_glfo('data/germlines/human', locus=locus, only_genes=simulation_genes.split(':')) added_snp_names = None if args.snp_positions is not None: snps_to_add = [{ 'gene': args.sim_v_genes[ig], 'positions': args.snp_positions[ig] } for ig in range(len(args.sim_v_genes))] added_snp_names = glutils.add_some_snps( snps_to_add, sglfo, debug=True, remove_template_genes=args.remove_template_genes) if args.allele_prevalence_freqs is not None: if len(args.allele_prevalence_freqs) != len( sglfo['seqs']['v'] ): # already checked when parsing args, but, you know... raise Exception( '--allele-prevalence-freqs not the right length') gene_list = sorted( sglfo['seqs']['v']) if added_snp_names is None else list( set(args.sim_v_genes)) + added_snp_names prevalence_freqs = { 'v': { g: f for g, f in zip(gene_list, args.allele_prevalence_freqs) }, 'd': {}, 'j': {} } glutils.write_allele_prevalence_freqs( prevalence_freqs, args.workdir + '/allele-prevalence-freqs.csv') cmd_str += ' --allele-prevalence-fname ' + args.workdir + '/allele-prevalence-freqs.csv' print ' simulating with %d v: %s' % (len( sglfo['seqs']['v']), ' '.join( [utils.color_gene(g) for g in sglfo['seqs']['v']])) glutils.write_glfo(args.outdir + '/germlines/simulation', sglfo) cmd_str += ' --initial-germline-dir ' + args.outdir + '/germlines/simulation' # run simulation if args.seed is not None: cmd_str += ' --seed ' + str(args.seed) run(cmd_str) # remove any old sw cache files sw_cachefiles = glob.glob(outpdir + '/sw-cache-*.csv') if len(sw_cachefiles) > 0: for cachefname in sw_cachefiles: check_call(['rm', '-v', cachefname]) sw_cache_gldir = cachefname.replace('.csv', '-glfo') if os.path.exists( sw_cache_gldir ): # if stuff fails halfway through, you can get one but not the other glutils.remove_glfo_files(sw_cache_gldir, locus) # os.rmdir(sw_cache_gldir) # generate germline set and cache parameters cmd_str = base_cmd + ' cache-parameters --infname ' + simfname + ' --only-smith-waterman --debug-allele-finding --always-find-new-alleles --n-max-allele-finding-iterations 2' # --dont-collapse-clones' # cmd_str = 'python -m cProfile -s tottime -o prof.out ' + cmd_str cmd_str += ' --n-procs ' + str(args.n_procs) if args.n_max_queries is not None: cmd_str += ' --n-max-queries ' + str( args.n_max_queries ) # NOTE do *not* use --n-random-queries, since it'll change the cluster size distribution if args.slurm: cmd_str += ' --batch-system slurm' cmd_str += ' --find-new-alleles' if args.gen_gset: pass # i.e. uses default (full) germline dir else: cmd_str += ' --dont-remove-unlikely-alleles' # --new-allele-fname ' + args.outdir + '/new-alleles.fa' inference_genes = ':'.join(args.inf_v_genes + args.dj_genes) iglfo = glutils.read_glfo('data/germlines/human', locus=locus, only_genes=inference_genes.split(':')) print ' starting inference with %d v: %s' % (len( iglfo['seqs']['v']), ' '.join( [utils.color_gene(g) for g in iglfo['seqs']['v']])) glutils.write_glfo(args.outdir + '/germlines/inference', iglfo) cmd_str += ' --initial-germline-dir ' + args.outdir + '/germlines/inference' # cmd_str += ' --n-max-snps 12' cmd_str += ' --parameter-dir ' + outpdir cmd_str += ' --only-overall-plots --plotdir ' + plotdir if args.seed is not None: cmd_str += ' --seed ' + str(args.seed) run(cmd_str)
def run_test(args): print 'seed %d' % args.seed label = 'test' #get_label(existing_genes, new_allele) simfname = args.outdir + '/simu-' + label + '.csv' outpdir = args.outdir + '/simu-' + label plotdir = args.outdir + '/simu-' + label + '-plots' # simulate if not args.nosim: cmd_str = base_cmd + ' simulate --n-sim-events ' + str(args.n_sim_events) + ' --n-leaves ' + str(args.n_leaves) + ' --constant-number-of-leaves --rearrange-from-scratch --outfname ' + simfname cmd_str += ' --mutation-multiplier ' + str(args.mut_mult) cmd_str += ' --n-procs ' + str(args.n_procs) if args.slurm: cmd_str += ' --batch-system slurm --subsimproc' if args.gen_gset: cmd_str += ' --generate-germline-set' cmd_str += ' --n-genes-per-region 1:5:3' cmd_str += ' --n-alleles-per-gene 2,3:1,2:1,2' else: simulation_genes = ':'.join(args.sim_v_genes + args.dj_genes) sglfo = glutils.read_glfo('data/germlines/human', chain=chain, only_genes=simulation_genes.split(':')) added_snp_names = None if args.snp_positions is not None: snps_to_add = [{'gene' : args.sim_v_genes[ig], 'positions' : args.snp_positions[ig]} for ig in range(len(args.sim_v_genes))] added_snp_names = glutils.add_some_snps(snps_to_add, sglfo, debug=True, remove_template_genes=args.remove_template_genes) if args.allele_prevalence_freqs is not None: if len(args.allele_prevalence_freqs) != len(sglfo['seqs']['v']): raise Exception('--allele-prevalence-freqs not the right length') gene_list = sorted(sglfo['seqs']['v']) if added_snp_names is None else list(set(args.sim_v_genes)) + added_snp_names prevalence_freqs = {'v' : {g : f for g, f in zip(gene_list, args.allele_prevalence_freqs)}, 'd' : {}, 'j' : {}} glutils.write_allele_prevalence_freqs(prevalence_freqs, args.workdir + '/allele-prevalence-freqs.csv') cmd_str += ' --allele-prevalence-fname ' + args.workdir + '/allele-prevalence-freqs.csv' glutils.write_glfo(args.outdir + '/germlines/simulation', sglfo) cmd_str += ' --initial-germline-dir ' + args.outdir + '/germlines/simulation' if args.seed is not None: cmd_str += ' --seed ' + str(args.seed) run(cmd_str) # remove any old sw cache files sw_cachefiles = glob.glob(outpdir + '/sw-cache-*.csv') if len(sw_cachefiles) > 0: for cachefname in sw_cachefiles: check_call(['rm', '-v', cachefname]) sw_cache_gldir = cachefname.replace('.csv', '-glfo') if os.path.exists(sw_cache_gldir): # if stuff fails halfway through, you can get one but not the other glutils.remove_glfo_files(sw_cache_gldir, chain) # os.rmdir(sw_cache_gldir) # generate germline set and cache parameters cmd_str = base_cmd + ' cache-parameters --infname ' + simfname + ' --only-smith-waterman --debug-allele-finding --always-find-new-alleles --n-max-allele-finding-iterations 2' # --dont-collapse-clones' # cmd_str = 'python -m cProfile -s tottime -o prof.out ' + cmd_str cmd_str += ' --n-procs ' + str(args.n_procs) if args.slurm: cmd_str += ' --batch-system slurm' if args.gen_gset: cmd_str += ' --find-new-alleles' else: inference_genes = ':'.join(args.inf_v_genes + args.dj_genes) iglfo = glutils.read_glfo('data/germlines/human', chain=chain, only_genes=inference_genes.split(':'), debug=True) glutils.write_glfo(args.outdir + '/germlines/inference', iglfo) cmd_str += ' --initial-germline-dir ' + args.outdir + '/germlines/inference' cmd_str += ' --find-new-alleles --dont-remove-unlikely-alleles' # --new-allele-fname ' + args.outdir + '/new-alleles.fa' # cmd_str += ' --n-max-snps 12' cmd_str += ' --parameter-dir ' + outpdir cmd_str += ' --plotdir ' + plotdir if args.seed is not None: cmd_str += ' --seed ' + str(args.seed) run(cmd_str)
#!/usr/bin/env python # Script to process the extras.csv files from partis' germline directories # Assumes the fasta files (e.g. ighv.fa) have been deduplicated import sys partis_dir = "/home/bolson2/Software/partis" sys.path.insert(1, partis_dir + '/python') import glutils igb_path = "/home/bolson2/Software/igblast" igb_database_path = igb_path + "/bin_deduplicated" glfo = glutils.read_glfo(igb_database_path, locus='igh', debug=True) glutils.write_glfo(igb_path + '/partis_friendly_bin', glfo, debug=True) glfo = glutils.read_glfo(igb_database_path, locus='igk', debug=True) glutils.write_glfo(igb_path + '/partis_friendly_bin', glfo, debug=True) glfo = glutils.read_glfo(igb_database_path, locus='igl', debug=True) glutils.write_glfo(igb_path + '/partis_friendly_bin', glfo, debug=True)