def clean(self): """ remove all the parameter files """ self.mutefreqer.clean() for column in self.counts: if column == 'all': os.remove(self.base_outdir + '/' + utils.get_parameter_fname(column='all')) else: index = [column,] + utils.column_dependencies[column] os.remove(self.base_outdir + '/' + utils.get_parameter_fname(column_and_deps=index))
def write(self, base_outdir): print ' writing parameters' start = time.time() utils.prep_dir(base_outdir, multilings=('*.csv', '*.svg')) mute_start = time.time() self.mutefreqer.write( base_outdir, mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv' ) # REGION is replace by each region in the three output files) print ' mut freq write time: %.3f' % (time.time() - mute_start) # print ' %d / %d cached' % (self.mutefreqer.n_cached, self.mutefreqer.n_cached + self.mutefreqer.n_not_cached) for column in self.counts: index = None outfname = None if column == 'all': index = utils.index_columns outfname = base_outdir + '/' + utils.get_parameter_fname( column='all') elif '_content' in column: index = [ column, ] outfname = base_outdir + '/' + column + '.csv' else: index = [ column, ] + utils.column_dependencies[column] outfname = base_outdir + '/' + utils.get_parameter_fname( column_and_deps=index) if os.path.isfile(outfname): os.remove(outfname) elif not os.path.exists(base_outdir): os.makedirs(base_outdir) with opener('w')(outfname) as outfile: out_fieldnames = list(index) out_fieldnames.append('count') out_data = csv.DictWriter(outfile, out_fieldnames) out_data.writeheader() # NOTE this will in general not be sorted for key, count in self.counts[column].iteritems(): line = {} for ic in range(len(key)): line[index[ic]] = key[ic] line['count'] = count out_data.writerow(line) print ' parameter write time: %.3f' % (time.time() - start)
def read_vdj_version_freqs(self): """ Read the frequencies at which various VDJ combinations appeared in data """ if self.args.rearrange_from_scratch: return None version_freq_table = {} with open(self.reco_parameter_dir + '/' + utils.get_parameter_fname('all', 'r')) as infile: in_data = csv.DictReader(infile) total = 0.0 for line in in_data: # NOTE do *not* assume the file is sorted skip = False for region in utils.regions: if line[region + '_gene'] not in self.glfo['seqs'][region]: skip = True break if skip: continue total += float(line['count']) index = self.freqtable_index(line) assert index not in version_freq_table version_freq_table[index] = float(line['count']) if len(version_freq_table) == 0: raise Exception('didn\'t find any gene combinations in %s' % fname) # then normalize test_total = 0.0 for index in version_freq_table: version_freq_table[index] /= total test_total += version_freq_table[index] assert utils.is_normed(test_total, this_eps=1e-8) assert len(version_freq_table) < 1e8 # if it gets *too* large, choose_vdj_combo() below isn't going to work because of numerical underflow. Note there's nothing special about 1e8, it's just that I'm pretty sure we're fine *up* to that point, and once we get beyond it we should think about doing things differently return version_freq_table
def read_insertion_info(self, this_gene, approved_genes=None): if approved_genes == None: # if we aren't explicitly passed a list of genes to use, we just use the gene for which we're actually writing the hmm approved_genes = [this_gene,] genes_used = set() for insertion in self.insertions: self.insertion_probs[insertion] = {} deps = utils.column_dependencies[insertion + '_insertion'] with opener('r')(self.indir + '/' + utils.get_parameter_fname(column=insertion + '_insertion', deps=deps)) as infile: reader = csv.DictReader(infile) for line in reader: # first see if we want to use this line (if <region>_gene isn't in the line, this erosion doesn't depend on gene version) if self.region + '_gene' in line and line[self.region + '_gene'] not in approved_genes: # NOTE you'll need to change this if you want it to depend on another region's genes continue # then add in this insertion's counts n_inserted = 0 n_inserted = int(line[insertion + '_insertion']) if n_inserted not in self.insertion_probs[insertion]: self.insertion_probs[insertion][n_inserted] = 0.0 self.insertion_probs[insertion][n_inserted] += float(line['count']) if self.region + '_gene' in line: genes_used.add(line[self.region + '_gene']) assert len(self.insertion_probs[insertion]) > 0 # print ' interpolate insertions' interpolate_bins(self.insertion_probs[insertion], self.n_max_to_interpolate, bin_eps=self.eps) #, max_bin=len(self.germline_seq)) # NOTE that we normalize *after* this if 0 not in self.insertion_probs[insertion] or len(self.insertion_probs[insertion]) < 2: # all hell breaks loose lower down if we haven't got shit in the way of information if self.args.debug: print ' WARNING adding pseudocount to 1-bin in insertion probs' self.insertion_probs[insertion][0] = 1 self.insertion_probs[insertion][1] = 1 if self.args.debug: print ' ', self.insertion_probs[insertion] assert 0 in self.insertion_probs[insertion] and len(self.insertion_probs[insertion]) >= 2 # all hell breaks loose lower down if we haven't got shit in the way of information # and finally, normalize total = 0.0 for _, val in self.insertion_probs[insertion].iteritems(): total += val test_total = 0.0 for n_inserted in self.insertion_probs[insertion]: self.insertion_probs[insertion][n_inserted] /= total test_total += self.insertion_probs[insertion][n_inserted] assert utils.is_normed(test_total) if 0 not in self.insertion_probs[insertion] or self.insertion_probs[insertion][0] == 1.0: print 'ERROR cannot have all or none of the probability mass in the zero bin:', self.insertion_probs[insertion] assert False # self.insertion_content_probs = {} self.read_insertion_content(insertion) # also read the base content of the insertions if len(genes_used) > 1: # if length is 1, we will have just used the actual gene if self.args.debug: print ' insertions used:', ' '.join(genes_used)
def read_erosion_info(self, this_gene, approved_genes=None): # NOTE that d erosion lengths depend on each other... but I don't think that's modellable with an hmm. At least for the moment we integrate over the other erosion if approved_genes is None: approved_genes = [this_gene, ] eprobs = {} genes_used = set() for erosion in utils.real_erosions + utils.effective_erosions: if erosion[0] != self.region: continue eprobs[erosion] = {} if this_gene == glutils.dummy_d_genes[self.args.chain]: eprobs[erosion][0] = 1. # always erode zero bases continue deps = utils.column_dependencies[erosion + '_del'] with opener('r')(self.indir + '/' + utils.get_parameter_fname(column=erosion + '_del', deps=deps)) as infile: reader = csv.DictReader(infile) for line in reader: # first see if we want to use this line (if <region>_gene isn't in the line, this erosion doesn't depend on gene version) if self.region + '_gene' in line and line[self.region + '_gene'] not in approved_genes: # NOTE you'll need to change this if you want it to depend on another region's genes continue # then skip nonsense erosions that're too long for this gene, but were ok for another if int(line[erosion + '_del']) >= len(self.germline_seq): continue # then add in this erosion's counts n_eroded = int(line[erosion + '_del']) if n_eroded not in eprobs[erosion]: eprobs[erosion][n_eroded] = 0.0 eprobs[erosion][n_eroded] += float(line['count']) if self.region + '_gene' in line: genes_used.add(line[self.region + '_gene']) if len(eprobs[erosion]) == 0: raise Exception('didn\'t read any %s erosion probs from %s' % (erosion, self.indir + '/' + utils.get_parameter_fname(column=erosion + '_del', deps=deps))) # do some smoothingy things NOTE that we normalize *after* interpolating if erosion in utils.real_erosions: # for real erosions, don't interpolate if we lots of information about neighboring bins (i.e. we're pretty confident this bin should actually be zero) n_max = self.n_max_to_interpolate else: # for fake erosions, always interpolate n_max = -1 # print ' interpolate erosions' interpolate_bins(eprobs[erosion], n_max, bin_eps=self.eps, max_bin=len(self.germline_seq)) self.add_pseudocounts(eprobs[erosion]) # and finally, normalize total = 0.0 for _, val in eprobs[erosion].iteritems(): total += val test_total = 0.0 for n_eroded in eprobs[erosion]: eprobs[erosion][n_eroded] /= total test_total += eprobs[erosion][n_eroded] assert utils.is_normed(test_total) if len(genes_used) > 1 and self.debug: # if length is 1, we will have just used the actual gene print ' used erosion info from:', ' '.join(genes_used) return eprobs
def read_vdj_version_freqs(self): """ Read the frequencies at which various VDJ combinations appeared in data """ if self.args.rearrange_from_scratch: return None version_freq_table = {} with opener('r')(self.parameter_dir + '/' + utils.get_parameter_fname('all')) as infile: in_data = csv.DictReader(infile) total = 0.0 for line in in_data: # NOTE do *not* assume the file is sorted skip = False for region in utils.regions: if line[region + '_gene'] not in self.glfo['seqs'][region]: skip = True break if skip: continue total += float(line['count']) index = self.freqtable_index(line) assert index not in version_freq_table version_freq_table[index] = float(line['count']) if len(version_freq_table) == 0: raise Exception('didn\'t find any gene combinations in %s' % fname) # then normalize test_total = 0.0 for index in version_freq_table: version_freq_table[index] /= total test_total += version_freq_table[index] assert utils.is_normed(test_total, this_eps=1e-8) assert len(version_freq_table) < 1e8 # if it gets *too* large, choose_vdj_combo() below isn't going to work because of numerical underflow. Note there's nothing special about 1e8, it's just that I'm pretty sure we're fine *up* to that point, and once we get beyond it we should think about doing things differently return version_freq_table
def read_erosion_info(self, this_gene, approved_genes=None): # NOTE that d erosion lengths depend on each other... but I don't think that's modellable with an hmm. At least for the moment we integrate over the other erosion if approved_genes is None: approved_genes = [this_gene, ] eprobs = {} genes_used = set() for erosion in utils.all_erosions: if erosion[0] != self.region: continue eprobs[erosion] = {} if this_gene == glutils.dummy_d_genes[self.args.locus]: eprobs[erosion][0] = 1. # always erode zero bases continue deps = utils.column_dependencies[erosion + '_del'] with open(self.indir + '/' + utils.get_parameter_fname(column=erosion + '_del', deps=deps), 'r') as infile: reader = csv.DictReader(infile) for line in reader: # first see if we want to use this line (if <region>_gene isn't in the line, this erosion doesn't depend on gene version) if self.region + '_gene' in line and line[self.region + '_gene'] not in approved_genes: # NOTE you'll need to change this if you want it to depend on another region's genes continue # then skip nonsense erosions that're too long for this gene, but were ok for another if int(line[erosion + '_del']) >= len(self.germline_seq): continue # then add in this erosion's counts n_eroded = int(line[erosion + '_del']) if n_eroded not in eprobs[erosion]: eprobs[erosion][n_eroded] = 0.0 eprobs[erosion][n_eroded] += float(line['count']) if self.region + '_gene' in line: genes_used.add(line[self.region + '_gene']) if len(eprobs[erosion]) == 0: raise Exception('didn\'t read any %s erosion probs from %s' % (erosion, self.indir + '/' + utils.get_parameter_fname(column=erosion + '_del', deps=deps))) # do some smoothingy things NOTE that we normalize *after* interpolating if erosion in utils.real_erosions: # for real erosions, don't interpolate if we lots of information about neighboring bins (i.e. we're pretty confident this bin should actually be zero) n_max = self.n_max_to_interpolate else: # for fake erosions, always interpolate n_max = -1 # print ' interpolate erosions' interpolate_bins(eprobs[erosion], n_max, bin_eps=self.eps, max_bin=len(self.germline_seq)) self.add_pseudocounts(eprobs[erosion]) # and finally, normalize total = 0.0 for _, val in eprobs[erosion].iteritems(): total += val test_total = 0.0 for n_eroded in eprobs[erosion]: eprobs[erosion][n_eroded] /= total test_total += eprobs[erosion][n_eroded] assert utils.is_normed(test_total) if len(genes_used) > 1 and self.debug: # if length is 1, we will have just used the actual gene print ' used erosion info from:', ' '.join(genes_used) return eprobs
def read_erosion_info(self, this_gene, approved_genes=None): # NOTE that d erosion lengths depend on each other... but I don't think that's modellable with an hmm. At least for the moment we integrate over the other erosion if approved_genes == None: approved_genes = [this_gene] genes_used = set() for erosion in utils.real_erosions + utils.effective_erosions: if erosion[0] != self.region: continue self.erosion_probs[erosion] = {} deps = utils.column_dependencies[erosion + "_del"] with opener("r")( self.indir + "/" + utils.get_parameter_fname(column=erosion + "_del", deps=deps) ) as infile: reader = csv.DictReader(infile) for line in reader: # first see if we want to use this line (if <region>_gene isn't in the line, this erosion doesn't depend on gene version) if ( self.region + "_gene" in line and line[self.region + "_gene"] not in approved_genes ): # NOTE you'll need to change this if you want it to depend on another region's genes continue # then skip nonsense erosions that're too long for this gene, but were ok for another if int(line[erosion + "_del"]) >= len(self.germline_seq): continue # then add in this erosion's counts n_eroded = int(line[erosion + "_del"]) if n_eroded not in self.erosion_probs[erosion]: self.erosion_probs[erosion][n_eroded] = 0.0 self.erosion_probs[erosion][n_eroded] += float(line["count"]) if self.region + "_gene" in line: genes_used.add(line[self.region + "_gene"]) assert len(self.erosion_probs[erosion]) > 0 # do some smoothingy things NOTE that we normalize *after* interpolating if ( erosion in utils.real_erosions ): # for real erosions, don't interpolate if we lots of information about neighboring bins (i.e. we're pretty confident this bin should actually be zero) n_max = self.n_max_to_interpolate else: # for fake erosions, always interpolate n_max = -1 # print ' interpolate erosions' interpolate_bins(self.erosion_probs[erosion], n_max, bin_eps=self.eps, max_bin=len(self.germline_seq)) self.add_pseudocounts(self.erosion_probs[erosion]) # and finally, normalize total = 0.0 for _, val in self.erosion_probs[erosion].iteritems(): total += val test_total = 0.0 for n_eroded in self.erosion_probs[erosion]: self.erosion_probs[erosion][n_eroded] /= total test_total += self.erosion_probs[erosion][n_eroded] assert utils.is_normed(test_total) if len(genes_used) > 1: # if length is 1, we will have just used the actual gene if self.args.debug: print " erosions used:", " ".join(genes_used)
def write(self, base_outdir): print " writing parameters", sys.stdout.flush() start = time.time() utils.prep_dir( base_outdir, subdirs=("hmms", "mute-freqs", "germline-sets"), wildlings=("*.csv", "*.yaml", "*.fasta") ) # it's kind of hackey to specify the /hmms dir here, but as soon as we write the parameters below, the previous yamels are out of date, so it's pretty much necessary self.mfreqer.write( base_outdir + "/mute-freqs", mean_freq_outfname=base_outdir + "/REGION-mean-mute-freqs.csv" ) # REGION is replace by each region in the three output files) genes_with_counts = [g[0] for r in utils.regions for g in self.counts[r + "_gene"].keys()] glutils.write_glfo(base_outdir + "/" + glutils.glfo_dir, self.glfo, only_genes=genes_with_counts, debug=False) for column in self.counts: index = None outfname = None if column == "all": index = tuple(list(utils.index_columns) + ["cdr3_length"]) outfname = base_outdir + "/" + utils.get_parameter_fname(column="all") elif "_content" in column: index = [column] outfname = base_outdir + "/" + column + ".csv" else: index = [column] + utils.column_dependencies[column] outfname = base_outdir + "/" + utils.get_parameter_fname(column_and_deps=index) if os.path.isfile(outfname): os.remove(outfname) elif not os.path.exists(base_outdir): os.makedirs(base_outdir) with opener("w")(outfname) as outfile: out_fieldnames = list(index) out_fieldnames.append("count") out_data = csv.DictWriter(outfile, out_fieldnames) out_data.writeheader() # NOTE this will in general not be sorted for key, count in self.counts[column].iteritems(): line = {} for ic in range(len(key)): line[index[ic]] = key[ic] line["count"] = count out_data.writerow(line) print "(%.1f sec)" % (time.time() - start)
def __init__(self, args, seed, sublabel=None): self.args = args if sublabel == None: self.workdir = self.args.workdir + '/recombinator' self.outfname = self.args.outfname else: # need a separate workdir for each subprocess self.workdir = self.args.workdir + '/recombinator-' + sublabel self.outfname = self.workdir + '/' + os.path.basename(self.args.outfname) utils.prep_dir(self.workdir) if not os.path.exists(self.args.parameter_dir): raise Exception('parameter dir ' + self.args.parameter_dir + ' d.n.e') # parameters that control recombination, erosion, and whatnot self.index_keys = {} # this is kind of hackey, but I suspect indexing my huge table of freqs with a tuple is better than a dict self.version_freq_table = {} # list of the probabilities with which each VDJ combo appears in data self.mute_models = {} # self.treeinfo = [] # list of newick-formatted tree strings with region-specific branch info tacked at the end for region in utils.regions: self.mute_models[region] = {} for model in ['gtr', 'gamma']: self.mute_models[region][model] = {} # first read info that doesn't depend on which person we're looking at self.glfo = utils.read_germline_set(self.args.datadir) # then read stuff that's specific to each person self.read_vdj_version_freqs(self.args.parameter_dir + '/' + utils.get_parameter_fname('all')) self.allowed_genes = self.get_allowed_genes(self.args.parameter_dir) # only really used if <self.args.uniform_vj_choice_probs> is set, but it also checks the sensibility of <self.args.only_genes> self.insertion_content_probs = None self.read_insertion_content() # read shm info NOTE I'm not inferring the gtr parameters a.t.m., so I'm just (very wrongly) using the same ones for all individuals with opener('r')(self.args.gtrfname) as gtrfile: # read gtr parameters reader = csv.DictReader(gtrfile) for line in reader: parameters = line['parameter'].split('.') region = parameters[0][3].lower() assert region == 'v' or region == 'd' or region == 'j' model = parameters[1].lower() parameter_name = parameters[2] assert model in self.mute_models[region] self.mute_models[region][model][parameter_name] = line['value'] treegen = treegenerator.TreeGenerator(args, self.args.parameter_dir, seed=seed) self.treefname = self.workdir + '/trees.tre' treegen.generate_trees(seed, self.treefname) with opener('r')(self.treefname) as treefile: # read in the trees (and other info) that we just generated self.treeinfo = treefile.readlines() if not self.args.no_clean: os.remove(self.treefname) if os.path.exists(self.outfname): os.remove(self.outfname) elif not os.path.exists(os.path.dirname(os.path.abspath(self.outfname))): os.makedirs(os.path.dirname(os.path.abspath(self.outfname)))
def get_parameter_dir_genes(self, parameter_dir): parameter_dir_genes = set() for region in utils.regions: col = region + '_gene' column_and_deps = [col, ] + utils.column_dependencies[col] with open(parameter_dir + '/' + utils.get_parameter_fname(column_and_deps=column_and_deps)) as infile: reader = csv.DictReader(infile) for line in reader: parameter_dir_genes.add(line[region + '_gene']) return parameter_dir_genes
def write(self, base_outdir, my_datadir=None): print ' writing parameters', sys.stdout.flush() start = time.time() utils.prep_dir(base_outdir, subdirs=('hmms', 'mute-freqs', 'germline-sets'), wildlings=('*.csv', '*.yaml', '*.fasta')) # it's kind of hackey to specify the /hmms dir here, but as soon as we write the parameters below, the previous yamels are out of date, so it's pretty much necessary self.mfreqer.write(base_outdir + '/mute-freqs', mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv') # REGION is replace by each region in the three output files) genes_with_counts = [g[0] for r in utils.regions for g in self.counts[r + '_gene'].keys()] glutils.write_glfo(base_outdir + '/' + glutils.glfo_dir, self.glfo, only_genes=genes_with_counts, debug=True) for column in self.counts: index = None outfname = None if column == 'all': index = tuple(list(utils.index_columns) + ['cdr3_length', ]) outfname = base_outdir + '/' + utils.get_parameter_fname(column='all') elif '_content' in column: index = [column,] outfname = base_outdir + '/' + column + '.csv' else: index = [column,] + utils.column_dependencies[column] outfname = base_outdir + '/' + utils.get_parameter_fname(column_and_deps=index) if os.path.isfile(outfname): os.remove(outfname) elif not os.path.exists(base_outdir): os.makedirs(base_outdir) with opener('w')(outfname) as outfile: out_fieldnames = list(index) out_fieldnames.append('count') out_data = csv.DictWriter(outfile, out_fieldnames) out_data.writeheader() # NOTE this will in general not be sorted for key, count in self.counts[column].iteritems(): line = {} for ic in range(len(key)): line[index[ic]] = key[ic] line['count'] = count out_data.writerow(line) print '(%.1f sec)' % (time.time()-start)
def write(self, base_outdir): print ' writing parameters', sys.stdout.flush() start = time.time() utils.prep_dir(base_outdir, subdirs=('hmms', 'mute-freqs', 'germline-sets'), wildlings=('*.csv', '*.yaml', '*.fasta')) # it's kind of hackey to specify the /hmms dir here, but as soon as we write the parameters below, the previous yamels are out of date, so it's pretty much necessary self.mfreqer.write(base_outdir + '/mute-freqs', mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv') # REGION is replace by each region in the three output files) genes_with_counts = [g[0] for r in utils.regions for g in self.counts[r + '_gene'].keys()] glutils.write_glfo(base_outdir + '/' + glutils.glfo_dir, self.glfo, only_genes=genes_with_counts, debug=False) for column in self.counts: index = None outfname = None if column == 'all': index = tuple(list(utils.index_columns) + ['cdr3_length', ]) outfname = base_outdir + '/' + utils.get_parameter_fname(column='all') elif '_content' in column: index = [column,] outfname = base_outdir + '/' + column + '.csv' else: index = [column,] + utils.column_dependencies[column] outfname = base_outdir + '/' + utils.get_parameter_fname(column_and_deps=index) if os.path.isfile(outfname): os.remove(outfname) elif not os.path.exists(base_outdir): os.makedirs(base_outdir) with open(outfname, 'w') as outfile: out_fieldnames = list(index) out_fieldnames.append('count') out_data = csv.DictWriter(outfile, out_fieldnames) out_data.writeheader() # NOTE this will in general not be sorted for key, count in self.counts[column].iteritems(): line = {} for ic in range(len(key)): line[index[ic]] = key[ic] line['count'] = count out_data.writerow(line) print '(%.1f sec)' % (time.time()-start)
def write(self, base_outdir): print ' writing parameters', sys.stdout.flush() start = time.time() utils.prep_dir(base_outdir, multilings=('*.csv', '*.svg')) # mute_start = time.time() self.mutefreqer.write(base_outdir, mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv') # REGION is replace by each region in the three output files) # print ' mut freq write time: %.3f' % (time.time() - mute_start) # print ' %d / %d cached' % (self.mutefreqer.n_cached, self.mutefreqer.n_cached + self.mutefreqer.n_not_cached) for column in self.counts: index = None outfname = None if column == 'all': index = utils.index_columns outfname = base_outdir + '/' + utils.get_parameter_fname(column='all') elif '_content' in column: index = [column,] outfname = base_outdir + '/' + column + '.csv' else: index = [column,] + utils.column_dependencies[column] outfname = base_outdir + '/' + utils.get_parameter_fname(column_and_deps=index) if os.path.isfile(outfname): os.remove(outfname) elif not os.path.exists(base_outdir): os.makedirs(base_outdir) with opener('w')(outfname) as outfile: out_fieldnames = list(index) out_fieldnames.append('count') out_data = csv.DictWriter(outfile, out_fieldnames) out_data.writeheader() # NOTE this will in general not be sorted for key, count in self.counts[column].iteritems(): line = {} for ic in range(len(key)): line[index[ic]] = key[ic] line['count'] = count out_data.writerow(line) print '(%.1f sec)' % (time.time()-start)
def get_allowed_genes(self, parameter_dir): allowed_genes = {} for region in [r for r in utils.regions if r != 'd']: genes_in_file = set() with open(parameter_dir + '/' + utils.get_parameter_fname(column=region + '_gene', deps=utils.column_dependencies[region + '_gene'])) as csvfile: reader = csv.DictReader(csvfile) for line in reader: genes_in_file.add(line[region + '_gene']) allowed_genes[region] = genes_in_file if self.args.only_genes is not None: # if --only-genes was specified, not only does the gene have to be in the parameter file, but it has to be among --only-genes regional_only_genes = set(g for g in self.args.only_genes if utils.get_region(g) == region) if len(regional_only_genes - genes_in_file) > 0: # if command line asked for genes that aren't in the file raise Exception('genes %s specified with --only-genes are not present in %s, so there\'s no information with which to simulate' % (' '.join(regional_only_genes - genes_in_file), parameter_dir)) allowed_genes[region] &= regional_only_genes return allowed_genes
def __init__(self, args, seed, sublabel=None, total_length_from_right=-1): self.args = args if sublabel == None: self.workdir = self.args.workdir + '/recombinator' self.outfname = self.args.outfname else: # need a separate workdir for each subprocess self.workdir = self.args.workdir + '/recombinator-' + sublabel self.outfname = self.workdir + '/' + os.path.basename( self.args.outfname) utils.prep_dir(self.workdir) if not os.path.exists(self.args.parameter_dir): raise Exception('ERROR ' + self.args.parameter_dir + ' d.n.e') # parameters that control recombination, erosion, and whatnot self.total_length_from_right = total_length_from_right # measured from right edge of j, only write to file this much of the sequence (our read lengths are 130 by this def'n a.t.m.) self.all_seqs = {} # all the Vs, all the Ds... self.index_keys = { } # this is kind of hackey, but I suspect indexing my huge table of freqs with a tuple is better than a dict self.version_freq_table = { } # list of the probabilities with which each VDJ combo appears in data self.mute_models = {} # self.treeinfo = [] # list of newick-formatted tree strings with region-specific branch info tacked at the end for region in utils.regions: self.mute_models[region] = {} for model in ['gtr', 'gamma']: self.mute_models[region][model] = {} # first read info that doesn't depend on which person we're looking at self.all_seqs = utils.read_germlines(self.args.datadir) with opener('r')( self.args.datadir + '/v-meta.json' ) as json_file: # get location of <begin> cysteine in each v region self.cyst_positions = json.load(json_file) with opener('r')( self.args.datadir + '/j_tryp.csv' ) as csv_file: # get location of <end> tryptophan in each j region (TGG) tryp_reader = csv.reader(csv_file) self.tryp_positions = { row[0]: row[1] for row in tryp_reader } # WARNING: this doesn't filter out the header line # then read stuff that's specific to each person self.read_vdj_version_freqs(self.args.parameter_dir + '/' + utils.get_parameter_fname('all')) self.read_insertion_content() if self.args.naivety == 'M': # read shm info if non-naive is requested # NOTE I'm not inferring the gtr parameters a.t.m., so I'm just (very wrongly) using the same ones for all individuals with opener('r')( self.args.gtrfname) as gtrfile: # read gtr parameters reader = csv.DictReader(gtrfile) for line in reader: parameters = line['parameter'].split('.') region = parameters[0][3].lower() assert region == 'v' or region == 'd' or region == 'j' model = parameters[1].lower() parameter_name = parameters[2] assert model in self.mute_models[region] self.mute_models[region][model][parameter_name] = line[ 'value'] treegen = treegenerator.TreeGenerator(args, self.args.parameter_dir, seed=seed) self.treefname = self.workdir + '/trees.tre' treegen.generate_trees(seed, self.treefname) with opener('r')( self.treefname ) as treefile: # read in the trees (and other info) that we just generated self.treeinfo = treefile.readlines() if not self.args.no_clean: os.remove(self.treefname) if os.path.exists(self.outfname): os.remove(self.outfname) elif not os.path.exists(os.path.dirname(os.path.abspath( self.outfname))): os.makedirs(os.path.dirname(os.path.abspath(self.outfname)))
def read_insertion_info(self, approved_genes): iprobs, icontentprobs = {}, {} genes_used = set() for insertion in self.insertions: iprobs[insertion] = {} if approved_genes[0] == glutils.dummy_d_genes[self.args.locus]: iprobs[insertion][0] = 1. # always insert zero bases icontentprobs[insertion] = {n: 0.25 for n in utils.nukes} continue deps = utils.column_dependencies[insertion + '_insertion'] with open( self.indir + '/' + utils.get_parameter_fname( column=insertion + '_insertion', deps=deps), 'r') as infile: reader = csv.DictReader(infile) for line in reader: # first see if we want to use this line (if <region>_gene isn't in the line, this erosion doesn't depend on gene version) if self.region + '_gene' in line and line[ self.region + '_gene'] not in approved_genes: # NOTE you'll need to change this if you want it to depend on another region's genes continue # then add in this insertion's counts n_inserted = 0 n_inserted = int(line[insertion + '_insertion']) if n_inserted not in iprobs[insertion]: iprobs[insertion][n_inserted] = 0.0 iprobs[insertion][n_inserted] += float(line['count']) if self.region + '_gene' in line: genes_used.add(line[self.region + '_gene']) if len(iprobs[insertion]) == 0: raise Exception( 'didn\'t read any %s insertion probs from %s' % (insertion, self.indir + '/' + utils.get_parameter_fname( column=insertion + '_insertion', deps=deps))) # print ' interpolate insertions' interpolate_bins( iprobs[insertion], self.n_max_to_interpolate, bin_eps=self.eps ) #, max_bin=len(self.germline_seq)) # NOTE that we normalize *after* this if 0 not in iprobs[insertion] or len( iprobs[insertion] ) < 2: # all hell breaks loose lower down if we haven't got shit in the way of information if self.debug: print ' WARNING adding pseudocount to 1-bin in insertion probs' iprobs[insertion][0] = 1 iprobs[insertion][1] = 1 if self.debug: print ' ', iprobs[insertion] assert 0 in iprobs[insertion] and len( iprobs[insertion] ) >= 2 # all hell breaks loose lower down if we haven't got shit in the way of information # and finally, normalize total = 0.0 for _, val in iprobs[insertion].iteritems(): total += val test_total = 0.0 for n_inserted in iprobs[insertion]: iprobs[insertion][n_inserted] /= total test_total += iprobs[insertion][n_inserted] assert utils.is_normed(test_total) if 0 not in iprobs[insertion] or iprobs[insertion][0] == 1.0: print 'ERROR cannot have all or none of the probability mass in the zero bin:', iprobs[ insertion] assert False icontentprobs[insertion] = self.read_insertion_content( insertion) # also read the base content of the insertions if len(genes_used ) > 1: # if length is 1, we will have just used the actual gene if self.debug: print ' insertions used:', ' '.join(genes_used) return iprobs, icontentprobs
def __init__(self, args, seed, sublabel=None, total_length_from_right=-1): self.args = args if sublabel == None: self.workdir = self.args.workdir + '/recombinator' self.outfname = self.args.outfname else: # need a separate workdir for each subprocess self.workdir = self.args.workdir + '/recombinator-' + sublabel self.outfname = self.workdir + '/' + os.path.basename(self.args.outfname) utils.prep_dir(self.workdir) if not os.path.exists(self.args.parameter_dir): raise Exception('ERROR ' + self.args.parameter_dir + ' d.n.e') # parameters that control recombination, erosion, and whatnot self.total_length_from_right = total_length_from_right # measured from right edge of j, only write to file this much of the sequence (our read lengths are 130 by this def'n a.t.m.) self.all_seqs = {} # all the Vs, all the Ds... self.index_keys = {} # this is kind of hackey, but I suspect indexing my huge table of freqs with a tuple is better than a dict self.version_freq_table = {} # list of the probabilities with which each VDJ combo appears in data self.mute_models = {} # self.treeinfo = [] # list of newick-formatted tree strings with region-specific branch info tacked at the end for region in utils.regions: self.mute_models[region] = {} for model in ['gtr', 'gamma']: self.mute_models[region][model] = {} # first read info that doesn't depend on which person we're looking at self.all_seqs = utils.read_germlines(self.args.datadir) with opener('r')(self.args.datadir + '/v-meta.json') as json_file: # get location of <begin> cysteine in each v region self.cyst_positions = json.load(json_file) with opener('r')(self.args.datadir + '/j_tryp.csv') as csv_file: # get location of <end> tryptophan in each j region (TGG) tryp_reader = csv.reader(csv_file) self.tryp_positions = {row[0]:row[1] for row in tryp_reader} # WARNING: this doesn't filter out the header line # then read stuff that's specific to each person self.read_vdj_version_freqs(self.args.parameter_dir + '/' + utils.get_parameter_fname('all')) self.read_insertion_content() if self.args.naivety == 'M': # read shm info if non-naive is requested # NOTE I'm not inferring the gtr parameters a.t.m., so I'm just (very wrongly) using the same ones for all individuals with opener('r')(self.args.gtrfname) as gtrfile: # read gtr parameters reader = csv.DictReader(gtrfile) for line in reader: parameters = line['parameter'].split('.') region = parameters[0][3].lower() assert region == 'v' or region == 'd' or region == 'j' model = parameters[1].lower() parameter_name = parameters[2] assert model in self.mute_models[region] self.mute_models[region][model][parameter_name] = line['value'] treegen = treegenerator.TreeGenerator(args, self.args.parameter_dir, seed=seed) self.treefname = self.workdir + '/trees.tre' treegen.generate_trees(seed, self.treefname) with opener('r')(self.treefname) as treefile: # read in the trees (and other info) that we just generated self.treeinfo = treefile.readlines() if not self.args.no_clean: os.remove(self.treefname) if os.path.exists(self.outfname): os.remove(self.outfname) elif not os.path.exists(os.path.dirname(os.path.abspath(self.outfname))): os.makedirs(os.path.dirname(os.path.abspath(self.outfname)))