def __init__(self, base_indir, outdir, gene_name, naivety, glfo, args): self.region = utils.get_region(gene_name) self.raw_name = gene_name # i.e. unsanitized self.germline_seqs = glfo['seqs'] # all germline alleles self.germline_seq = self.germline_seqs[self.region][gene_name] # germline sequence for this hmm self.indir = base_indir self.args = args self.cyst_positions = glfo['cyst-positions'] self.tryp_positions = glfo['tryp-positions'] # parameters with values that I more or less made up self.precision = '16' # number of digits after the decimal for probabilities self.eps = 1e-6 # NOTE I also have an eps defined in utils, and they should in principle be combined self.n_max_to_interpolate = 20 self.min_mean_unphysical_insertion_length = {'fv' : 1.5, 'jf' : 25} # jf has to be quite a bit bigger, since besides account for the variation in J length from the tryp position to the end, it has to account for the difference in cdr3 lengths self.erosion_pseudocount_length = 10 # if we're closer to the end of the gene than this, make sure erosion probability isn't zero # self.insert_mute_prob = 0.0 # self.mean_mute_freq = 0.0 self.outdir = outdir self.naivety = naivety self.smallest_entry_index = -1 # keeps track of the first state that has a chance of being entered from init -- we want to start writing (with add_internal_state) from there # self.insertions = [ insert for insert in utils.index_keys if re.match(self.region + '._insertion', insert) or re.match('.' + self.region + '_insertion', insert)] OOPS that's not what I want to do self.insertions = [] if self.region == 'v': self.insertions.append('fv') elif self.region == 'd': self.insertions.append('vd') elif self.region == 'j': self.insertions.append('dj') self.insertions.append('jf') self.erosion_probs = {} self.insertion_probs = {} self.insertion_content_probs = {} self.n_occurences = utils.read_overall_gene_probs(self.indir, only_gene=gene_name, normalize=False) # how many times did we observe this gene in data? replacement_genes = None if self.n_occurences < self.args.min_observations_to_write: # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us if self.args.debug: print ' only saw it %d times, use info from other genes' % self.n_occurences replacement_genes = utils.find_replacement_genes(self.indir, self.args.min_observations_to_write, gene_name, single_gene=False, debug=self.args.debug) self.read_erosion_info(gene_name, replacement_genes) # try this exact gene, but... self.read_insertion_info(gene_name, replacement_genes) if self.naivety == 'M': # mutate if not naive self.mute_freqs, self.mute_obs = paramutils.read_mute_info(self.indir, this_gene=gene_name, approved_genes=replacement_genes) self.track = Track('nukes', utils.nukes) self.saniname = utils.sanitize_name(gene_name) self.hmm = HMM(self.saniname, self.track.getdict()) # pass the track as a dict rather than a Track object to keep the yaml file a bit more readable self.hmm.extras['gene_prob'] = max(self.eps, utils.read_overall_gene_probs(self.indir, only_gene=gene_name)) # if we really didn't see this gene at all, take pity on it and kick it an eps mean_freq_hist = Hist(fname=self.indir + '/all-mean-mute-freqs.csv') self.hmm.extras['overall_mute_freq'] = mean_freq_hist.get_mean()
def __init__(self, base_indir, outdir, gene_name, naivety, germline_seq, args): self.indir = base_indir self.args = args # parameters with values that I more or less made up self.precision = '16' # number of digits after the decimal for probabilities self.eps = 1e-6 # NOTE I also have an eps defined in utils, and they should in principle be combined self.n_max_to_interpolate = 20 self.allow_unphysical_insertions = self.args.allow_unphysical_insertions # allow fv and jf insertions. NOTE this slows things down by a factor of 6 or so # self.allow_external_deletions = args.allow_external_deletions # allow v left and j right deletions. I.e. if your reads extend beyond v or j boundaries self.v_3p_del_pseudocount_limit = 10 # add at least one entry # self.insert_mute_prob = 0.0 # self.mean_mute_freq = 0.0 self.outdir = outdir self.region = utils.get_region(gene_name) self.naivety = naivety self.germline_seq = germline_seq self.smallest_entry_index = -1 # keeps track of the first state that has a chance of being entered from init -- we want to start writing (with add_internal_state) from there # self.insertions = [ insert for insert in utils.index_keys if re.match(self.region + '._insertion', insert) or re.match('.' + self.region + '_insertion', insert)] OOPS that's not what I want to do self.insertions = [] if self.region == 'v': if self.allow_unphysical_insertions: self.insertions.append('fv') elif self.region == 'd': self.insertions.append('vd') elif self.region == 'j': self.insertions.append('dj') if self.allow_unphysical_insertions: self.insertions.append('jf') self.erosion_probs = {} self.insertion_probs = {} self.n_occurences = utils.read_overall_gene_probs(self.indir, only_gene=gene_name, normalize=False) # how many times did we observe this gene in data? replacement_genes = None if self.n_occurences < self.args.min_observations_to_write: # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us if self.args.debug: print ' only saw it %d times, use info from other genes' % self.n_occurences replacement_genes = utils.find_replacement_genes(self.indir, self.args.min_observations_to_write, gene_name, single_gene=False, debug=self.args.debug) self.read_erosion_info(gene_name, replacement_genes) # try this exact gene, but... self.read_insertion_info(gene_name, replacement_genes) if self.naivety == 'M': # mutate if not naive self.mute_freqs = paramutils.read_mute_info(self.indir, this_gene=gene_name, approved_genes=replacement_genes) self.track = Track('nukes', list(utils.nukes)) self.saniname = utils.sanitize_name(gene_name) self.hmm = HMM(self.saniname, {'nukes':list(utils.nukes)}) # pass the track as a dict rather than a Track object to keep the yaml file a bit more readable self.hmm.extras['gene_prob'] = max(self.eps, utils.read_overall_gene_probs(self.indir, only_gene=gene_name)) # if we really didn't see this gene at all, take pity on it and kick it an eps
def read_mute_freq_stuff(self, gene): assert gene[: 2] not in utils.boundaries # make sure <gene> isn't actually an insertion (we used to pass insertions in here separately, but now they're smooshed onto either end of d) if self.args.mutate_from_scratch: self.all_mute_freqs[gene] = { 'overall_mean': self.args.scratch_mute_freq } else: approved_genes = [gene] # ok this is kind of dumb, but I need to figure out how many counts there are for this gene, even when we have only an shm parameter dir tmp_reco_param_dir = self.reco_parameter_dir if self.reco_parameter_dir is not None else self.shm_parameter_dir # will crash if the shm parameter dir doesn't have gene count info... but we should only end up using it on data/recombinator/scratch-parameters gene_counts = utils.read_overall_gene_probs( tmp_reco_param_dir, only_gene=gene, normalize=False, expect_zero_counts=True) if gene_counts < self.args.min_observations_per_gene: # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us NOTE if <gene> isn't in the dict, it's because it's in <args.datadir> but not in the parameter dir UPDATE not using datadir like this any more, so previous statement may not be true approved_genes += utils.find_replacement_genes( tmp_reco_param_dir, min_counts=self.args.min_observations_per_gene, gene_name=gene) self.all_mute_freqs[ gene] = paramutils.read_mute_freqs_with_weights( self.shm_parameter_dir, approved_genes)
def __init__(self, args, input_info, reco_info, germline_seqs, parameter_dir, write_parameters=False, plotdir=None): self.parameter_dir = parameter_dir self.plotdir = plotdir self.args = args self.input_info = input_info self.reco_info = reco_info self.germline_seqs = germline_seqs self.pcounter, self.true_pcounter = None, None if write_parameters: self.pcounter = ParameterCounter(self.germline_seqs) if not self.args.is_data: self.true_pcounter = ParameterCounter(self.germline_seqs) self.info = {} self.info['all_best_matches'] = set() # set of all the matches we found (for *all* queries) self.info['skipped_unproductive_queries'] = [] # list of unproductive queries if self.args.apply_choice_probs_in_sw: if self.args.debug: print ' reading gene choice probs from',parameter_dir self.gene_choice_probs = utils.read_overall_gene_probs(parameter_dir) with opener('r')(self.args.datadir + '/v-meta.json') as json_file: # get location of <begin> cysteine in each v region self.cyst_positions = json.load(json_file) with opener('r')(self.args.datadir + '/j_tryp.csv') as csv_file: # get location of <end> tryptophan in each j region (TGG) tryp_reader = csv.reader(csv_file) self.tryp_positions = {row[0]:row[1] for row in tryp_reader} # WARNING: this doesn't filter out the header line self.outfile = None if self.args.outfname != None: self.outfile = open(self.args.outfname, 'a') self.n_unproductive = 0 self.n_total = 0
def read_mute_freq_stuff(self, gene_or_insert_name): if self.args.mutate_from_scratch: # XXX GODDAMMIT i remember putting this 'xxx' here for a reason and I have no f*****g clue what it was self.all_mute_freqs[gene_or_insert_name] = { 'overall_mean': self.args.flat_mute_freq } elif gene_or_insert_name[:2] in utils.boundaries: replacement_genes = utils.find_replacement_genes( self.parameter_dir, min_counts=-1, all_from_region='v') self.all_mute_freqs[ gene_or_insert_name], _ = paramutils.read_mute_info( self.parameter_dir, this_gene=gene_or_insert_name, locus=self.args.locus, approved_genes=replacement_genes) else: gene_counts = utils.read_overall_gene_probs( self.parameter_dir, only_gene=gene_or_insert_name, normalize=False, expect_zero_counts=True) replacement_genes = None if gene_counts < self.args.min_observations_to_write: # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us NOTE if <gene_or_insert_name> isn't in the dict, it's because it's <args.datadir> but not in the parameter dir UPDATE not using datadir like this any more, so previous statement may not be true replacement_genes = utils.find_replacement_genes( self.parameter_dir, min_counts=self.args.min_observations_to_write, gene_name=gene_or_insert_name) self.all_mute_freqs[ gene_or_insert_name], _ = paramutils.read_mute_info( self.parameter_dir, this_gene=gene_or_insert_name, locus=self.args.locus, approved_genes=replacement_genes)
def read_mute_freq_stuff(self, gene_or_insert_name): if gene_or_insert_name[:2] in utils.boundaries: replacement_genes = utils.find_replacement_genes(self.parameter_dir, min_counts=-1, all_from_region='v') self.all_mute_freqs[gene_or_insert_name], _ = paramutils.read_mute_info(self.parameter_dir, this_gene=gene_or_insert_name, approved_genes=replacement_genes) else: gene_counts = utils.read_overall_gene_probs(self.parameter_dir, only_gene=gene_or_insert_name, normalize=False, expect_zero_counts=True) replacement_genes = None if gene_counts < self.args.min_observations_to_write: # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us NOTE if <gene_or_insert_name> isn't in the dict, it's because it's <args.datadir> but not in the parameter dir UPDATE not using datadir like this any more, so previous statement may not be true replacement_genes = utils.find_replacement_genes(self.parameter_dir, min_counts=self.args.min_observations_to_write, gene_name=gene_or_insert_name, single_gene=False) self.all_mute_freqs[gene_or_insert_name], _ = paramutils.read_mute_info(self.parameter_dir, this_gene=gene_or_insert_name, approved_genes=replacement_genes)
def read_mute_freq_stuff(self, gene_or_insert_name): if self.args.mutate_from_scratch: # XXX GODDAMMIT i remember putting this 'xxx' here for a reason and I have no f*****g clue what it was self.all_mute_freqs[gene_or_insert_name] = {'overall_mean' : self.args.flat_mute_freq} elif gene_or_insert_name[:2] in utils.boundaries: replacement_genes = utils.find_replacement_genes(self.parameter_dir, min_counts=-1, all_from_region='v') self.all_mute_freqs[gene_or_insert_name], _ = paramutils.read_mute_info(self.parameter_dir, this_gene=gene_or_insert_name, chain=self.args.chain, approved_genes=replacement_genes) else: gene_counts = utils.read_overall_gene_probs(self.parameter_dir, only_gene=gene_or_insert_name, normalize=False, expect_zero_counts=True) replacement_genes = None if gene_counts < self.args.min_observations_to_write: # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us NOTE if <gene_or_insert_name> isn't in the dict, it's because it's <args.datadir> but not in the parameter dir UPDATE not using datadir like this any more, so previous statement may not be true replacement_genes = utils.find_replacement_genes(self.parameter_dir, min_counts=self.args.min_observations_to_write, gene_name=gene_or_insert_name) self.all_mute_freqs[gene_or_insert_name], _ = paramutils.read_mute_info(self.parameter_dir, this_gene=gene_or_insert_name, chain=self.args.chain, approved_genes=replacement_genes)
def __init__(self, base_indir, outdir, gene_name, glfo, args, debug=False): self.region = utils.get_region(gene_name) self.raw_name = gene_name # i.e. unsanitized self.germline_seqs = glfo['seqs'] # all germline alleles self.germline_seq = self.germline_seqs[self.region][gene_name] # germline sequence for this hmm self.indir = base_indir self.args = args self.debug = debug self.codon_positions = {r : glfo[c + '-positions'] for r, c in utils.conserved_codons[args.chain].items()} # parameters with values that I more or less made up self.precision = '16' # number of digits after the decimal for probabilities self.eps = 1e-6 # NOTE I also have an eps defined in utils, and they should in principle be combined self.n_max_to_interpolate = args.min_observations_to_write self.min_mean_unphysical_insertion_length = {'fv' : 1.5, 'jf' : 25} # jf has to be quite a bit bigger, since besides account for the variation in J length from the tryp position to the end, it has to account for the difference in cdr3 lengths self.erosion_pseudocount_length = 10 # if we're closer to the end of the gene than this, make sure erosion probability isn't zero self.outdir = outdir self.smallest_entry_index = -1 # keeps track of the first state that has a chance of being entered from init -- we want to start writing (with add_internal_state) from there self.insertions = [] if self.region == 'v': self.insertions.append('fv') elif self.region == 'd': self.insertions.append('vd') elif self.region == 'j': self.insertions.append('dj') self.insertions.append('jf') assert len(utils.ambiguous_bases) == 1 and utils.ambiguous_bases[0] == 'N' # maybe need to update some stuff below if this changes if self.debug: print '%s' % utils.color_gene(gene_name) self.n_occurences = utils.read_single_gene_count(self.indir, gene_name, debug=self.debug) # how many times did we observe this gene in data? replacement_genes = None if self.n_occurences < self.args.min_observations_to_write: # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us if self.debug: print ' only saw it %d times (wanted %d), so use info from all other genes' % (self.n_occurences, self.args.min_observations_to_write) replacement_genes = utils.find_replacement_genes(self.indir, self.args.min_observations_to_write, gene_name, debug=self.debug) self.erosion_probs = self.read_erosion_info(gene_name, replacement_genes) self.insertion_probs, self.insertion_content_probs = self.read_insertion_info(gene_name, replacement_genes) self.mute_freqs, self.mute_obs = paramutils.read_mute_info(self.indir, this_gene=gene_name, chain=self.args.chain, approved_genes=replacement_genes) # actual info in <self.mute_obs> isn't actually used a.t.m. self.track = Track('nukes', utils.nukes) self.saniname = utils.sanitize_name(gene_name) self.hmm = HMM(self.saniname, self.track.getdict()) # pass the track as a dict rather than a Track object to keep the yaml file a bit more readable self.hmm.extras['gene_prob'] = max(self.eps, utils.read_overall_gene_probs(self.indir, only_gene=gene_name)) # if we really didn't see this gene at all, take pity on it and kick it an eps tmp_mean_freq_hist = Hist(fname=self.indir + '/all-mean-mute-freqs.csv') self.hmm.extras['overall_mute_freq'] = tmp_mean_freq_hist.get_mean()
def __init__(self, base_indir, outdir, gene_name, glfo, args, debug=False): self.region = utils.get_region(gene_name) self.raw_name = gene_name # i.e. unsanitized self.germline_seqs = glfo['seqs'] # all germline alleles self.germline_seq = self.germline_seqs[self.region][gene_name] # germline sequence for this hmm self.indir = base_indir self.args = args self.debug = debug self.codon_positions = {r : glfo[c + '-positions'] for r, c in utils.conserved_codons[args.locus].items()} # parameters with values that I more or less made up self.precision = '16' # number of digits after the decimal for probabilities self.eps = 1e-6 # NOTE I also have an eps defined in utils, and they should in principle be combined self.n_max_to_interpolate = args.min_observations_to_write self.min_mean_unphysical_insertion_length = {'fv' : 1.5, 'jf' : 25} # jf has to be quite a bit bigger, since besides account for the variation in J length from the tryp position to the end, it has to account for the difference in cdr3 lengths self.erosion_pseudocount_length = 10 # if we're closer to the end of the gene than this, make sure erosion probability isn't zero self.outdir = outdir self.smallest_entry_index = -1 # keeps track of the first state that has a chance of being entered from init -- we want to start writing (with add_internal_state) from there self.insertions = [] if self.region == 'v': self.insertions.append('fv') elif self.region == 'd': self.insertions.append('vd') elif self.region == 'j': self.insertions.append('dj') self.insertions.append('jf') assert len(utils.ambiguous_bases) == 1 and utils.ambiguous_bases[0] == 'N' # maybe need to update some stuff below if this changes if self.debug: print '%s' % utils.color_gene(gene_name) self.n_occurences = utils.read_single_gene_count(self.indir, gene_name, debug=self.debug) # how many times did we observe this gene in data? replacement_genes = None if self.n_occurences < self.args.min_observations_to_write: # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us if self.debug: print ' only saw it %d times (wanted %d), so use info from all other genes' % (self.n_occurences, self.args.min_observations_to_write) replacement_genes = utils.find_replacement_genes(self.indir, self.args.min_observations_to_write, gene_name, debug=self.debug) self.erosion_probs = self.read_erosion_info(gene_name, replacement_genes) self.insertion_probs, self.insertion_content_probs = self.read_insertion_info(gene_name, replacement_genes) self.mute_freqs, self.mute_obs = paramutils.read_mute_info(self.indir, this_gene=gene_name, locus=self.args.locus, approved_genes=replacement_genes) # actual info in <self.mute_obs> isn't actually used a.t.m. self.track = Track('nukes', utils.nukes) self.saniname = utils.sanitize_name(gene_name) self.hmm = HMM(self.saniname, self.track.getdict()) # pass the track as a dict rather than a Track object to keep the yaml file a bit more readable self.hmm.extras['gene_prob'] = max(self.eps, utils.read_overall_gene_probs(self.indir, only_gene=gene_name)) # if we really didn't see this gene at all, take pity on it and kick it an eps tmp_mean_freq_hist = Hist(fname=self.indir + '/all-mean-mute-freqs.csv') self.hmm.extras['overall_mute_freq'] = tmp_mean_freq_hist.get_mean()
def __init__(self, args, input_info, reco_info, glfo, parameter_dir, write_parameters, genes_to_use): print 'smith-waterman' sys.stdout.flush() self.parameter_dir = parameter_dir self.args = args self.debug = self.args.debug if self.args.sw_debug is None else self.args.sw_debug self.max_insertion_length = 35 # if vdjalign reports an insertion longer than this, rerun the query (typically with different match/mismatch ratio) self.absolute_max_insertion_length = 200 # just ignore them if it's longer than this self.input_info = input_info self.remaining_queries = set([q for q in self.input_info.keys()]) # we remove queries from this set when we're satisfied with the current output (in general we may have to rerun some queries with different match/mismatch scores) self.new_indels = 0 # number of new indels that were kicked up this time through self.reco_info = reco_info self.glfo = glfo self.pcounter, self.true_pcounter, self.perfplotter = None, None, None if write_parameters: self.pcounter = ParameterCounter(self.glfo['seqs']) if not self.args.is_data: self.true_pcounter = ParameterCounter(self.glfo['seqs']) if self.args.plot_performance: self.perfplotter = PerformancePlotter(self.glfo['seqs'], 'sw') self.info = {} self.info['queries'] = [] # list of queries that *passed* sw, i.e. for which we have information self.info['all_best_matches'] = set() # set of all the matches we found (for *all* queries) self.info['indels'] = {} if self.args.apply_choice_probs_in_sw: if self.debug: print ' reading gene choice probs from', parameter_dir self.gene_choice_probs = utils.read_overall_gene_probs(parameter_dir) self.outfile = None if self.args.outfname is not None: self.outfile = open(self.args.outfname, 'a') self.nth_try = 1 self.unproductive_queries = set() # rewrite input germline sets (if needed) self.genes_to_use = genes_to_use # if None, we use all of 'em. NOTE do *not* use self.args.only_genes in this file (see partitiondriver) self.my_datadir = self.args.datadir # make sure to use *only* use <self.my_datadir> elsewhere if self.genes_to_use is not None: self.my_datadir = self.args.workdir + '/germline-sets' self.rewritten_files = utils.rewrite_germline_fasta(self.args.datadir, self.my_datadir, self.genes_to_use) if not os.path.exists(self.args.ighutil_dir + '/bin/vdjalign'): raise Exception('ERROR ighutil path d.n.e: ' + self.args.ighutil_dir + '/bin/vdjalign')
def __init__(self, args, input_info, reco_info, germline_seqs, parameter_dir, write_parameters=False, plotdir=None): self.parameter_dir = parameter_dir self.plotdir = plotdir self.args = args self.input_info = input_info self.reco_info = reco_info self.germline_seqs = germline_seqs self.pcounter, self.true_pcounter = None, None if write_parameters: self.pcounter = ParameterCounter(self.germline_seqs) if not self.args.is_data: self.true_pcounter = ParameterCounter(self.germline_seqs) self.info = {} self.info['all_best_matches'] = set( ) # set of all the matches we found (for *all* queries) self.info['skipped_unproductive_queries'] = [ ] # list of unproductive queries if self.args.apply_choice_probs_in_sw: if self.args.debug: print ' reading gene choice probs from', parameter_dir self.gene_choice_probs = utils.read_overall_gene_probs( parameter_dir) with opener('r')( self.args.datadir + '/v-meta.json' ) as json_file: # get location of <begin> cysteine in each v region self.cyst_positions = json.load(json_file) with opener('r')( self.args.datadir + '/j_tryp.csv' ) as csv_file: # get location of <end> tryptophan in each j region (TGG) tryp_reader = csv.reader(csv_file) self.tryp_positions = { row[0]: row[1] for row in tryp_reader } # WARNING: this doesn't filter out the header line self.outfile = None if self.args.outfname != None: self.outfile = open(self.args.outfname, 'a') self.n_unproductive = 0 self.n_total = 0
def __init__(self, args, input_info, reco_info, germline_seqs, parameter_dir, write_parameters=False): self.parameter_dir = parameter_dir self.args = args self.debug = self.args.debug if self.args.sw_debug is None else self.args.sw_debug self.input_info = input_info self.remaining_queries = [query for query in self.input_info.keys()] # we remove queries from this list when we're satisfied with the current output (in general we may have to rerun some queries with different match/mismatch scores) self.new_indels = 0 # number of new indels that were kicked up this time through self.reco_info = reco_info self.germline_seqs = germline_seqs self.pcounter, self.true_pcounter, self.perfplotter = None, None, None if write_parameters: self.pcounter = ParameterCounter(self.germline_seqs) if not self.args.is_data: self.true_pcounter = ParameterCounter(self.germline_seqs) if self.args.plot_performance: self.perfplotter = PerformancePlotter(self.germline_seqs, 'sw') self.info = {} self.info['queries'] = [] self.info['all_best_matches'] = set() # set of all the matches we found (for *all* queries) self.info['skipped_unproductive_queries'] = [] # list of unproductive queries # self.info['skipped_indel_queries'] = [] # list of queries that had indels self.info['skipped_unknown_queries'] = [] self.info['indels'] = {} if self.args.apply_choice_probs_in_sw: if self.debug: print ' reading gene choice probs from', parameter_dir self.gene_choice_probs = utils.read_overall_gene_probs(parameter_dir) with opener('r')(self.args.datadir + '/v-meta.json') as json_file: # get location of <begin> cysteine in each v region self.cyst_positions = json.load(json_file) with opener('r')(self.args.datadir + '/j_tryp.csv') as csv_file: # get location of <end> tryptophan in each j region (TGG) tryp_reader = csv.reader(csv_file) self.tryp_positions = {row[0]:row[1] for row in tryp_reader} # WARNING: this doesn't filter out the header line self.outfile = None if self.args.outfname is not None: self.outfile = open(self.args.outfname, 'a') self.n_unproductive = 0 self.n_total = 0 print 'smith-waterman'
def write_mute_freqs(self, region, gene_name, seq, reco_event, reco_seq_fname, is_insertion=False): """ Read position-by-position mute freqs from disk for <gene_name>, renormalize, then write to a file for bppseqgen. """ replacement_genes = None if is_insertion: replacement_genes = utils.find_replacement_genes(self.args.parameter_dir, min_counts=-1, all_from_region='v') else: n_occurences = utils.read_overall_gene_probs(self.args.parameter_dir, only_gene=gene_name, normalize=False) # how many times did we observe this gene in data? if n_occurences < self.args.min_observations_to_write: # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us # print ' only saw %s %d times, use info from other genes' % (utils.color_gene(gene_name), n_occurences) replacement_genes = utils.find_replacement_genes(self.args.parameter_dir, min_counts=self.args.min_observations_to_write, gene_name=gene_name, single_gene=False) mute_freqs, mute_counts = paramutils.read_mute_info(self.args.parameter_dir, this_gene=gene_name, approved_genes=replacement_genes) rates = [] # list with a relative mutation rate for each position in <seq> total = 0.0 # assert len(mute_freqs) == len(seq) # only equal length if no erosions NO oh right but mute_freqs only covers areas we could align to... for inuke in range(len(seq)): # append a freq for each nuke position = inuke + dict(reco_event.erosions.items() + reco_event.effective_erosions.items())[region + '_5p'] freq = 0.0 if position in mute_freqs: freq = mute_freqs[position] else: freq = mute_freqs['overall_mean'] rates.append(freq) total += freq # normalize to the number of sites (i.e. so an average site is given value 1.0) assert total != 0.0 # I am not hip enough to divide by zero for inuke in range(len(seq)): rates[inuke] *= float(len(seq)) / total total = 0.0 # and... double check it, just for shits and giggles for inuke in range(len(seq)): total += rates[inuke] assert utils.is_normed(total / float(len(seq))) assert len(rates) == len(seq) # you just can't be too careful. what if gremlins ate a few while python wasn't looking? # write the input file for bppseqgen, one base per line with opener('w')(reco_seq_fname) as reco_seq_file: reco_seq_file.write('state\trate\n') for inuke in range(len(seq)): reco_seq_file.write('%s\t%.15f\n' % (seq[inuke], rates[inuke]))
def print_data_table(dsetfos, method, latex=False, emph_genes=['IGHV1-2*02+G35A', 'IGHD3-10*01', 'IGHJ4*02', 'IGKV3-15*01', 'IGKJ3*01']): latex = True def getvalstr(gene, val): if gene is None or (utils.get_region(gene) == 'd' and not utils.has_d_gene(utils.get_locus(gene))): return '%s %5.2s %s %-16s%s' % (cstr, ' - ', cstr, ' - ', 4 * ' ' if latex else '') else: if latex: gstr = utils.shorten_gene_name(gene, use_one_based_indexing=True, n_max_mutstrs=5) if emph_genes is not None and gene in emph_genes: gstr = '\\color{red}{\\textbf{%s}}' % gstr else: gstr = utils.color_gene(gene, width=18) return '%s %s%5.2f%s %s %-20s' % (cstr, estr, 100 * val, estr, cstr, gstr) def print_line(rfos): print ' %s%s' % (' '.join([getvalstr(g, v) for g, v in rfos]), lstr) def ds_str(ds, region): lstr = ds.split('-')[1] return ('IG%s%s' % (('h' if lstr in ['g', 'm'] else lstr).upper(), region.upper())) if latex else ds cstr = '&' if latex else '' estr = '$' if latex else '' lstr = '\\\\' if latex else '' for region in utils.regions: param_dirs = [get_param_dir(heads.get_datadir(study, 'processed', extra_str=args.label) + '/' + dset, method) for study, dset in dsetfos] countfos = [utils.read_overall_gene_probs(pdir, normalize=True)[region] for pdir in param_dirs] gene_val_str = (' %s ' % cstr).join([(' %s %s %-20s' % ('\\%' if latex else '', cstr, ds_str(ds, region))) for _, ds in dsetfos]) tmpline = ' %s %s %s' % (cstr, gene_val_str, lstr) if latex: hstr = '\\hline' tmpline = ' %s\n%s\n %s' % (hstr, tmpline, hstr) print tmpline rowfos = [sorted(cfo.items(), key=operator.itemgetter(1), reverse=True) for cfo in countfos] irow = 0 while True: rfos = [rfo[irow] if irow < len(rfo) else (None, None) for rfo in rowfos] if set(rfos) == set([(None, None)]): break print_line(rfos) irow += 1
def read_mute_freq_stuff(self, gene): assert gene[: 2] not in utils.boundaries # make sure <gene> isn't actually an insertion (we used to pass insertions in here separately, but now they're smooshed onto either end of d) if self.args.mutate_from_scratch: self.all_mute_freqs[gene] = { 'overall_mean': self.args.default_scratch_mute_freq if self.args.flat_mute_freq is None else self.args.flat_mute_freq } else: gene_counts = utils.read_overall_gene_probs( self.parameter_dir, only_gene=gene, normalize=False, expect_zero_counts=True) approved_genes = [gene] if gene_counts < self.args.min_observations_to_write: # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us NOTE if <gene> isn't in the dict, it's because it's <args.datadir> but not in the parameter dir UPDATE not using datadir like this any more, so previous statement may not be true approved_genes += utils.find_replacement_genes( self.parameter_dir, min_counts=self.args.min_observations_to_write, gene_name=gene) self.all_mute_freqs[ gene] = paramutils.read_mute_freqs_with_weights( self.parameter_dir, approved_genes)
def __init__(self, args, input_info, reco_info, germline_seqs, parameter_dir, write_parameters=False): self.parameter_dir = parameter_dir self.args = args self.debug = self.args.debug if self.args.sw_debug is None else self.args.sw_debug self.input_info = input_info self.remaining_queries = [ query for query in self.input_info.keys() ] # we remove queries from this list when we're satisfied with the current output (in general we may have to rerun some queries with different match/mismatch scores) self.new_indels = 0 # number of new indels that were kicked up this time through self.reco_info = reco_info self.germline_seqs = germline_seqs self.pcounter, self.true_pcounter, self.perfplotter = None, None, None if write_parameters: self.pcounter = ParameterCounter(self.germline_seqs) if not self.args.is_data: self.true_pcounter = ParameterCounter(self.germline_seqs) if self.args.plot_performance: self.perfplotter = PerformancePlotter(self.germline_seqs, 'sw') self.info = {} self.info['queries'] = [] self.info['all_best_matches'] = set( ) # set of all the matches we found (for *all* queries) self.info['skipped_unproductive_queries'] = [ ] # list of unproductive queries # self.info['skipped_indel_queries'] = [] # list of queries that had indels self.info['skipped_unknown_queries'] = [] self.info['indels'] = {} if self.args.apply_choice_probs_in_sw: if self.debug: print ' reading gene choice probs from', parameter_dir self.gene_choice_probs = utils.read_overall_gene_probs( parameter_dir) with opener('r')( self.args.datadir + '/v-meta.json' ) as json_file: # get location of <begin> cysteine in each v region self.cyst_positions = json.load(json_file) with opener('r')( self.args.datadir + '/j_tryp.csv' ) as csv_file: # get location of <end> tryptophan in each j region (TGG) tryp_reader = csv.reader(csv_file) self.tryp_positions = { row[0]: row[1] for row in tryp_reader } # WARNING: this doesn't filter out the header line self.outfile = None if self.args.outfname is not None: self.outfile = open(self.args.outfname, 'a') self.n_unproductive = 0 self.n_total = 0 print 'smith-waterman'
def __init__(self, base_indir, outdir, gene_name, glfo, args, debug=False): self.region = utils.get_region(gene_name) self.raw_name = gene_name # i.e. unsanitized self.germline_seqs = glfo['seqs'] # all germline alleles self.germline_seq = self.germline_seqs[self.region][ gene_name] # germline sequence for this hmm self.indir = base_indir self.args = args self.debug = debug self.codon_positions = { r: glfo[c + '-positions'] for r, c in utils.conserved_codons[args.locus].items() } # parameters with values that I more or less made up self.precision = '16' # number of digits after the decimal for probabilities self.eps = 1e-6 # NOTE I also have an eps defined in utils, and they should in principle be combined self.n_max_to_interpolate = args.min_observations_to_write self.min_mean_unphysical_insertion_length = { 'fv': 1.5, 'jf': 25 } # jf has to be quite a bit bigger, since besides account for the variation in J length from the tryp position to the end, it has to account for the difference in cdr3 lengths self.mute_freq_bounds = { 'lo': 0.01, 'hi': 0.5 } # don't let any position mutate less frequently than 1% of the time, or more frequently than half the time self.enforced_flat_mfreq_length = { # i.e. distance over which the mute freqs are typically screwed up. I'm not really sure why these vary so much, but it's probably to do with how the s-w step works 'v_3p' : 9, 'd_5p' : 9, 'd_3p' : 9, 'j_5p' : 20, } self.erosion_pseudocount_length = 10 # if we're closer to the end of the gene than this, make sure erosion probability isn't zero self.outdir = outdir self.smallest_entry_index = -1 # keeps track of the first state that has a chance of being entered from init -- we want to start writing (with add_internal_state) from there self.insertions = [] if self.region == 'v': self.insertions.append('fv') elif self.region == 'd': self.insertions.append('vd') elif self.region == 'j': self.insertions.append('dj') self.insertions.append('jf') assert len(utils.ambiguous_bases) == 1 and utils.ambiguous_bases[ 0] == 'N' # maybe need to update some stuff below if this changes if self.debug: print '%s' % utils.color_gene(gene_name) self.n_occurences = utils.read_single_gene_count( self.indir, gene_name, debug=self.debug ) # how many times did we observe this gene in data? approved_genes = [gene_name] # NOTE this never happens any more, since partitiondriver.cache_parameters() resets <args.min_observations_to_write> if it's arger than 10*(number of sequences) if self.n_occurences < self.args.min_observations_to_write: # if we didn't see it enough, average also over all the genes that find_replacement_genes() gives us if self.debug: print ' only saw it %d times (wanted %d), so use info from all other genes' % ( self.n_occurences, self.args.min_observations_to_write) approved_genes += utils.find_replacement_genes( self.indir, self.args.min_observations_to_write, gene_name, debug=self.debug) self.erosion_probs = self.read_erosion_info(approved_genes) self.insertion_probs, self.insertion_content_probs = self.read_insertion_info( approved_genes) self.mute_freqs = paramutils.read_mute_freqs_with_weights( self.indir, approved_genes) # weighted averages over genes self.mute_counts = paramutils.read_mute_counts( self.indir, gene_name, self.args.locus) # raw per-{ACGT} counts self.process_mutation_info( ) # smooth/interpolation/whatnot for <self.mute_freqs> and <self.mute_counts> # NOTE i'm using a hybrid approach with mute_freqs and mute_counts -- the only thing I get from mute_counts is the ratios of the different bases, whereas the actual freq comes from mute_freqs (which has all the corrections/smooth/bullshit) self.track = Track('nukes', utils.nukes) self.saniname = utils.sanitize_name(gene_name) self.hmm = HMM( self.saniname, self.track.getdict() ) # pass the track as a dict rather than a Track object to keep the yaml file a bit more readable self.hmm.extras['gene_prob'] = max( self.eps, utils.read_overall_gene_probs(self.indir, only_gene=gene_name) ) # if we really didn't see this gene at all, take pity on it and kick it an eps tmp_mean_freq_hist = Hist(fname=self.indir + '/all-mean-mute-freqs.csv') self.hmm.extras['overall_mute_freq'] = tmp_mean_freq_hist.get_mean() self.hmm.extras['per_gene_mute_freq'] = self.mute_freqs[ 'unweighted_overall_mean'] # the other (weighted) one might be technically more accurate, depending on what you want, but it's probably not what anyone is expecting, so we write the unweighted one
def __init__(self, base_indir, outdir, gene_name, naivety, germline_seq, args): self.indir = base_indir self.args = args # parameters with values that I more or less made up self.precision = '16' # number of digits after the decimal for probabilities self.eps = 1e-6 # NOTE I also have an eps defined in utils, and they should in principle be combined self.n_max_to_interpolate = 20 self.allow_unphysical_insertions = self.args.allow_unphysical_insertions # allow fv and jf insertions. NOTE this slows things down by a factor of 6 or so # self.allow_external_deletions = args.allow_external_deletions # allow v left and j right deletions. I.e. if your reads extend beyond v or j boundaries self.v_3p_del_pseudocount_limit = 10 # add at least one entry # self.insert_mute_prob = 0.0 # self.mean_mute_freq = 0.0 self.outdir = outdir self.region = utils.get_region(gene_name) self.naivety = naivety self.germline_seq = germline_seq self.smallest_entry_index = -1 # keeps track of the first state that has a chance of being entered from init -- we want to start writing (with add_internal_state) from there # self.insertions = [ insert for insert in utils.index_keys if re.match(self.region + '._insertion', insert) or re.match('.' + self.region + '_insertion', insert)] OOPS that's not what I want to do self.insertions = [] if self.region == 'v': if self.allow_unphysical_insertions: self.insertions.append('fv') elif self.region == 'd': self.insertions.append('vd') elif self.region == 'j': self.insertions.append('dj') if self.allow_unphysical_insertions: self.insertions.append('jf') self.erosion_probs = {} self.insertion_probs = {} self.n_occurences = utils.read_overall_gene_probs( self.indir, only_gene=gene_name, normalize=False ) # how many times did we observe this gene in data? replacement_genes = None if self.n_occurences < self.args.min_observations_to_write: # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us if self.args.debug: print ' only saw it %d times, use info from other genes' % self.n_occurences replacement_genes = utils.find_replacement_genes( self.indir, self.args.min_observations_to_write, gene_name, single_gene=False, debug=self.args.debug) self.read_erosion_info( gene_name, replacement_genes) # try this exact gene, but... self.read_insertion_info(gene_name, replacement_genes) if self.naivety == 'M': # mutate if not naive self.mute_freqs = paramutils.read_mute_info( self.indir, this_gene=gene_name, approved_genes=replacement_genes) self.track = Track('nukes', list(utils.nukes)) self.saniname = utils.sanitize_name(gene_name) self.hmm = HMM( self.saniname, {'nukes': list(utils.nukes)} ) # pass the track as a dict rather than a Track object to keep the yaml file a bit more readable self.hmm.extras['gene_prob'] = max( self.eps, utils.read_overall_gene_probs(self.indir, only_gene=gene_name) ) # if we really didn't see this gene at all, take pity on it and kick it an eps
def __init__(self, base_indir, outdir, gene_name, naivety, germline_seqs, args, cyst_positions, tryp_positions): self.region = utils.get_region(gene_name) self.raw_name = gene_name # i.e. unsanitized self.germline_seqs = germline_seqs # all germline alleles self.germline_seq = self.germline_seqs[self.region][gene_name] # germline sequence for this hmm self.indir = base_indir self.args = args self.cyst_positions = cyst_positions self.tryp_positions = tryp_positions # parameters with values that I more or less made up self.precision = '16' # number of digits after the decimal for probabilities self.eps = 1e-6 # NOTE I also have an eps defined in utils, and they should in principle be combined self.n_max_to_interpolate = 20 # self.allow_external_deletions = args.allow_external_deletions # allow v left and j right deletions. I.e. if your reads extend beyond v or j boundaries self.min_mean_unphysical_insertion_length = {'fv' : 1.5, 'jf' : 25} # jf has to be quite a bit bigger, since besides account for the variation in J length from the tryp position to the end, it has to account for the difference in cdr3 lengths self.erosion_pseudocount_length = 10 # if we're closer to the end of the gene than this, make sure erosion probability isn't zero # self.insert_mute_prob = 0.0 # self.mean_mute_freq = 0.0 self.outdir = outdir self.naivety = naivety self.smallest_entry_index = -1 # keeps track of the first state that has a chance of being entered from init -- we want to start writing (with add_internal_state) from there # self.insertions = [ insert for insert in utils.index_keys if re.match(self.region + '._insertion', insert) or re.match('.' + self.region + '_insertion', insert)] OOPS that's not what I want to do self.insertions = [] if self.region == 'v': if not self.args.dont_allow_unphysical_insertions: self.insertions.append('fv') elif self.region == 'd': self.insertions.append('vd') elif self.region == 'j': self.insertions.append('dj') if not self.args.dont_allow_unphysical_insertions: self.insertions.append('jf') self.erosion_probs = {} self.insertion_probs = {} self.insertion_content_probs = {} self.n_occurences = utils.read_overall_gene_probs(self.indir, only_gene=gene_name, normalize=False) # how many times did we observe this gene in data? replacement_genes = None if self.n_occurences < self.args.min_observations_to_write: # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us if self.args.debug: print ' only saw it %d times, use info from other genes' % self.n_occurences replacement_genes = utils.find_replacement_genes(self.indir, self.args.min_observations_to_write, gene_name, single_gene=False, debug=self.args.debug) self.read_erosion_info(gene_name, replacement_genes) # try this exact gene, but... self.read_insertion_info(gene_name, replacement_genes) if self.naivety == 'M': # mutate if not naive self.mute_freqs, self.mute_obs = paramutils.read_mute_info(self.indir, this_gene=gene_name, approved_genes=replacement_genes) self.track = Track('nukes', utils.nukes) self.saniname = utils.sanitize_name(gene_name) self.hmm = HMM(self.saniname, self.track.getdict()) # pass the track as a dict rather than a Track object to keep the yaml file a bit more readable self.hmm.extras['gene_prob'] = max(self.eps, utils.read_overall_gene_probs(self.indir, only_gene=gene_name)) # if we really didn't see this gene at all, take pity on it and kick it an eps mean_freq_hist = plotting.make_hist_from_bin_entry_file(self.indir + '/all-mean-mute-freqs.csv') self.hmm.extras['overall_mute_freq'] = mean_freq_hist.GetMean()