def plot(self, base_plotdir, cyst_positions=None, tryp_positions=None): if not self.finalized: self.finalize() plotdir = base_plotdir + '/mute-freqs' utils.prep_dir(plotdir + '/plots', multilings=('*.csv', '*.svg')) for region in utils.regions: utils.prep_dir(plotdir + '/' + region + '/plots', multilings=('*.csv', '*.svg')) utils.prep_dir(plotdir + '/' + region + '-per-base/plots', multilings=('*.csv', '*.png')) for gene in self.counts: counts, plotting_info = self.counts[gene], self.plotting_info[gene] sorted_positions = sorted(counts) hist = TH1D('hist_' + utils.sanitize_name(gene), '', sorted_positions[-1] - sorted_positions[0] + 1, sorted_positions[0] - 0.5, sorted_positions[-1] + 0.5) for position in sorted_positions: hist.SetBinContent(hist.FindBin(position), counts[position]['freq']) hi_diff = abs(counts[position]['freq'] - counts[position]['freq_hi_err']) lo_diff = abs(counts[position]['freq'] - counts[position]['freq_lo_err']) err = 0.5*(hi_diff + lo_diff) hist.SetBinError(hist.FindBin(position), err) plotfname = plotdir + '/' + utils.get_region(gene) + '/plots/' + utils.sanitize_name(gene) + '.svg' xline = None if utils.get_region(gene) == 'v' and cyst_positions is not None: xline = cyst_positions[gene]['cysteine-position'] elif utils.get_region(gene) == 'j' and tryp_positions is not None: xline = int(tryp_positions[gene]) plotting.draw(hist, 'int', plotdir=plotdir + '/' + utils.get_region(gene), plotname=utils.sanitize_name(gene), errors=True, write_csv=True, xline=xline, draw_str='e') #, cwidth=4000, cheight=1000) paramutils.make_mutefreq_plot(plotdir + '/' + utils.get_region(gene) + '-per-base', utils.sanitize_name(gene), plotting_info) # for region in utils.regions: # utils.prep_dir(plotdir + '/' + region + '/tmp/plots', multilings=('*.csv', '*.svg')) # for gene in self.tmpcounts: # for position in self.tmpcounts[gene]: # roothist = plotting.make_hist_from_my_hist_class(self.tmpcounts[gene][position]['muted'], gene + '_' + str(position)) # plotting.draw(roothist, 'int', plotdir=plotdir + '/' + utils.get_region(gene) + '/tmp', plotname=utils.sanitize_name(gene) + '_' + str(position), errors=True, write_csv=True) #, cwidth=4000, cheight=1000) # make mean mute freq hists hist = plotting.make_hist_from_my_hist_class(self.mean_rates['all'], 'all-mean-freq') plotting.draw(hist, 'float', plotname='all-mean-freq', plotdir=plotdir, stats='mean', bounds=(0.0, 0.4), write_csv=True) for region in utils.regions: hist = plotting.make_hist_from_my_hist_class(self.mean_rates[region], region+'-mean-freq') plotting.draw(hist, 'float', plotname=region+'-mean-freq', plotdir=plotdir, stats='mean', bounds=(0.0, 0.4), write_csv=True) check_call(['./bin/makeHtml', plotdir, '3', 'null', 'svg']) # then write html file and fix permissiions for region in utils.regions: check_call(['./bin/makeHtml', plotdir + '/' + region, '1', 'null', 'svg']) check_call(['./bin/makeHtml', plotdir + '/' + region + '-per-base', '1', 'null', 'png']) check_call(['./bin/permissify-www', plotdir]) # NOTE this should really permissify starting a few directories higher up
def plot(self, plotdir, subset_by_gene=False, cyst_positions=None, tryp_positions=None): print ' plotting parameters' start = time.time() utils.prep_dir(plotdir + '/plots') #, multilings=('*.csv', '*.svg')) for column in self.counts: if column == 'all': continue values, gene_values = {}, {} if len(self.counts[column]) == 0: print 'ERROR no counts in %s' % column assert False for index, count in self.counts[column].iteritems(): gene = None if subset_by_gene and ('_del' in column or column == 'vd_insertion' or column == 'dj_insertion'): # option to subset deletion and (real) insertion plots by gene if '_del' in column: region = column[0] else: region = column[1] assert region in utils.regions assert 'IGH' + region.upper() in index[1] # NOTE this is hackey, but it works find now and will fail obviously gene = index[1] # if I ever change the correlations to be incompatible. so screw it if gene not in gene_values: gene_values[gene] = {} column_val = index[0] if gene is not None: if column_val not in gene_values[gene]: gene_values[gene][column_val] = 0.0 gene_values[gene][column_val] += count if column_val not in values: values[column_val] = 0.0 values[column_val] += count try: # figure out whether this is an integer or string (only used outside this loop when we make the plots) int(column_val) var_type = 'int' except: var_type = 'string' if subset_by_gene and ('_del' in column or column == 'vd_insertion' or column == 'dj_insertion'): # option to subset deletion and (real) insertion plots by gene thisplotdir = plotdir + '/' + column utils.prep_dir(thisplotdir + '/plots', multilings=['*.csv', '*.svg']) for gene in gene_values: plotname = utils.sanitize_name(gene) + '-' + column hist = plotting.make_hist_from_dict_of_counts(gene_values[gene], var_type, plotname, sort=True) plotting.draw(hist, var_type, plotname=plotname, plotdir=thisplotdir, errors=True, write_csv=True) check_call(['./bin/makeHtml', thisplotdir, '3', 'null', 'svg']) check_call(['./bin/permissify-www', thisplotdir]) # NOTE this should really permissify starting a few directories higher up plotname = column hist = plotting.make_hist_from_dict_of_counts(values, var_type, plotname, sort=True) plotting.draw(hist, var_type, plotname=plotname, plotdir=plotdir, errors=True, write_csv=True) self.mutefreqer.plot(plotdir, cyst_positions, tryp_positions) #, mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv') # REGION is replace by each region in the three output files if has_root: check_call(['./bin/makeHtml', plotdir, '3', 'null', 'svg']) check_call(['./bin/permissify-www', plotdir]) # NOTE this should really permissify starting a few directories higher up print ' parameter plot time: %.3f' % (time.time()-start)
def write(self, base_outdir, mean_freq_outfname): if not self.finalized: self.finalize() outdir = base_outdir + '/mute-freqs' utils.prep_dir(outdir, '*.csv') for gene in self.counts: counts, freqs, plotting_info = self.counts[gene], self.freqs[gene], self.plotting_info[gene] sorted_positions = sorted(counts) outfname = outdir + '/' + utils.sanitize_name(gene) + '.csv' with opener('w')(outfname) as outfile: nuke_header = [] for nuke in utils.nukes: nuke_header.append(nuke) nuke_header.append(nuke + '_lo_err') nuke_header.append(nuke + '_hi_err') writer = csv.DictWriter(outfile, ('position', 'mute_freq', 'lo_err', 'hi_err') + tuple(nuke_header)) writer.writeheader() for position in sorted_positions: row = {'position':position, 'mute_freq':counts[position]['freq'], 'lo_err':counts[position]['freq_lo_err'], 'hi_err':counts[position]['freq_hi_err']} for nuke in utils.nukes: row[nuke] = freqs[position][nuke] row[nuke + '_lo_err'] = freqs[position][nuke + '_lo_err'] row[nuke + '_hi_err'] = freqs[position][nuke + '_hi_err'] writer.writerow(row) assert 'REGION' in mean_freq_outfname self.mean_rates['all'].write(mean_freq_outfname.replace('REGION', 'all')) # hackey hackey hackey replacement... *sigh* for region in utils.regions: self.mean_rates[region].write(mean_freq_outfname.replace('REGION', region))
def write(self, outdir, mean_freq_outfname): if not self.finalized: self.finalize() for gene in self.counts: gcounts, freqs = self.counts[gene], self.freqs[gene] outfname = outdir + '/' + utils.sanitize_name(gene) + '.csv' with opener('w')(outfname) as outfile: nuke_header = [n + xtra for n in utils.nukes for xtra in ('', '_obs', '_lo_err', '_hi_err')] writer = csv.DictWriter(outfile, ('position', 'mute_freq', 'lo_err', 'hi_err') + tuple(nuke_header)) writer.writeheader() for position in sorted(gcounts.keys()): row = {'position':position, 'mute_freq':freqs[position]['freq'], 'lo_err':freqs[position]['freq_lo_err'], 'hi_err':freqs[position]['freq_hi_err']} for nuke in utils.nukes: row[nuke] = freqs[position][nuke] row[nuke + '_obs'] = gcounts[position][nuke] row[nuke + '_lo_err'] = freqs[position][nuke + '_lo_err'] row[nuke + '_hi_err'] = freqs[position][nuke + '_hi_err'] writer.writerow(row) assert 'REGION' in mean_freq_outfname self.mean_rates['all'].write(mean_freq_outfname.replace('REGION', 'all')) # hackey hackey hackey replacement... *sigh* for region in utils.regions: self.mean_rates[region].write(mean_freq_outfname.replace('REGION', region))
def plot(self, base_plotdir, only_csv=False): if not self.finalized: self.finalize(debug=debug) plotdir = base_plotdir + '/allele-finding' for old_gene_dir in glob.glob(plotdir + '/*'): # has to be a bit more hackey than elsewhere, since we have no way of knowing what genes might have had their own directories written last time we wrote to this dir if not os.path.isdir(old_gene_dir): raise Exception('not a directory: %s' % old_gene_dir) utils.prep_dir(old_gene_dir, wildlings=('*.csv', '*.svg')) os.rmdir(old_gene_dir) utils.prep_dir(plotdir, wildlings=('*.csv', '*.svg')) if only_csv: # not implemented return start = time.time() for gene in self.plotvals: if utils.get_region(gene) != 'v': continue for position in self.plotvals[gene]: if position not in self.fitted_positions[gene]: # we can make plots for the positions we didn't fit, but there's a *lot* of them and they're slow continue # if 'allele-finding' not in self.TMPxyvals[gene][position] or self.TMPxyvals[gene][position]['allele-finding'] is None: # continue plotting.make_allele_finding_plot(plotdir + '/' + utils.sanitize_name(gene), gene, position, self.plotvals[gene][position]) print ' allele finding plot time: %.1f' % (time.time()-start)
def __init__(self, base_indir, outdir, gene_name, naivety, glfo, args): self.region = utils.get_region(gene_name) self.raw_name = gene_name # i.e. unsanitized self.germline_seqs = glfo['seqs'] # all germline alleles self.germline_seq = self.germline_seqs[self.region][gene_name] # germline sequence for this hmm self.indir = base_indir self.args = args self.cyst_positions = glfo['cyst-positions'] self.tryp_positions = glfo['tryp-positions'] # parameters with values that I more or less made up self.precision = '16' # number of digits after the decimal for probabilities self.eps = 1e-6 # NOTE I also have an eps defined in utils, and they should in principle be combined self.n_max_to_interpolate = 20 self.min_mean_unphysical_insertion_length = {'fv' : 1.5, 'jf' : 25} # jf has to be quite a bit bigger, since besides account for the variation in J length from the tryp position to the end, it has to account for the difference in cdr3 lengths self.erosion_pseudocount_length = 10 # if we're closer to the end of the gene than this, make sure erosion probability isn't zero # self.insert_mute_prob = 0.0 # self.mean_mute_freq = 0.0 self.outdir = outdir self.naivety = naivety self.smallest_entry_index = -1 # keeps track of the first state that has a chance of being entered from init -- we want to start writing (with add_internal_state) from there # self.insertions = [ insert for insert in utils.index_keys if re.match(self.region + '._insertion', insert) or re.match('.' + self.region + '_insertion', insert)] OOPS that's not what I want to do self.insertions = [] if self.region == 'v': self.insertions.append('fv') elif self.region == 'd': self.insertions.append('vd') elif self.region == 'j': self.insertions.append('dj') self.insertions.append('jf') self.erosion_probs = {} self.insertion_probs = {} self.insertion_content_probs = {} self.n_occurences = utils.read_overall_gene_probs(self.indir, only_gene=gene_name, normalize=False) # how many times did we observe this gene in data? replacement_genes = None if self.n_occurences < self.args.min_observations_to_write: # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us if self.args.debug: print ' only saw it %d times, use info from other genes' % self.n_occurences replacement_genes = utils.find_replacement_genes(self.indir, self.args.min_observations_to_write, gene_name, single_gene=False, debug=self.args.debug) self.read_erosion_info(gene_name, replacement_genes) # try this exact gene, but... self.read_insertion_info(gene_name, replacement_genes) if self.naivety == 'M': # mutate if not naive self.mute_freqs, self.mute_obs = paramutils.read_mute_info(self.indir, this_gene=gene_name, approved_genes=replacement_genes) self.track = Track('nukes', utils.nukes) self.saniname = utils.sanitize_name(gene_name) self.hmm = HMM(self.saniname, self.track.getdict()) # pass the track as a dict rather than a Track object to keep the yaml file a bit more readable self.hmm.extras['gene_prob'] = max(self.eps, utils.read_overall_gene_probs(self.indir, only_gene=gene_name)) # if we really didn't see this gene at all, take pity on it and kick it an eps mean_freq_hist = Hist(fname=self.indir + '/all-mean-mute-freqs.csv') self.hmm.extras['overall_mute_freq'] = mean_freq_hist.get_mean()
def finalize(self, calculate_uncertainty=True): """ convert from counts to mut freqs """ assert not self.finalized self.n_cached, self.n_not_cached = 0, 0 for gene in self.counts: self.freqs[gene], self.plotting_info[gene] = {}, [] # NOTE <counts> hold the overall (not per-base) frequencies, while <freqs> holds the per-base frequencies counts, freqs, plotting_info = self.counts[gene], self.freqs[gene], self.plotting_info[gene] sorted_positions = sorted(counts) for position in sorted_positions: freqs[position] = {} plotting_info.append({}) plotting_info[-1]['name'] = utils.sanitize_name(gene) + '_' + str(position) plotting_info[-1]['nuke_freqs'] = {} n_conserved, n_mutated = 0, 0 for nuke in utils.nukes: nuke_freq = float(counts[position][nuke]) / counts[position]['total'] freqs[position][nuke] = nuke_freq plotting_info[-1]['nuke_freqs'][nuke] = nuke_freq if calculate_uncertainty: # it's kinda slow errs = fraction_uncertainty.err(counts[position][nuke], counts[position]['total']) if errs[2]: self.n_cached += 1 else: self.n_not_cached += 1 # print nuke_freq, errs[0], errs[1], '(', counts[position][nuke], ',', counts[position]['total'], ')' assert errs[0] <= nuke_freq # these checks are probably unnecessary. EDIT and totally saved my ass about ten minutes after writing the previous statement assert nuke_freq <= errs[1] freqs[position][nuke + '_lo_err'] = errs[0] freqs[position][nuke + '_hi_err'] = errs[1] if nuke == counts[position]['gl_nuke']: n_conserved += counts[position][nuke] else: n_mutated += counts[position][nuke] # sum over A,C,G,T # uncert = fraction_uncertainty.err(obs, total) # uncertainty for each nuke counts[position]['freq'] = float(n_mutated) / counts[position]['total'] mutated_fraction_err = (0.0, 0.0) if calculate_uncertainty: # it's kinda slow mutated_fraction_err = fraction_uncertainty.err(n_mutated, counts[position]['total']) if mutated_fraction_err[2]: self.n_cached += 1 else: self.n_not_cached += 1 counts[position]['freq_lo_err'] = mutated_fraction_err[0] counts[position]['freq_hi_err'] = mutated_fraction_err[1] self.mean_rates['all'].normalize(overflow_warn=False) # we expect overflows in mute freq hists, so no need to warn us for region in utils.regions: self.mean_rates[region].normalize(overflow_warn=False) # for gene in self.tmpcounts: # for position in self.tmpcounts[gene]: # self.tmpcounts[gene][position]['muted'].divide_by(self.tmpcounts[gene][position]['total'], debug=False) self.finalized = True
def __init__(self, base_indir, outdir, gene_name, naivety, germline_seq, args): self.indir = base_indir self.args = args # parameters with values that I more or less made up self.precision = '16' # number of digits after the decimal for probabilities self.eps = 1e-6 # NOTE I also have an eps defined in utils, and they should in principle be combined self.n_max_to_interpolate = 20 self.allow_unphysical_insertions = self.args.allow_unphysical_insertions # allow fv and jf insertions. NOTE this slows things down by a factor of 6 or so # self.allow_external_deletions = args.allow_external_deletions # allow v left and j right deletions. I.e. if your reads extend beyond v or j boundaries self.v_3p_del_pseudocount_limit = 10 # add at least one entry # self.insert_mute_prob = 0.0 # self.mean_mute_freq = 0.0 self.outdir = outdir self.region = utils.get_region(gene_name) self.naivety = naivety self.germline_seq = germline_seq self.smallest_entry_index = -1 # keeps track of the first state that has a chance of being entered from init -- we want to start writing (with add_internal_state) from there # self.insertions = [ insert for insert in utils.index_keys if re.match(self.region + '._insertion', insert) or re.match('.' + self.region + '_insertion', insert)] OOPS that's not what I want to do self.insertions = [] if self.region == 'v': if self.allow_unphysical_insertions: self.insertions.append('fv') elif self.region == 'd': self.insertions.append('vd') elif self.region == 'j': self.insertions.append('dj') if self.allow_unphysical_insertions: self.insertions.append('jf') self.erosion_probs = {} self.insertion_probs = {} self.n_occurences = utils.read_overall_gene_probs(self.indir, only_gene=gene_name, normalize=False) # how many times did we observe this gene in data? replacement_genes = None if self.n_occurences < self.args.min_observations_to_write: # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us if self.args.debug: print ' only saw it %d times, use info from other genes' % self.n_occurences replacement_genes = utils.find_replacement_genes(self.indir, self.args.min_observations_to_write, gene_name, single_gene=False, debug=self.args.debug) self.read_erosion_info(gene_name, replacement_genes) # try this exact gene, but... self.read_insertion_info(gene_name, replacement_genes) if self.naivety == 'M': # mutate if not naive self.mute_freqs = paramutils.read_mute_info(self.indir, this_gene=gene_name, approved_genes=replacement_genes) self.track = Track('nukes', list(utils.nukes)) self.saniname = utils.sanitize_name(gene_name) self.hmm = HMM(self.saniname, {'nukes':list(utils.nukes)}) # pass the track as a dict rather than a Track object to keep the yaml file a bit more readable self.hmm.extras['gene_prob'] = max(self.eps, utils.read_overall_gene_probs(self.indir, only_gene=gene_name)) # if we really didn't see this gene at all, take pity on it and kick it an eps
def callback(): name = utils.sanitize_name(transition.get_name_or_id()) if phases == 2: self.sequence.add_fire(process_id, name) else: self.sequence.add_transition_start(process_id, name) if not transition.has_code(): self.sequence.add_transition_finish(process_id) if query_reports: self.query_reports(ok_callback) elif ok_callback: ok_callback()
def __init__(self, base_indir, outdir, gene_name, glfo, args, debug=False): self.region = utils.get_region(gene_name) self.raw_name = gene_name # i.e. unsanitized self.germline_seqs = glfo['seqs'] # all germline alleles self.germline_seq = self.germline_seqs[self.region][gene_name] # germline sequence for this hmm self.indir = base_indir self.args = args self.debug = debug self.codon_positions = {r : glfo[c + '-positions'] for r, c in utils.conserved_codons[args.chain].items()} # parameters with values that I more or less made up self.precision = '16' # number of digits after the decimal for probabilities self.eps = 1e-6 # NOTE I also have an eps defined in utils, and they should in principle be combined self.n_max_to_interpolate = args.min_observations_to_write self.min_mean_unphysical_insertion_length = {'fv' : 1.5, 'jf' : 25} # jf has to be quite a bit bigger, since besides account for the variation in J length from the tryp position to the end, it has to account for the difference in cdr3 lengths self.erosion_pseudocount_length = 10 # if we're closer to the end of the gene than this, make sure erosion probability isn't zero self.outdir = outdir self.smallest_entry_index = -1 # keeps track of the first state that has a chance of being entered from init -- we want to start writing (with add_internal_state) from there self.insertions = [] if self.region == 'v': self.insertions.append('fv') elif self.region == 'd': self.insertions.append('vd') elif self.region == 'j': self.insertions.append('dj') self.insertions.append('jf') assert len(utils.ambiguous_bases) == 1 and utils.ambiguous_bases[0] == 'N' # maybe need to update some stuff below if this changes if self.debug: print '%s' % utils.color_gene(gene_name) self.n_occurences = utils.read_single_gene_count(self.indir, gene_name, debug=self.debug) # how many times did we observe this gene in data? replacement_genes = None if self.n_occurences < self.args.min_observations_to_write: # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us if self.debug: print ' only saw it %d times (wanted %d), so use info from all other genes' % (self.n_occurences, self.args.min_observations_to_write) replacement_genes = utils.find_replacement_genes(self.indir, self.args.min_observations_to_write, gene_name, debug=self.debug) self.erosion_probs = self.read_erosion_info(gene_name, replacement_genes) self.insertion_probs, self.insertion_content_probs = self.read_insertion_info(gene_name, replacement_genes) self.mute_freqs, self.mute_obs = paramutils.read_mute_info(self.indir, this_gene=gene_name, chain=self.args.chain, approved_genes=replacement_genes) # actual info in <self.mute_obs> isn't actually used a.t.m. self.track = Track('nukes', utils.nukes) self.saniname = utils.sanitize_name(gene_name) self.hmm = HMM(self.saniname, self.track.getdict()) # pass the track as a dict rather than a Track object to keep the yaml file a bit more readable self.hmm.extras['gene_prob'] = max(self.eps, utils.read_overall_gene_probs(self.indir, only_gene=gene_name)) # if we really didn't see this gene at all, take pity on it and kick it an eps tmp_mean_freq_hist = Hist(fname=self.indir + '/all-mean-mute-freqs.csv') self.hmm.extras['overall_mute_freq'] = tmp_mean_freq_hist.get_mean()
def read_mute_info(indir, this_gene, chain, approved_genes=None): # NOTE this would probably be more accurate if we made some effort to align the genes before combining all the approved ones if approved_genes is None: approved_genes = [this_gene, ] if this_gene == glutils.dummy_d_genes[chain]: return {'overall_mean' : 0.5}, {} observed_freqs, observed_counts = {}, {} total_counts = 0 # add an observation for each position, for each gene where we observed that position NOTE this would be more sensible if they were aligned first for gene in approved_genes: mutefname = indir + '/mute-freqs/' + utils.sanitize_name(gene) + '.csv' if not os.path.exists(mutefname): continue with opener('r')(mutefname) as mutefile: reader = csv.DictReader(mutefile) for line in reader: pos = int(line['position']) freq = float(line['mute_freq']) lo_err = float(line['lo_err']) # NOTE lo_err in the file is really the lower *bound* hi_err = float(line['hi_err']) # same deal assert freq >= 0.0 and lo_err >= 0.0 and hi_err >= 0.0 # you just can't be too careful if freq < utils.eps or abs(1.0 - freq) < utils.eps: # if <freq> too close to 0 or 1, replace it with the midpoint of its uncertainty band freq = 0.5 * (lo_err + hi_err) if pos not in observed_freqs: observed_freqs[pos] = [] observed_counts[pos] = {n : 0 for n in utils.nukes} observed_freqs[pos].append({'freq':freq, 'err':max(abs(freq-lo_err), abs(freq-hi_err))}) for nuke in utils.nukes: observed_counts[pos][nuke] += int(line[nuke + '_obs']) total_counts += int(line[nuke + '_obs']) # set final mute_freqs[pos] to the (inverse error-weighted) average over all the observations for each position mute_freqs = {} overall_total, overall_sum_of_weights = 0.0, 0.0 # also calculate the mean over all positions for pos in observed_freqs: total, sum_of_weights = 0.0, 0.0 for obs in observed_freqs[pos]: assert obs['err'] > 0.0 weight = 1.0 / obs['err'] total += weight * obs['freq'] sum_of_weights += weight assert sum_of_weights > 0.0 mean_freq = total / sum_of_weights mute_freqs[pos] = mean_freq overall_total += total overall_sum_of_weights += sum_of_weights mute_freqs['overall_mean'] = 0. if overall_sum_of_weights > 0.: mute_freqs['overall_mean'] = overall_total / overall_sum_of_weights observed_counts['total_counts'] = total_counts return mute_freqs, observed_counts
def run_sequence(self, sequence): transitions = {} command = [0] for t in self.runinstance.net.transitions(): transitions["#{0}".format(t.id)] = t for t in self.runinstance.net.transitions(): transitions[utils.sanitize_name(t.get_name())] = t def next_command(): if command[0] >= sequence.get_commands_size(): self.query_reports() return sequence.execute_command(command[0], fire, start, finish, receive) command[0] += 1 def fail_callback(): self.emit_event("command-failed", sequence, command[0] - 1) def fire(process_id, transition): t = transitions.get(transition) if t is None: raise SimulationException("Transition '{0}' not found".format(transition)) self.fire_transition(t.id, process_id, 2, ok_callback=next_command) def start(process_id, transition): t = transitions.get(transition) if t is None: raise SimulationException("Transition '{0}' not found".format(transition)) self.fire_transition(t.id, process_id, 1, ok_callback=next_command) def finish(process_id): self.finish_transition(process_id, ok_callback=next_command, fail_callback=fail_callback) def receive(process_id, from_process): self.receive(process_id, from_process, ok_callback=next_command, fail_callback=fail_callback) next_command()
def read_single_file(gtmp): mfname = indir + '/mute-freqs/' + utils.sanitize_name(gtmp) + '.csv' if not os.path.exists(mfname): return None observed_counts = {} with open(mfname, 'r') as mutefile: reader = csv.DictReader(mutefile) for line in reader: pos = int(line['position']) assert pos not in observed_counts observed_counts[pos] = { n: int(line[n + '_obs']) for n in utils.nukes } if debug: print ' read %d per-base mute counts from %s' % ( len(observed_counts), mfname) return observed_counts
def write(self, base_outdir, mean_freq_outfname): if not self.finalized: self.finalize() outdir = base_outdir + '/mute-freqs' utils.prep_dir(outdir, '*.csv') for gene in self.counts: counts, freqs, plotting_info = self.counts[gene], self.freqs[ gene], self.plotting_info[gene] sorted_positions = sorted(counts) outfname = outdir + '/' + utils.sanitize_name(gene) + '.csv' with opener('w')(outfname) as outfile: nuke_header = [] for nuke in utils.nukes: nuke_header.append(nuke) nuke_header.append(nuke + '_lo_err') nuke_header.append(nuke + '_hi_err') writer = csv.DictWriter( outfile, ('position', 'mute_freq', 'lo_err', 'hi_err') + tuple(nuke_header)) writer.writeheader() for position in sorted_positions: row = { 'position': position, 'mute_freq': counts[position]['freq'], 'lo_err': counts[position]['freq_lo_err'], 'hi_err': counts[position]['freq_hi_err'] } for nuke in utils.nukes: row[nuke] = freqs[position][nuke] row[nuke + '_lo_err'] = freqs[position][nuke + '_lo_err'] row[nuke + '_hi_err'] = freqs[position][nuke + '_hi_err'] writer.writerow(row) assert 'REGION' in mean_freq_outfname self.mean_rates['all'].write( mean_freq_outfname.replace( 'REGION', 'all')) # hackey hackey hackey replacement... *sigh* for region in utils.regions: self.mean_rates[region].write( mean_freq_outfname.replace('REGION', region))
def check_hmm_existence(self, gene_list, skipped_gene_matches, parameter_dir, query_name, second_query_name=None): """ Check if hmm model file exists, and if not remove gene from <gene_list> and print a warning """ # first get the list of genes for which we don't have hmm files if len(glob.glob(parameter_dir + '/hmms/*.yaml')) == 0: print 'ERROR no yamels in %s' % parameter_dir sys.exit() genes_to_remove = [] for gene in gene_list: hmmfname = parameter_dir + '/hmms/' + utils.sanitize_name(gene) + '.yaml' if not os.path.exists(hmmfname): # if self.args.debug: # print ' WARNING %s removed from match list for %s %s (not in %s)' % (utils.color_gene(gene), query_name, '' if second_query_name==None else second_query_name, os.path.dirname(hmmfname)) skipped_gene_matches.add(gene) genes_to_remove.append(gene) # then remove 'em from <gene_list> for gene in genes_to_remove: gene_list.remove(gene)
def read_mute_info(indir, this_gene, approved_genes=None): if approved_genes == None: approved_genes = [this_gene,] observed_freqs = {} # add an observation for each position, for each gene where we observed that position for gene in approved_genes: mutefname = indir + '/mute-freqs/' + utils.sanitize_name(gene) + '.csv' if not os.path.exists(mutefname): continue with opener('r')(mutefname) as mutefile: reader = csv.DictReader(mutefile) for line in reader: pos = int(line['position']) freq = float(line['mute_freq']) lo_err = float(line['lo_err']) # NOTE lo_err in the file is really the lower *bound* hi_err = float(line['hi_err']) # same deal assert freq >= 0.0 and lo_err >= 0.0 and hi_err >= 0.0 # you just can't be too careful if freq < utils.eps or abs(1.0 - freq) < utils.eps: # if <freq> too close to 0 or 1, replace it with the midpoint of its uncertainty band freq = 0.5 * (lo_err + hi_err) if pos not in observed_freqs: observed_freqs[pos] = [] observed_freqs[pos].append({'freq':freq, 'err':max(abs(freq-lo_err), abs(freq-hi_err))}) # set final mute_freqs[pos] to the (inverse error-weighted) average over all the observations for each position mute_freqs = {} overall_total, overall_sum_of_weights = 0.0, 0.0 # also calculate the mean over all positions for pos in observed_freqs: total, sum_of_weights = 0.0, 0.0 for obs in observed_freqs[pos]: assert obs['err'] > 0.0 weight = 1.0 / obs['err'] total += weight * obs['freq'] sum_of_weights += weight assert sum_of_weights > 0.0 mean_freq = total / sum_of_weights mute_freqs[pos] = mean_freq overall_total += total overall_sum_of_weights += sum_of_weights mute_freqs['overall_mean'] = overall_total / overall_sum_of_weights return mute_freqs
def write(self, outdir, mean_freq_outfname): if not self.finalized: self.finalize() for gene in self.counts: gcounts, freqs = self.counts[gene], self.freqs[gene] outfname = outdir + '/' + utils.sanitize_name(gene) + '.csv' with open(outfname, 'w') as outfile: nuke_header = [ n + xtra for n in utils.nukes for xtra in ('', '_obs', '_lo_err', '_hi_err') ] writer = csv.DictWriter( outfile, ('position', 'mute_freq', 'lo_err', 'hi_err') + tuple(nuke_header)) writer.writeheader() for position in sorted(gcounts.keys()): row = { 'position': position, 'mute_freq': freqs[position]['freq'], 'lo_err': freqs[position]['freq_lo_err'], 'hi_err': freqs[position]['freq_hi_err'] } for nuke in utils.nukes: row[nuke] = freqs[position][nuke] row[nuke + '_obs'] = gcounts[position][nuke] row[nuke + '_lo_err'] = freqs[position][nuke + '_lo_err'] row[nuke + '_hi_err'] = freqs[position][nuke + '_hi_err'] writer.writerow(row) assert 'REGION' in mean_freq_outfname self.mean_rates['all'].write( mean_freq_outfname.replace( 'REGION', 'all')) # hackey hackey hackey replacement... *sigh* for region in utils.regions: self.mean_rates[region].write( mean_freq_outfname.replace('REGION', region))
def __init__(self, base_indir, outdir, gene_name, glfo, args, debug=False): self.region = utils.get_region(gene_name) self.raw_name = gene_name # i.e. unsanitized self.germline_seqs = glfo['seqs'] # all germline alleles self.germline_seq = self.germline_seqs[self.region][ gene_name] # germline sequence for this hmm self.indir = base_indir self.args = args self.debug = debug self.codon_positions = { r: glfo[c + '-positions'] for r, c in utils.conserved_codons[args.locus].items() } # parameters with values that I more or less made up self.precision = '16' # number of digits after the decimal for probabilities self.eps = 1e-6 # NOTE I also have an eps defined in utils, and they should in principle be combined self.n_max_to_interpolate = args.min_observations_to_write self.min_mean_unphysical_insertion_length = { 'fv': 1.5, 'jf': 25 } # jf has to be quite a bit bigger, since besides account for the variation in J length from the tryp position to the end, it has to account for the difference in cdr3 lengths self.mute_freq_bounds = { 'lo': 0.01, 'hi': 0.5 } # don't let any position mutate less frequently than 1% of the time, or more frequently than half the time self.enforced_flat_mfreq_length = { # i.e. distance over which the mute freqs are typically screwed up. I'm not really sure why these vary so much, but it's probably to do with how the s-w step works 'v_3p' : 9, 'd_5p' : 9, 'd_3p' : 9, 'j_5p' : 20, } self.erosion_pseudocount_length = 10 # if we're closer to the end of the gene than this, make sure erosion probability isn't zero self.outdir = outdir self.smallest_entry_index = -1 # keeps track of the first state that has a chance of being entered from init -- we want to start writing (with add_internal_state) from there self.insertions = [] if self.region == 'v': self.insertions.append('fv') elif self.region == 'd': self.insertions.append('vd') elif self.region == 'j': self.insertions.append('dj') self.insertions.append('jf') assert len(utils.ambiguous_bases) == 1 and utils.ambiguous_bases[ 0] == 'N' # maybe need to update some stuff below if this changes if self.debug: print '%s' % utils.color_gene(gene_name) self.n_occurences = utils.read_single_gene_count( self.indir, gene_name, debug=self.debug ) # how many times did we observe this gene in data? approved_genes = [gene_name] # NOTE this never happens any more, since partitiondriver.cache_parameters() resets <args.min_observations_to_write> if it's arger than 10*(number of sequences) if self.n_occurences < self.args.min_observations_to_write: # if we didn't see it enough, average also over all the genes that find_replacement_genes() gives us if self.debug: print ' only saw it %d times (wanted %d), so use info from all other genes' % ( self.n_occurences, self.args.min_observations_to_write) approved_genes += utils.find_replacement_genes( self.indir, self.args.min_observations_to_write, gene_name, debug=self.debug) self.erosion_probs = self.read_erosion_info(approved_genes) self.insertion_probs, self.insertion_content_probs = self.read_insertion_info( approved_genes) self.mute_freqs = paramutils.read_mute_freqs_with_weights( self.indir, approved_genes) # weighted averages over genes self.mute_counts = paramutils.read_mute_counts( self.indir, gene_name, self.args.locus) # raw per-{ACGT} counts self.process_mutation_info( ) # smooth/interpolation/whatnot for <self.mute_freqs> and <self.mute_counts> # NOTE i'm using a hybrid approach with mute_freqs and mute_counts -- the only thing I get from mute_counts is the ratios of the different bases, whereas the actual freq comes from mute_freqs (which has all the corrections/smooth/bullshit) self.track = Track('nukes', utils.nukes) self.saniname = utils.sanitize_name(gene_name) self.hmm = HMM( self.saniname, self.track.getdict() ) # pass the track as a dict rather than a Track object to keep the yaml file a bit more readable self.hmm.extras['gene_prob'] = max( self.eps, utils.read_overall_gene_probs(self.indir, only_gene=gene_name) ) # if we really didn't see this gene at all, take pity on it and kick it an eps tmp_mean_freq_hist = Hist(fname=self.indir + '/all-mean-mute-freqs.csv') self.hmm.extras['overall_mute_freq'] = tmp_mean_freq_hist.get_mean() self.hmm.extras['per_gene_mute_freq'] = self.mute_freqs[ 'unweighted_overall_mean'] # the other (weighted) one might be technically more accurate, depending on what you want, but it's probably not what anyone is expecting, so we write the unweighted one
def join_gene_names(gene_name_str): return ':'.join([utils.sanitize_name(g) for g in gene_name_str.split(':')])
def write_freqs(self, baseplotdir, baseoutdir, total_frequency=True, only_gene_name='', calculate_uncertainty=True): cvn = TCanvas("cvn", "", 1700, 600) for gene_name in self.freqs: if only_gene_name != '' and gene_name != only_gene_name: continue print ' %-20s' % (gene_name) mute_freqs = self.freqs[gene_name] sorted_positions = sorted(mute_freqs) # calculate mute freq and its uncertainty for position in sorted_positions: n_conserved, n_mutated = 0, 0 total = mute_freqs[position]['n_reads'] for nuke in utils.nukes: obs = int(round(mute_freqs[position][nuke] * total)) if nuke == mute_freqs[position]['ref']: n_conserved += obs else: n_mutated += obs # uncert = fraction_uncertainty(obs, total) # uncertainty for each nuke assert n_mutated + n_conserved == total mute_freqs[position]['mute_freq'] = float(n_mutated) / total mutated_fraction_err = (0.0, 0.0) if calculate_uncertainty: # it's kinda slow mutated_fraction_err = fraction_uncertainty(n_mutated, total) mute_freqs[position]['mute_freq_lo_err'] = mutated_fraction_err[0] mute_freqs[position]['mute_freq_hi_err'] = mutated_fraction_err[1] # write to csv outdir = baseoutdir + '/' + self.human + '/' + self.naivety + '/mute-freqs' if not os.path.exists(outdir): os.makedirs(outdir) outfname = outdir + '/' + utils.sanitize_name(gene_name) + '.csv' # TODO there's kind of starting to be a lot of differenct scripts producing inputs for recombinator. I should unify them with opener('w')(outfname) as outfile: # write out mutation freqs for use by recombinator outfile.write('position,mute_freq,lo_err,hi_err\n') for position in sorted_positions: outfile.write('%d,%f,%f,%f\n' % (position, mute_freqs[position]['mute_freq'], mute_freqs[position]['mute_freq_lo_err'],mute_freqs[position]['mute_freq_hi_err'])) # and make a plot hist = TH1F('hist_' + utils.sanitize_name(gene_name), '', sorted_positions[-1] - sorted_positions[0] + 1, sorted_positions[0] - 0.5, sorted_positions[-1] + 0.5) lo_err_hist = TH1F(hist) hi_err_hist = TH1F(hist) for position in sorted_positions: hist.SetBinContent(hist.FindBin(position), mute_freqs[position]['mute_freq']) lo_err_hist.SetBinContent(hist.FindBin(position), mute_freqs[position]['mute_freq_lo_err']) hi_err_hist.SetBinContent(hist.FindBin(position), mute_freqs[position]['mute_freq_hi_err']) hframe = TH1F(hist) hframe.SetTitle(gene_name + ';;') hframe.Reset() hframe.SetMinimum(lo_err_hist.GetMinimum() - 0.03) hframe.SetMaximum(1.1*hi_err_hist.GetMaximum()) hframe.Draw('') line = TLine(hist.GetXaxis().GetXmin(), 0., hist.GetXaxis().GetXmax(), 0.) line.SetLineColor(0) line.Draw() # can't figure out how to convince hframe not to draw a horizontal line at y=0, so... cover it up hist.SetLineColor(419) hist.Draw('same') lo_err_hist.SetLineColor(kRed+2) hi_err_hist.SetLineColor(kRed+2) lo_err_hist.SetMarkerColor(kRed+2) hi_err_hist.SetMarkerColor(kRed+2) lo_err_hist.SetMarkerStyle(22) hi_err_hist.SetMarkerStyle(23) lo_err_hist.Draw('p same') hi_err_hist.Draw('p same') plotdir = baseplotdir + '/' + self.human + '/' + self.naivety + '/plots' if not os.path.exists(plotdir): os.makedirs(plotdir) outfname = plotdir + '/' + utils.sanitize_name(gene_name) + '.png' cvn.SaveAs(outfname)
# plt.xlabel(legends.get(meth2, meth2) + ' cluster size') # I don't know why it's reversed, it just is # plt.ylabel(legends.get(meth1, meth1) + ' cluster size') # ax.set_xlim(0, n_biggest_clusters) # ax.set_ylim(0, n_biggest_clusters) plt.title(title) if not os.path.exists(plotdir + '/plots'): os.makedirs(plotdir + '/plots') plt.savefig(plotdir + '/plots/' + plotname + '.svg') plt.close() # ---------------------------------------------------------------------------------------- baseplotdir = os.getenv('www') + '/tmp' for difftype in ['indels', 'subs']: print difftype # individual primary version plots for pv in pversions: print ' ', pv plotheatmap(baseplotdir + '/' + difftype, utils.sanitize_name(pv), difftype, genelist=pversions[pv], title='primary version \"' + pv + '\"', xtitle=xtitles[difftype]) # plots comparing two different primary versions plotheatmap(baseplotdir + '/' + difftype, 'compare-pvs', difftype, genesets=pversions, title='compare means over pairs of primary versions', xtitle=xtitles[difftype]) check_call(['./bin/makeHtml', baseplotdir + '/' + difftype, '2', 'foop', 'svg']) check_call(['./bin/permissify-www', baseplotdir])
def plot(self, base_plotdir, cyst_positions=None, tryp_positions=None, only_csv=False): if not self.finalized: self.finalize() plotdir = base_plotdir + '/mute-freqs' overall_plotdir = plotdir + '/overall' utils.prep_dir(overall_plotdir, multilings=('*.csv', '*.svg')) for region in utils.regions: utils.prep_dir(plotdir + '/' + region, multilings=('*.csv', '*.svg')) # utils.prep_dir(plotdir + '/' + region + '-per-base/plots', multilings=('*.csv', '*.png')) for gene in self.counts: counts, plotting_info = self.counts[gene], self.plotting_info[gene] sorted_positions = sorted(counts) genehist = Hist(sorted_positions[-1] - sorted_positions[0] + 1, sorted_positions[0] - 0.5, sorted_positions[-1] + 0.5, xtitle='fixme', ytitle='fixme') #, title=utils.sanitize_name(gene)) for position in sorted_positions: hi_diff = abs(counts[position]['freq'] - counts[position]['freq_hi_err']) lo_diff = abs(counts[position]['freq'] - counts[position]['freq_lo_err']) err = 0.5*(hi_diff + lo_diff) genehist.set_ibin(genehist.find_bin(position), counts[position]['freq'], error=err) xline = None figsize = [3, 3] if utils.get_region(gene) == 'v' and cyst_positions is not None: xline = cyst_positions[gene] figsize[0] *= 3.5 elif utils.get_region(gene) == 'j' and tryp_positions is not None: xline = tryp_positions[gene] figsize[0] *= 2 plotting.draw_no_root(genehist, plotdir=plotdir + '/' + utils.get_region(gene), plotname=utils.sanitize_name(gene), errors=True, write_csv=True, xline=xline, figsize=figsize, only_csv=only_csv) # paramutils.make_mutefreq_plot(plotdir + '/' + utils.get_region(gene) + '-per-base', utils.sanitize_name(gene), plotting_info) # needs translation to mpl # make mean mute freq hists plotting.draw_no_root(self.mean_rates['all'], plotname='all-mean-freq', plotdir=overall_plotdir, stats='mean', bounds=(0.0, 0.4), write_csv=True, only_csv=only_csv) for region in utils.regions: plotting.draw_no_root(self.mean_rates[region], plotname=region+'-mean-freq', plotdir=overall_plotdir, stats='mean', bounds=(0.0, 0.4), write_csv=True, only_csv=only_csv) if not only_csv: # write html file and fix permissiions plotting.make_html(overall_plotdir) for region in utils.regions: plotting.make_html(plotdir + '/' + region, n_columns=1)
def plot( self, plotdir, only_csv=False, only_overall=False, make_per_base_plots=False ): # NOTE most of the time in here is taken up by mutefrequer.finalize() (if it write() wasn't called first, that is) import plotting print ' plotting parameters in %s' % plotdir, sys.stdout.flush() start = time.time() self.clean_plots(plotdir) self.mfreqer.plot(plotdir + '/mute-freqs', only_csv=only_csv, only_overall=only_overall, make_per_base_plots=make_per_base_plots) overall_plotdir = plotdir + '/overall' for column in self.counts: if column == 'all': continue values, gene_values = {}, {} for index, count in self.counts[column].iteritems(): column_val = index[0] if column_val not in values: values[column_val] = 0.0 values[column_val] += count if column in self.columns_to_subset_by_gene: gene = index[ 1] # NOTE this is hackey, but it works find now and will fail obviously if I ever change the correlations to be incompatible. so screw it utils.split_gene(gene) # checks validity of gene if gene not in gene_values: gene_values[gene] = {} if column_val not in gene_values[gene]: gene_values[gene][column_val] = 0.0 gene_values[gene][column_val] += count var_type = 'string' if column in self.string_columns else 'int' hist = hutils.make_hist_from_dict_of_counts( values, var_type, column) plotting.draw_no_root( hist, plotname=column, plotdir=overall_plotdir, xtitle=plotconfig.xtitles.get(column, column), plottitle=plotconfig.plot_titles.get(column, column), errors=True, write_csv=True, only_csv=only_csv, stats='mean' if column in self.mean_columns else None, normalize=True) if column in self.columns_to_subset_by_gene and not only_overall: thisplotdir = plotdir + '/' + column for gene in gene_values: plotname = utils.sanitize_name(gene) + '-' + column hist = hutils.make_hist_from_dict_of_counts( gene_values[gene], var_type, plotname) plotting.draw_no_root(hist, plotname=plotname, plotdir=thisplotdir, xtitle=plotconfig.plot_titles.get( column, column), plottitle=gene, errors=True, write_csv=True, only_csv=only_csv) if not only_csv: plotting.make_html(thisplotdir) if not only_csv: plotting.make_html(overall_plotdir) print '(%.1f sec)' % (time.time() - start)
def clean(self): """ remove all the parameter files """ for gene in self.counts: outfname = self.outdir + '/' + utils.sanitize_name(gene) + '.csv' os.remove(outfname) os.rmdir(self.outdir)
def convert_model(s1_model, s2fm_addon_folder): print(f'\033[94mWorking on {s1_model.stem} model\033[0m') s1_mdl = Mdl(s1_model) s1_mdl.read() eye_conv = EyeConverter() content_manager = ContentManager() content_manager.scan_for_content(s1_model) mod_path = get_mod_path(s1_model) rel_model_path = normalize_path(s1_model.relative_to(mod_path)) print('\033[94mCollecting materials\033[0m') s1_materials = collect_materials(s1_mdl) os.makedirs(s2fm_addon_folder / rel_model_path.with_suffix(''), exist_ok=True) eyes = eye_conv.process_mdl( s1_mdl, s2fm_addon_folder / rel_model_path.with_suffix('')) print('\033[94mDecompiling model\033[0m') model_decompiler = ModelDecompiler(s1_model) model_decompiler.decompile(remove_eyes=True) model_decompiler.save(s2fm_addon_folder / rel_model_path.with_suffix('')) s2_vmodel = (s2fm_addon_folder / rel_model_path.with_suffix('.vmdl')) os.makedirs(s2_vmodel.parent, exist_ok=True) print('\033[94mWriting VMDL\033[0m') vmdl = KV3mdl() for dmx_model in model_decompiler.dmx_models: vmdl.add_render_mesh( sanitize_name(dmx_model.mdl_model.name), normalize_path( rel_model_path.with_suffix('') / f'{Path(dmx_model.mdl_model.name).stem}.dmx')) for eyeball_name, eyeball_path in eyes: vmdl.add_render_mesh( sanitize_name(eyeball_name), normalize_path(eyeball_path.relative_to(s2fm_addon_folder))) for bone in s1_mdl.bones: if bone.procedural_rule_type == ProceduralBoneType.JIGGLE: procedural_rule = bone.procedural_rule # type:JiggleRule jiggle_type = 0 if procedural_rule.flags & JiggleRuleFlags.IS_RIGID: jiggle_type = 0 elif procedural_rule.flags & JiggleRuleFlags.IS_FLEXIBLE: jiggle_type = 1 elif procedural_rule.flags & JiggleRuleFlags.HAS_BASE_SPRING: jiggle_type = 2 jiggle_data = { "name": f"{bone.name}_jiggle", "jiggle_root_bone": bone.name, "jiggle_type": jiggle_type, 'length': procedural_rule.length, 'tip_mass': procedural_rule.tip_mass, 'has_yaw_constraint': bool(procedural_rule.flags & JiggleRuleFlags.HAS_YAW_CONSTRAINT), 'has_pitch_constraint': bool(procedural_rule.flags & JiggleRuleFlags.HAS_PITCH_CONSTRAINT), 'has_angle_constraint': bool(procedural_rule.flags & JiggleRuleFlags.HAS_ANGLE_CONSTRAINT), 'allow_flex_length ': bool(procedural_rule.flags & JiggleRuleFlags.HAS_LENGTH_CONSTRAINT), 'invert_axes': bone.position[0] < 0, 'angle_limit': math.degrees(procedural_rule.angle_limit), 'max_yaw': procedural_rule.max_yaw, 'min_yaw': procedural_rule.min_yaw, 'yaw_bounce': procedural_rule.yaw_bounce, 'yaw_damping': procedural_rule.yaw_damping or 10, 'yaw_stiffness': procedural_rule.yaw_stiffness or 10, 'yaw_friction': procedural_rule.yaw_friction or 10, 'max_pitch': procedural_rule.max_pitch, 'min_pitch': procedural_rule.min_pitch, 'pitch_bounce': procedural_rule.pitch_bounce or 10, 'pitch_damping': procedural_rule.pitch_damping or 10, 'pitch_stiffness': procedural_rule.pitch_stiffness or 10, 'pitch_friction': procedural_rule.pitch_friction or 10, 'base_left_max': procedural_rule.base_max_left, 'base_left_min': procedural_rule.base_min_left, 'base_left_friction': procedural_rule.base_left_friction, 'base_up_max': procedural_rule.base_max_up, 'base_up_min': procedural_rule.base_min_up, 'base_up_friction': procedural_rule.base_up_friction, 'base_forward_max': procedural_rule.base_min_forward, 'base_forward_min': procedural_rule.base_min_forward, 'base_forward_friction': procedural_rule.base_forward_friction, 'along_stiffness': procedural_rule.along_stiffness / 10, 'along_damping': procedural_rule.along_damping or 15, } vmdl.add_jiggle_bone(jiggle_data) for s1_bodygroup in s1_mdl.body_parts: if 'clamped' in s1_bodygroup.name: continue bodygroup = vmdl.add_bodygroup(sanitize_name(s1_bodygroup.name)) for mesh in s1_bodygroup.models: if len(mesh.meshes) == 0 or mesh.name == 'blank': vmdl.add_bodygroup_choice(bodygroup, []) continue vmdl.add_bodygroup_choice(bodygroup, sanitize_name(mesh.name)) reference_skin = s1_mdl.skin_groups[0] for n, skin in enumerate(s1_mdl.skin_groups[1:]): vmdl_skin = vmdl.add_skin(f'skin_{n}') for ref_mat, skin_mat in zip(reference_skin, skin): if ref_mat != skin_mat: ref_mat = get_s2_material_path(normalize_path(ref_mat), s1_materials) skin_mat = get_s2_material_path(normalize_path(skin_mat), s1_materials) if ref_mat and skin_mat: vmdl.add_skin_remap(vmdl_skin, ref_mat, skin_mat) else: print( '\033[91mFailed to create skin!\nMissing source or destination material!\033[0m' ) with s2_vmodel.open('w') as f: f.write(vmdl.dump()) print('\033[94mConverting materials\033[0m') for mat in s1_materials: mat_name = normalize_path(mat[0]) print('\033[92mConverting {}\033[0m'.format(mat_name)) result, shader = convert_material(mat, s2fm_addon_folder) if result: pass else: print(f'\033[91mUnsupported Source1 shader "{shader}"!\033[0m') return s2_vmodel
def __init__(self, base_indir, outdir, gene_name, naivety, germline_seq, args): self.indir = base_indir self.args = args # parameters with values that I more or less made up self.precision = '16' # number of digits after the decimal for probabilities self.eps = 1e-6 # NOTE I also have an eps defined in utils, and they should in principle be combined self.n_max_to_interpolate = 20 self.allow_unphysical_insertions = self.args.allow_unphysical_insertions # allow fv and jf insertions. NOTE this slows things down by a factor of 6 or so # self.allow_external_deletions = args.allow_external_deletions # allow v left and j right deletions. I.e. if your reads extend beyond v or j boundaries self.v_3p_del_pseudocount_limit = 10 # add at least one entry # self.insert_mute_prob = 0.0 # self.mean_mute_freq = 0.0 self.outdir = outdir self.region = utils.get_region(gene_name) self.naivety = naivety self.germline_seq = germline_seq self.smallest_entry_index = -1 # keeps track of the first state that has a chance of being entered from init -- we want to start writing (with add_internal_state) from there # self.insertions = [ insert for insert in utils.index_keys if re.match(self.region + '._insertion', insert) or re.match('.' + self.region + '_insertion', insert)] OOPS that's not what I want to do self.insertions = [] if self.region == 'v': if self.allow_unphysical_insertions: self.insertions.append('fv') elif self.region == 'd': self.insertions.append('vd') elif self.region == 'j': self.insertions.append('dj') if self.allow_unphysical_insertions: self.insertions.append('jf') self.erosion_probs = {} self.insertion_probs = {} self.n_occurences = utils.read_overall_gene_probs( self.indir, only_gene=gene_name, normalize=False ) # how many times did we observe this gene in data? replacement_genes = None if self.n_occurences < self.args.min_observations_to_write: # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us if self.args.debug: print ' only saw it %d times, use info from other genes' % self.n_occurences replacement_genes = utils.find_replacement_genes( self.indir, self.args.min_observations_to_write, gene_name, single_gene=False, debug=self.args.debug) self.read_erosion_info( gene_name, replacement_genes) # try this exact gene, but... self.read_insertion_info(gene_name, replacement_genes) if self.naivety == 'M': # mutate if not naive self.mute_freqs = paramutils.read_mute_info( self.indir, this_gene=gene_name, approved_genes=replacement_genes) self.track = Track('nukes', list(utils.nukes)) self.saniname = utils.sanitize_name(gene_name) self.hmm = HMM( self.saniname, {'nukes': list(utils.nukes)} ) # pass the track as a dict rather than a Track object to keep the yaml file a bit more readable self.hmm.extras['gene_prob'] = max( self.eps, utils.read_overall_gene_probs(self.indir, only_gene=gene_name) ) # if we really didn't see this gene at all, take pity on it and kick it an eps
def plot(self, plotdir, only_csv=False, only_overall=False): print " plotting parameters", sys.stdout.flush() start = time.time() self.clean_plots(plotdir) self.mfreqer.plot(plotdir + "/mute-freqs", only_csv=only_csv, only_overall=only_overall) overall_plotdir = plotdir + "/overall" for column in self.counts: if column == "all": continue values, gene_values = {}, {} for index, count in self.counts[column].iteritems(): column_val = index[0] if column_val not in values: values[column_val] = 0.0 values[column_val] += count if column in self.columns_to_subset_by_gene: gene = index[ 1 ] # NOTE this is hackey, but it works find now and will fail obviously if I ever change the correlations to be incompatible. so screw it utils.split_gene(gene) # checks validity of gene if gene not in gene_values: gene_values[gene] = {} if column_val not in gene_values[gene]: gene_values[gene][column_val] = 0.0 gene_values[gene][column_val] += count var_type = "string" if column in self.string_columns else "int" hist = plotting.make_hist_from_dict_of_counts(values, var_type, column, sort=True) plotting.draw_no_root( hist, plotname=column, plotdir=overall_plotdir, xtitle=plotconfig.xtitles.get(column, column), plottitle=plotconfig.plot_titles.get(column, column), errors=True, write_csv=True, only_csv=only_csv, ) if column in self.columns_to_subset_by_gene and not only_overall: thisplotdir = plotdir + "/" + column for gene in gene_values: plotname = utils.sanitize_name(gene) + "-" + column hist = plotting.make_hist_from_dict_of_counts(gene_values[gene], var_type, plotname, sort=True) plotting.draw_no_root( hist, plotname=plotname, plotdir=thisplotdir, xtitle=plotconfig.plot_titles.get(column, column), plottitle=gene, errors=True, write_csv=True, only_csv=only_csv, ) if not only_csv: plotting.make_html(thisplotdir) if not only_csv: plotting.make_html(overall_plotdir) print "(%.1f sec)" % (time.time() - start)
def __init__(self, base_indir, outdir, gene_name, naivety, germline_seqs, args, cyst_positions, tryp_positions): self.region = utils.get_region(gene_name) self.raw_name = gene_name # i.e. unsanitized self.germline_seqs = germline_seqs # all germline alleles self.germline_seq = self.germline_seqs[self.region][gene_name] # germline sequence for this hmm self.indir = base_indir self.args = args self.cyst_positions = cyst_positions self.tryp_positions = tryp_positions # parameters with values that I more or less made up self.precision = '16' # number of digits after the decimal for probabilities self.eps = 1e-6 # NOTE I also have an eps defined in utils, and they should in principle be combined self.n_max_to_interpolate = 20 # self.allow_external_deletions = args.allow_external_deletions # allow v left and j right deletions. I.e. if your reads extend beyond v or j boundaries self.min_mean_unphysical_insertion_length = {'fv' : 1.5, 'jf' : 25} # jf has to be quite a bit bigger, since besides account for the variation in J length from the tryp position to the end, it has to account for the difference in cdr3 lengths self.erosion_pseudocount_length = 10 # if we're closer to the end of the gene than this, make sure erosion probability isn't zero # self.insert_mute_prob = 0.0 # self.mean_mute_freq = 0.0 self.outdir = outdir self.naivety = naivety self.smallest_entry_index = -1 # keeps track of the first state that has a chance of being entered from init -- we want to start writing (with add_internal_state) from there # self.insertions = [ insert for insert in utils.index_keys if re.match(self.region + '._insertion', insert) or re.match('.' + self.region + '_insertion', insert)] OOPS that's not what I want to do self.insertions = [] if self.region == 'v': if not self.args.dont_allow_unphysical_insertions: self.insertions.append('fv') elif self.region == 'd': self.insertions.append('vd') elif self.region == 'j': self.insertions.append('dj') if not self.args.dont_allow_unphysical_insertions: self.insertions.append('jf') self.erosion_probs = {} self.insertion_probs = {} self.insertion_content_probs = {} self.n_occurences = utils.read_overall_gene_probs(self.indir, only_gene=gene_name, normalize=False) # how many times did we observe this gene in data? replacement_genes = None if self.n_occurences < self.args.min_observations_to_write: # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us if self.args.debug: print ' only saw it %d times, use info from other genes' % self.n_occurences replacement_genes = utils.find_replacement_genes(self.indir, self.args.min_observations_to_write, gene_name, single_gene=False, debug=self.args.debug) self.read_erosion_info(gene_name, replacement_genes) # try this exact gene, but... self.read_insertion_info(gene_name, replacement_genes) if self.naivety == 'M': # mutate if not naive self.mute_freqs, self.mute_obs = paramutils.read_mute_info(self.indir, this_gene=gene_name, approved_genes=replacement_genes) self.track = Track('nukes', utils.nukes) self.saniname = utils.sanitize_name(gene_name) self.hmm = HMM(self.saniname, self.track.getdict()) # pass the track as a dict rather than a Track object to keep the yaml file a bit more readable self.hmm.extras['gene_prob'] = max(self.eps, utils.read_overall_gene_probs(self.indir, only_gene=gene_name)) # if we really didn't see this gene at all, take pity on it and kick it an eps mean_freq_hist = plotting.make_hist_from_bin_entry_file(self.indir + '/all-mean-mute-freqs.csv') self.hmm.extras['overall_mute_freq'] = mean_freq_hist.GetMean()
def scrape_book_data(driver, book_url, match_language="", category={"label": "Uncategorized"}, force=False): # check if this book has already been dumped, unless we are forcing # scraping, if so return the content of the dump, alonside with a flash # saying it already existed if os.path.exists(get_book_dump_filename(book_url)) and not force: log.debug(f"Json dump for book {book_url} already exists, skipping " "scraping...") with open(get_book_dump_filename(book_url)) as f: return json.load(f), True # if not, proceed scraping the reader page log.info(f"Scraping book at {book_url}") if "/nc/reader/" not in book_url: book_url = book_url.replace("/books/", "/nc/reader/") if not driver.current_url == book_url: driver.get(book_url) # check for re-direct to the upgrade page detect_needs_upgrade(driver) reader = driver.find_element_by_class_name("reader__container") # get the book's metadata from the blinkist API using its ID book_id = reader.get_attribute("data-book-id") book_json = requests.get( url=f"https://api.blinkist.com/v4/books/{book_id}").json() book = book_json["book"] if match_language and book["language"] != match_language: log.warning( f"Book not available in the selected language ({match_language}), " "skipping scraping...") return None, False # sanitize the book's title and author since they will be used for paths # and such book["title"] = sanitize_name(book["title"]) book["author"] = sanitize_name(book["author"]) # check if the book's metadata already has chapter content # (this is the case for the free book of the day) json_needs_content = False for chapter_json in book["chapters"]: if "text" not in chapter_json: json_needs_content = True break else: # change the text content key name for compatibility with the # script methods chapter_json["content"] = chapter_json.pop("text") if json_needs_content: # scrape the chapter's content on the reader page # and extend the book json data by inserting the scraped content # in the appropriate chapter section to get a complete data file book_chapters = driver.find_elements(By.CSS_SELECTOR, ".chapter.chapter") for chapter in book_chapters: chapter_no = chapter.get_attribute("data-chapterno") chapter_content = chapter.find_element_by_class_name( "chapter__content") for chapter_json in book["chapters"]: if chapter_json["order_no"] == int(chapter_no): chapter_json["content"] = chapter_content.get_attribute( "innerHTML") break # look for any supplement sections book_supplements = driver.find_elements(By.CSS_SELECTOR, ".chapter.supplement") for supplement in book_supplements: chapter_no = supplement.get_attribute("data-chapterno") supplement_content = chapter.find_element_by_class_name( "chapter__content") for chapter_json in book["chapters"]: if chapter_json["order_no"] == int(chapter_no): if not chapter_json.get("supplement", None): supplement_text = supplement_content.get_attribute( "innerHTML") chapter_json["supplement"] = supplement_text break # if we are scraping by category, add it to the book metadata book["category"] = category["label"] # store the book json metadata for future use dump_book(book) # return a tuple with the book json metadata, and a boolean indicating # whether the json dump already existed or not return book, False
germlines = utils.read_germlines('../../../recombinator') reader = csv.DictReader(infile) for inline in reader: print 'searching' # inline['seq'] = inline['seq'][-130:] searcher = Searcher(inline['seq'], debug=True, n_matches_max=2) searcher.search() inferred_group_str = '' true_group_str = '' outline = {} outline['seq'] = inline['seq'] print 'RESULT ', for region in utils.regions: inferred_name = searcher.get_best_match_name(region) outline[region + '_gene'] = utils.unsanitize_name(inferred_name) true_name = utils.sanitize_name(inline[region + '_gene']) inferred_group_str += inferred_name true_group_str += true_name if inferred_name == 'none': print ' none', elif inferred_name == true_name: print ' - ', else: print ' x ', for region in utils.regions: print '%3d' % searcher.n_tries[region], print '' print ' true' utils.print_reco_event(germlines, inline, -1, -1) if searcher.all_matched():
def plot(self, plotdir, only_csv=False, only_overall=False): if not self.finalized: self.finalize() overall_plotdir = plotdir + '/overall' for gene in self.freqs: if only_overall: continue freqs = self.freqs[gene] if len(freqs) == 0: if gene not in glutils.dummy_d_genes.values(): print ' %s no mutefreqer obs for %s' % (utils.color('red', 'warning'), utils.color_gene(gene)) continue sorted_positions = sorted(freqs.keys()) genehist = Hist(sorted_positions[-1] - sorted_positions[0] + 1, sorted_positions[0] - 0.5, sorted_positions[-1] + 0.5, xtitle='position', ytitle='mut freq', title=gene) for position in sorted_positions: hi_diff = abs(freqs[position]['freq'] - freqs[position]['freq_hi_err']) lo_diff = abs(freqs[position]['freq'] - freqs[position]['freq_lo_err']) err = 0.5*(hi_diff + lo_diff) genehist.set_ibin(genehist.find_bin(position), freqs[position]['freq'], error=err) xline = None figsize = [7, 4] if utils.get_region(gene) in utils.conserved_codons[self.glfo['chain']]: codon = utils.conserved_codons[self.glfo['chain']][utils.get_region(gene)] xline = self.glfo[codon + '-positions'][gene] if utils.get_region(gene) == 'v': figsize[0] *= 3.5 elif utils.get_region(gene) == 'j': figsize[0] *= 2 plotting.draw_no_root(self.per_gene_mean_rates[gene], plotdir=plotdir + '/per-gene/' + utils.get_region(gene), plotname=utils.sanitize_name(gene), errors=True, write_csv=True, only_csv=only_csv, shift_overflows=True) # per-position plots: plotting.draw_no_root(genehist, plotdir=plotdir + '/per-gene-per-position/' + utils.get_region(gene), plotname=utils.sanitize_name(gene), errors=True, write_csv=True, xline=xline, figsize=figsize, only_csv=only_csv, shift_overflows=True) # # per-position, per-base plots: # paramutils.make_mutefreq_plot(plotdir + '/' + utils.get_region(gene) + '-per-base', utils.sanitize_name(gene), plotting_info) # needs translation to mpl UPDATE fcn is fixed, but I can't be bothered uncommenting this at the moment # make mean mute freq hists for rstr in ['all', 'cdr3'] + utils.regions: if rstr == 'all': bounds = (0.0, 0.4) else: bounds = (0.0, 0.6 if rstr == 'd' else 0.4) plotting.draw_no_root(self.mean_rates[rstr], plotname=rstr+'_mean-freq', plotdir=overall_plotdir, stats='mean', bounds=bounds, write_csv=True, only_csv=only_csv, shift_overflows=True) plotting.draw_no_root(self.mean_n_muted[rstr], plotname=rstr+'_mean-n-muted', plotdir=overall_plotdir, stats='mean', write_csv=True, only_csv=only_csv, shift_overflows=True) if not only_csv: # write html file and fix permissiions for substr in self.subplotdirs: plotting.make_html(plotdir + '/' + substr)
def plot(self, plotdir, only_csv=False, only_overall=False): import plotting if not self.finalized: self.finalize() overall_plotdir = plotdir + '/overall' for gene in self.freqs: if only_overall: continue freqs = self.freqs[gene] if len(freqs) == 0: if gene not in glutils.dummy_d_genes.values(): print ' %s no mutefreqer obs for %s' % (utils.color( 'red', 'warning'), utils.color_gene(gene)) continue sorted_positions = sorted(freqs.keys()) genehist = Hist(sorted_positions[-1] - sorted_positions[0] + 1, sorted_positions[0] - 0.5, sorted_positions[-1] + 0.5, xtitle='position', ytitle='mut freq', title=gene) for position in sorted_positions: hi_diff = abs(freqs[position]['freq'] - freqs[position]['freq_hi_err']) lo_diff = abs(freqs[position]['freq'] - freqs[position]['freq_lo_err']) err = 0.5 * (hi_diff + lo_diff) genehist.set_ibin(genehist.find_bin(position), freqs[position]['freq'], error=err) xline = None figsize = [7, 4] if utils.get_region(gene) in utils.conserved_codons[ self.glfo['locus']]: xline = utils.cdn_pos(self.glfo, utils.get_region(gene), gene) if utils.get_region(gene) == 'v': figsize[0] *= 3.5 elif utils.get_region(gene) == 'j': figsize[0] *= 2 plotting.draw_no_root(self.per_gene_mean_rates[gene], plotdir=plotdir + '/per-gene/' + utils.get_region(gene), plotname=utils.sanitize_name(gene), errors=True, write_csv=True, only_csv=only_csv, shift_overflows=True) # per-position plots: plotting.draw_no_root(genehist, plotdir=plotdir + '/per-gene-per-position/' + utils.get_region(gene), plotname=utils.sanitize_name(gene), errors=True, write_csv=True, xline=xline, figsize=figsize, only_csv=only_csv, shift_overflows=True) # # per-position, per-base plots: # paramutils.make_mutefreq_plot(plotdir + '/' + utils.get_region(gene) + '-per-base', utils.sanitize_name(gene), plotting_info) # needs translation to mpl UPDATE fcn is fixed, but I can't be bothered uncommenting this at the moment # make mean mute freq hists for rstr in ['all', 'cdr3'] + utils.regions: if rstr == 'all': bounds = (0.0, 0.4) else: bounds = (0.0, 0.6 if rstr == 'd' else 0.4) plotting.draw_no_root(self.mean_rates[rstr], plotname=rstr + '_mean-freq', plotdir=overall_plotdir, stats='mean', bounds=bounds, write_csv=True, only_csv=only_csv, shift_overflows=True) plotting.draw_no_root(self.mean_n_muted[rstr], plotname=rstr + '_mean-n-muted', plotdir=overall_plotdir, stats='mean', write_csv=True, only_csv=only_csv, shift_overflows=True) if not only_csv: # write html file and fix permissiions for substr in self.subplotdirs: plotting.make_html(plotdir + '/' + substr)
def plot(self, plotdir, subset_by_gene=False, cyst_positions=None, tryp_positions=None, only_csv=False): print ' plotting parameters', sys.stdout.flush() start = time.time() self.clean_plots(plotdir, subset_by_gene) self.mfreqer.plot(plotdir + '/mute-freqs', cyst_positions, tryp_positions, only_csv=only_csv) #, mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv') # REGION is replace by each region in the three output files overall_plotdir = plotdir + '/overall' for column in self.counts: if column == 'all': continue values, gene_values = {}, {} if len(self.counts[column]) == 0: raise Exception('no counts in %s' % column) for index, count in self.counts[column].iteritems(): gene = None if subset_by_gene and ('_del' in column or column == 'vd_insertion' or column == 'dj_insertion'): # option to subset deletion and (real) insertion plots by gene if '_del' in column: region = column[0] else: region = column[1] assert region in utils.regions assert 'IGH' + region.upper() in index[1] # NOTE this is hackey, but it works find now and will fail obviously gene = index[1] # if I ever change the correlations to be incompatible. so screw it if gene not in gene_values: gene_values[gene] = {} column_val = index[0] if gene is not None: if column_val not in gene_values[gene]: gene_values[gene][column_val] = 0.0 gene_values[gene][column_val] += count if column_val not in values: values[column_val] = 0.0 values[column_val] += count try: # figure out whether this is an integer or string (only used outside this loop when we make the plots) int(column_val) var_type = 'int' except: var_type = 'string' if subset_by_gene and ('_del' in column or column == 'vd_insertion' or column == 'dj_insertion'): # option to subset deletion and (real) insertion plots by gene thisplotdir = plotdir + '/' + column for gene in gene_values: plotname = utils.sanitize_name(gene) + '-' + column hist = plotting.make_hist_from_dict_of_counts(gene_values[gene], var_type, plotname, sort=True) plotting.draw_no_root(hist, plotname=plotname, plotdir=thisplotdir, errors=True, write_csv=True, only_csv=only_csv) if not only_csv: plotting.make_html(thisplotdir) plotname = column hist = plotting.make_hist_from_dict_of_counts(values, var_type, plotname, sort=True) plotting.draw_no_root(hist, plotname=plotname, plotdir=overall_plotdir, errors=True, write_csv=True, only_csv=only_csv) if not only_csv: plotting.make_html(overall_plotdir) print '(%.1f sec)' % (time.time()-start)
def _export_control_sequence(self, sequence): VERSION = "1.0" TYPE = "transition_id" if self.export_transition_id else "transition_name" if sequence: dialog = gtk.FileChooserDialog("Export Control Sequence", self.app.window, gtk.FILE_CHOOSER_ACTION_SAVE, (gtk.STOCK_CANCEL, gtk.RESPONSE_CANCEL, gtk.STOCK_SAVE, gtk.RESPONSE_OK)) dialog.set_default_response(gtk.RESPONSE_OK) dialog.set_current_name("{0}.kcs.xml".format(sequence.name)) skcs_filter = gtk.FileFilter() # Kaira Control Sequence skcs_filter.set_name("Control Sequence (.kcs.xml)") dialog.add_filter(skcs_filter) try: response = dialog.run() filename = dialog.get_filename() finally: dialog.destroy() net = self.project.build_net transitions = {} for t in net.transitions(): transitions["#{0}".format(t.id)] = t for t in net.transitions(): transitions[utils.sanitize_name(t.get_name())] = t running_transitions = {} if response == gtk.RESPONSE_OK: cmdlines = "\n" for command in sequence.commands: match = command_parser.match(command) if match is None: raise ControlSequenceException("Invalid format: ", command) process = int(match.group("process")) action = match.group("action") if action == "T" or action == "S": arg = match.group("arg_int") if arg is None: arg = match.group("arg_str") if not transitions.has_key(arg): raise ControlSequenceException( "Transition '{0}' not found.".format(arg)) t = transitions[arg] if self.export_transition_id: tid = t.id else: tid = utils.sanitize_name(t.get_name_or_id()) cmdlines += "{0} {1} {2}\n".format(process, action, tid) if action == "S": if running_transitions.has_key(process): running_transitions[process].push(t.id) else: running_transitions[process] = [t.id] elif action == "R": arg_int = match.group("arg_int") if arg_int is None: raise ControlSequenceException("Invalid format of receive.") cmdlines += "{0}\n".format(command) else: assert action == "F" if not running_transitions.has_key(process) or \ not running_transitions[process]: raise ControlSequenceException( "Invalid sequence. Transition fire action is missing.") tid = running_transitions[process].pop() cmdlines += "{0} {1} {2}\n".format(process, action, tid) element = xml.Element("sequence") element.set("name", sequence.name) element.set("type", TYPE) element.set("version", VERSION) element.text = cmdlines tree = xml.ElementTree(element) tree.write(filename)
plt.xticks(ticks, xticklabels, rotation=90) plt.yticks(ticks, yticklabels) # plt.xlabel(legends.get(meth2, meth2) + ' cluster size') # I don't know why it's reversed, it just is # plt.ylabel(legends.get(meth1, meth1) + ' cluster size') # ax.set_xlim(0, n_biggest_clusters) # ax.set_ylim(0, n_biggest_clusters) plt.title(title) if not os.path.exists(plotdir + '/plots'): os.makedirs(plotdir + '/plots') plt.savefig(plotdir + '/plots/' + plotname + '.svg') plt.close() # ---------------------------------------------------------------------------------------- baseplotdir = os.getenv('www') + '/tmp' for difftype in ['indels', 'subs']: print difftype # individual primary version plots for pv in pversions: print ' ', pv plotheatmap(baseplotdir + '/' + difftype, utils.sanitize_name(pv), difftype, genelist=pversions[pv], title='primary version \"' + pv + '\"', xtitle=xtitles[difftype]) # plots comparing two different primary versions plotheatmap(baseplotdir + '/' + difftype, 'compare-pvs', difftype, genesets=pversions, title='compare means over pairs of primary versions', xtitle=xtitles[difftype]) check_call(['./bin/makeHtml', baseplotdir + '/' + difftype, '2', 'foop', 'svg'])
def plot(self, base_plotdir, cyst_positions=None, tryp_positions=None): if not self.finalized: self.finalize() plotdir = base_plotdir + '/mute-freqs' utils.prep_dir(plotdir + '/plots', multilings=('*.csv', '*.svg')) for region in utils.regions: utils.prep_dir(plotdir + '/' + region + '/plots', multilings=('*.csv', '*.svg')) utils.prep_dir(plotdir + '/' + region + '-per-base/plots', multilings=('*.csv', '*.png')) for gene in self.counts: counts, plotting_info = self.counts[gene], self.plotting_info[gene] sorted_positions = sorted(counts) hist = TH1D('hist_' + utils.sanitize_name(gene), '', sorted_positions[-1] - sorted_positions[0] + 1, sorted_positions[0] - 0.5, sorted_positions[-1] + 0.5) for position in sorted_positions: hist.SetBinContent(hist.FindBin(position), counts[position]['freq']) hi_diff = abs(counts[position]['freq'] - counts[position]['freq_hi_err']) lo_diff = abs(counts[position]['freq'] - counts[position]['freq_lo_err']) err = 0.5 * (hi_diff + lo_diff) hist.SetBinError(hist.FindBin(position), err) plotfname = plotdir + '/' + utils.get_region( gene) + '/plots/' + utils.sanitize_name(gene) + '.svg' xline = None if utils.get_region(gene) == 'v' and cyst_positions is not None: xline = cyst_positions[gene]['cysteine-position'] elif utils.get_region(gene) == 'j' and tryp_positions is not None: xline = int(tryp_positions[gene]) plotting.draw(hist, 'int', plotdir=plotdir + '/' + utils.get_region(gene), plotname=utils.sanitize_name(gene), errors=True, write_csv=True, xline=xline, draw_str='e') #, cwidth=4000, cheight=1000) paramutils.make_mutefreq_plot( plotdir + '/' + utils.get_region(gene) + '-per-base', utils.sanitize_name(gene), plotting_info) # make mean mute freq hists hist = plotting.make_hist_from_my_hist_class(self.mean_rates['all'], 'all-mean-freq') plotting.draw(hist, 'float', plotname='all-mean-freq', plotdir=plotdir, stats='mean', bounds=(0.0, 0.4), write_csv=True) for region in utils.regions: hist = plotting.make_hist_from_my_hist_class( self.mean_rates[region], region + '-mean-freq') plotting.draw(hist, 'float', plotname=region + '-mean-freq', plotdir=plotdir, stats='mean', bounds=(0.0, 0.4), write_csv=True) check_call(['./bin/makeHtml', plotdir, '3', 'null', 'svg']) # then write html file and fix permissiions for region in utils.regions: check_call( ['./bin/makeHtml', plotdir + '/' + region, '1', 'null', 'svg']) check_call([ './bin/makeHtml', plotdir + '/' + region + '-per-base', '1', 'null', 'png' ]) check_call( ['./bin/permissify-www', plotdir] ) # NOTE this should really permissify starting a few directories higher up
def read_mute_freqs_with_weights( indir, approved_genes, debug=False ): # it would be nice to eventually align the genes before combining # returns: # - mute_freqs: inverse error-weighted average mute freq over all genes for each position # - also includes weighted and unweigthed means over positions if len(approved_genes) == 0: raise Exception('no approved genes') if approved_genes[0] == glutils.dummy_d_genes[utils.get_locus( approved_genes[0])]: return {'overall_mean': 0.5, 'unweighted_overall_mean': 0.5} if debug: print ' reading mute freqs from %s for %d gene%s: %s' % ( indir, len(approved_genes), utils.plural( len(approved_genes)), utils.color_genes(approved_genes)) # add an observation for each position, for each gene where we observed that position NOTE this would be more sensible if they were aligned first observed_freqs = {} for gene in approved_genes: mutefname = indir + '/mute-freqs/' + utils.sanitize_name(gene) + '.csv' if not os.path.exists(mutefname): continue with open(mutefname, 'r') as mutefile: reader = csv.DictReader(mutefile) for line in reader: pos = int(line['position']) freq = float(line['mute_freq']) lo_err = float( line['lo_err'] ) # NOTE lo_err in the file is really the lower *bound* hi_err = float(line['hi_err']) # same deal assert freq >= 0.0 and lo_err >= 0.0 and hi_err >= 0.0 # you just can't be too careful if freq < utils.eps or abs( 1.0 - freq ) < utils.eps: # if <freq> too close to 0 or 1, replace it with the midpoint of its uncertainty band freq = 0.5 * (lo_err + hi_err) if pos not in observed_freqs: observed_freqs[pos] = [] observed_freqs[pos].append({ 'freq': freq, 'err': max(abs(freq - lo_err), abs(freq - hi_err)) }) # append one for each gene # set final mute_freqs[pos] to the (inverse error-weighted) average over all the observations [i.e. genes] for each position mute_freqs = {} for pos in observed_freqs: total, sum_of_weights = 0.0, 0.0 for obs in observed_freqs[pos]: # loop over genes assert obs['err'] > 0.0 weight = 1.0 / obs['err'] total += weight * obs['freq'] sum_of_weights += weight assert sum_of_weights > 0.0 mean_freq = total / sum_of_weights mute_freqs[pos] = mean_freq # NOTE I'm sure that this weighting scheme makes sense for comparing differeing genes at the same position, but I'm less sure it makes sense for the overall mean. But, I don't want to track down all the places that changing it might affect right now mute_freqs['overall_mean'] = 0. weighted_denom = sum([ 1. / obs['err'] for pos in observed_freqs for obs in observed_freqs[pos] ]) if weighted_denom > 0.: mute_freqs['overall_mean'] = sum([ obs['freq'] / obs['err'] for pos in observed_freqs for obs in observed_freqs[pos] ]) / weighted_denom # I need the inverse-error-weighted numbers to sensibly combine genes, but then I also need unweigthed values that I can easily write to the yaml files for other people to use mute_freqs['unweighted_overall_mean'] = 0. unweighted_denom = sum( [len(observed_freqs[pos]) for pos in observed_freqs]) if unweighted_denom > 0.: mute_freqs['unweighted_overall_mean'] = sum([ obs['freq'] for pos in observed_freqs for obs in observed_freqs[pos] ]) / unweighted_denom if debug: iskipstart = 35 # i.e. for v genes skip the middle positions positions = sorted(observed_freqs) if len(positions) > 2 * iskipstart: print ' %s%s%s' % (' '.join([ ('%4d' % p) for p in positions[:iskipstart] ]), utils.color('blue', ' [...] '), ' '.join([ ('%4d' % p) for p in positions[len(positions) - iskipstart:] ])) print ' %s%s%s' % (' '.join([ ('%4.2f' % mute_freqs[p]) for p in positions[:iskipstart] ]), utils.color('blue', ' [...] '), ' '.join( [('%4.2f' % mute_freqs[p]) for p in positions[len(positions) - iskipstart:]])) else: print ' %s' % ' '.join([('%4d' % p) for p in positions]) print ' %s' % ' '.join([('%4.2f' % mute_freqs[p]) for p in positions]) print ' overall mean: %5.3f (unweighted %5.3f)' % ( mute_freqs['overall_mean'], mute_freqs['unweighted_overall_mean']) return mute_freqs
def plot(self, plotdir, subset_by_gene=False, cyst_positions=None, tryp_positions=None): print ' plotting parameters' start = time.time() utils.prep_dir(plotdir + '/plots') #, multilings=('*.csv', '*.svg')) for column in self.counts: if column == 'all': continue values, gene_values = {}, {} if len(self.counts[column]) == 0: print 'ERROR no counts in %s' % column assert False for index, count in self.counts[column].iteritems(): gene = None if subset_by_gene and ( '_del' in column or column == 'vd_insertion' or column == 'dj_insertion' ): # option to subset deletion and (real) insertion plots by gene if '_del' in column: region = column[0] else: region = column[1] assert region in utils.regions assert 'IGH' + region.upper() in index[ 1] # NOTE this is hackey, but it works find now and will fail obviously gene = index[ 1] # if I ever change the correlations to be incompatible. so screw it if gene not in gene_values: gene_values[gene] = {} column_val = index[0] if gene is not None: if column_val not in gene_values[gene]: gene_values[gene][column_val] = 0.0 gene_values[gene][column_val] += count if column_val not in values: values[column_val] = 0.0 values[column_val] += count try: # figure out whether this is an integer or string (only used outside this loop when we make the plots) int(column_val) var_type = 'int' except: var_type = 'string' if subset_by_gene and ( '_del' in column or column == 'vd_insertion' or column == 'dj_insertion' ): # option to subset deletion and (real) insertion plots by gene thisplotdir = plotdir + '/' + column utils.prep_dir(thisplotdir + '/plots', multilings=['*.csv', '*.svg']) for gene in gene_values: plotname = utils.sanitize_name(gene) + '-' + column hist = plotting.make_hist_from_dict_of_counts( gene_values[gene], var_type, plotname, sort=True) plotting.draw(hist, var_type, plotname=plotname, plotdir=thisplotdir, errors=True, write_csv=True) check_call(['./bin/makeHtml', thisplotdir, '3', 'null', 'svg']) check_call( ['./bin/permissify-www', thisplotdir] ) # NOTE this should really permissify starting a few directories higher up plotname = column hist = plotting.make_hist_from_dict_of_counts(values, var_type, plotname, sort=True) plotting.draw(hist, var_type, plotname=plotname, plotdir=plotdir, errors=True, write_csv=True) self.mutefreqer.plot( plotdir, cyst_positions, tryp_positions ) #, mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv') # REGION is replace by each region in the three output files if has_root: check_call(['./bin/makeHtml', plotdir, '3', 'null', 'svg']) check_call( ['./bin/permissify-www', plotdir] ) # NOTE this should really permissify starting a few directories higher up print ' parameter plot time: %.3f' % (time.time() - start)
def read_mute_info( indir, this_gene, approved_genes=None ): # NOTE this would probably be more accurate if we made some effort to align the genes before combining all the approved ones if approved_genes == None: approved_genes = [ this_gene, ] observed_freqs, observed_counts = {}, {} total_counts = 0 # add an observation for each position, for each gene where we observed that position for gene in approved_genes: mutefname = indir + '/mute-freqs/' + utils.sanitize_name(gene) + '.csv' if not os.path.exists(mutefname): continue with opener('r')(mutefname) as mutefile: reader = csv.DictReader(mutefile) for line in reader: pos = int(line['position']) freq = float(line['mute_freq']) lo_err = float( line['lo_err'] ) # NOTE lo_err in the file is really the lower *bound* hi_err = float(line['hi_err']) # same deal assert freq >= 0.0 and lo_err >= 0.0 and hi_err >= 0.0 # you just can't be too careful if freq < utils.eps or abs( 1.0 - freq ) < utils.eps: # if <freq> too close to 0 or 1, replace it with the midpoint of its uncertainty band freq = 0.5 * (lo_err + hi_err) if pos not in observed_freqs: observed_freqs[pos] = [] observed_counts[pos] = {n: 0 for n in utils.nukes} observed_freqs[pos].append({ 'freq': freq, 'err': max(abs(freq - lo_err), abs(freq - hi_err)) }) for nuke in utils.nukes: observed_counts[pos][nuke] += int(line[nuke + '_obs']) total_counts += int(line[nuke + '_obs']) # set final mute_freqs[pos] to the (inverse error-weighted) average over all the observations for each position mute_freqs = {} overall_total, overall_sum_of_weights = 0.0, 0.0 # also calculate the mean over all positions for pos in observed_freqs: total, sum_of_weights = 0.0, 0.0 for obs in observed_freqs[pos]: assert obs['err'] > 0.0 weight = 1.0 / obs['err'] total += weight * obs['freq'] sum_of_weights += weight assert sum_of_weights > 0.0 mean_freq = total / sum_of_weights mute_freqs[pos] = mean_freq overall_total += total overall_sum_of_weights += sum_of_weights mute_freqs['overall_mean'] = 0. if overall_sum_of_weights > 0.: mute_freqs['overall_mean'] = overall_total / overall_sum_of_weights observed_counts['total_counts'] = total_counts return mute_freqs, observed_counts
def _export_control_sequence(self, sequence): VERSION = "1.0" TYPE = "transition_id" if self.export_transition_id else "transition_name" if sequence: dialog = gtk.FileChooserDialog( "Export Control Sequence", self.app.window, gtk.FILE_CHOOSER_ACTION_SAVE, (gtk.STOCK_CANCEL, gtk.RESPONSE_CANCEL, gtk.STOCK_SAVE, gtk.RESPONSE_OK)) dialog.set_default_response(gtk.RESPONSE_OK) dialog.set_current_name("{0}.kcs.xml".format(sequence.name)) skcs_filter = gtk.FileFilter() # Kaira Control Sequence skcs_filter.set_name("Control Sequence (.kcs.xml)") dialog.add_filter(skcs_filter) try: response = dialog.run() filename = dialog.get_filename() finally: dialog.destroy() net = self.project.build_net transitions = {} for t in net.transitions(): transitions["#{0}".format(t.id)] = t for t in net.transitions(): transitions[utils.sanitize_name(t.get_name())] = t running_transitions = {} if response == gtk.RESPONSE_OK: cmdlines = "\n" for command in sequence.commands: match = command_parser.match(command) if match is None: raise ControlSequenceException("Invalid format: ", command) process = int(match.group("process")) action = match.group("action") if action == "T" or action == "S": arg = match.group("arg_int") if arg is None: arg = match.group("arg_str") if not transitions.has_key(arg): raise ControlSequenceException( "Transition '{0}' not found.".format(arg)) t = transitions[arg] if self.export_transition_id: tid = t.id else: tid = utils.sanitize_name(t.get_name_or_id()) cmdlines += "{0} {1} {2}\n".format( process, action, tid) if action == "S": if running_transitions.has_key(process): running_transitions[process].push(t.id) else: running_transitions[process] = [t.id] elif action == "R": arg_int = match.group("arg_int") if arg_int is None: raise ControlSequenceException( "Invalid format of receive.") cmdlines += "{0}\n".format(command) else: assert action == "F" if not running_transitions.has_key(process) or \ not running_transitions[process]: raise ControlSequenceException( "Invalid sequence. Transition fire action is missing." ) tid = running_transitions[process].pop() cmdlines += "{0} {1} {2}\n".format( process, action, tid) element = xml.Element("sequence") element.set("name", sequence.name) element.set("type", TYPE) element.set("version", VERSION) element.text = cmdlines tree = xml.ElementTree(element) tree.write(filename)
def finalize(self, calculate_uncertainty=True): """ convert from counts to mut freqs """ assert not self.finalized self.n_cached, self.n_not_cached = 0, 0 for gene in self.counts: self.freqs[gene], self.plotting_info[gene] = {}, [] # NOTE <counts> hold the overall (not per-base) frequencies, while <freqs> holds the per-base frequencies counts, freqs, plotting_info = self.counts[gene], self.freqs[ gene], self.plotting_info[gene] sorted_positions = sorted(counts) for position in sorted_positions: freqs[position] = {} plotting_info.append({}) plotting_info[-1]['name'] = utils.sanitize_name( gene) + '_' + str(position) plotting_info[-1]['nuke_freqs'] = {} n_conserved, n_mutated = 0, 0 for nuke in utils.nukes: nuke_freq = float( counts[position][nuke]) / counts[position]['total'] freqs[position][nuke] = nuke_freq plotting_info[-1]['nuke_freqs'][nuke] = nuke_freq if calculate_uncertainty: # it's kinda slow errs = fraction_uncertainty.err( counts[position][nuke], counts[position]['total']) if errs[2]: self.n_cached += 1 else: self.n_not_cached += 1 # print nuke_freq, errs[0], errs[1], '(', counts[position][nuke], ',', counts[position]['total'], ')' assert errs[ 0] <= nuke_freq # these checks are probably unnecessary. EDIT and totally saved my ass about ten minutes after writing the previous statement assert nuke_freq <= errs[1] freqs[position][nuke + '_lo_err'] = errs[0] freqs[position][nuke + '_hi_err'] = errs[1] if nuke == counts[position]['gl_nuke']: n_conserved += counts[position][nuke] else: n_mutated += counts[position][nuke] # sum over A,C,G,T # uncert = fraction_uncertainty.err(obs, total) # uncertainty for each nuke counts[position]['freq'] = float( n_mutated) / counts[position]['total'] mutated_fraction_err = (0.0, 0.0) if calculate_uncertainty: # it's kinda slow mutated_fraction_err = fraction_uncertainty.err( n_mutated, counts[position]['total']) if mutated_fraction_err[2]: self.n_cached += 1 else: self.n_not_cached += 1 counts[position]['freq_lo_err'] = mutated_fraction_err[0] counts[position]['freq_hi_err'] = mutated_fraction_err[1] self.mean_rates['all'].normalize( overflow_warn=False ) # we expect overflows in mute freq hists, so no need to warn us for region in utils.regions: self.mean_rates[region].normalize(overflow_warn=False) # for gene in self.tmpcounts: # for position in self.tmpcounts[gene]: # self.tmpcounts[gene][position]['muted'].divide_by(self.tmpcounts[gene][position]['total'], debug=False) self.finalized = True