def write_glfo(output_dir, glfo, only_genes=None, debug=False): if debug: print ' writing glfo to %s%s' % (output_dir, '' if only_genes is None else (' (restricting to %d genes)' % len(only_genes))) if os.path.exists(output_dir + '/' + glfo['chain']): remove_glfo_files(output_dir, glfo['chain']) # also removes output_dir os.makedirs(output_dir + '/' + glfo['chain']) for fname in glfo_fasta_fnames(glfo['chain']): with open(output_dir + '/' + glfo['chain'] + '/' + fname, 'w') as outfile: for gene in glfo['seqs'][utils.get_region(fname)]: if only_genes is not None and gene not in only_genes: continue outfile.write('>' + gene + '\n') outfile.write(glfo['seqs'][utils.get_region(fname)][gene] + '\n') for fname in glfo_csv_fnames(): with open(output_dir + '/' + glfo['chain'] + '/' + fname, 'w') as codonfile: writer = csv.DictWriter(codonfile, ('gene', 'istart')) writer.writeheader() for gene, istart in glfo[utils.get_codon(fname) + '-positions'].items(): if only_genes is not None and gene not in only_genes: continue writer.writerow({'gene' : gene, 'istart' : istart}) # make sure there weren't any files lingering in the output dir when we started # NOTE this will ignore the dirs corresponding to any *other* chains (which is what we want now, I think) unexpected_files = set(glob.glob(output_dir + '/' + glfo['chain'] + '/*')) - set([output_dir + '/' + glfo['chain'] + '/' + fn for fn in glfo_fnames(glfo['chain'])]) if len(unexpected_files) > 0: raise Exception('unexpected file(s) while writing germline set: %s' % (' '.join(unexpected_files)))
def write_glfo(output_dir, glfo, only_genes=None, debug=False): if debug: print ' writing glfo to %s%s' % (output_dir, '' if only_genes is None else (' (restricting to %d genes)' % len(only_genes))) if os.path.exists(output_dir + '/' + glfo['locus']): remove_glfo_files(output_dir, glfo['locus']) # also removes output_dir os.makedirs(output_dir + '/' + glfo['locus']) for fname in glfo_fasta_fnames(glfo['locus']): with open(output_dir + '/' + glfo['locus'] + '/' + fname, 'w') as outfile: for gene in glfo['seqs'][utils.get_region(fname)]: if only_genes is not None and gene not in only_genes: continue outfile.write('>' + gene + '\n') outfile.write(glfo['seqs'][utils.get_region(fname)][gene] + '\n') with open(output_dir + '/' + glfo['locus'] + '/' + extra_fname, 'w') as csvfile: writer = csv.DictWriter(csvfile, csv_headers) writer.writeheader() for region, codon in utils.conserved_codons[glfo['locus']].items(): for gene, istart in glfo[codon + '-positions'].items(): if only_genes is not None and gene not in only_genes: continue writer.writerow({'gene' : gene, codon + '_position' : istart}) # make sure there weren't any files lingering in the output dir when we started # NOTE this will ignore the dirs corresponding to any *other* loci (which is what we want now, I think) unexpected_files = set(glob.glob(output_dir + '/' + glfo['locus'] + '/*')) - set([output_dir + '/' + glfo['locus'] + '/' + fn for fn in glfo_fnames(glfo['locus'])]) if len(unexpected_files) > 0: raise Exception('unexpected file(s) while writing germline set: %s' % (' '.join(unexpected_files)))
def add_some_snps(snps_to_add, glfo, remove_template_genes=False, debug=False): """ Generate some snp'd genes and add them to glfo, specified with <snps_to_add>. e.g. [{'gene' : 'IGHV3-71*01', 'positions' : (35, None)}, ] will add a snp at position 35 and at a random location. The resulting snp'd gene will have a name like IGHV3-71*01+C35T.T47G """ templates_to_remove = set() for isnp in range(len(snps_to_add)): snpinfo = snps_to_add[isnp] gene, positions = snpinfo['gene'], snpinfo['positions'] print ' adding %d %s to %s' % (len(positions), utils.plural_str('snp', len(positions)), gene) seq = glfo['seqs'][utils.get_region(gene)][gene] assert utils.get_region(gene) == 'v' cpos = glfo['cyst-positions'][gene] snpfo = None itry = 0 while snpfo is None or snpfo['gene'] in glfo['seqs'][utils.get_region(gene)]: if itry > 0: print ' already in glfo, try again' if itry > 99: raise Exception('too many tries while trying to generate new snps -- did you specify a lot of snps on the same position?') snpfo = generate_snpd_gene(gene, cpos, seq, positions) itry += 1 if remove_template_genes: templates_to_remove.add(gene) add_new_allele(glfo, snpfo, remove_template_genes=False, debug=debug) # *don't* remove the templates here, since we don't know if there's another snp later that needs them remove_the_stupid_godamn_template_genes_all_at_once(glfo, templates_to_remove) # works fine with zero-length <templates_to_remove>
def write_glfo(output_dir, glfo, only_genes=None, debug=False): if debug: print " writing glfo to %s%s" % ( output_dir, "" if only_genes is None else (" (restricting to %d genes)" % len(only_genes)), ) if os.path.exists(output_dir + "/" + glfo["chain"]): remove_glfo_files(output_dir, glfo["chain"]) # also removes output_dir os.makedirs(output_dir + "/" + glfo["chain"]) for fname in glfo_fasta_fnames(glfo["chain"]): with open(output_dir + "/" + glfo["chain"] + "/" + fname, "w") as outfile: for gene in glfo["seqs"][utils.get_region(fname)]: if only_genes is not None and gene not in only_genes: continue outfile.write(">" + gene + "\n") outfile.write(glfo["seqs"][utils.get_region(fname)][gene] + "\n") with open(output_dir + "/" + glfo["chain"] + "/" + extra_fname, "w") as csvfile: writer = csv.DictWriter(csvfile, csv_headers) writer.writeheader() for region, codon in utils.conserved_codons[glfo["chain"]].items(): for gene, istart in glfo[codon + "-positions"].items(): if only_genes is not None and gene not in only_genes: continue writer.writerow({"gene": gene, codon + "_position": istart}) # make sure there weren't any files lingering in the output dir when we started # NOTE this will ignore the dirs corresponding to any *other* chains (which is what we want now, I think) unexpected_files = set(glob.glob(output_dir + "/" + glfo["chain"] + "/*")) - set( [output_dir + "/" + glfo["chain"] + "/" + fn for fn in glfo_fnames(glfo["chain"])] ) if len(unexpected_files) > 0: raise Exception("unexpected file(s) while writing germline set: %s" % (" ".join(unexpected_files)))
def read_fasta_file(seqs, fname, skip_pseudogenes, aligned=False): n_skipped_pseudogenes = 0 for seq_record in SeqIO.parse(fname, 'fasta'): linefo = [p.strip() for p in seq_record.description.split('|')] # first get gene name if linefo[0][:2] != 'IG': # if it's an imgt file, with a bunch of header info (and the accession number first) gene = linefo[imgt_info_indices.index('gene')] functionality = linefo[imgt_info_indices.index('functionality')] if functionality not in functionalities: raise Exception('unexpected functionality %s in %s' % (functionality, fname)) if skip_pseudogenes and functionality == 'P': n_skipped_pseudogenes += 1 continue else: # plain fasta with just the gene name after the '>' gene = linefo[0] utils.split_gene(gene) # just to check if it's a valid gene name if not aligned and utils.get_region(gene) != utils.get_region(os.path.basename(fname)): # if <aligned> is True, file name is expected to be whatever raise Exception('gene %s from %s has unexpected region %s' % (gene, os.path.basename(fname), utils.get_region(gene))) # then the sequence seq = str(seq_record.seq).upper() if not aligned: seq = utils.remove_gaps(seq) if len(seq.strip(''.join(utils.expected_characters))) > 0: # return the empty string if it only contains expected characters raise Exception('unexpected character %s in %s (expected %s)' % (seq.strip(''.join(utils.expected_characters)), seq, ' '.join(utils.expected_characters))) seqs[utils.get_region(gene)][gene] = seq if n_skipped_pseudogenes > 0: print ' skipped %d pseudogenes' % n_skipped_pseudogenes
def convert_to_duplicate_name(glfo, gene): for equivalence_class in duplicate_names[utils.get_region(gene)]: if gene in equivalence_class: for alternate_name in equivalence_class: if alternate_name != gene and alternate_name in glfo["seqs"][utils.get_region(gene)]: # print 'converting %s --> %s' % (gene, alternate_name) return alternate_name raise Exception("couldn't find alternate name for %s" % gene)
def convert_to_duplicate_name(glfo, gene): for equivalence_class in duplicate_names[utils.get_region(gene)]: if gene in equivalence_class: for alternate_name in equivalence_class: if alternate_name != gene and alternate_name in glfo['seqs'][utils.get_region(gene)]: # print 'converting %s --> %s' % (gene, alternate_name) return alternate_name raise Exception('couldn\'t find alternate name for %s' % gene)
def simcountstr( gene, ws ): # counts in simulation for <gene> (note that this is _not_ the same as sim_gene_count_str(), since this takes no account of _which_ queries these counts occur in [plus it's coming from the opposite point of view]) if self.simglfo is None: rstr = '' elif gene in self.simglfo['seqs'][utils.get_region(gene)]: rstr = utils.color( 'blue', (' %' + ws + 'd') % self.simcounts[utils.get_region(gene)][gene]) else: rstr = utils.color('red', (' %' + ws + 's') % 'x') return rstr
def prepare_bppseqgen(self, seq, chosen_tree, n_leaf_nodes, gene, reco_event, seed): """ write input files and get command line options necessary to run bppseqgen on <seq> (which is a part of the full query sequence) """ if len(seq) == 0: return None # write the tree to a tmp file workdir = self.workdir + '/' + utils.get_region(gene) os.makedirs(workdir) treefname = workdir + '/tree.tre' reco_seq_fname = workdir + '/start-seq.txt' leaf_seq_fname = workdir + '/leaf-seqs.fa' if n_leaf_nodes == 1: # add an extra leaf to one-leaf trees so bppseqgen doesn't barf (when we read the output, we ignore the second leaf) lreg = re.compile('t1:[0-9]\.[0-9][0-9]*') leafstr = lreg.findall(chosen_tree) assert len(leafstr) == 1 leafstr = leafstr[0] chosen_tree = chosen_tree.replace(leafstr, '(' + leafstr + ',' + leafstr + '):0.0') with opener('w')(treefname) as treefile: treefile.write(chosen_tree) self.write_mute_freqs(gene, seq, reco_event, reco_seq_fname) env = os.environ.copy() env["LD_LIBRARY_PATH"] += ':' + self.args.partis_dir + '/packages/bpp/lib' # build up the command line # docs: http://biopp.univ-montp2.fr/apidoc/bpp-phyl/html/classbpp_1_1GTR.html that page is too darn hard to google bpp_binary = self.args.partis_dir + '/packages/bpp/bin/bppseqgen' if not os.path.exists(bpp_binary): raise Exception('bpp not found in %s' % os.path.dirname(bpp_binary)) command = bpp_binary # NOTE should I use the "equilibrium frequencies" option? command += ' alphabet=DNA' command += ' --seed=' + str(seed) command += ' input.infos=' + reco_seq_fname # input file (specifies initial "state" for each position, and possibly also the mutation rate at that position) command += ' input.infos.states=state' # column name in input file BEWARE bio++ undocumented defaults (i.e. look in the source code) command += ' input.tree.file=' + treefname command += ' input.tree.format=Newick' command += ' output.sequence.file=' + leaf_seq_fname command += ' output.sequence.format=Fasta' if self.args.mutate_from_scratch: command += ' model=JC69' command += ' input.infos.rates=none' # BEWARE bio++ undocumented defaults (i.e. look in the source code) if self.args.flat_mute_freq is not None: command += ' rate_distribution=Constant' else: command += ' rate_distribution=Gamma(n=4,alpha=' + self.mute_models[utils.get_region(gene)]['gamma']['alpha']+ ')' else: command += ' input.infos.rates=rate' # column name in input file pvpairs = [p + '=' + v for p, v in self.mute_models[utils.get_region(gene)]['gtr'].items()] command += ' model=GTR(' + ','.join(pvpairs) + ')' return {'cmd_str' : command, 'outfname' : leaf_seq_fname, 'workdir' : workdir, 'other-files' : [reco_seq_fname, treefname], 'env' : env}
def plot(self, base_plotdir, cyst_positions=None, tryp_positions=None): if not self.finalized: self.finalize() plotdir = base_plotdir + '/mute-freqs' utils.prep_dir(plotdir + '/plots', multilings=('*.csv', '*.svg')) for region in utils.regions: utils.prep_dir(plotdir + '/' + region + '/plots', multilings=('*.csv', '*.svg')) utils.prep_dir(plotdir + '/' + region + '-per-base/plots', multilings=('*.csv', '*.png')) for gene in self.counts: counts, plotting_info = self.counts[gene], self.plotting_info[gene] sorted_positions = sorted(counts) hist = TH1D('hist_' + utils.sanitize_name(gene), '', sorted_positions[-1] - sorted_positions[0] + 1, sorted_positions[0] - 0.5, sorted_positions[-1] + 0.5) for position in sorted_positions: hist.SetBinContent(hist.FindBin(position), counts[position]['freq']) hi_diff = abs(counts[position]['freq'] - counts[position]['freq_hi_err']) lo_diff = abs(counts[position]['freq'] - counts[position]['freq_lo_err']) err = 0.5*(hi_diff + lo_diff) hist.SetBinError(hist.FindBin(position), err) plotfname = plotdir + '/' + utils.get_region(gene) + '/plots/' + utils.sanitize_name(gene) + '.svg' xline = None if utils.get_region(gene) == 'v' and cyst_positions is not None: xline = cyst_positions[gene]['cysteine-position'] elif utils.get_region(gene) == 'j' and tryp_positions is not None: xline = int(tryp_positions[gene]) plotting.draw(hist, 'int', plotdir=plotdir + '/' + utils.get_region(gene), plotname=utils.sanitize_name(gene), errors=True, write_csv=True, xline=xline, draw_str='e') #, cwidth=4000, cheight=1000) paramutils.make_mutefreq_plot(plotdir + '/' + utils.get_region(gene) + '-per-base', utils.sanitize_name(gene), plotting_info) # for region in utils.regions: # utils.prep_dir(plotdir + '/' + region + '/tmp/plots', multilings=('*.csv', '*.svg')) # for gene in self.tmpcounts: # for position in self.tmpcounts[gene]: # roothist = plotting.make_hist_from_my_hist_class(self.tmpcounts[gene][position]['muted'], gene + '_' + str(position)) # plotting.draw(roothist, 'int', plotdir=plotdir + '/' + utils.get_region(gene) + '/tmp', plotname=utils.sanitize_name(gene) + '_' + str(position), errors=True, write_csv=True) #, cwidth=4000, cheight=1000) # make mean mute freq hists hist = plotting.make_hist_from_my_hist_class(self.mean_rates['all'], 'all-mean-freq') plotting.draw(hist, 'float', plotname='all-mean-freq', plotdir=plotdir, stats='mean', bounds=(0.0, 0.4), write_csv=True) for region in utils.regions: hist = plotting.make_hist_from_my_hist_class(self.mean_rates[region], region+'-mean-freq') plotting.draw(hist, 'float', plotname=region+'-mean-freq', plotdir=plotdir, stats='mean', bounds=(0.0, 0.4), write_csv=True) check_call(['./bin/makeHtml', plotdir, '3', 'null', 'svg']) # then write html file and fix permissiions for region in utils.regions: check_call(['./bin/makeHtml', plotdir + '/' + region, '1', 'null', 'svg']) check_call(['./bin/makeHtml', plotdir + '/' + region + '-per-base', '1', 'null', 'png']) check_call(['./bin/permissify-www', plotdir]) # NOTE this should really permissify starting a few directories higher up
def plot(self, plotdir, only_csv=False, only_overall=False): if not self.finalized: self.finalize() overall_plotdir = plotdir + '/overall' for gene in self.freqs: if only_overall: continue freqs = self.freqs[gene] if len(freqs) == 0: if gene not in glutils.dummy_d_genes.values(): print ' %s no mutefreqer obs for %s' % (utils.color('red', 'warning'), utils.color_gene(gene)) continue sorted_positions = sorted(freqs.keys()) genehist = Hist(sorted_positions[-1] - sorted_positions[0] + 1, sorted_positions[0] - 0.5, sorted_positions[-1] + 0.5, xtitle='position', ytitle='mut freq', title=gene) for position in sorted_positions: hi_diff = abs(freqs[position]['freq'] - freqs[position]['freq_hi_err']) lo_diff = abs(freqs[position]['freq'] - freqs[position]['freq_lo_err']) err = 0.5*(hi_diff + lo_diff) genehist.set_ibin(genehist.find_bin(position), freqs[position]['freq'], error=err) xline = None figsize = [7, 4] if utils.get_region(gene) in utils.conserved_codons[self.glfo['chain']]: codon = utils.conserved_codons[self.glfo['chain']][utils.get_region(gene)] xline = self.glfo[codon + '-positions'][gene] if utils.get_region(gene) == 'v': figsize[0] *= 3.5 elif utils.get_region(gene) == 'j': figsize[0] *= 2 plotting.draw_no_root(self.per_gene_mean_rates[gene], plotdir=plotdir + '/per-gene/' + utils.get_region(gene), plotname=utils.sanitize_name(gene), errors=True, write_csv=True, only_csv=only_csv, shift_overflows=True) # per-position plots: plotting.draw_no_root(genehist, plotdir=plotdir + '/per-gene-per-position/' + utils.get_region(gene), plotname=utils.sanitize_name(gene), errors=True, write_csv=True, xline=xline, figsize=figsize, only_csv=only_csv, shift_overflows=True) # # per-position, per-base plots: # paramutils.make_mutefreq_plot(plotdir + '/' + utils.get_region(gene) + '-per-base', utils.sanitize_name(gene), plotting_info) # needs translation to mpl UPDATE fcn is fixed, but I can't be bothered uncommenting this at the moment # make mean mute freq hists for rstr in ['all', 'cdr3'] + utils.regions: if rstr == 'all': bounds = (0.0, 0.4) else: bounds = (0.0, 0.6 if rstr == 'd' else 0.4) plotting.draw_no_root(self.mean_rates[rstr], plotname=rstr+'_mean-freq', plotdir=overall_plotdir, stats='mean', bounds=bounds, write_csv=True, only_csv=only_csv, shift_overflows=True) plotting.draw_no_root(self.mean_n_muted[rstr], plotname=rstr+'_mean-n-muted', plotdir=overall_plotdir, stats='mean', write_csv=True, only_csv=only_csv, shift_overflows=True) if not only_csv: # write html file and fix permissiions for substr in self.subplotdirs: plotting.make_html(plotdir + '/' + substr)
def prepare_bppseqgen(self, seq, chosen_tree, n_leaf_nodes, gene, reco_event, seed): """ write input files and get command line options necessary to run bppseqgen on <seq> (which is a part of the full query sequence) """ if len(seq) == 0: return None # write the tree to a tmp file workdir = self.workdir + '/' + utils.get_region(gene) os.makedirs(workdir) treefname = workdir + '/tree.tre' reco_seq_fname = workdir + '/start-seq.txt' leaf_seq_fname = workdir + '/leaf-seqs.fa' # add dummy leaf that we'll subsequently ignore (such are the vagaries of bppseqgen) chosen_tree = '(%s,%s:%.15f):0.0;' % (chosen_tree.rstrip(';'), dummy_name_so_bppseqgen_doesnt_break, treeutils.get_mean_leaf_height(treestr=chosen_tree)) with open(treefname, 'w') as treefile: treefile.write(chosen_tree) self.write_mute_freqs(gene, seq, reco_event, reco_seq_fname) env = os.environ.copy() env['LD_LIBRARY_PATH'] = env.get('LD_LIBRARY_PATH', '') + ':' + self.args.partis_dir + '/packages/bpp/lib' # build up the command line # docs: http://biopp.univ-montp2.fr/apidoc/bpp-phyl/html/classbpp_1_1GTR.html that page is too darn hard to google bpp_binary = self.args.partis_dir + '/packages/bpp/bin/bppseqgen' if not os.path.exists(bpp_binary): raise Exception('bpp not found in %s' % os.path.dirname(bpp_binary)) command = bpp_binary # NOTE should I use the "equilibrium frequencies" option? command += ' alphabet=DNA' command += ' --seed=' + str(seed) command += ' input.infos=' + reco_seq_fname # input file (specifies initial "state" for each position, and possibly also the mutation rate at that position) command += ' input.infos.states=state' # column name in input file BEWARE bio++ undocumented defaults (i.e. look in the source code) command += ' input.tree.file=' + treefname command += ' input.tree.format=Newick' command += ' output.sequence.file=' + leaf_seq_fname command += ' output.sequence.format=Fasta' if self.args.mutate_from_scratch: command += ' model=JC69' command += ' input.infos.rates=none' # BEWARE bio++ undocumented defaults (i.e. look in the source code) if self.args.flat_mute_freq: command += ' rate_distribution=Constant' else: command += ' rate_distribution=Gamma(n=4,alpha=' + self.mute_models[utils.get_region(gene)]['gamma']['alpha']+ ')' else: command += ' input.infos.rates=rate' # column name in input file pvpairs = [p + '=' + v for p, v in self.mute_models[utils.get_region(gene)]['gtr'].items()] command += ' model=GTR(' + ','.join(pvpairs) + ')' return {'cmd_str' : command, 'outfname' : leaf_seq_fname, 'workdir' : workdir, 'other-files' : [reco_seq_fname, treefname], 'env' : env}
def get_token(username, password, domain): region_name = utils.get_region() auth_data = { "auth": { "identity": { "password": { "user": { "name": username, "password": password, "domain": { "name": domain } } }, "methods": ["password"] }, "scope": { "project": { "name": region_name } } } } _url = 'https://%s/v3/auth/tokens' % ais.AisEndpoint.IAM_ENPOINT req = urllib2.Request(url=_url) req.add_header('Content-Type', 'application/json') req.add_data(json.dumps(auth_data)) r = urllib2.urlopen(req) X_TOKEN = r.headers['X-Subject-Token'] return X_TOKEN
def generate_message(update: dict): """ generates telegram message """ android = update['android'] codename = update['codename'] device = update['device'] download = update['download'] filename = update['filename'] filesize = update['size'] version = update['version'] branch = get_branch(version).capitalize() region = get_region(filename, codename, version) rom_type = get_type(filename) codename = codename.split('_')[0] message = f"New {branch} {rom_type} update available!\n" message += f"*Device:* {device} \n" \ f"*Codename:* #{codename} \n" \ f"*Region:* {region} \n" \ f"*Version:* `{version}` \n" \ f"*Android:* {android} \n" \ f"*Size*: {filesize} \n" \ f"*Download*: [Here]({download})\n\n" \ f"[Latest Updates](https://xiaomifirmwareupdater.com/miui/{codename}) - " \ f"[All Updates](https://xiaomifirmwareupdater.com/archive/miui/{codename})\n" \ "@MIUIUpdatesTracker | @XiaomiFirmwareUpdater" return message
def read_ramesh_file(fname, outdir, debug=False): seqfos = utils.read_fastx(fname) glseqs = { l: {r: {} for r in utils.loci[l]} for l in utils.loci if 'ig' in l } for sfo in seqfos: if os.path.basename(fname) == 'coding.fa': meta = [x.strip('[]').split('=') for x in sfo['infostrs']] mdict = {m[0]: m[1] for m in meta if len(m) == 2} if 'gene' not in mdict: print 'no gene for %s' % sfo['infostrs'] continue gene = mdict['gene'] else: mdict = {} gene = sfo['name'] if debug: print gene if utils.is_constant_gene(gene): if debug: print ' constant' continue region = utils.get_region(gene) utils.split_gene(gene) # if 'partial' in mdict: # gene += '_partial_%s' % mdict['partial'].replace('\'', '').replace(',', '') if sfo['seq'] in glseqs[utils.get_locus(gene)][region].values(): if debug: print ' duplicate' continue glseqs[utils.get_locus(gene)][region][gene] = sfo['seq'] return glseqs
def add_new_allele(glfo, newfo, remove_template_genes=False, debug=False): """ Add a new allele to <glfo>, specified by <newfo> which is of the form: {'gene' : 'IGHV3-71*01+C35T.T47G', 'seq' : 'ACTG yadda yadda CGGGT', 'template-gene' : 'IGHV3-71*01'} If <remove_template_genes>, we also remove 'template-gene' from <glfo>. """ template_gene = newfo['template-gene'] region = utils.get_region(template_gene) if template_gene not in glfo['seqs'][region]: raise Exception('unknown template gene %s' % template_gene) new_gene = newfo['gene'] if region == 'v': glfo['cyst-positions'][new_gene] = glfo['cyst-positions'][template_gene] elif region == 'j': glfo['tryp-positions'][new_gene] = glfo['tryp-positions'][template_gene] glfo['seqs'][region][new_gene] = newfo['seq'] if debug: print ' adding new allele to glfo:' print ' template %s %s' % (glfo['seqs'][region][template_gene], utils.color_gene(template_gene)) print ' new %s %s' % (utils.color_mutants(glfo['seqs'][region][template_gene], newfo['seq']), utils.color_gene(new_gene)) if remove_template_genes: remove_gene(glfo, template_gene, debug=True)
def add_new_allele(self, gene, fitfo, n_candidate_snps, debug=False): # figure out what the new nukes are old_seq = self.glfo['seqs'][utils.get_region(gene)][gene] new_seq = old_seq mutfo = {} for pos in sorted(fitfo['candidates'][n_candidate_snps]): obs_counts = {nuke : self.counts[gene][pos][n_candidate_snps][nuke] for nuke in utils.nukes} # NOTE it's super important to only use the counts from sequences with <n_candidate_snps> total mutations sorted_obs_counts = sorted(obs_counts.items(), key=operator.itemgetter(1), reverse=True) original_nuke = self.mfreqer.counts[gene][pos]['gl_nuke'] new_nuke = None for nuke, _ in sorted_obs_counts: # take the most common one that isn't the existing gl nuke if nuke != original_nuke: new_nuke = nuke break print ' %3d (%s --> %s)' % (pos, original_nuke, new_nuke), assert old_seq[pos] == original_nuke mutfo[pos] = {'original' : original_nuke, 'new' : new_nuke} new_seq = new_seq[:pos] + new_nuke + new_seq[pos+1:] new_name, mutfo = glutils.get_new_allele_name_and_change_mutfo(gene, mutfo) print '' print ' %s %s' % (old_seq, utils.color_gene(gene)) print ' %s %s' % (utils.color_mutants(old_seq, new_seq), utils.color_gene(new_name)) # and add it to the set of new alleles for this gene self.new_allele_info.append({ 'template-gene' : gene, 'gene' : new_name, 'seq' : new_seq, 'aligned-seq' : None })
def add_new_allele(glfo, newfo, remove_template_genes, debug=False): """ Add a new allele to <glfo>, specified by <newfo> which is of the form: {'template-gene' : 'IGHV3-71*01', 'gene' : 'IGHV3-71*01+C35T.T47G', 'seq' : 'ACTG yadda yadda CGGGT'} If <remove_template_genes>, we also remove 'template-gene' from <glfo>. """ template_gene = newfo['template-gene'] region = utils.get_region(template_gene) if template_gene not in glfo['seqs'][region]: raise Exception('unknown template gene %s' % template_gene) new_gene = newfo['gene'] if region == 'v': glfo['cyst-positions'][new_gene] = glfo['cyst-positions'][template_gene] elif region == 'j': glfo['tryp-positions'][new_gene] = glfo['tryp-positions'][template_gene] glfo['seqs'][region][new_gene] = newfo['seq'] if debug: print ' adding new allele to glfo:' print ' template %s %s' % (glfo['seqs'][region][template_gene], utils.color_gene(template_gene)) print ' new %s %s' % (utils.color_mutants(glfo['seqs'][region][template_gene], newfo['seq']), utils.color_gene(new_gene)) if remove_template_genes: remove_gene(glfo, template_gene, debug=True)
def make_transition_plot(self, gene_name, model): """ NOTE shares a lot with make_mutefreq_plot() in python/paramutils.py """ fig, ax = plotting.mpl_init() fig.set_size_inches(plotting.plot_ratios[utils.get_region(gene_name)]) ibin = 0 print utils.color_gene(utils.unsanitize_name(gene_name)) legend_colors = set() # add a color to this the first time you plot it for state in model.states: # bin label ax.text(-0.5 + ibin, -0.075, paramutils.simplify_state_name(state.name), rotation='vertical', size=8) sorted_to_states = {} for name in state.transitions.keys(): if name.find('IG') == 0: sorted_to_states[name] = int(paramutils.simplify_state_name(name)) else: sorted_to_states[name] = name sorted_to_states = sorted(sorted_to_states.items(), key=operator.itemgetter(1)) total = 0.0 for to_state, simple_to_state in sorted_to_states: prob = state.transitions[to_state] alpha = 0.6 width = 3 if 'insert' in str(simple_to_state): label = 'insert' color = '#3498db' # blue elif str(simple_to_state) == 'end': label = 'end' color = 'red' else: # regional/internal states assert to_state.find('IG') == 0 label = 'internal' color = 'green' label_to_use = None if color not in legend_colors: label_to_use = label legend_colors.add(color) # horizontal line at height total+prob ax.plot([-0.5 + ibin, 0.5 + ibin], [total + prob, total + prob], color=color, linewidth=width, alpha=alpha, label=label_to_use) # vertical line from total to total + prob ax.plot([ibin, ibin], [total + 0.01, total + prob], color=color, alpha=alpha, linewidth=width) midpoint = 0.5*(prob + 2*total) # ax.text(ibin, midpoint, paramutils.simplify_state_name(to_state)) # nicely labels the midpoint of the chunk between lines, but there isn't really room for it total += prob ibin += 1 ax.get_xaxis().set_visible(False) plotting.mpl_finish(ax, self.base_plotdir + '/transitions', gene_name, ybounds=(-0.01, 1.01), xbounds=(-3, len(model.states) + 3), leg_loc=(0.95, 0.1), adjust={'left' : 0.1, 'right' : 0.8}, leg_prop={'size' : 8})
def generate_snpd_gene(gene, cpos, seq, positions): assert utils.get_region(gene) == 'v' # others not yet handled def choose_position(): snp_pos = None while snp_pos is None or snp_pos in snpd_positions or not utils.codon_unmutated('cyst', tmpseq, cpos, debug=True): snp_pos = random.randint(0, len(seq) - 1) # note that randint() is inclusive tmpseq = seq[: snp_pos] + 'X' + seq[snp_pos + 1 :] # for checking cyst position return snp_pos snpd_positions = set() # only used if a position wasn't specified (i.e. was None) in <snps_to_add> mutfo = OrderedDict() for snp_pos in positions: if snp_pos is None: snp_pos = choose_position() snpd_positions.add(snp_pos) new_base = None while new_base is None or new_base == seq[snp_pos]: new_base = utils.nukes[random.randint(0, len(utils.nukes) - 1)] print ' %3d %s --> %s' % (snp_pos, seq[snp_pos], new_base) mutfo[snp_pos] = {'original' : seq[snp_pos], 'new' : new_base} seq = seq[: snp_pos] + new_base + seq[snp_pos + 1 :] assert utils.codon_unmutated('cyst', seq, cpos, debug=True) # this is probably unnecessary snpd_name, mutfo = get_new_allele_name_and_change_mutfo(gene, mutfo) return {'template-gene' : gene, 'gene' : snpd_name, 'seq' : seq}
def plot(self, base_plotdir, only_csv=False): if not self.finalized: self.finalize(debug=debug) plotdir = base_plotdir + '/allele-finding' for old_gene_dir in glob.glob(plotdir + '/*'): # has to be a bit more hackey than elsewhere, since we have no way of knowing what genes might have had their own directories written last time we wrote to this dir if not os.path.isdir(old_gene_dir): raise Exception('not a directory: %s' % old_gene_dir) utils.prep_dir(old_gene_dir, wildlings=('*.csv', '*.svg')) os.rmdir(old_gene_dir) utils.prep_dir(plotdir, wildlings=('*.csv', '*.svg')) if only_csv: # not implemented return start = time.time() for gene in self.plotvals: if utils.get_region(gene) != 'v': continue for position in self.plotvals[gene]: if position not in self.fitted_positions[gene]: # we can make plots for the positions we didn't fit, but there's a *lot* of them and they're slow continue # if 'allele-finding' not in self.TMPxyvals[gene][position] or self.TMPxyvals[gene][position]['allele-finding'] is None: # continue plotting.make_allele_finding_plot(plotdir + '/' + utils.sanitize_name(gene), gene, position, self.plotvals[gene][position]) print ' allele finding plot time: %.1f' % (time.time()-start)
def make_transition_plot(self, gene_name, model): """ NOTE shares a lot with make_mutefreq_plot() in python/paramutils.py """ fig, ax = plotting.mpl_init() fig.set_size_inches(plotting.plot_ratios[utils.get_region(gene_name)]) ibin = 0 print utils.color_gene(utils.unsanitize_name(gene_name)) legend_colors = set() # add a color to this the first time you plot it for state in model.states: # bin label ax.text(-0.5 + ibin, -0.075, paramutils.simplify_state_name(state.name), rotation='vertical', size=8) sorted_to_states = {} for name in state.transitions.keys(): if name.find('IG') == 0 or name.find('TR') == 0: sorted_to_states[name] = int(paramutils.simplify_state_name(name)) else: sorted_to_states[name] = name sorted_to_states = sorted(sorted_to_states.items(), key=operator.itemgetter(1)) total = 0.0 for to_state, simple_to_state in sorted_to_states: prob = state.transitions[to_state] alpha = 0.6 width = 3 if 'insert' in str(simple_to_state): label = 'insert' color = '#3498db' # blue elif str(simple_to_state) == 'end': label = 'end' color = 'red' else: # regional/internal states assert to_state.find('IG') == 0 or to_state.find('TR') == 0 label = 'internal' color = 'green' label_to_use = None if color not in legend_colors: label_to_use = label legend_colors.add(color) # horizontal line at height total+prob ax.plot([-0.5 + ibin, 0.5 + ibin], [total + prob, total + prob], color=color, linewidth=width, alpha=alpha, label=label_to_use) # vertical line from total to total + prob ax.plot([ibin, ibin], [total + 0.01, total + prob], color=color, alpha=alpha, linewidth=width) midpoint = 0.5*(prob + 2*total) # ax.text(ibin, midpoint, paramutils.simplify_state_name(to_state)) # nicely labels the midpoint of the chunk between lines, but there isn't really room for it total += prob ibin += 1 ax.get_xaxis().set_visible(False) plotting.mpl_finish(ax, self.base_plotdir + '/transitions', gene_name, ybounds=(-0.01, 1.01), xbounds=(-3, len(model.states) + 3), leg_loc=(0.95, 0.1), adjust={'left' : 0.1, 'right' : 0.8}, leg_prop={'size' : 8})
async def send_message(self, update: dict): """ Generates and sends a Discord message """ android = update['android'] codename = update['codename'] device = update['device'] filename = update['filename'] filesize = update['size'] version = update['version'] download = update['download'] branch = get_branch(version) region = get_region(filename, codename, version) rom_type = get_type(filename) codename = codename.split('_')[0] device = device.replace(f' {region}', '') desc = f"**Device**: {device} \n" \ f"**Codename**: `{codename}` \n" \ f"**Region**: {region} \n" \ f"**Version**: `{version} | {android}` \n" \ f"**Size**: {filesize} \n" \ f"**Download**: [Here]({download})" embed = discord.Embed(title=f"New {branch} {rom_type} update available!", color=discord.Colour.orange(), description=desc) embed.set_footer(text=f"https://xiaomifirmwareupdater.com/miui/{codename}") device = device.lower() for name, channel in self.channels.items(): if device.startswith(name): await channel.send(embed=embed) print(f"Posted update for {codename} in Discord") return await self.channels['other'].send(embed=embed) print(f"Posted update for {codename} in Discord")
def bundle(rootfs, size=10, filesystem='ext4'): log.debug('getting unique snapshot name') app = utils.get_turnkey_version(rootfs) snapshot_name = utils.get_uniquename(utils.get_region(), app + '.ebs') log.info('target snapshot - %s ', snapshot_name) log.info('creating volume, attaching, formatting and mounting') volume = Volume() volume.create(size) device = Device() volume.attach(utils.get_instanceid(), device) device.mkfs(filesystem) mount_path = rootfs + '.mount' device.mount(mount_path) log.info('syncing rootfs to volume') utils.rsync(rootfs, mount_path) device.umount() volume.detach() os.removedirs(mount_path) log.info('creating snapshot from volume') snapshot = Snapshot() snapshot.create(volume.vol.id, snapshot_name) volume.delete() log.info("complete - %s %s", snapshot.snap.id, snapshot.snap.description) return snapshot.snap.id, snapshot.snap.description
def add_new_allele(glfo, newfo, remove_template_genes=False, debug=False): """ Add a new allele to <glfo>, specified by <newfo> which is of the form: {'gene' : 'IGHV3-71*01+C35T.T47G', 'seq' : 'ACTG yadda yadda CGGGT', 'template-gene' : 'IGHV3-71*01'} If <remove_template_genes>, we also remove 'template-gene' from <glfo>. """ template_gene = newfo["template-gene"] region = utils.get_region(template_gene) if template_gene not in glfo["seqs"][region]: raise Exception("unknown template gene %s" % template_gene) new_gene = newfo["gene"] if region == "v": glfo["cyst-positions"][new_gene] = glfo["cyst-positions"][template_gene] elif region == "j": glfo["tryp-positions"][new_gene] = glfo["tryp-positions"][template_gene] glfo["seqs"][region][new_gene] = newfo["seq"] if debug: print " adding new allele to glfo:" print " template %s %s" % (glfo["seqs"][region][template_gene], utils.color_gene(template_gene)) print " new %s %s" % ( utils.color_mutants(glfo["seqs"][region][template_gene], newfo["seq"]), utils.color_gene(new_gene), ) if remove_template_genes: remove_gene(glfo, template_gene, debug=True)
def generate_snpd_gene(gene, cpos, seq, positions): assert utils.get_region(gene) == 'v' # others not yet handled def choose_position(): snp_pos = None while snp_pos is None or snp_pos in snpd_positions or not utils.check_conserved_cysteine(tmpseq, cpos, debug=True, assert_on_fail=False): snp_pos = random.randint(10, len(seq) - 15) # note that randint() is inclusive tmpseq = seq[: snp_pos] + 'X' + seq[snp_pos + 1 :] # for checking cyst position return snp_pos snpd_positions = set() # only used if a position wasn't specified (i.e. was None) in <snps_to_add> mutfo = OrderedDict() for snp_pos in positions: if snp_pos is None: snp_pos = choose_position() snpd_positions.add(snp_pos) new_base = None while new_base is None or new_base == seq[snp_pos]: new_base = utils.nukes[random.randint(0, len(utils.nukes) - 1)] print ' %3d %s --> %s' % (snp_pos, seq[snp_pos], new_base) mutfo[snp_pos] = {'original' : seq[snp_pos], 'new' : new_base} seq = seq[: snp_pos] + new_base + seq[snp_pos + 1 :] utils.check_conserved_cysteine(seq, cpos) snpd_name, mutfo = get_new_allele_name_and_change_mutfo(gene, mutfo) return {'template-gene' : gene, 'gene' : snpd_name, 'seq' : seq}
def generate_snpd_gene(gene, cpos, seq, positions): assert utils.get_region(gene) == "v" # others not yet handled def choose_position(): snp_pos = None while snp_pos is None or snp_pos in snpd_positions or not utils.codon_ok("cyst", tmpseq, cpos, debug=True): snp_pos = random.randint(10, len(seq) - 15) # note that randint() is inclusive tmpseq = seq[:snp_pos] + "X" + seq[snp_pos + 1 :] # for checking cyst position return snp_pos snpd_positions = set() # only used if a position wasn't specified (i.e. was None) in <snps_to_add> mutfo = OrderedDict() for snp_pos in positions: if snp_pos is None: snp_pos = choose_position() snpd_positions.add(snp_pos) new_base = None while new_base is None or new_base == seq[snp_pos]: new_base = utils.nukes[random.randint(0, len(utils.nukes) - 1)] print " %3d %s --> %s" % (snp_pos, seq[snp_pos], new_base) mutfo[snp_pos] = {"original": seq[snp_pos], "new": new_base} seq = seq[:snp_pos] + new_base + seq[snp_pos + 1 :] assert utils.codon_ok("cyst", seq, cpos, debug=True) # this is probably unnecessary snpd_name, mutfo = get_new_allele_name_and_change_mutfo(gene, mutfo) return {"template-gene": gene, "gene": snpd_name, "seq": seq}
def finalize(self, debug=False): assert not self.finalized self.mfreqer.finalize() start = time.time() gene_results = {'not_enough_obs_to_fit' : set(), 'didnt_find_anything_with_fit' : set(), 'new_allele' : set()} if debug: print '\nlooking for new alleles:' for gene in sorted(self.mfreqer.counts): if utils.get_region(gene) != 'v': continue if debug: print '\n%s (observed %d %s)' % (utils.color_gene(gene), self.gene_obs_counts[gene], utils.plural_str('time', self.gene_obs_counts[gene])) positions_to_try_to_fit, xyvals = self.get_positions_to_fit(gene, gene_results, debug=debug) if positions_to_try_to_fit is None: continue fitfo = {n : {} for n in ('min_snp_ratios', 'candidates')} for istart in range(1, self.n_max_snps): if debug: if istart == 1: print ' resid. / ndof' print ' position ratio (m=0 / m>%5.2f) muted / obs ' % self.big_y_icpt_bounds[0] print ' %d %s' % (istart, utils.plural_str('snp', istart)) subxyvals = {pos : {k : v[istart : istart + self.max_fit_length] for k, v in xyvals[pos].items()} for pos in positions_to_try_to_fit} self.fit_istart(gene, istart, positions_to_try_to_fit, subxyvals, fitfo, debug=debug) if istart not in fitfo['candidates']: # if it didn't get filled, we didn't have enough observations to do the fit break istart_candidates = [] if debug: print ' evaluating each snp hypothesis' print ' snps min ratio' for istart in fitfo['candidates']: if debug: print ' %2d %9s' % (istart, fstr(fitfo['min_snp_ratios'][istart])), if self.is_a_candidate(gene, fitfo, istart, debug=debug): istart_candidates.append(istart) if len(istart_candidates) > 0: n_candidate_snps = min(istart_candidates) # add the candidate with the smallest number of snps to the germline set, and run again (if the firs gene_results['new_allele'].add(gene) print '\n found a new allele candidate separated from %s by %d %s at %s:' % (utils.color_gene(gene), n_candidate_snps, utils.plural_str('snp', n_candidate_snps), utils.plural_str('position', n_candidate_snps)), self.add_new_allele(gene, fitfo, n_candidate_snps, debug=debug) else: gene_results['didnt_find_anything_with_fit'].add(gene) if debug: print ' no new alleles' if debug: print 'found new alleles for %d %s (there were also %d without new alleles, and %d without enough observations to fit)' % (len(gene_results['new_allele']), utils.plural_str('gene', len(gene_results['new_allele'])), len(gene_results['didnt_find_anything_with_fit']), len(gene_results['not_enough_obs_to_fit'])) print ' allele finding time: %.1f' % (time.time()-start) self.finalized = True
def __init__(self, base_indir, outdir, gene_name, naivety, glfo, args): self.region = utils.get_region(gene_name) self.raw_name = gene_name # i.e. unsanitized self.germline_seqs = glfo['seqs'] # all germline alleles self.germline_seq = self.germline_seqs[self.region][gene_name] # germline sequence for this hmm self.indir = base_indir self.args = args self.cyst_positions = glfo['cyst-positions'] self.tryp_positions = glfo['tryp-positions'] # parameters with values that I more or less made up self.precision = '16' # number of digits after the decimal for probabilities self.eps = 1e-6 # NOTE I also have an eps defined in utils, and they should in principle be combined self.n_max_to_interpolate = 20 self.min_mean_unphysical_insertion_length = {'fv' : 1.5, 'jf' : 25} # jf has to be quite a bit bigger, since besides account for the variation in J length from the tryp position to the end, it has to account for the difference in cdr3 lengths self.erosion_pseudocount_length = 10 # if we're closer to the end of the gene than this, make sure erosion probability isn't zero # self.insert_mute_prob = 0.0 # self.mean_mute_freq = 0.0 self.outdir = outdir self.naivety = naivety self.smallest_entry_index = -1 # keeps track of the first state that has a chance of being entered from init -- we want to start writing (with add_internal_state) from there # self.insertions = [ insert for insert in utils.index_keys if re.match(self.region + '._insertion', insert) or re.match('.' + self.region + '_insertion', insert)] OOPS that's not what I want to do self.insertions = [] if self.region == 'v': self.insertions.append('fv') elif self.region == 'd': self.insertions.append('vd') elif self.region == 'j': self.insertions.append('dj') self.insertions.append('jf') self.erosion_probs = {} self.insertion_probs = {} self.insertion_content_probs = {} self.n_occurences = utils.read_overall_gene_probs(self.indir, only_gene=gene_name, normalize=False) # how many times did we observe this gene in data? replacement_genes = None if self.n_occurences < self.args.min_observations_to_write: # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us if self.args.debug: print ' only saw it %d times, use info from other genes' % self.n_occurences replacement_genes = utils.find_replacement_genes(self.indir, self.args.min_observations_to_write, gene_name, single_gene=False, debug=self.args.debug) self.read_erosion_info(gene_name, replacement_genes) # try this exact gene, but... self.read_insertion_info(gene_name, replacement_genes) if self.naivety == 'M': # mutate if not naive self.mute_freqs, self.mute_obs = paramutils.read_mute_info(self.indir, this_gene=gene_name, approved_genes=replacement_genes) self.track = Track('nukes', utils.nukes) self.saniname = utils.sanitize_name(gene_name) self.hmm = HMM(self.saniname, self.track.getdict()) # pass the track as a dict rather than a Track object to keep the yaml file a bit more readable self.hmm.extras['gene_prob'] = max(self.eps, utils.read_overall_gene_probs(self.indir, only_gene=gene_name)) # if we really didn't see this gene at all, take pity on it and kick it an eps mean_freq_hist = Hist(fname=self.indir + '/all-mean-mute-freqs.csv') self.hmm.extras['overall_mute_freq'] = mean_freq_hist.get_mean()
def remove_gene(glfo, gene, debug=False): """ remove <gene> from <glfo> """ if debug: print ' removing %s from glfo' % utils.color_gene(gene) region = utils.get_region(gene) if region in utils.conserved_codons: del glfo[utils.conserved_codons[region] + '-positions'][gene] del glfo['seqs'][region][gene]
def read_fasta_file(seqs, fname, skip_pseudogenes, aligned=False): n_skipped_pseudogenes = 0 seq_to_gene_map = {} for seqfo in utils.read_fastx(fname): # first get gene name if seqfo['name'][:2] != 'IG' and seqfo['name'][:2] != 'TR': # if it's an imgt file, with a bunch of header info (and the accession number first) gene = seqfo['infostrs'][imgt_info_indices.index('gene')] functionality = seqfo['infostrs'][imgt_info_indices.index('functionality')] if functionality not in functionalities: raise Exception('unexpected functionality %s in %s' % (functionality, fname)) if skip_pseudogenes and functionality in pseudogene_funcionalities: n_skipped_pseudogenes += 1 continue else: # plain fasta with just the gene name after the '>' gene = seqfo['name'] utils.split_gene(gene) # just to check if it's a valid gene name if not aligned and utils.get_region(gene) != utils.get_region(os.path.basename(fname)): # if <aligned> is True, file name is expected to be whatever raise Exception('gene %s from %s has unexpected region %s' % (gene, os.path.basename(fname), utils.get_region(gene))) if gene in seqs[utils.get_region(gene)]: raise Exception('gene name %s appears twice in %s' % (gene, fname)) # then the sequence seq = seqfo['seq'] if not aligned: seq = utils.remove_gaps(seq) if 'Y' in seq: print ' replacing Y --> N (%d of \'em) in %s' % (seq.count('Y'), utils.color_gene(gene)) seq = seq.replace('Y', 'N') if len(seq.strip(''.join(utils.expected_characters))) > 0: # return the empty string if it only contains expected characters raise Exception('unexpected character %s in %s (expected %s)' % (seq.strip(''.join(utils.expected_characters)), seq, ' '.join(utils.expected_characters))) if seq not in seq_to_gene_map: seq_to_gene_map[seq] = [] seq_to_gene_map[seq].append(gene) seqs[utils.get_region(gene)][gene] = seq tmpcounts = [len(gl) for gl in seq_to_gene_map.values()] # number of names corresponding to each sequence (should all be ones) if tmpcounts.count(1) != len(tmpcounts): print ' mutliple names in %s for the following sequences:' % fname for seq, genelist in seq_to_gene_map.items(): if len(genelist) > 1: print ' %-50s %s' % (' '.join(genelist), seq) raise Exception('please de-duplicate the fasta and re-run.') if n_skipped_pseudogenes > 0: print ' skipped %d %s pseudogenes (leaving %d)' % (n_skipped_pseudogenes, utils.get_region(os.path.basename(fname)), len(seqs[utils.get_region(os.path.basename(fname))]))
def plot(self, base_plotdir, cyst_positions=None, tryp_positions=None, only_csv=False): if not self.finalized: self.finalize() plotdir = base_plotdir + '/mute-freqs' overall_plotdir = plotdir + '/overall' utils.prep_dir(overall_plotdir, multilings=('*.csv', '*.svg')) for region in utils.regions: utils.prep_dir(plotdir + '/' + region, multilings=('*.csv', '*.svg')) # utils.prep_dir(plotdir + '/' + region + '-per-base/plots', multilings=('*.csv', '*.png')) if self.tigger: utils.prep_dir(plotdir + '/tigger', multilings=('*.csv', '*.svg')) for gene in self.freqs: freqs = self.freqs[gene] sorted_positions = sorted(freqs.keys()) genehist = Hist(sorted_positions[-1] - sorted_positions[0] + 1, sorted_positions[0] - 0.5, sorted_positions[-1] + 0.5, xtitle='fixme', ytitle='fixme') #, title=utils.sanitize_name(gene)) for position in sorted_positions: hi_diff = abs(freqs[position]['freq'] - freqs[position]['freq_hi_err']) lo_diff = abs(freqs[position]['freq'] - freqs[position]['freq_lo_err']) err = 0.5*(hi_diff + lo_diff) genehist.set_ibin(genehist.find_bin(position), freqs[position]['freq'], error=err) xline = None figsize = [3, 3] if utils.get_region(gene) == 'v' and cyst_positions is not None: xline = cyst_positions[gene] figsize[0] *= 3.5 elif utils.get_region(gene) == 'j' and tryp_positions is not None: xline = tryp_positions[gene] figsize[0] *= 2 plotting.draw_no_root(genehist, plotdir=plotdir + '/' + utils.get_region(gene), plotname=utils.sanitize_name(gene), errors=True, write_csv=True, xline=xline, figsize=figsize, only_csv=only_csv) # paramutils.make_mutefreq_plot(plotdir + '/' + utils.get_region(gene) + '-per-base', utils.sanitize_name(gene), plotting_info) # needs translation to mpl # make mean mute freq hists plotting.draw_no_root(self.mean_rates['all'], plotname='all-mean-freq', plotdir=overall_plotdir, stats='mean', bounds=(0.0, 0.4), write_csv=True, only_csv=only_csv) for region in utils.regions: plotting.draw_no_root(self.mean_rates[region], plotname=region+'-mean-freq', plotdir=overall_plotdir, stats='mean', bounds=(0.0, 0.4), write_csv=True, only_csv=only_csv) if self.tigger: self.tigger_plot(only_csv) if not only_csv: # write html file and fix permissiions plotting.make_html(overall_plotdir) for region in utils.regions: plotting.make_html(plotdir + '/' + region, n_columns=1)
def figure_out_which_damn_gene(germline_seqs, gene_name, seq, debug=False): region = utils.get_region(gene_name) seq = seq.replace(' ', '') if gene_name in germline_seqs[region]: # already have it, but maybe when we added it before it was a shorter match, so substitute with the new longer match if len(seq) > len(germline_seqs[region][gene_name]): print ' gl match longer than gl!' print ' ', seq print ' ', germline_seqs[region][gene_name] germline_seqs[region][gene_name] = seq return gene_name candidates = [] # if it doesn't specify an allele, see if any of the alleles we've got have the same sequence in the match region if gene_name.find('*') == -1: for candidate_gene in germline_seqs[region]: if candidate_gene.find(gene_name) == 0: if seq in germline_seqs[region][candidate_gene]: candidates.append(candidate_gene) # if it *does* specify an allele, see if any of the other allele have the same sequence in the match region if len(candidates) == 0: # didn't find anything... try other alleles for candidate_gene in germline_seqs[region]: if utils.are_alleles(candidate_gene, gene_name): if seq in germline_seqs[region][candidate_gene]: candidates.append(candidate_gene) # sometimes it's 3-9, but sometimes 3-09. *grrrrrr*. if len(candidates) == 0: for candidate_gene in germline_seqs[region]: if gene_name.replace('-0', '-') == candidate_gene: if seq in germline_seqs[region][candidate_gene]: candidates.append(candidate_gene) # try adding _F and _P to the end of j names if len(candidates) == 0: for candidate_gene in germline_seqs[region]: if gene_name + '_F' == candidate_gene or gene_name + '_P' == candidate_gene: if seq[ : len(germline_seqs[region][candidate_gene])] in germline_seqs[region][candidate_gene]: # shorten <seq> to account for extra bases on right of imgt j versions candidates.append(candidate_gene) # try removing the darn R at the end (and remove the zero). I hope it doesn't mean anything important if len(candidates) == 0: for candidate_gene in germline_seqs[region]: if gene_name.replace('R', '').replace('-0', '-') == candidate_gene: if seq in germline_seqs[region][candidate_gene]: candidates.append(candidate_gene) if len(candidates) == 0: print ' ERROR didn\'t find jack for', gene_name, seq assert False # elif len(candidates) > 1: # print 'NOTE found',len(candidates),'candidates, just using the first one' if debug: print ' swapping', gene_name, '-->', candidates[0] return candidates[0]
def add_some_snps(snps_to_add, glfo, remove_template_genes=False, debug=False): """ Generate some snp'd genes and add them to glfo, specified with <snps_to_add>. e.g. [{'gene' : 'IGHV3-71*01', 'positions' : (35, None)}, ] will add a snp at position 35 and at a random location. The resulting snp'd gene will have a name like IGHV3-71*01+C35T.T47G """ templates_to_remove = set() added_snp_names = [] for isnp in range(len(snps_to_add)): snpinfo = snps_to_add[isnp] gene, positions = snpinfo["gene"], snpinfo["positions"] print " adding %d %s to %s" % (len(positions), utils.plural_str("snp", len(positions)), gene) seq = glfo["seqs"][utils.get_region(gene)][gene] assert utils.get_region(gene) == "v" cpos = glfo["cyst-positions"][gene] snpfo = None itry = 0 while snpfo is None or snpfo["gene"] in glfo["seqs"][utils.get_region(gene)]: if itry > 0: print " already in glfo, try again" if itry > 99: raise Exception( "too many tries while trying to generate new snps -- did you specify a lot of snps on the same position?" ) snpfo = generate_snpd_gene(gene, cpos, seq, positions) itry += 1 if remove_template_genes: templates_to_remove.add(gene) add_new_allele( glfo, snpfo, remove_template_genes=False, debug=debug ) # *don't* remove the templates here, since we don't know if there's another snp later that needs them added_snp_names.append(snpfo["gene"]) remove_the_stupid_godamn_template_genes_all_at_once( glfo, templates_to_remove ) # works fine with zero-length <templates_to_remove> return ( added_snp_names ) # need the order of the names so we can get allele prevalence freqs from the command line right
def process_query(self, bam, reads, perfplotter=None): primary = next((r for r in reads if not r.is_secondary), None) query_seq = primary.seq try: query_name = int( primary.qname ) # if it's just one of my hashes, we want it as an int except ValueError: query_name = primary.qname # but if it's someone else's random-ass alphasymbolonumeric string we'll just leave it as-is raw_best = {} all_match_names = {} warnings = {} # ick, this is a messy way to pass stuff around for region in utils.regions: all_match_names[region] = [] all_query_bounds, all_germline_bounds = {}, {} for read in reads: # loop over the matches found for each query sequence read.seq = query_seq # only the first one has read.seq set by default, so we need to set the rest by hand gene = bam.references[read.tid] region = utils.get_region(gene) warnings[gene] = '' if region not in raw_best: # best v, d, and j before multiplying by gene choice probs. needed 'cause *these* are the v and j that get excised raw_best[region] = gene raw_score = read.tags[0][ 1] # raw because they don't include the gene choice probs score = raw_score if self.args.apply_choice_probs_in_sw: # NOTE I stopped applying the gene choice probs here because the smith-waterman scores don't correspond to log-probs, so throwing on the gene choice probs was dubious (and didn't seem to work that well) score = self.get_choice_prob( region, gene ) * raw_score # multiply by the probability to choose this gene # set bounds qrbounds = (read.qstart, read.qend) glbounds = (read.pos, read.aend) assert qrbounds[1] - qrbounds[0] == glbounds[1] - glbounds[0] assert qrbounds[1] <= len(query_seq) if glbounds[1] > len(self.germline_seqs[region][gene]): print ' ', gene print ' ', glbounds[1], len(self.germline_seqs[region][gene]) print ' ', self.germline_seqs[region][gene] assert glbounds[1] <= len(self.germline_seqs[region][gene]) assert qrbounds[1] - qrbounds[0] == glbounds[1] - glbounds[0] all_match_names[region].append( (score, gene) ) # NOTE it is important that this is ordered such that the best match is first all_query_bounds[gene] = qrbounds all_germline_bounds[gene] = glbounds self.summarize_query(query_name, query_seq, raw_best, all_match_names, all_query_bounds, all_germline_bounds, perfplotter, warnings)
def __init__(self, base_indir, outdir, gene_name, naivety, germline_seq, args): self.indir = base_indir self.args = args # parameters with values that I more or less made up self.precision = '16' # number of digits after the decimal for probabilities self.eps = 1e-6 # NOTE I also have an eps defined in utils, and they should in principle be combined self.n_max_to_interpolate = 20 self.allow_unphysical_insertions = self.args.allow_unphysical_insertions # allow fv and jf insertions. NOTE this slows things down by a factor of 6 or so # self.allow_external_deletions = args.allow_external_deletions # allow v left and j right deletions. I.e. if your reads extend beyond v or j boundaries self.v_3p_del_pseudocount_limit = 10 # add at least one entry # self.insert_mute_prob = 0.0 # self.mean_mute_freq = 0.0 self.outdir = outdir self.region = utils.get_region(gene_name) self.naivety = naivety self.germline_seq = germline_seq self.smallest_entry_index = -1 # keeps track of the first state that has a chance of being entered from init -- we want to start writing (with add_internal_state) from there # self.insertions = [ insert for insert in utils.index_keys if re.match(self.region + '._insertion', insert) or re.match('.' + self.region + '_insertion', insert)] OOPS that's not what I want to do self.insertions = [] if self.region == 'v': if self.allow_unphysical_insertions: self.insertions.append('fv') elif self.region == 'd': self.insertions.append('vd') elif self.region == 'j': self.insertions.append('dj') if self.allow_unphysical_insertions: self.insertions.append('jf') self.erosion_probs = {} self.insertion_probs = {} self.n_occurences = utils.read_overall_gene_probs(self.indir, only_gene=gene_name, normalize=False) # how many times did we observe this gene in data? replacement_genes = None if self.n_occurences < self.args.min_observations_to_write: # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us if self.args.debug: print ' only saw it %d times, use info from other genes' % self.n_occurences replacement_genes = utils.find_replacement_genes(self.indir, self.args.min_observations_to_write, gene_name, single_gene=False, debug=self.args.debug) self.read_erosion_info(gene_name, replacement_genes) # try this exact gene, but... self.read_insertion_info(gene_name, replacement_genes) if self.naivety == 'M': # mutate if not naive self.mute_freqs = paramutils.read_mute_info(self.indir, this_gene=gene_name, approved_genes=replacement_genes) self.track = Track('nukes', list(utils.nukes)) self.saniname = utils.sanitize_name(gene_name) self.hmm = HMM(self.saniname, {'nukes':list(utils.nukes)}) # pass the track as a dict rather than a Track object to keep the yaml file a bit more readable self.hmm.extras['gene_prob'] = max(self.eps, utils.read_overall_gene_probs(self.indir, only_gene=gene_name)) # if we really didn't see this gene at all, take pity on it and kick it an eps
def getvalstr(gene, val): if gene is None or (utils.get_region(gene) == 'd' and not utils.has_d_gene(utils.get_locus(gene))): return '%s %5.2s %s %-16s%s' % (cstr, ' - ', cstr, ' - ', 4 * ' ' if latex else '') else: if latex: gstr = utils.shorten_gene_name(gene, use_one_based_indexing=True, n_max_mutstrs=5) if emph_genes is not None and gene in emph_genes: gstr = '\\color{red}{\\textbf{%s}}' % gstr else: gstr = utils.color_gene(gene, width=18) return '%s %s%5.2f%s %s %-20s' % (cstr, estr, 100 * val, estr, cstr, gstr)
def fit_istart(self, gene, istart, positions_to_try_to_fit, subxyvals, fitfo, debug=False): residuals = {} for pos in positions_to_try_to_fit: # skip positions that are too close to the 5' end of V (misassigned insertions look like snps) if pos > len(self.glfo['seqs'][utils.get_region(gene)][gene]) - self.n_five_prime_positions_to_exclude - 1: continue # as long as we already have a few non-candidate positions, skip positions that have no frequencies greater than the min y intercept (note that they could in principle still have a large y intercept, but we don't really care) if len(residuals) > istart + self.min_non_candidate_positions_to_fit and len([f for f in subxyvals[pos]['freqs'] if f > self.min_y_intercept]) == 0: continue if sum(subxyvals[pos]['total']) < self.n_total_min: continue # also skip positions that only have a few points to fit (i.e. genes that were very rare, or I guess maybe if they were always eroded past this position) if len(subxyvals[pos]['n_mutelist']) < 3: continue zero_icpt_fit = self.get_curvefit(subxyvals[pos]['n_mutelist'], subxyvals[pos]['freqs'], subxyvals[pos]['errs'], y_icpt_bounds=(0. - self.small_number, 0. + self.small_number)) big_icpt_fit = self.get_curvefit(subxyvals[pos]['n_mutelist'], subxyvals[pos]['freqs'], subxyvals[pos]['errs'], y_icpt_bounds=self.big_y_icpt_bounds) residuals[pos] = {'zero_icpt' : zero_icpt_fit['residuals_over_ndof'], 'big_icpt' : big_icpt_fit['residuals_over_ndof']} self.fitted_positions[gene].add(pos) # if we already did the fit for another <istart>, it'll already be in there if len(residuals) <= istart: # needs to be at least one longer, so we have the first-non-snp if debug: print ' not enough observations to fit more than %d snps' % (istart - 1) return residual_ratios = {pos : float('inf') if r['big_icpt'] == 0. else r['zero_icpt'] / r['big_icpt'] for pos, r in residuals.items()} sorted_ratios = sorted(residual_ratios.items(), key=operator.itemgetter(1), reverse=True) # sort the positions in decreasing order of residual ratio candidate_snps = [pos for pos, _ in sorted_ratios[:istart]] # the first <istart> positions are the "candidate snps" max_non_snp, max_non_snp_ratio = sorted_ratios[istart] # position and ratio for largest non-candidate min_candidate_ratio = min([residual_ratios[cs] for cs in candidate_snps]) # fitfo['scores'][istart] = (min_candidate_ratio - max_non_snp_ratio) / max(self.small_number, max_non_snp_ratio) fitfo['min_snp_ratios'][istart] = min([residual_ratios[cs] for cs in candidate_snps]) fitfo['candidates'][istart] = {cp : residual_ratios[cp] for cp in candidate_snps} if debug: # if debug > 1: # print '%70s %s' % ('', ''.join(['%11d' % nm for nm in subxyvals[max_non_snp]['n_mutelist']])) for pos in candidate_snps + [max_non_snp, ]: xtrastrs = ('[', ']') if pos == max_non_snp else (' ', ' ') pos_str = '%3s' % str(pos) if residual_ratios[pos] > self.min_min_candidate_ratio: pos_str = utils.color('yellow', pos_str) print ' %s %s %5s (%5s / %-5s) %4d / %-4d %s' % (xtrastrs[0], pos_str, fstr(residual_ratios[pos]), fstr(residuals[pos]['zero_icpt']), fstr(residuals[pos]['big_icpt']), sum(subxyvals[pos]['obs']), sum(subxyvals[pos]['total']), xtrastrs[1]), # if debug > 1: # print ' ', ''.join(['%4d / %-4d' % (subxyvals[pos]['obs'][inm], subxyvals[pos]['total'][inm]) for inm in range(len(subxyvals[pos]['n_mutelist']))]) print ''
def plot(self, base_plotdir, cyst_positions=None, tryp_positions=None, only_csv=False): if not self.finalized: self.finalize() plotdir = base_plotdir + '/mute-freqs' utils.prep_dir(plotdir + '/plots', multilings=('*.csv', '*.svg')) for region in utils.regions: utils.prep_dir(plotdir + '/' + region + '/plots', multilings=('*.csv', '*.svg')) # utils.prep_dir(plotdir + '/' + region + '-per-base/plots', multilings=('*.csv', '*.png')) for gene in self.counts: counts, plotting_info = self.counts[gene], self.plotting_info[gene] sorted_positions = sorted(counts) genehist = Hist(sorted_positions[-1] - sorted_positions[0] + 1, sorted_positions[0] - 0.5, sorted_positions[-1] + 0.5, xtitle='fixme', ytitle='fixme') #, title=utils.sanitize_name(gene)) for position in sorted_positions: hi_diff = abs(counts[position]['freq'] - counts[position]['freq_hi_err']) lo_diff = abs(counts[position]['freq'] - counts[position]['freq_lo_err']) err = 0.5*(hi_diff + lo_diff) genehist.set_ibin(genehist.find_bin(position), counts[position]['freq'], error=err) xline = None figsize = [3, 3] if utils.get_region(gene) == 'v' and cyst_positions is not None: xline = cyst_positions[gene]['cysteine-position'] figsize[0] *= 3.5 elif utils.get_region(gene) == 'j' and tryp_positions is not None: xline = int(tryp_positions[gene]) figsize[0] *= 2 plotting.draw_no_root(genehist, plotdir=plotdir + '/' + utils.get_region(gene), plotname=utils.sanitize_name(gene), errors=True, write_csv=True, xline=xline, figsize=figsize, only_csv=only_csv) # paramutils.make_mutefreq_plot(plotdir + '/' + utils.get_region(gene) + '-per-base', utils.sanitize_name(gene), plotting_info) # needs translation to mpl # make mean mute freq hists plotting.draw_no_root(self.mean_rates['all'], plotname='all-mean-freq', plotdir=plotdir, stats='mean', bounds=(0.0, 0.4), write_csv=True, only_csv=only_csv) for region in utils.regions: plotting.draw_no_root(self.mean_rates[region], plotname=region+'-mean-freq', plotdir=plotdir, stats='mean', bounds=(0.0, 0.4), write_csv=True, only_csv=only_csv) if not only_csv: # write html file and fix permissiions check_call(['./bin/makeHtml', plotdir, '3', 'null', 'svg']) for region in utils.regions: check_call(['./bin/makeHtml', plotdir + '/' + region, '1', 'null', 'svg']) # check_call(['./bin/makeHtml', plotdir + '/' + region + '-per-base', '1', 'null', 'png']) check_call(['./bin/permissify-www', plotdir]) # NOTE this should really permissify starting a few directories higher up
def remove_gene(glfo, gene, debug=False): """ remove <gene> from <glfo> """ region = utils.get_region(gene) if gene in glfo['seqs'][region]: if debug: print ' removing %s from glfo' % utils.color_gene(gene) del glfo['seqs'][region][gene] if region in utils.conserved_codons[glfo['locus']]: del glfo[utils.conserved_codons[glfo['locus']][region] + '-positions'][gene] else: if debug: print ' can\'t remove %s from glfo, it\'s not there' % utils.color_gene(gene)
def remove_gene(glfo, gene, debug=False): """ remove <gene> from <glfo> """ region = utils.get_region(gene) if gene in glfo["seqs"][region]: if debug: print " removing %s from glfo" % utils.color_gene(gene) del glfo["seqs"][region][gene] if region in utils.conserved_codons[glfo["chain"]]: del glfo[utils.conserved_codons[glfo["chain"]][region] + "-positions"][gene] else: if debug: print " can't remove %s from glfo, it's not there" % utils.color_gene(gene)
def __init__(self, base_indir, outdir, gene_name, glfo, args, debug=False): self.region = utils.get_region(gene_name) self.raw_name = gene_name # i.e. unsanitized self.germline_seqs = glfo['seqs'] # all germline alleles self.germline_seq = self.germline_seqs[self.region][gene_name] # germline sequence for this hmm self.indir = base_indir self.args = args self.debug = debug self.codon_positions = {r : glfo[c + '-positions'] for r, c in utils.conserved_codons[args.locus].items()} # parameters with values that I more or less made up self.precision = '16' # number of digits after the decimal for probabilities self.eps = 1e-6 # NOTE I also have an eps defined in utils, and they should in principle be combined self.n_max_to_interpolate = args.min_observations_to_write self.min_mean_unphysical_insertion_length = {'fv' : 1.5, 'jf' : 25} # jf has to be quite a bit bigger, since besides account for the variation in J length from the tryp position to the end, it has to account for the difference in cdr3 lengths self.erosion_pseudocount_length = 10 # if we're closer to the end of the gene than this, make sure erosion probability isn't zero self.outdir = outdir self.smallest_entry_index = -1 # keeps track of the first state that has a chance of being entered from init -- we want to start writing (with add_internal_state) from there self.insertions = [] if self.region == 'v': self.insertions.append('fv') elif self.region == 'd': self.insertions.append('vd') elif self.region == 'j': self.insertions.append('dj') self.insertions.append('jf') assert len(utils.ambiguous_bases) == 1 and utils.ambiguous_bases[0] == 'N' # maybe need to update some stuff below if this changes if self.debug: print '%s' % utils.color_gene(gene_name) self.n_occurences = utils.read_single_gene_count(self.indir, gene_name, debug=self.debug) # how many times did we observe this gene in data? replacement_genes = None if self.n_occurences < self.args.min_observations_to_write: # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us if self.debug: print ' only saw it %d times (wanted %d), so use info from all other genes' % (self.n_occurences, self.args.min_observations_to_write) replacement_genes = utils.find_replacement_genes(self.indir, self.args.min_observations_to_write, gene_name, debug=self.debug) self.erosion_probs = self.read_erosion_info(gene_name, replacement_genes) self.insertion_probs, self.insertion_content_probs = self.read_insertion_info(gene_name, replacement_genes) self.mute_freqs, self.mute_obs = paramutils.read_mute_info(self.indir, this_gene=gene_name, locus=self.args.locus, approved_genes=replacement_genes) # actual info in <self.mute_obs> isn't actually used a.t.m. self.track = Track('nukes', utils.nukes) self.saniname = utils.sanitize_name(gene_name) self.hmm = HMM(self.saniname, self.track.getdict()) # pass the track as a dict rather than a Track object to keep the yaml file a bit more readable self.hmm.extras['gene_prob'] = max(self.eps, utils.read_overall_gene_probs(self.indir, only_gene=gene_name)) # if we really didn't see this gene at all, take pity on it and kick it an eps tmp_mean_freq_hist = Hist(fname=self.indir + '/all-mean-mute-freqs.csv') self.hmm.extras['overall_mute_freq'] = tmp_mean_freq_hist.get_mean()
def __init__(self, base_indir, outdir, gene_name, glfo, args, debug=False): self.region = utils.get_region(gene_name) self.raw_name = gene_name # i.e. unsanitized self.germline_seqs = glfo['seqs'] # all germline alleles self.germline_seq = self.germline_seqs[self.region][gene_name] # germline sequence for this hmm self.indir = base_indir self.args = args self.debug = debug self.codon_positions = {r : glfo[c + '-positions'] for r, c in utils.conserved_codons[args.chain].items()} # parameters with values that I more or less made up self.precision = '16' # number of digits after the decimal for probabilities self.eps = 1e-6 # NOTE I also have an eps defined in utils, and they should in principle be combined self.n_max_to_interpolate = args.min_observations_to_write self.min_mean_unphysical_insertion_length = {'fv' : 1.5, 'jf' : 25} # jf has to be quite a bit bigger, since besides account for the variation in J length from the tryp position to the end, it has to account for the difference in cdr3 lengths self.erosion_pseudocount_length = 10 # if we're closer to the end of the gene than this, make sure erosion probability isn't zero self.outdir = outdir self.smallest_entry_index = -1 # keeps track of the first state that has a chance of being entered from init -- we want to start writing (with add_internal_state) from there self.insertions = [] if self.region == 'v': self.insertions.append('fv') elif self.region == 'd': self.insertions.append('vd') elif self.region == 'j': self.insertions.append('dj') self.insertions.append('jf') assert len(utils.ambiguous_bases) == 1 and utils.ambiguous_bases[0] == 'N' # maybe need to update some stuff below if this changes if self.debug: print '%s' % utils.color_gene(gene_name) self.n_occurences = utils.read_single_gene_count(self.indir, gene_name, debug=self.debug) # how many times did we observe this gene in data? replacement_genes = None if self.n_occurences < self.args.min_observations_to_write: # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us if self.debug: print ' only saw it %d times (wanted %d), so use info from all other genes' % (self.n_occurences, self.args.min_observations_to_write) replacement_genes = utils.find_replacement_genes(self.indir, self.args.min_observations_to_write, gene_name, debug=self.debug) self.erosion_probs = self.read_erosion_info(gene_name, replacement_genes) self.insertion_probs, self.insertion_content_probs = self.read_insertion_info(gene_name, replacement_genes) self.mute_freqs, self.mute_obs = paramutils.read_mute_info(self.indir, this_gene=gene_name, chain=self.args.chain, approved_genes=replacement_genes) # actual info in <self.mute_obs> isn't actually used a.t.m. self.track = Track('nukes', utils.nukes) self.saniname = utils.sanitize_name(gene_name) self.hmm = HMM(self.saniname, self.track.getdict()) # pass the track as a dict rather than a Track object to keep the yaml file a bit more readable self.hmm.extras['gene_prob'] = max(self.eps, utils.read_overall_gene_probs(self.indir, only_gene=gene_name)) # if we really didn't see this gene at all, take pity on it and kick it an eps tmp_mean_freq_hist = Hist(fname=self.indir + '/all-mean-mute-freqs.csv') self.hmm.extras['overall_mute_freq'] = tmp_mean_freq_hist.get_mean()
def finalize_tigger(self): utils.prep_dir(os.getenv('www') + '/partis/tmp', wildling='*.svg') for gene in self.counts: if utils.get_region(gene) != 'v': continue print '\n%s' % gene print ' position x-icpt y-icpt slope mut / total' mean_x_icpt = {'sum' : 0., 'total' : 0.} for position in sorted(self.counts[gene].keys()): self.freqs[gene][position]['tigger'] = self.tigger_calcs(position, self.counts[gene][position], mean_x_icpt) print mean_x_icpt if mean_x_icpt['total'] > 0.: print mean_x_icpt['sum'] / mean_x_icpt['total'] assert False for gene in self.freqs: if utils.get_region(gene) != 'v': continue info = {p : self.freqs[gene][p]['tigger-fits'] for p in self.freqs[gene]} x_intercepts = [-v['intercept'] / v['slope'] for k, v in info.items() if v['intercept'] is not None and v['intercept'] < 0.3] print sorted(x_intercepts) print sum(x_intercepts) / float(len(x_intercepts)) print numpy.median(x_intercepts)
def write_mute_freqs( self, gene, seq, reco_event, reco_seq_fname ): # TODO unsurprisingly, this function profiles out to be kind of a dumb way to do it, in terms of run time """ Read position-by-position mute freqs from disk for <gene>, renormalize, then write to a file for bppseqgen. """ mute_freqs = self.get_mute_freqs(gene) rates = [ ] # list with a relative mutation rate for each position in <seq> total = 0.0 # assert len(mute_freqs) == len(seq) # only equal length if no erosions NO oh right but mute_freqs only covers areas we could align to... left_erosion_length = dict(reco_event.erosions.items() + reco_event.effective_erosions.items())[ utils.get_region(gene) + '_5p'] for inuke in range(len(seq)): # append a freq for each nuke position = inuke + left_erosion_length freq = 0.0 if position in mute_freqs: freq = mute_freqs[position] else: freq = mute_freqs['overall_mean'] rates.append(freq) total += freq # normalize to the number of sites (i.e. so an average site is given value 1.0) assert total != 0.0 # I am not hip enough to divide by zero for inuke in range(len(seq)): rates[inuke] *= float(len(seq)) / total total = 0.0 # and... double check it, just for shits and giggles for inuke in range(len(seq)): total += rates[inuke] assert utils.is_normed(total / float(len(seq))) assert len(rates) == len( seq ) # you just can't be too careful. what if gremlins ate a few while python wasn't looking? # write the input file for bppseqgen, one base per line with open(reco_seq_fname, 'w') as reco_seq_file: # NOTE really not sure why this doesn't really [seems to require an "extra" column] work with csv.DictWriter, but it doesn't -- bppseqgen barfs (I think maybe it expects a different newline character? don't feel like working it out) headstr = 'state' if not self.args.mutate_from_scratch: headstr += '\trate' reco_seq_file.write(headstr + '\n') for inuke in range(len(seq)): linestr = seq[inuke] if not self.args.mutate_from_scratch: linestr += '\t%f' % rates[inuke] reco_seq_file.write(linestr + '\n')
def add_some_snps(snps_to_add, glfo, remove_template_genes=False, debug=False): """ Generate some snp'd genes and add them to glfo, specified with <snps_to_add>. e.g. [{'gene' : 'IGHV3-71*01', 'positions' : (35, None)}, ] will add a snp at position 35 and at a random location. The resulting snp'd gene will have a name like IGHV3-71*01+C35T.T47G """ templates_to_remove = set() added_snp_names = [] for isnp in range(len(snps_to_add)): snpinfo = snps_to_add[isnp] gene, positions = snpinfo['gene'], snpinfo['positions'] print ' adding %d %s to %s' % (len(positions), utils.plural_str('snp', len(positions)), gene) seq = glfo['seqs'][utils.get_region(gene)][gene] assert utils.get_region(gene) == 'v' cpos = glfo['cyst-positions'][gene] snpfo = None itry = 0 while snpfo is None or snpfo['gene'] in glfo['seqs'][utils.get_region(gene)]: if itry > 0: print ' already in glfo, try again' if itry > 99: raise Exception('too many tries while trying to generate new snps -- did you specify a lot of snps on the same position?') snpfo = generate_snpd_gene(gene, cpos, seq, positions) itry += 1 if remove_template_genes: templates_to_remove.add(gene) add_new_allele(glfo, snpfo, remove_template_genes=False, debug=debug) # *don't* remove the templates here, since we don't know if there's another snp later that needs them added_snp_names.append(snpfo['gene']) remove_the_stupid_godamn_template_genes_all_at_once(glfo, templates_to_remove) # works fine with zero-length <templates_to_remove> return added_snp_names # need the order of the names so we can get allele prevalence freqs from the command line right
def read_allele_prevalence_freqs(fname, debug=False): # NOTE kinda weird to mash all the regions into one file here (as compared to parametercounter), but it seems to make more sense allele_prevalence_freqs = {r: {} for r in utils.regions} with open(fname) as pfile: reader = csv.DictReader(pfile) for line in reader: allele_prevalence_freqs[utils.get_region(line["gene"])][line["gene"]] = float(line["freq"]) for region in utils.regions: if len(allele_prevalence_freqs[region]) == 0: continue if debug: for gene, freq in allele_prevalence_freqs[region].items(): print "%14.8f %s" % (freq, utils.color_gene(gene)) assert utils.is_normed(allele_prevalence_freqs[region]) return allele_prevalence_freqs
def read_allele_prevalence_freqs(fname, debug=False): # NOTE kinda weird to mash all the regions into one file here (as compared to parametercounter), but it seems to make more sense allele_prevalence_freqs = {r : {} for r in utils.regions} with open(fname) as pfile: reader = csv.DictReader(pfile) for line in reader: allele_prevalence_freqs[utils.get_region(line['gene'])][line['gene']] = float(line['freq']) for region in utils.regions: if len(allele_prevalence_freqs[region]) == 0: continue if debug: for gene, freq in allele_prevalence_freqs[region].items(): print '%14.8f %s' % (freq, utils.color_gene(gene)) assert utils.is_normed(allele_prevalence_freqs[region]) return allele_prevalence_freqs
def get_allowed_genes(self, parameter_dir): allowed_genes = {} for region in [r for r in utils.regions if r != 'd']: genes_in_file = set() with open(parameter_dir + '/' + utils.get_parameter_fname(column=region + '_gene', deps=utils.column_dependencies[region + '_gene'])) as csvfile: reader = csv.DictReader(csvfile) for line in reader: genes_in_file.add(line[region + '_gene']) allowed_genes[region] = genes_in_file if self.args.only_genes is not None: # if --only-genes was specified, not only does the gene have to be in the parameter file, but it has to be among --only-genes regional_only_genes = set(g for g in self.args.only_genes if utils.get_region(g) == region) if len(regional_only_genes - genes_in_file) > 0: # if command line asked for genes that aren't in the file raise Exception('genes %s specified with --only-genes are not present in %s, so there\'s no information with which to simulate' % (' '.join(regional_only_genes - genes_in_file), parameter_dir)) allowed_genes[region] &= regional_only_genes return allowed_genes
def make_allele_finding_plot(plotdir, gene, position, values, xmax, fitfos=None): xmin, xmax = -0.3, xmax fig, ax = mpl_init() ax.errorbar(values['n_mutelist'], values['freqs'], yerr=values['errs'], markersize=15, linewidth=2, marker='.') #, title='position ' + str(position)) if fitfos is not None: # fitted lines colors = {'prefo' : 'red', 'postfo' : 'red', 'onefo' : 'green'} for ftype in colors: if fitfos[ftype]['xvals'] is None: # not really sure why this happens... probably zero-point fits? continue linevals = [fitfos[ftype]['slope']*x + fitfos[ftype]['y_icpt'] for x in fitfos[ftype]['xvals']] ax.plot(fitfos[ftype]['xvals'], linevals, color=colors[ftype]) ax.plot([xmin, xmax], [0, 0], linestyle='dashed', alpha=0.5, color='black') ymax = max(values['freqs']) + max(values['errs']) mpl_finish(ax, plotdir, str(position), xlabel='mutations in %s segment' % utils.get_region(gene), ylabel='position\'s mut freq', xbounds=(xmin, xmax), ybounds=(-0.01, ymax), leg_loc=(0.95, 0.1), adjust={'right' : 0.85}, title='position ' + str(position) + ' in ' + gene)
def restrict_to_genes(glfo, only_genes, debug=False): """ Remove from <glfo> any genes which are not in <only_genes>. Any regions which are not represented in in a non-None <only_genes> will be unrestricted (i.e. any gene from that region is fair game). """ if only_genes is None: return restricted_regions = set([utils.get_region(g) for g in only_genes]) unrestricted_regions = set(utils.regions) - restricted_regions only_genes_not_in_glfo = set(only_genes) - set([g for r in restricted_regions for g in glfo['seqs'][r]]) if len(only_genes_not_in_glfo) > 0: print ' %s genes %s in <only_genes> aren\'t in glfo to begin with' % (utils.color('red', 'warning'), ' '.join(only_genes_not_in_glfo)) genes_to_remove = set([g for r in restricted_regions for g in glfo['seqs'][r]]) - set(only_genes) if debug: print ' removing %d genes from glfo' % len(genes_to_remove) remove_genes(glfo, genes_to_remove)
def make_mutefreq_plot(plotdir, gene_name, positions): import plotting """ NOTE shares a lot with make_transition_plot() in bin/plot-hmms.py. """ nuke_colors = {'A' : 'red', 'C' : 'blue', 'G' : 'orange', 'T' : 'green'} fig, ax = plotting.mpl_init() fig.set_size_inches(plotting.plot_ratios[utils.get_region(gene_name)]) ibin = 0 print utils.color_gene(utils.unsanitize_name(gene_name)) legend_colors = set() for info in positions: posname = info['name'] # make label below bin ax.text(-0.5 + ibin, -0.075, simplify_state_name(posname), rotation='vertical', size=8) total = 0.0 alpha = 0.6 for nuke, prob in sorted(info['nuke_freqs'].items(), key=operator.itemgetter(1), reverse=True): color = nuke_colors[nuke] label_to_use = None if color not in legend_colors: label_to_use = nuke legend_colors.add(color) # horizontal line at height total+prob ax.plot([-0.5 + ibin, 0.5 + ibin], [total + prob, total + prob], color=color, alpha=alpha, linewidth=3, label=label_to_use) # vertical line from total to total + prob ax.plot([ibin, ibin], [total + 0.01, total + prob], color=color, alpha=alpha, linewidth=3) # # write [ACGT] at midpoint between total and total+prob # midpoint = 0.5*(prob + 2*total) # ... *redacted* total += prob ibin += 1 ax.get_xaxis().set_visible(False) plotting.mpl_finish(ax, plotdir, gene_name, ybounds=(-0.01, 1.01), xbounds=(-3, len(positions) + 3), leg_loc=(0.95, 0.1), adjust={'left' : 0.1, 'right' : 0.8}, leg_prop={'size' : 8})
def write_hmms(self, parameter_dir, sw_matches): print 'writing hmms with info from %s' % parameter_dir start = time.time() from hmmwriter import HmmWriter hmm_dir = parameter_dir + '/hmms' utils.prep_dir(hmm_dir, '*.yaml') gene_list = self.args.only_genes if gene_list == None: # if specific genes weren't specified, do the ones for which we have matches gene_list = [] for region in utils.regions: for gene in self.germline_seqs[region]: if sw_matches == None or gene in sw_matches: # shouldn't be None really, but I'm testing something gene_list.append(gene) for gene in gene_list: if self.args.debug: print ' %s' % utils.color_gene(gene) writer = HmmWriter( parameter_dir, hmm_dir, gene, self.args.naivety, self.germline_seqs[utils.get_region(gene)][gene], self.args) writer.write() print ' time to write hmms: %.3f' % (time.time() - start)
def run_bppseqgen(self, seq, chosen_tree, gene_name, reco_event, seed, is_insertion=False): """ Run bppseqgen on sequence Note that this is in general a piece of the full sequence (say, the V region), since we have different mutation models for different regions. Returns a list of mutated sequences. """ region = '' if is_insertion: region = 'v' # NOTE should really do something other than just use the v model for insertion mutations else: region = utils.get_region(gene_name) if len(seq) == 0: # zero length insertion (or d) treg = re.compile('t[0-9][0-9]*') # find number of leaf nodes n_leaf_nodes = len(treg.findall(chosen_tree)) return ['' for _ in range(n_leaf_nodes) ] # return an empty string for each leaf node # write the tree to a tmp file if is_insertion: label = gene_name[:2] else: label = utils.get_region(gene_name) treefname = self.workdir + '/' + label + '-tree.tre' reco_seq_fname = self.workdir + '/' + label + '-start-seq.txt' leaf_seq_fname = self.workdir + '/' + label + '-leaf-seqs.fa' with opener('w')(treefname) as treefile: treefile.write(chosen_tree) self.write_mute_freqs(region, gene_name, seq, reco_event, reco_seq_fname, is_insertion=is_insertion) # build up the command line # docs: http://biopp.univ-montp2.fr/apidoc/bpp-phyl/html/classbpp_1_1GTR.html that page is too darn hard to google bpp_binary = os.getcwd() + '/packages/bpp/bin/bppseqgen' if not os.path.exists(bpp_binary): print 'ERROR bpp not found in %s' % os.path.dirname(bpp_binary) assert False command = 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:' + os.getcwd( ) + '/packages/bpp/lib\n' command += bpp_binary command += ' input.tree.file=' + treefname command += ' output.sequence.file=' + leaf_seq_fname command += ' number_of_sites=' + str(len(seq)) command += ' input.tree.format=Newick' command += ' output.sequence.format=Fasta\(\)' command += ' alphabet=DNA' command += ' --seed=' + str(seed) command += ' model=GTR\(' for par in self.mute_models[region]['gtr']: val = self.mute_models[region]['gtr'][par] command += par + '=' + val + ',' command = command.rstrip(',') command += '\)' # NOTE should I use the "equilibrium frequencies" option? command += ' rate_distribution=\'Gamma(n=4,alpha=' + self.mute_models[ region]['gamma']['alpha'] + ')\'' command += ' input.infos.states=state' command += ' input.infos=' + reco_seq_fname command += ' input.infos.rates=rate' # print command check_output(command, shell=True) mutated_seqs = [] for seq_record in SeqIO.parse( leaf_seq_fname, "fasta" ): # get the leaf node sequences from the file that bppseqgen wrote mutated_seqs.append(str(seq_record.seq)) # self.check_tree_simulation(leaf_seq_fname, chosen_tree) if not self.args.no_clean: os.remove(reco_seq_fname) # clean up temp files os.remove(treefname) os.remove(leaf_seq_fname) return mutated_seqs
def plot_single_variable(args, varname, hlist, outdir, pathnameclues): if varname in plotconfig.gene_usage_columns: hlist = plotting.add_bin_labels_not_in_all_hists(hlist) no_labels = False xline, bounds, figsize = None, None, None stats = args.extra_stats translegend = [0.0, -0.2] xtitle, ytitle = hlist[0].xtitle, hlist[0].ytitle if xtitle == '': # arg, plotting.py thinks default should be None, hist.py thinks it's '' xtitle = None if '-mean-bins' in varname: raise Exception( 'darn, I was hoping I wasn\'t making these plots any more') plottitle = plotconfig.plot_titles[ varname] if varname in plotconfig.plot_titles else varname ytitle = 'frequency' if args.normalize else 'counts' if 'mute-freqs/v' in pathnameclues or 'mute-freqs/d' in pathnameclues or 'mute-freqs/j' in pathnameclues: assert not args.normalize ytitle = 'mutation freq' if varname in plotconfig.gene_usage_columns: xtitle = 'allele' if hlist[0].n_bins == 2: stats = ' 0-bin' # print the fraction of entries in the zero bin into the legend (i.e. the fraction correct) # elif hlist[0].bin_labels.count('') == hlist[0].n_bins + 2: # xtitle = '???' line_width_override = None if args.performance_plots: if 'hamming_to_true_naive' in varname: xtitle = 'hamming distance' if '_normed' in varname: xtitle = 'fractional ' + xtitle elif '_vs_mute_freq' in varname: xtitle = 'mutation freq' ytitle = 'fraction correct' if varname[0] == 'v' or varname[0] == 'j': translegend = [-0.4, -0.4] elif varname.find('_gene') == 1: xtitle = '' ytitle = 'fraction correct' else: xtitle = 'inferred - true' bounds = plotconfig.true_vs_inferred_hard_bounds.setdefault( varname, None) else: bounds = plotconfig.default_hard_bounds.setdefault(varname, None) if bounds is None and 'insertion' in varname: bounds = plotconfig.default_hard_bounds.setdefault( 'all_insertions', None) if varname in plotconfig.gene_usage_columns: no_labels = True if 'j_' not in varname: figsize = (10, 5) line_width_override = 1 elif 'per-gene-per-position/v' in pathnameclues: figsize = (20, 5) bounds = plotconfig.default_hard_bounds.setdefault( utils.unsanitize_name(varname), None) if 'IG' in varname or 'TR' in varname: if 'mute-freqs' in pathnameclues: gene = utils.unsanitize_name(varname) plottitle = gene # + ' -- mutation frequency' xtitle = 'position' if utils.get_region(gene) == 'j': translegend = [0.1, 0.] #(-0.35, -0.02) else: translegend = [0.15, -0.02] xline = None if args.glfo is not None: if utils.get_region(gene) in utils.conserved_codons[ args.locus]: xline = args.glfo[utils.conserved_codons[args.locus][ utils.get_region(gene)] + '-positions'][gene] else: ilastdash = varname.rfind('-') gene = utils.unsanitize_name(varname[:ilastdash]) base_varname = varname[ilastdash + 1:] base_plottitle = plotconfig.plot_titles[ base_varname] if base_varname in plotconfig.plot_titles else '' plottitle = gene + ' -- ' + base_plottitle if len(hlist) > 9: # skootch it down so they (maybe) all fit translegend[1] -= 0.5 if args.translegend is not None: # override with the command line translegend = args.translegend if args.extra_stats == 'auto': # kind of hackey if xtitle == 'inferred - true': stats = 'absmean' else: stats = 'mean' # draw that little #$*(! linewidths = [ line_width_override, ] if line_width_override is not None else args.linewidths alphas = [0.6 for _ in range(len(hlist))] plotting.draw_no_root( hlist[0], plotname=varname, plotdir=outdir, more_hists=hlist[1:], write_csv=False, stats=stats, bounds=bounds, shift_overflows=(os.path.basename(outdir) != 'gene-call'), plottitle=plottitle, colors=args.colors, xtitle=xtitle, ytitle=ytitle, xline=xline, normalize=(args.normalize and '_vs_mute_freq' not in varname), linewidths=linewidths, alphas=alphas, errors=True, figsize=figsize, no_labels=no_labels, log=args.log, translegend=translegend)
if opt == "--region": region = val if opt == "--size": kwargs['size'] = int(val) if opt == "--name": kwargs['name'] = val if opt == "--desc": kwargs['desc'] = val if len(args) != 2: usage("incorrect number of arguments") snapshot_id = args[0] virt = args[1] arch = arch if arch else utils.get_arch() region = region if region else utils.get_region() if not virt in ('hvm', 'pvm'): fatal("virtualization type not supported: %s" % virt) ami_id, ami_name = register(snapshot_id, region, virt, arch, **kwargs) print ami_id, ami_name if __name__ == "__main__": main()
tree = input_tfile.Get('Nominal') entries = tree.GetEntries() for entry in range(entries): if entry % 10000 == 0: print('*** processed {0} out of {1}'.format(entry, entries)) # get the next tree in the chain and verify ientry = tree.LoadTree(entry) if ientry < 0: break # copy next entry into memory and verify nb = tree.GetEntry(entry) if nb <= 0: continue for idv, nTracks in enumerate(tree.DV_nTracks): rIndex = utils.get_region(tree, idv) if rIndex < 0: continue if not tree.DV_passFidCuts[idv] or not tree.DV_passChisqCut[ idv] or not tree.DV_passDistCut[idv]: continue if nTracks == 2: h_mass_2[rIndex].Fill(tree.DV_m[idv]) elif nTracks == 3: get_2track_mass_from_3track_dv(tree, idv, h_mass_2_in_3[rIndex]) else: continue #print('3-track mass = ' + str(tree.DV_m[idv])) output_tfile = TFile('output_ks_method.root', 'recreate') for region in range(12):
def __init__(self, region=None): self.region = region if region else utils.get_region() self.conn = utils.connect(self.region) self.snap = None
def __init__(self, region=None): self.region = region if region else utils.get_region() self.conn = utils.connect(self.region) self.vol = None self.device = None