def __init__(self, args, input_info, reco_info, germline_seqs, parameter_dir, write_parameters=False, plotdir=None): self.parameter_dir = parameter_dir self.plotdir = plotdir self.args = args self.input_info = input_info self.reco_info = reco_info self.germline_seqs = germline_seqs self.pcounter, self.true_pcounter = None, None if write_parameters: self.pcounter = ParameterCounter(self.germline_seqs) if not self.args.is_data: self.true_pcounter = ParameterCounter(self.germline_seqs) self.info = {} self.info['all_best_matches'] = set() # set of all the matches we found (for *all* queries) self.info['skipped_unproductive_queries'] = [] # list of unproductive queries if self.args.apply_choice_probs_in_sw: if self.args.debug: print ' reading gene choice probs from',parameter_dir self.gene_choice_probs = utils.read_overall_gene_probs(parameter_dir) with opener('r')(self.args.datadir + '/v-meta.json') as json_file: # get location of <begin> cysteine in each v region self.cyst_positions = json.load(json_file) with opener('r')(self.args.datadir + '/j_tryp.csv') as csv_file: # get location of <end> tryptophan in each j region (TGG) tryp_reader = csv.reader(csv_file) self.tryp_positions = {row[0]:row[1] for row in tryp_reader} # WARNING: this doesn't filter out the header line self.outfile = None if self.args.outfname != None: self.outfile = open(self.args.outfname, 'a') self.n_unproductive = 0 self.n_total = 0
def __init__(self, args): self.args = args self.germline_seqs = utils.read_germlines( self.args.datadir) #, add_fp=True) with opener('r')( self.args.datadir + '/v-meta.json' ) as json_file: # get location of <begin> cysteine in each v region self.cyst_positions = json.load(json_file) with opener('r')( self.args.datadir + '/j_tryp.csv' ) as csv_file: # get location of <end> tryptophan in each j region (TGG) tryp_reader = csv.reader(csv_file) self.tryp_positions = { row[0]: row[1] for row in tryp_reader } # WARNING: this doesn't filter out the header line self.precluster_info = {} if self.args.seqfile is not None: self.input_info, self.reco_info = get_seqfile_info( self.args.seqfile, self.args.is_data, self.germline_seqs, self.cyst_positions, self.tryp_positions, self.args.n_max_queries, self.args.queries, self.args.reco_ids) self.outfile = None if self.args.outfname != None: if os.path.exists(self.args.outfname): os.remove(self.args.outfname) self.outfile = open(self.args.outfname, 'a')
def __init__(self, args): self.args = args self.germline_seqs = utils.read_germlines(self.args.datadir, remove_N_nukes=True) self.perfplotter = PerformancePlotter(self.germline_seqs, self.args.plotdir, 'ihhhmmm') self.details = OrderedDict() self.failtails = {} self.n_partially_failed = 0 # get sequence info that was passed to ihhhmmm self.siminfo = OrderedDict() self.sim_need = [] # list of queries that we still need to find with opener('r')(self.args.simfname) as seqfile: reader = csv.DictReader(seqfile) iline = 0 for line in reader: if self.args.queries != None and line[ 'unique_id'] not in self.args.queries: continue self.siminfo[line['unique_id']] = line self.sim_need.append(line['unique_id']) iline += 1 if args.n_queries > 0 and iline >= args.n_queries: break fostream_names = glob.glob(self.args.indir + '/*.fostream') if len(fostream_names) == 0: raise Exception('no fostreams found in %s' % args.indir) fostream_names.sort() # maybe already sorted? for infname in fostream_names: if len(self.sim_need) == 0: break # try to get whatever you can for the failures unique_ids = self.find_partial_failures( infname) # returns list of unique ids in this file with opener('r')(infname) as infile: self.parse_file(infile, unique_ids) # now check that we got results for all the queries we wanted n_failed = 0 for unique_id in self.siminfo: if unique_id not in self.details and unique_id not in self.failtails: print '%-20s no info' % unique_id self.perfplotter.add_fail() n_failed += 1 print '' print 'partially failed: %d / %d = %.2f' % ( self.n_partially_failed, len(self.siminfo), float(self.n_partially_failed) / len(self.siminfo)) print 'failed: %d / %d = %.2f' % (n_failed, len( self.siminfo), float(n_failed) / len(self.siminfo)) print '' self.perfplotter.plot()
def check_tree_simulation(self, leaf_seq_fname, chosen_tree_str, reco_event=None): """ See how well we can reconstruct the true tree """ clean_up = False if leaf_seq_fname == '': # we need to make the leaf seq file based on info in reco_event clean_up = True leaf_seq_fname = self.workdir + '/leaf-seqs.fa' with opener('w')(leaf_seq_fname) as leafseqfile: for iseq in range(len(reco_event.final_seqs)): leafseqfile.write( '>t' + str(iseq + 1) + '\n' ) # NOTE the *order* of the seqs doesn't correspond to the tN number. does it matter? leafseqfile.write(reco_event.final_seqs[iseq] + '\n') with opener('w')(os.devnull) as fnull: inferred_tree_str = check_output('FastTree -gtr -nt ' + leaf_seq_fname, shell=True, stderr=fnull) if clean_up and not self.args.no_clean: os.remove(leaf_seq_fname) chosen_tree = dendropy.Tree.get_from_string(chosen_tree_str, 'newick') inferred_tree = dendropy.Tree.get_from_string(inferred_tree_str, 'newick') if self.args.debug: print ' tree diff -- symmetric %d euke %f rf %f' % ( chosen_tree.symmetric_difference(inferred_tree), chosen_tree.euclidean_distance(inferred_tree), chosen_tree.robinson_foulds_distance(inferred_tree))
def get_seqfile_info(fname, is_data, germline_seqs=None, cyst_positions=None, tryp_positions=None, n_max_queries=-1, queries=None, reco_ids=None): """ return list of sequence info from files of several types """ if not is_data: assert germline_seqs is not None assert cyst_positions is not None assert tryp_positions is not None if '.csv' in fname: delimiter = ',' name_column = 'unique_id' seq_column = 'seq' seqfile = opener('r')(fname) reader = csv.DictReader(seqfile, delimiter=delimiter) elif '.tsv' in fname: delimiter = '\t' name_column = 'name' seq_column = 'nucleotide' seqfile = opener('r')(fname) reader = csv.DictReader(seqfile, delimiter=delimiter) elif '.fasta' in fname or '.fa' in fname or '.fastq' in fname or '.fq' in fname: name_column = 'unique_id' seq_column = 'seq' reader = [] n_fasta_queries = 0 ftype = 'fasta' if ('.fasta' in fname or '.fa' in fname) else 'fastq' for seq_record in SeqIO.parse(fname, ftype): reader.append({}) reader[-1][name_column] = seq_record.name reader[-1][seq_column] = str(seq_record.seq).upper() n_fasta_queries += 1 if n_max_queries > 0 and n_fasta_queries >= n_max_queries: break else: print 'ERROR unrecognized file format %s' % fname assert False input_info, reco_info = OrderedDict(), OrderedDict() n_queries = 0 for line in reader: utils.intify(line) # if command line specified query or reco ids, skip other ones if queries is not None and line[name_column] not in queries: continue if reco_ids is not None and line['reco_id'] not in reco_ids: continue input_info[line[name_column]] = {'unique_id':line[name_column], 'seq':line[seq_column]} if not is_data: reco_info[line['unique_id']] = line utils.add_match_info(germline_seqs, line, cyst_positions, tryp_positions) n_queries += 1 if n_max_queries > 0 and n_queries >= n_max_queries: break if len(input_info) == 0: print 'ERROR didn\'t end up pulling any input info out of %s' % fname assert False return (input_info, reco_info)
def __init__(self, args, seed, sublabel=None): self.args = args if sublabel == None: self.workdir = self.args.workdir + '/recombinator' self.outfname = self.args.outfname else: # need a separate workdir for each subprocess self.workdir = self.args.workdir + '/recombinator-' + sublabel self.outfname = self.workdir + '/' + os.path.basename(self.args.outfname) utils.prep_dir(self.workdir) if not self.args.simulate_partially_from_scratch: parameter_dir = self.args.parameter_dir else: # we start from scratch, except for the mute freq stuff parameter_dir = self.args.scratch_mute_freq_dir if parameter_dir is None or not os.path.exists(parameter_dir): raise Exception('parameter dir ' + parameter_dir + ' d.n.e') self.index_keys = {} # this is kind of hackey, but I suspect indexing my huge table of freqs with a tuple is better than a dict self.mute_models = {} # self.treeinfo = [] # list of newick-formatted tree strings with region-specific branch info tacked at the end for region in utils.regions: self.mute_models[region] = {} for model in ['gtr', 'gamma']: self.mute_models[region][model] = {} self.glfo = glutils.read_glfo(self.args.initial_datadir, self.args.chain, only_genes=self.args.only_genes) self.allowed_genes = self.get_allowed_genes(parameter_dir) # set of genes a) for which we read per-position mutation information and b) from which we choose when running partially from scratch self.version_freq_table = self.read_vdj_version_freqs(parameter_dir) # list of the probabilities with which each VDJ combo (plus other rearrangement parameters) appears in data self.insertion_content_probs = self.read_insertion_content(parameter_dir) self.all_mute_freqs = {} self.parameter_dir = parameter_dir # damnit, I guess I do need to save this in self # read shm info NOTE I'm not inferring the gtr parameters a.t.m., so I'm just (very wrongly) using the same ones for all individuals with opener('r')(self.args.gtrfname) as gtrfile: # read gtr parameters reader = csv.DictReader(gtrfile) for line in reader: parameters = line['parameter'].split('.') region = parameters[0][3].lower() assert region == 'v' or region == 'd' or region == 'j' model = parameters[1].lower() parameter_name = parameters[2] assert model in self.mute_models[region] self.mute_models[region][model][parameter_name] = line['value'] treegen = treegenerator.TreeGenerator(args, parameter_dir, seed=seed) self.treefname = self.workdir + '/trees.tre' treegen.generate_trees(seed, self.treefname) with opener('r')(self.treefname) as treefile: # read in the trees (and other info) that we just generated self.treeinfo = treefile.readlines() os.remove(self.treefname) if os.path.exists(self.outfname): os.remove(self.outfname) elif not os.path.exists(os.path.dirname(os.path.abspath(self.outfname))): os.makedirs(os.path.dirname(os.path.abspath(self.outfname)))
def __init__(self, args, seed, sublabel=None): self.args = args if sublabel == None: self.workdir = self.args.workdir + '/recombinator' self.outfname = self.args.outfname else: # need a separate workdir for each subprocess self.workdir = self.args.workdir + '/recombinator-' + sublabel self.outfname = self.workdir + '/' + os.path.basename(self.args.outfname) utils.prep_dir(self.workdir) if not os.path.exists(self.args.parameter_dir): raise Exception('parameter dir ' + self.args.parameter_dir + ' d.n.e') # parameters that control recombination, erosion, and whatnot self.index_keys = {} # this is kind of hackey, but I suspect indexing my huge table of freqs with a tuple is better than a dict self.version_freq_table = {} # list of the probabilities with which each VDJ combo appears in data self.mute_models = {} # self.treeinfo = [] # list of newick-formatted tree strings with region-specific branch info tacked at the end for region in utils.regions: self.mute_models[region] = {} for model in ['gtr', 'gamma']: self.mute_models[region][model] = {} # first read info that doesn't depend on which person we're looking at self.glfo = utils.read_germline_set(self.args.datadir) # then read stuff that's specific to each person self.read_vdj_version_freqs(self.args.parameter_dir + '/' + utils.get_parameter_fname('all')) self.allowed_genes = self.get_allowed_genes(self.args.parameter_dir) # only really used if <self.args.uniform_vj_choice_probs> is set, but it also checks the sensibility of <self.args.only_genes> self.insertion_content_probs = None self.read_insertion_content() # read shm info NOTE I'm not inferring the gtr parameters a.t.m., so I'm just (very wrongly) using the same ones for all individuals with opener('r')(self.args.gtrfname) as gtrfile: # read gtr parameters reader = csv.DictReader(gtrfile) for line in reader: parameters = line['parameter'].split('.') region = parameters[0][3].lower() assert region == 'v' or region == 'd' or region == 'j' model = parameters[1].lower() parameter_name = parameters[2] assert model in self.mute_models[region] self.mute_models[region][model][parameter_name] = line['value'] treegen = treegenerator.TreeGenerator(args, self.args.parameter_dir, seed=seed) self.treefname = self.workdir + '/trees.tre' treegen.generate_trees(seed, self.treefname) with opener('r')(self.treefname) as treefile: # read in the trees (and other info) that we just generated self.treeinfo = treefile.readlines() if not self.args.no_clean: os.remove(self.treefname) if os.path.exists(self.outfname): os.remove(self.outfname) elif not os.path.exists(os.path.dirname(os.path.abspath(self.outfname))): os.makedirs(os.path.dirname(os.path.abspath(self.outfname)))
def __init__(self, args): self.args = args self.germline_seqs = utils.read_germlines(self.args.datadir, remove_N_nukes=True) self.perfplotter = PerformancePlotter(self.germline_seqs, self.args.plotdir, "ihhhmmm") self.details = OrderedDict() self.failtails = {} self.n_partially_failed = 0 # get sequence info that was passed to ihhhmmm self.siminfo = OrderedDict() self.sim_need = [] # list of queries that we still need to find with opener("r")(self.args.simfname) as seqfile: reader = csv.DictReader(seqfile) iline = 0 for line in reader: if self.args.queries != None and line["unique_id"] not in self.args.queries: continue self.siminfo[line["unique_id"]] = line self.sim_need.append(line["unique_id"]) iline += 1 if args.n_queries > 0 and iline >= args.n_queries: break fostream_names = glob.glob(self.args.indir + "/*.fostream") if len(fostream_names) == 0: raise Exception("no fostreams found in %s" % args.indir) fostream_names.sort() # maybe already sorted? for infname in fostream_names: if len(self.sim_need) == 0: break # try to get whatever you can for the failures unique_ids = self.find_partial_failures(infname) # returns list of unique ids in this file with opener("r")(infname) as infile: self.parse_file(infile, unique_ids) # now check that we got results for all the queries we wanted n_failed = 0 for unique_id in self.siminfo: if unique_id not in self.details and unique_id not in self.failtails: print "%-20s no info" % unique_id self.perfplotter.add_fail() n_failed += 1 print "" print "partially failed: %d / %d = %.2f" % ( self.n_partially_failed, len(self.siminfo), float(self.n_partially_failed) / len(self.siminfo), ) print "failed: %d / %d = %.2f" % (n_failed, len(self.siminfo), float(n_failed) / len(self.siminfo)) print "" self.perfplotter.plot()
def __init__(self, args, glfo, seed, workdir, outfname): # NOTE <gldir> is not in general the same as <args.initial_germline_dir> self.args = args self.glfo = glfo # NOTE in general *not* the same as <self.args.workdir> and <self.args.outfname> self.workdir = workdir self.outfname = outfname utils.prep_dir(self.workdir) # set <self.parameter_dir> (note that this is in general *not* the same as self.args.parameter_dir) if self.args.rearrange_from_scratch: # currently not allowed to mutate from scratch without also rearranging from scratch (enforced in bin/partis) if self.args.mutate_from_scratch: self.parameter_dir = None else: self.parameter_dir = self.args.scratch_mute_freq_dir # if you make up mute freqs from scratch, unless you're really careful you tend to get nonsense results for a lot of things (e.g. allele finding). So it's easier to copy over a reasonable set of mut freq parameters from somewhere. else: self.parameter_dir = self.args.parameter_dir + '/' + self.args.parameter_type self.index_keys = {} # this is kind of hackey, but I suspect indexing my huge table of freqs with a tuple is better than a dict self.mute_models = {} # self.treeinfo = [] # list of newick-formatted tree strings with region-specific branch info tacked at the end for region in utils.regions: self.mute_models[region] = {} for model in ['gtr', 'gamma']: self.mute_models[region][model] = {} self.allele_prevalence_freqs = glutils.read_allele_prevalence_freqs(args.allele_prevalence_fname) if args.allele_prevalence_fname is not None else {} self.version_freq_table = self.read_vdj_version_freqs() # list of the probabilities with which each VDJ combo (plus other rearrangement parameters) appears in data (none if rearranging from scratch) self.insertion_content_probs = self.read_insertion_content() # dummy/uniform if rearranging from scratch self.all_mute_freqs = {} # read shm info NOTE I'm not inferring the gtr parameters a.t.m., so I'm just (very wrongly) using the same ones for all individuals with opener('r')(self.args.gtrfname) as gtrfile: # read gtr parameters reader = csv.DictReader(gtrfile) for line in reader: parameters = line['parameter'].split('.') region = parameters[0][3].lower() assert region == 'v' or region == 'd' or region == 'j' model = parameters[1].lower() parameter_name = parameters[2] assert model in self.mute_models[region] self.mute_models[region][model][parameter_name] = line['value'] treegen = treegenerator.TreeGenerator(args, self.parameter_dir, seed=seed) self.treefname = self.workdir + '/trees.tre' treegen.generate_trees(seed, self.treefname) with opener('r')(self.treefname) as treefile: # read in the trees (and other info) that we just generated self.treeinfo = treefile.readlines() os.remove(self.treefname) if os.path.exists(self.outfname): os.remove(self.outfname) elif not os.path.exists(os.path.dirname(os.path.abspath(self.outfname))): os.makedirs(os.path.dirname(os.path.abspath(self.outfname)))
def __init__(self, args, input_info, reco_info, germline_seqs, parameter_dir, write_parameters=False, plotdir=None): self.parameter_dir = parameter_dir self.plotdir = plotdir self.args = args self.input_info = input_info self.reco_info = reco_info self.germline_seqs = germline_seqs self.pcounter, self.true_pcounter = None, None if write_parameters: self.pcounter = ParameterCounter(self.germline_seqs) if not self.args.is_data: self.true_pcounter = ParameterCounter(self.germline_seqs) self.info = {} self.info['all_best_matches'] = set( ) # set of all the matches we found (for *all* queries) self.info['skipped_unproductive_queries'] = [ ] # list of unproductive queries if self.args.apply_choice_probs_in_sw: if self.args.debug: print ' reading gene choice probs from', parameter_dir self.gene_choice_probs = utils.read_overall_gene_probs( parameter_dir) with opener('r')( self.args.datadir + '/v-meta.json' ) as json_file: # get location of <begin> cysteine in each v region self.cyst_positions = json.load(json_file) with opener('r')( self.args.datadir + '/j_tryp.csv' ) as csv_file: # get location of <end> tryptophan in each j region (TGG) tryp_reader = csv.reader(csv_file) self.tryp_positions = { row[0]: row[1] for row in tryp_reader } # WARNING: this doesn't filter out the header line self.outfile = None if self.args.outfname != None: self.outfile = open(self.args.outfname, 'a') self.n_unproductive = 0 self.n_total = 0
def __init__(self, seqfname, joinfnames, datadir): # <seqfname>: input to joinsolver, <joinfname> output from joinsolver (I only need both because they don't seem to put the full query seq in the output) self.debug = 0 self.n_max_queries = -1 self.queries = [] self.germline_seqs = utils.read_germline_set(datadir, remove_N_nukes=False)['seqs'] assert os.path.exists(os.getenv('www')) self.perfplotter = PerformancePlotter(self.germline_seqs, os.getenv('www') + '/partis/joinsolver_performance', 'js') # get info that was passed to joinsolver self.seqinfo = {} with opener('r')(seqfname) as seqfile: reader = csv.DictReader(seqfile) iline = 0 for line in reader: if len(self.queries) > 0 and line['unique_id'] not in self.queries: continue self.seqinfo[line['unique_id']] = line iline += 1 if self.n_max_queries > 0 and iline >= self.n_max_queries: break self.n_failed, self.n_total = 0, 0 for joinfname in joinfnames: self.parse_file(joinfname) self.perfplotter.plot() print 'failed: %d / %d = %f' % (self.n_failed, self.n_total, float(self.n_failed) / self.n_total)
def readfile(self, fname): if os.stat(fname).st_size == 0: raise Exception('partition file %s has size zero' % fname) with opener('r')(fname) as infile: reader = csv.DictReader(infile) lines = [line for line in reader] self.readlines(lines)
def write_event(self, outfile, total_length_from_right=0, irandom=None): """ Write out all info to csv file. NOTE/RANT so, in calculating each sequence's unique id, we need to hash more than the information about the rearrangement event and mutation, because if we create identical events and sequences in independent recombinator threads, we *need* them to have different unique ids (otherwise all hell will break loose when you try to analyze them). The easy way to avoid this is to add a random number to the information before you hash it... but then you have no way to reproduce that random number when you want to run again with a set random seed to get identical output. The FIX for this at the moment is to pass in <irandom>, i.e. the calling proc tells write_event() that we're writing the <irandom>th event that that calling event is working on. Which effectively means we (drastically) reduce the period of our random number generator for hashing in exchange for reproducibility. Should be ok... """ columns = ('unique_id', 'reco_id') + utils.index_columns + ('seq', ) mode = '' if os.path.isfile(outfile): mode = 'ab' else: mode = 'wb' with opener(mode)(outfile) as csvfile: writer = csv.DictWriter(csvfile, columns) if mode == 'wb': # write the header if file wasn't there before writer.writeheader() # fill the row with values row = {} # first the stuff that's common to the whole recombination event row['cdr3_length'] = self.cdr3_length for region in utils.regions: row[region + '_gene'] = self.genes[region] for boundary in utils.boundaries: row[boundary + '_insertion'] = self.insertions[boundary] for erosion in utils.real_erosions: row[erosion + '_del'] = self.erosions[erosion] for erosion in utils.effective_erosions: row[erosion + '_del'] = self.effective_erosions[erosion] # hash the information that uniquely identifies each recombination event reco_id = '' for column in row: assert 'unique_id' not in row assert 'seq' not in row reco_id += str(row[column]) row['reco_id'] = hash(reco_id) assert 'fv_insertion' not in row # well, in principle it's ok if they're there, but in that case I'll need to at least think about updating some things assert 'jf_insertion' not in row row['fv_insertion'] = '' row['jf_insertion'] = '' # then the stuff that's particular to each mutant/clone for imute in range(len(self.final_seqs)): row['seq'] = self.final_seqs[imute] if total_length_from_right > 0: row['seq'] = row['seq'][len(row['seq'])-total_length_from_right : ] unique_id = '' # Hash to uniquely identify the sequence. for column in row: unique_id += str(row[column]) if irandom is None: # NOTE see note above unique_id += str(numpy.random.uniform()) else: # print 'ievt',irandom unique_id += str(irandom) row['unique_id'] = hash(unique_id) # print row['unique_id'], unique_id writer.writerow(row)
def read_insertion_info(self, this_gene, approved_genes=None): if approved_genes == None: # if we aren't explicitly passed a list of genes to use, we just use the gene for which we're actually writing the hmm approved_genes = [this_gene,] genes_used = set() for insertion in self.insertions: self.insertion_probs[insertion] = {} deps = utils.column_dependencies[insertion + '_insertion'] with opener('r')(self.indir + '/' + utils.get_parameter_fname(column=insertion + '_insertion', deps=deps)) as infile: reader = csv.DictReader(infile) for line in reader: # first see if we want to use this line (if <region>_gene isn't in the line, this erosion doesn't depend on gene version) if self.region + '_gene' in line and line[self.region + '_gene'] not in approved_genes: # NOTE you'll need to change this if you want it to depend on another region's genes continue # then add in this insertion's counts n_inserted = 0 n_inserted = int(line[insertion + '_insertion']) if n_inserted not in self.insertion_probs[insertion]: self.insertion_probs[insertion][n_inserted] = 0.0 self.insertion_probs[insertion][n_inserted] += float(line['count']) if self.region + '_gene' in line: genes_used.add(line[self.region + '_gene']) assert len(self.insertion_probs[insertion]) > 0 # print ' interpolate insertions' interpolate_bins(self.insertion_probs[insertion], self.n_max_to_interpolate, bin_eps=self.eps) #, max_bin=len(self.germline_seq)) # NOTE that we normalize *after* this if 0 not in self.insertion_probs[insertion] or len(self.insertion_probs[insertion]) < 2: # all hell breaks loose lower down if we haven't got shit in the way of information if self.args.debug: print ' WARNING adding pseudocount to 1-bin in insertion probs' self.insertion_probs[insertion][0] = 1 self.insertion_probs[insertion][1] = 1 if self.args.debug: print ' ', self.insertion_probs[insertion] assert 0 in self.insertion_probs[insertion] and len(self.insertion_probs[insertion]) >= 2 # all hell breaks loose lower down if we haven't got shit in the way of information # and finally, normalize total = 0.0 for _, val in self.insertion_probs[insertion].iteritems(): total += val test_total = 0.0 for n_inserted in self.insertion_probs[insertion]: self.insertion_probs[insertion][n_inserted] /= total test_total += self.insertion_probs[insertion][n_inserted] assert utils.is_normed(test_total) if 0 not in self.insertion_probs[insertion] or self.insertion_probs[insertion][0] == 1.0: print 'ERROR cannot have all or none of the probability mass in the zero bin:', self.insertion_probs[insertion] assert False # self.insertion_content_probs = {} self.read_insertion_content(insertion) # also read the base content of the insertions if len(genes_used) > 1: # if length is 1, we will have just used the actual gene if self.args.debug: print ' insertions used:', ' '.join(genes_used)
def read_file_info(self, infname, n_paths): paths = [None for _ in range(n_paths)] lines_list = [[] for _ in range(n_paths)] with opener('r')(infname) as csvfile: reader = csv.DictReader(csvfile) for line in reader: if line['partition'] == '': print ' %s null partition (one of the processes probably got passed zero sequences)' % utils.color('red', 'warning') return paths path_index = int(line['path_index']) if 'path_index' in line else 0 initial_path_index = int(line['initial_path_index']) if 'initial_path_index' in line else 0 if paths[path_index] is None: # is this the first line for this path? paths[path_index] = ClusterPath(initial_path_index, seed_unique_id=self.seed_unique_id) # NOTE I may have screwed up the initial_path_index/path_index distinction here... it's been too long since I wrote the smc stuff and I'm not sure else: assert paths[path_index].initial_path_index == initial_path_index lines_list[path_index].append(line) if paths.count(None) > 0: raise Exception('couldn\'t find the required number of paths in file %s' % infname) for path_index in range(n_paths): paths[path_index].readlines(lines_list[path_index]) for cp in paths: if cp is None: raise Exception('None type path read from %s' % infname) for ptn in cp.partitions: if len(ptn) == 0: raise Exception('zero length partition read from %s' % infname) return paths
def read_insertion_content(self): self.insertion_content_probs = {} for bound in utils.boundaries: self.insertion_content_probs[bound] = {} if self.args.insertion_base_content: with opener('r')(self.args.parameter_dir + '/' + bound + '_insertion_content.csv') as icfile: reader = csv.DictReader(icfile) total = 0 for line in reader: self.insertion_content_probs[bound][line[ bound + '_insertion_content']] = int(line['count']) total += int(line['count']) for nuke in utils.nukes: if nuke not in self.insertion_content_probs[bound]: print ' %s not in insertion content probs, adding with zero' % nuke self.insertion_content_probs[bound][nuke] = 0 self.insertion_content_probs[bound][nuke] /= float( total) else: self.insertion_content_probs[bound] = { 'A': 0.25, 'C': 0.25, 'G': 0.25, 'T': 0.25 } assert utils.is_normed(self.insertion_content_probs[bound])
def read_vdj_version_freqs(self, fname): """ Read the frequencies at which various VDJ combinations appeared in data """ with opener('r')(fname) as infile: in_data = csv.DictReader(infile) total = 0.0 for line in in_data: # NOTE do *not* assume the file is sorted # # if int(line['cdr3_length']) == -1: # continue # couldn't find conserved codons when we were inferring things if self.args.only_genes != None: # are we restricting ourselves to a subset of genes? if line['v_gene'] not in self.args.only_genes: continue # oops, don't change this to a loop, 'cause you won't continue out of the right thing then if line['d_gene'] not in self.args.only_genes: continue if line['j_gene'] not in self.args.only_genes: continue total += float(line['count']) index = tuple(line[column] for column in utils.index_columns) assert index not in self.version_freq_table self.version_freq_table[index] = float(line['count']) if len(self.version_freq_table) == 0: print 'ERROR didn\'t find any matching gene combinations' assert False # then normalize test_total = 0.0 for index in self.version_freq_table: self.version_freq_table[index] /= total test_total += self.version_freq_table[index] assert utils.is_normed(test_total, this_eps=1e-8) assert len( self.version_freq_table ) < 1e8 # if it gets *too* large, choose_vdj_combo() below isn't going to work because of numerical underflow. Note there's nothing special about 1e8, it's just that I'm pretty sure we're fine *up* to that point, and once we get beyond it we should think about doing things differently
def read_file_info(self, infname, n_paths, calc_adj_mi): paths = [None for _ in range(n_paths)] with opener('r')(infname) as csvfile: reader = csv.DictReader(csvfile) for line in reader: if line['partition'] == '': raise Exception('ERROR null partition (one of the processes probably got passed zero sequences') # shouldn't happen any more FLW uids = [] for cluster in line['partition'].split(';'): uids.append([unique_id for unique_id in cluster.split(':')]) path_index = int(line['path_index']) if paths[path_index] is None: paths[path_index] = ClusterPath(int(line['initial_path_index'])) else: assert paths[path_index].initial_path_index == int(line['initial_path_index']) n_procs = int(line['n_procs']) if 'n_procs' in line else 1 logweight = float(line['logweight']) if 'logweight' in line else None adj_mi = -1 if calc_adj_mi: adj_mi = utils.mutual_information(uids, self.reco_info, debug=False) if self.reco_info is not None else -1 paths[path_index].add_partition(uids, float(line['logprob']), n_procs=n_procs, logweight=logweight, adj_mi=adj_mi) for cp in paths: if cp is None: raise Exception('None type path read from %s' % infname) for ptn in cp.partitions: if len(ptn) == 0: raise Exception('zero length partition read from %s' % infname) return paths
def read_vdj_version_freqs(self, fname): """ Read the frequencies at which various VDJ combinations appeared in data """ with opener('r')(fname) as infile: in_data = csv.DictReader(infile) total = 0.0 for line in in_data: # NOTE do *not* assume the file is sorted # # if int(line['cdr3_length']) == -1: # continue # couldn't find conserved codons when we were inferring things if self.args.only_genes is not None: # are we restricting ourselves to a subset of genes? if line['v_gene'] not in self.args.only_genes: continue if line['d_gene'] not in self.args.only_genes: continue if line['j_gene'] not in self.args.only_genes: continue total += float(line['count']) index = tuple(line[column] for column in utils.index_columns) assert index not in self.version_freq_table self.version_freq_table[index] = float(line['count']) if len(self.version_freq_table) == 0: print 'ERROR didn\'t find any matching gene combinations' assert False # then normalize test_total = 0.0 for index in self.version_freq_table: self.version_freq_table[index] /= total test_total += self.version_freq_table[index] assert utils.is_normed(test_total, this_eps=1e-8) assert len(self.version_freq_table) < 1e8 # if it gets *too* large, choose_vdj_combo() below isn't going to work because of numerical underflow. Note there's nothing special about 1e8, it's just that I'm pretty sure we're fine *up* to that point, and once we get beyond it we should think about doing things differently
def read_vdj_version_freqs(self): """ Read the frequencies at which various VDJ combinations appeared in data """ if self.args.rearrange_from_scratch: return None version_freq_table = {} with opener('r')(self.parameter_dir + '/' + utils.get_parameter_fname('all')) as infile: in_data = csv.DictReader(infile) total = 0.0 for line in in_data: # NOTE do *not* assume the file is sorted skip = False for region in utils.regions: if line[region + '_gene'] not in self.glfo['seqs'][region]: skip = True break if skip: continue total += float(line['count']) index = self.freqtable_index(line) assert index not in version_freq_table version_freq_table[index] = float(line['count']) if len(version_freq_table) == 0: raise Exception('didn\'t find any gene combinations in %s' % fname) # then normalize test_total = 0.0 for index in version_freq_table: version_freq_table[index] /= total test_total += version_freq_table[index] assert utils.is_normed(test_total, this_eps=1e-8) assert len(version_freq_table) < 1e8 # if it gets *too* large, choose_vdj_combo() below isn't going to work because of numerical underflow. Note there's nothing special about 1e8, it's just that I'm pretty sure we're fine *up* to that point, and once we get beyond it we should think about doing things differently return version_freq_table
def write(self, base_outdir, mean_freq_outfname): if not self.finalized: self.finalize() outdir = base_outdir + '/mute-freqs' utils.prep_dir(outdir, '*.csv') for gene in self.counts: counts, freqs, plotting_info = self.counts[gene], self.freqs[gene], self.plotting_info[gene] sorted_positions = sorted(counts) outfname = outdir + '/' + utils.sanitize_name(gene) + '.csv' with opener('w')(outfname) as outfile: nuke_header = [] for nuke in utils.nukes: nuke_header.append(nuke) nuke_header.append(nuke + '_lo_err') nuke_header.append(nuke + '_hi_err') writer = csv.DictWriter(outfile, ('position', 'mute_freq', 'lo_err', 'hi_err') + tuple(nuke_header)) writer.writeheader() for position in sorted_positions: row = {'position':position, 'mute_freq':counts[position]['freq'], 'lo_err':counts[position]['freq_lo_err'], 'hi_err':counts[position]['freq_hi_err']} for nuke in utils.nukes: row[nuke] = freqs[position][nuke] row[nuke + '_lo_err'] = freqs[position][nuke + '_lo_err'] row[nuke + '_hi_err'] = freqs[position][nuke + '_hi_err'] writer.writerow(row) assert 'REGION' in mean_freq_outfname self.mean_rates['all'].write(mean_freq_outfname.replace('REGION', 'all')) # hackey hackey hackey replacement... *sigh* for region in utils.regions: self.mean_rates[region].write(mean_freq_outfname.replace('REGION', region))
def __init__(self, inputdir, human, naivety, imax=-1): self.human = human self.naivety = naivety self.freqs = {} infname = inputdir + '/' + self.human + '/' + self.naivety + '/mute-counts.csv.bz2' print ' opening ',infname with opener('r')(infname) as infile: reader = csv.DictReader(infile) il = 0 for line in reader: il += 1 assert line['subject'] == self.human gene_name = line['reference'] if gene_name not in self.freqs: self.freqs[gene_name] = {} assert utils.maturity_to_naivety(line['subset']) == self.naivety position = int(line['position']) assert position not in self.freqs[gene_name] self.freqs[gene_name][position] = {} self.freqs[gene_name][position]['ref'] = line['ref_base'] self.freqs[gene_name][position]['n_reads'] = int(line['n_reads']) # assert line['N'] == '' for nuke in utils.nukes: self.freqs[gene_name][position][nuke] = float(line[nuke]) / int(line['n_reads']) if imax > 0 and il > imax: break
def read_insertion_content(self, insertion): icontentprobs = {} # NOTE this is only the probs for <insertion>, even though name is the same as in the previous function if insertion in utils.boundaries: # i.e. if it's a real insertion with opener('r')(self.indir + '/' + insertion + '_insertion_content.csv') as icfile: reader = csv.DictReader(icfile) total = 0 for line in reader: icontentprobs[line[insertion + '_insertion_content']] = int(line['count']) total += int(line['count']) if total == 0. and self.debug: print '\n WARNING zero insertion content probs read from %s, so setting to uniform distribution' % self.indir + '/' + insertion + '_insertion_content.csv' for nuke in utils.nukes: if total == 0.: icontentprobs[nuke] = 1. / len(utils.nukes) else: if nuke not in icontentprobs: print ' %s not in insertion content probs, adding with zero' % nuke icontentprobs[nuke] = 0 icontentprobs[nuke] /= float(total) else: # just return uniform probs for effective (fv and jf) insertions icontentprobs = {n : 0.25 for n in utils.nukes} assert utils.is_normed(icontentprobs) return icontentprobs
def read_erosion_info(self, this_gene, approved_genes=None): # NOTE that d erosion lengths depend on each other... but I don't think that's modellable with an hmm. At least for the moment we integrate over the other erosion if approved_genes is None: approved_genes = [this_gene, ] eprobs = {} genes_used = set() for erosion in utils.real_erosions + utils.effective_erosions: if erosion[0] != self.region: continue eprobs[erosion] = {} if this_gene == glutils.dummy_d_genes[self.args.chain]: eprobs[erosion][0] = 1. # always erode zero bases continue deps = utils.column_dependencies[erosion + '_del'] with opener('r')(self.indir + '/' + utils.get_parameter_fname(column=erosion + '_del', deps=deps)) as infile: reader = csv.DictReader(infile) for line in reader: # first see if we want to use this line (if <region>_gene isn't in the line, this erosion doesn't depend on gene version) if self.region + '_gene' in line and line[self.region + '_gene'] not in approved_genes: # NOTE you'll need to change this if you want it to depend on another region's genes continue # then skip nonsense erosions that're too long for this gene, but were ok for another if int(line[erosion + '_del']) >= len(self.germline_seq): continue # then add in this erosion's counts n_eroded = int(line[erosion + '_del']) if n_eroded not in eprobs[erosion]: eprobs[erosion][n_eroded] = 0.0 eprobs[erosion][n_eroded] += float(line['count']) if self.region + '_gene' in line: genes_used.add(line[self.region + '_gene']) if len(eprobs[erosion]) == 0: raise Exception('didn\'t read any %s erosion probs from %s' % (erosion, self.indir + '/' + utils.get_parameter_fname(column=erosion + '_del', deps=deps))) # do some smoothingy things NOTE that we normalize *after* interpolating if erosion in utils.real_erosions: # for real erosions, don't interpolate if we lots of information about neighboring bins (i.e. we're pretty confident this bin should actually be zero) n_max = self.n_max_to_interpolate else: # for fake erosions, always interpolate n_max = -1 # print ' interpolate erosions' interpolate_bins(eprobs[erosion], n_max, bin_eps=self.eps, max_bin=len(self.germline_seq)) self.add_pseudocounts(eprobs[erosion]) # and finally, normalize total = 0.0 for _, val in eprobs[erosion].iteritems(): total += val test_total = 0.0 for n_eroded in eprobs[erosion]: eprobs[erosion][n_eroded] /= total test_total += eprobs[erosion][n_eroded] assert utils.is_normed(test_total) if len(genes_used) > 1 and self.debug: # if length is 1, we will have just used the actual gene print ' used erosion info from:', ' '.join(genes_used) return eprobs
def file_init(self, fname): self.errors, self.sum_weights_squared = [], [] # kill the unused one after reading file with opener('r')(fname) as infile: reader = csv.DictReader(infile) for line in reader: self.low_edges.append(float(line['bin_low_edge'])) self.bin_contents.append(float(line['contents'])) if 'sum-weights-squared' in line: self.sum_weights_squared.append(float(line['sum-weights-squared'])) if 'error' in line or 'binerror' in line: # in theory I should go find all the code that writes these files and make 'em use the same header for this assert 'sum-weights-squared' not in line tmp_error = float(line['error']) if 'error' in line else float(line['binerror']) self.errors.append(tmp_error) if 'binlabel' in line: self.bin_labels.append(line['binlabel']) else: self.bin_labels.append('') if 'xtitle' in line: # should be the same for every line in the file... but this avoids complicating the file format self.xtitle = line['xtitle'] self.n_bins = len(self.low_edges) - 2 # file should have a line for the under- and overflow bins self.xmin, self.xmax = self.low_edges[1], self.low_edges[-1] # *upper* bound of underflow, *lower* bound of overflow assert sorted(self.low_edges) == self.low_edges assert len(self.bin_contents) == len(self.low_edges) assert len(self.low_edges) == len(self.bin_labels) if len(self.errors) == 0: # (re)set to None if the file didn't have errors listed self.errors = None assert len(self.sum_weights_squared) == len(self.low_edges) if len(self.sum_weights_squared) == 0: self.sum_weights_squared = None assert len(self.errors) == len(self.low_edges)
def write_vdjalign_input(self, base_infname, n_procs): n_remaining = len(self.remaining_queries) queries_per_proc = float(n_remaining) / n_procs n_queries_per_proc = int(math.ceil(queries_per_proc)) written_queries = set() # make sure we actually write each query TODO remove this when you work out where they're disappearing to if n_procs == 1: # double check for rounding problems or whatnot assert n_queries_per_proc == n_remaining for iproc in range(n_procs): workdir = self.subworkdir(iproc, n_procs) if n_procs > 1: utils.prep_dir(workdir) with opener('w')(workdir + '/' + base_infname) as sub_infile: iquery = 0 for query_name in self.remaining_queries: # NOTE this is wasteful to loop of all the remaining queries for each process... but maybe not that wasteful if iquery >= n_remaining: break if iquery < iproc*n_queries_per_proc or iquery >= (iproc + 1)*n_queries_per_proc: # not for this process iquery += 1 continue sub_infile.write('>' + query_name + ' NUKES\n') seq = self.input_info[query_name]['seq'] if query_name in self.info['indels']: seq = self.info['indels'][query_name]['reversed_seq'] # use the query sequence with shm insertions and deletions reversed sub_infile.write(seq + '\n') written_queries.add(query_name) iquery += 1 not_written = self.remaining_queries - written_queries if len(not_written) > 0: raise Exception('didn\'t write %s to %s' % (':'.join(not_written), self.args.workdir))
def write_mute_freqs(self, region, gene_or_insert_name, seq, reco_event, reco_seq_fname, is_insertion=False): """ Read position-by-position mute freqs from disk for <gene_or_insert_name>, renormalize, then write to a file for bppseqgen. """ mute_freqs = self.get_mute_freqs(gene_or_insert_name) rates = [] # list with a relative mutation rate for each position in <seq> total = 0.0 # assert len(mute_freqs) == len(seq) # only equal length if no erosions NO oh right but mute_freqs only covers areas we could align to... left_erosion_length = dict(reco_event.erosions.items() + reco_event.effective_erosions.items())[region + '_5p'] for inuke in range(len(seq)): # append a freq for each nuke position = inuke + left_erosion_length freq = 0.0 if position in mute_freqs: freq = mute_freqs[position] else: freq = mute_freqs['overall_mean'] rates.append(freq) total += freq # normalize to the number of sites (i.e. so an average site is given value 1.0) assert total != 0.0 # I am not hip enough to divide by zero for inuke in range(len(seq)): rates[inuke] *= float(len(seq)) / total total = 0.0 # and... double check it, just for shits and giggles for inuke in range(len(seq)): total += rates[inuke] assert utils.is_normed(total / float(len(seq))) assert len(rates) == len(seq) # you just can't be too careful. what if gremlins ate a few while python wasn't looking? # write the input file for bppseqgen, one base per line with opener('w')(reco_seq_fname) as reco_seq_file: reco_seq_file.write('state\trate\n') for inuke in range(len(seq)): reco_seq_file.write('%s\t%.15f\n' % (seq[inuke], rates[inuke]))
def write(self, outdir, mean_freq_outfname): if not self.finalized: self.finalize() for gene in self.counts: gcounts, freqs = self.counts[gene], self.freqs[gene] outfname = outdir + '/' + utils.sanitize_name(gene) + '.csv' with opener('w')(outfname) as outfile: nuke_header = [n + xtra for n in utils.nukes for xtra in ('', '_obs', '_lo_err', '_hi_err')] writer = csv.DictWriter(outfile, ('position', 'mute_freq', 'lo_err', 'hi_err') + tuple(nuke_header)) writer.writeheader() for position in sorted(gcounts.keys()): row = {'position':position, 'mute_freq':freqs[position]['freq'], 'lo_err':freqs[position]['freq_lo_err'], 'hi_err':freqs[position]['freq_hi_err']} for nuke in utils.nukes: row[nuke] = freqs[position][nuke] row[nuke + '_obs'] = gcounts[position][nuke] row[nuke + '_lo_err'] = freqs[position][nuke + '_lo_err'] row[nuke + '_hi_err'] = freqs[position][nuke + '_hi_err'] writer.writerow(row) assert 'REGION' in mean_freq_outfname self.mean_rates['all'].write(mean_freq_outfname.replace('REGION', 'all')) # hackey hackey hackey replacement... *sigh* for region in utils.regions: self.mean_rates[region].write(mean_freq_outfname.replace('REGION', region))
def write_hmm_input(self, csv_fname, sw_info, parameter_dir, preclusters=None, hmm_type='', pair_hmm=False, stripped=False): print ' writing input' csvfile = opener('w')(csv_fname) start = time.time() # write header header = ['names', 'k_v_min', 'k_v_max', 'k_d_min', 'k_d_max', 'only_genes', 'seqs'] # I wish I had a good c++ csv reader csvfile.write(' '.join(header) + '\n') skipped_gene_matches = set() assert hmm_type != '' if hmm_type == 'k=1': # single vanilla hmm nsets = [[qn] for qn in self.input_info.keys()] elif hmm_type == 'k=2': # pair hmm nsets = self.get_pairs(preclusters) elif hmm_type == 'k=preclusters': # run the k-hmm on each cluster in <preclusters> assert preclusters != None nsets = [ val for key, val in preclusters.id_clusters.items() if len(val) > 1 ] # <nsets> is a list of sets (well, lists) of query names # nsets = [] # for cluster in preclusters.id_clusters.values(): # nsets += itertools.combinations(cluster, 5) elif hmm_type == 'k=nsets': # run on *every* combination of queries which has length <self.args.n_sets> if self.args.all_combinations: nsets = itertools.combinations(self.input_info.keys(), self.args.n_sets) else: # put the first n together, and the second group of n (not the self.input_info is and OrderedDict) nsets = [] keylist = self.input_info.keys() this_set = [] for iquery in range(len(keylist)): if iquery % self.args.n_sets == 0: # every nth query, start a new group if len(this_set) > 0: nsets.append(this_set) this_set = [] this_set.append(keylist[iquery]) if len(this_set) > 0: nsets.append(this_set) else: assert False for query_names in nsets: non_failed_names = self.remove_sw_failures(query_names, sw_info) if len(non_failed_names) == 0: continue combined_query = self.combine_queries(sw_info, non_failed_names, parameter_dir, stripped=stripped, skipped_gene_matches=skipped_gene_matches) if len(combined_query) == 0: # didn't find all regions continue csvfile.write('%s %d %d %d %d %s %s\n' % # NOTE csv.DictWriter can handle tsvs, so this should really be switched to use that (':'.join([str(qn) for qn in non_failed_names]), combined_query['k_v']['min'], combined_query['k_v']['max'], combined_query['k_d']['min'], combined_query['k_d']['max'], ':'.join(combined_query['only_genes']), ':'.join(combined_query['seqs']))) if len(skipped_gene_matches) > 0: print ' not found in %s, i.e. were never the best sw match for any query, so removing from consideration for hmm:' % (parameter_dir) for region in utils.regions: print ' %s: %s' % (region, ' '.join([utils.color_gene(gene) for gene in skipped_gene_matches if utils.get_region(gene) == region])) csvfile.close() print ' input write time: %.3f' % (time.time()-start)
def check_tree_lengths(self, treefname, ages): treestrs = [] with opener('r')(treefname) as treefile: for line in treefile: treestrs.append(line.split(';')[0] + ';') # ignore the info I added after the ';' if self.args.debug > 1: print ' checking branch lengths... ' assert len(treestrs) == len(ages) total_length, total_leaves = 0.0, 0 for itree in range(len(ages)): if self.args.debug > 1: print ' asked for', ages[itree], for name, depth in get_leaf_node_depths(treestrs[itree]).items(): if self.args.debug > 1: print '%s:%f' % (name, depth), if not utils.is_normed(depth / ages[itree], this_eps=1e-6): raise Exception( 'asked for branch length %f but got %f\n %s' % (ages[itree], depth, treestrs[itree]) ) # ratio of <age> (requested length) and <length> (length in the tree file) should be 1 within float precision total_length += ages[itree] total_leaves += len(re.findall('t', treestrs[itree])) if self.args.debug > 1: print '' if self.args.debug: print ' mean branch length %.5f' % (total_length / len(ages)) print ' mean n leaves %.2f' % (float(total_leaves) / len(ages))
def read_erosion_info(self, this_gene, approved_genes=None): # NOTE that d erosion lengths depend on each other... but I don't think that's modellable with an hmm. At least for the moment we integrate over the other erosion if approved_genes == None: approved_genes = [this_gene] genes_used = set() for erosion in utils.real_erosions + utils.effective_erosions: if erosion[0] != self.region: continue self.erosion_probs[erosion] = {} deps = utils.column_dependencies[erosion + "_del"] with opener("r")( self.indir + "/" + utils.get_parameter_fname(column=erosion + "_del", deps=deps) ) as infile: reader = csv.DictReader(infile) for line in reader: # first see if we want to use this line (if <region>_gene isn't in the line, this erosion doesn't depend on gene version) if ( self.region + "_gene" in line and line[self.region + "_gene"] not in approved_genes ): # NOTE you'll need to change this if you want it to depend on another region's genes continue # then skip nonsense erosions that're too long for this gene, but were ok for another if int(line[erosion + "_del"]) >= len(self.germline_seq): continue # then add in this erosion's counts n_eroded = int(line[erosion + "_del"]) if n_eroded not in self.erosion_probs[erosion]: self.erosion_probs[erosion][n_eroded] = 0.0 self.erosion_probs[erosion][n_eroded] += float(line["count"]) if self.region + "_gene" in line: genes_used.add(line[self.region + "_gene"]) assert len(self.erosion_probs[erosion]) > 0 # do some smoothingy things NOTE that we normalize *after* interpolating if ( erosion in utils.real_erosions ): # for real erosions, don't interpolate if we lots of information about neighboring bins (i.e. we're pretty confident this bin should actually be zero) n_max = self.n_max_to_interpolate else: # for fake erosions, always interpolate n_max = -1 # print ' interpolate erosions' interpolate_bins(self.erosion_probs[erosion], n_max, bin_eps=self.eps, max_bin=len(self.germline_seq)) self.add_pseudocounts(self.erosion_probs[erosion]) # and finally, normalize total = 0.0 for _, val in self.erosion_probs[erosion].iteritems(): total += val test_total = 0.0 for n_eroded in self.erosion_probs[erosion]: self.erosion_probs[erosion][n_eroded] /= total test_total += self.erosion_probs[erosion][n_eroded] assert utils.is_normed(test_total) if len(genes_used) > 1: # if length is 1, we will have just used the actual gene if self.args.debug: print " erosions used:", " ".join(genes_used)
def read_insertion_content(self, insertion): self.insertion_content_probs[insertion] = {} if insertion in utils.boundaries: # just return uniform probs for fv and jf insertions with opener('r')(self.indir + '/' + insertion + '_insertion_content.csv') as icfile: reader = csv.DictReader(icfile) total = 0 for line in reader: self.insertion_content_probs[insertion][line[insertion + '_insertion_content']] = int(line['count']) total += int(line['count']) if total == 0.: print '\n WARNING zero insertion content probs read from %s, so setting to uniform distribution' % self.indir + '/' + insertion + '_insertion_content.csv' for nuke in utils.nukes: if total == 0.: self.insertion_content_probs[insertion][nuke] = 1. / len(utils.nukes) else: if nuke not in self.insertion_content_probs[insertion]: print ' %s not in insertion content probs, adding with zero' % nuke self.insertion_content_probs[insertion][nuke] = 0 self.insertion_content_probs[insertion][nuke] /= float(total) else: self.insertion_content_probs[insertion] = {n : 0.25 for n in utils.nukes} assert utils.is_normed(self.insertion_content_probs[insertion]) if self.args.debug: print ' insertion content for', insertion, self.insertion_content_probs[insertion]
def read_insertion_content(self, insertion): self.insertion_content_probs[insertion] = {} if self.args.insertion_base_content: with opener('r')(self.indir + '/' + insertion + '_insertion_content.csv') as icfile: reader = csv.DictReader(icfile) total = 0 for line in reader: self.insertion_content_probs[insertion][line[ insertion + '_insertion_content']] = int(line['count']) total += int(line['count']) for nuke in utils.nukes: if nuke not in self.insertion_content_probs[insertion]: print ' %s not in insertion content probs, adding with zero' % nuke self.insertion_content_probs[insertion][nuke] = 0 self.insertion_content_probs[insertion][nuke] /= float( total) else: self.insertion_content_probs[insertion] = { 'A': 0.25, 'C': 0.25, 'G': 0.25, 'T': 0.25 } assert utils.is_normed(self.insertion_content_probs[insertion]) if self.args.debug: print ' insertion content for', insertion, self.insertion_content_probs[ insertion]
def __init__( self, seqfname, joinfnames, datadir ): # <seqfname>: input to joinsolver, <joinfname> output from joinsolver (I only need both because they don't seem to put the full query seq in the output) self.debug = 0 self.n_max_queries = -1 self.queries = [] self.germline_seqs = utils.read_germlines(datadir, remove_N_nukes=False) assert os.path.exists(os.getenv('www')) self.perfplotter = PerformancePlotter( self.germline_seqs, os.getenv('www') + '/partis/joinsolver_performance', 'js') # get info that was passed to joinsolver self.seqinfo = {} with opener('r')(seqfname) as seqfile: reader = csv.DictReader(seqfile) iline = 0 for line in reader: if len(self.queries ) > 0 and line['unique_id'] not in self.queries: continue self.seqinfo[line['unique_id']] = line iline += 1 if self.n_max_queries > 0 and iline >= self.n_max_queries: break self.n_failed, self.n_total = 0, 0 for joinfname in joinfnames: self.parse_file(joinfname) self.perfplotter.plot() print 'failed: %d / %d = %f' % (self.n_failed, self.n_total, float(self.n_failed) / self.n_total)
def read_file_info(self, infname, n_paths): paths = [None for _ in range(n_paths)] lines_list = [[] for _ in range(n_paths)] with opener('r')(infname) as csvfile: reader = csv.DictReader(csvfile) for line in reader: if line['partition'] == '': raise Exception('ERROR null partition (one of the processes probably got passed zero sequences') # shouldn't happen any more FLW uids = [] path_index = int(line['path_index']) if paths[path_index] is None: # is this the first line for this path? paths[path_index] = ClusterPath(int(line['initial_path_index'])) # NOTE I may have screwed up the initial_path_index/path_index distinction here... it's been too long since I wrote the smc stuff and I'm not sure else: assert paths[path_index].initial_path_index == int(line['initial_path_index']) lines_list[path_index].append(line) for path_index in range(n_paths): paths[path_index].readlines(lines_list[path_index]) for cp in paths: if cp is None: raise Exception('None type path read from %s' % infname) for ptn in cp.partitions: if len(ptn) == 0: raise Exception('zero length partition read from %s' % infname) return paths
def cdr3_length_precluster(self, waterer, preclusters=None): cdr3lengthfname = self.args.workdir + '/cdr3lengths.csv' with opener('w')(cdr3lengthfname) as outfile: writer = csv.DictWriter( outfile, ('unique_id', 'second_unique_id', 'cdr3_length', 'second_cdr3_length', 'score')) writer.writeheader() for query_name, second_query_name in self.get_pairs(preclusters): cdr3_length = waterer.info[query_name]['cdr3_length'] second_cdr3_length = waterer.info[second_query_name][ 'cdr3_length'] same_length = cdr3_length == second_cdr3_length if not self.args.is_data: assert cdr3_length == int( self.reco_info[query_name]['cdr3_length']) if second_cdr3_length != int( self.reco_info[second_query_name]['cdr3_length']): print 'WARNING did not infer correct cdr3 length' assert False writer.writerow({ 'unique_id': query_name, 'second_unique_id': second_query_name, 'cdr3_length': cdr3_length, 'second_cdr3_length': second_cdr3_length, 'score': int(same_length) }) clust = Clusterer( 0.5, greater_than=True) # i.e. cluster together if same_length == True clust.cluster(cdr3lengthfname, debug=False) os.remove(cdr3lengthfname) return clust
def readfile(self, fname): with opener('r')(fname) as infile: reader = csv.DictReader(infile) for line in reader: partition = [cl.split(':') for cl in line['clusters'].split(';')] logweight = float(line['logweight']) if 'logweight' in line else None adj_mi = float(line['adj_mi']) if 'adj_mi' in line else None self.add_partition(partition, float(line['logprob']), int(line['n_procs']), logweight=logweight, adj_mi=adj_mi)
def add_branch_lengths(self, treefname): """ Each tree is written with branch length the mean branch length over the whole sequence So we need to add the length for each region afterward, so each line looks e.g. like (t2:0.003751736951,t1:0.003751736951):0.001248262937;v:0.98,d:1.8,j:0.87 """ # first read the newick info for each tree with opener('r')(treefname) as treefile: treestrings = treefile.readlines() # then add the region-specific branch info length_list = ['%s:%f'% (region, self.branch_lengths[region]['mean'] / self.branch_lengths['all']['mean']) for region in utils.regions] for iline in range(len(treestrings)): treestrings[iline] = treestrings[iline].replace(';', ';' + ','.join(length_list)) # and finally write out the final lines with opener('w')(treefname) as treefile: for line in treestrings: treefile.write(line)
def check_tree_simulation(self, leaf_seq_fname, chosen_tree_str, reco_event=None): """ See how well we can reconstruct the true tree """ clean_up = False if leaf_seq_fname == '': # we need to make the leaf seq file based on info in reco_event clean_up = True leaf_seq_fname = self.workdir + '/leaf-seqs.fa' with opener('w')(leaf_seq_fname) as leafseqfile: for iseq in range(len(reco_event.final_seqs)): leafseqfile.write('>t' + str(iseq+1) + '\n') # NOTE the *order* of the seqs doesn't correspond to the tN number. does it matter? leafseqfile.write(reco_event.final_seqs[iseq] + '\n') with opener('w')(os.devnull) as fnull: inferred_tree_str = check_output('FastTree -gtr -nt ' + leaf_seq_fname, shell=True, stderr=fnull) os.remove(leaf_seq_fname) chosen_tree = dendropy.Tree.get_from_string(chosen_tree_str, 'newick') inferred_tree = dendropy.Tree.get_from_string(inferred_tree_str, 'newick') if self.args.debug: print ' tree diff -- symmetric %d euke %f rf %f' % (chosen_tree.symmetric_difference(inferred_tree), chosen_tree.euclidean_distance(inferred_tree), chosen_tree.robinson_foulds_distance(inferred_tree))
def readfile(self, fname): if fname is None: raise Exception("can't read NoneType partition file") if os.stat(fname).st_size == 0: raise Exception("partition file %s has size zero" % fname) with opener("r")(fname) as infile: reader = csv.DictReader(infile) lines = [line for line in reader] self.readlines(lines)
def merge_hmm_outputs(self, outfname): header = None outfo = [] for iproc in range(self.args.n_procs): workdir = self.args.workdir + '/hmm-' + str(iproc) with opener('r')(workdir + '/' + os.path.basename(outfname)) as sub_outfile: reader = csv.DictReader(sub_outfile) header = reader.fieldnames for line in reader: outfo.append(line) if not self.args.no_clean: os.remove(workdir + '/' + os.path.basename(outfname)) os.rmdir(workdir) with opener('w')(outfname) as outfile: writer = csv.DictWriter(outfile, header) writer.writeheader() for line in outfo: writer.writerow(line)
def read_erosion_info(self, this_gene, approved_genes=None): # NOTE that d erosion lengths depend on each other... but I don't think that's modellable with an hmm. At least for the moment we integrate over the other erosion if approved_genes == None: approved_genes = [this_gene,] genes_used = set() for erosion in utils.real_erosions + utils.effective_erosions: if erosion[0] != self.region: continue self.erosion_probs[erosion] = {} deps = utils.column_dependencies[erosion + '_del'] with opener('r')(self.indir + '/' + utils.get_parameter_fname(column=erosion + '_del', deps=deps)) as infile: reader = csv.DictReader(infile) for line in reader: # first see if we want to use this line (if <region>_gene isn't in the line, this erosion doesn't depend on gene version) if self.region + '_gene' in line and line[self.region + '_gene'] not in approved_genes: # NOTE you'll need to change this if you want it to depend on another region's genes continue # then skip nonsense erosions that're too long for this gene, but were ok for another if int(line[erosion + '_del']) >= len(self.germline_seq): continue # then add in this erosion's counts n_eroded = int(line[erosion + '_del']) if n_eroded not in self.erosion_probs[erosion]: self.erosion_probs[erosion][n_eroded] = 0.0 self.erosion_probs[erosion][n_eroded] += float(line['count']) if self.region + '_gene' in line: genes_used.add(line[self.region + '_gene']) assert len(self.erosion_probs[erosion]) > 0 # do some smoothingy things NOTE that we normalize *after* interpolating if erosion in utils.real_erosions: # for real erosions, don't interpolate if we lots of information about neighboring bins (i.e. we're pretty confident this bin should actually be zero) n_max = self.n_max_to_interpolate else: # for fake erosions, always interpolate n_max = -1 # print ' interpolate erosions' interpolate_bins(self.erosion_probs[erosion], n_max, bin_eps=self.eps, max_bin=len(self.germline_seq)) self.add_pseudocounts(self.erosion_probs[erosion]) # and finally, normalize total = 0.0 for _, val in self.erosion_probs[erosion].iteritems(): total += val test_total = 0.0 for n_eroded in self.erosion_probs[erosion]: self.erosion_probs[erosion][n_eroded] /= total test_total += self.erosion_probs[erosion][n_eroded] assert utils.is_normed(test_total) if len(genes_used) > 1: # if length is 1, we will have just used the actual gene if self.args.debug: print ' erosions used:', ' '.join(genes_used)
def make_hist_from_bin_entry_file(fname, hist_label='', log=''): """ Return root histogram with each bin low edge and bin content read from <fname> E.g. from the results of hist.Hist.write() """ low_edges, contents, bin_labels, bin_errors, sum_weights_squared = [], [], [], [], [] xtitle = '' with opener('r')(fname) as infile: reader = csv.DictReader(infile) for line in reader: low_edges.append(float(line['bin_low_edge'])) contents.append(float(line['contents'])) if 'sum-weights-squared' in line: sum_weights_squared.append(float(line['sum-weights-squared'])) if 'error' in line or 'binerror' in line: assert 'sum-weights-squared' not in line tmp_error = float(line['error']) if 'error' in line else float( line['binerror']) bin_errors.append(tmp_error) if 'binlabel' in line: bin_labels.append(line['binlabel']) else: bin_labels.append('') if 'xtitle' in line: xtitle = line['xtitle'] n_bins = len( low_edges ) - 2 # file should have a line for the under- and overflow bins xbins = array( 'f', [0.0 for i in range(n_bins + 1)]) # NOTE has to be n bins *plus* 1 low_edges = sorted(low_edges) for ib in range(n_bins + 1): xbins[ib] = low_edges[ ib + 1] # low_edges[1] is the lower edge of the first bin, i.e. the first bin after the underflow bin, and this will set the last entry in xbins to lower[n_bins+1], i.e. the lower edge of the overflow bin. Which, I bloody well think, is correct hist = TH1D( hist_label, '', n_bins, xbins) # this will barf if the csv file wasn't sorted by bin low edge hist.GetXaxis().SetTitle(xtitle) for ib in range(n_bins + 2): hist.SetBinContent(ib, contents[ib]) if len(sum_weights_squared) > 0: hist.SetBinError(ib, math.sqrt(sum_weights_squared[ib])) elif len(bin_errors) > 0: hist.SetBinError(ib, bin_errors[ib]) else: hist.SetBinError(ib, math.sqrt(contents[ib])) if bin_labels[ib] != '': hist.GetXaxis().SetBinLabel(ib, bin_labels[ib]) return hist
def read_mute_info(indir, this_gene, approved_genes=None): if approved_genes == None: approved_genes = [ this_gene, ] observed_freqs = {} # add an observation for each position, for each gene where we observed that position for gene in approved_genes: mutefname = indir + '/mute-freqs/' + utils.sanitize_name(gene) + '.csv' if not os.path.exists(mutefname): continue with opener('r')(mutefname) as mutefile: reader = csv.DictReader(mutefile) for line in reader: pos = int(line['position']) freq = float(line['mute_freq']) lo_err = float( line['lo_err'] ) # NOTE lo_err in the file is really the lower *bound* hi_err = float(line['hi_err']) # same deal assert freq >= 0.0 and lo_err >= 0.0 and hi_err >= 0.0 # you just can't be too careful if freq < utils.eps or abs( 1.0 - freq ) < utils.eps: # if <freq> too close to 0 or 1, replace it with the midpoint of its uncertainty band freq = 0.5 * (lo_err + hi_err) if pos not in observed_freqs: observed_freqs[pos] = [] observed_freqs[pos].append({ 'freq': freq, 'err': max(abs(freq - lo_err), abs(freq - hi_err)) }) # set final mute_freqs[pos] to the (inverse error-weighted) average over all the observations for each position mute_freqs = {} overall_total, overall_sum_of_weights = 0.0, 0.0 # also calculate the mean over all positions for pos in observed_freqs: total, sum_of_weights = 0.0, 0.0 for obs in observed_freqs[pos]: assert obs['err'] > 0.0 weight = 1.0 / obs['err'] total += weight * obs['freq'] sum_of_weights += weight assert sum_of_weights > 0.0 mean_freq = total / sum_of_weights mute_freqs[pos] = mean_freq overall_total += total overall_sum_of_weights += sum_of_weights mute_freqs['overall_mean'] = overall_total / overall_sum_of_weights return mute_freqs
def add_branch_lengths(self, treefname): """ Each tree is written with branch length the mean branch length over the whole sequence So we need to add the length for each region afterward, so each line looks e.g. like (t2:0.003751736951,t1:0.003751736951):0.001248262937;v:0.98,d:1.8,j:0.87 """ # first read the newick info for each tree with opener('r')(treefname) as treefile: treestrings = treefile.readlines() # then add the region-specific branch info length_list = [ '%s:%f' % (region, self.branch_lengths[region]['mean'] / self.branch_lengths['all']['mean']) for region in utils.regions ] for iline in range(len(treestrings)): treestrings[iline] = treestrings[iline].replace( ';', ';' + ','.join(length_list)) # and finally write out the final lines with opener('w')(treefname) as treefile: for line in treestrings: treefile.write(line)
def split_input(self, n_procs, infname=None, info=None, prefix='sub'): """ If <infname> is specified split the csv info from it into <n_procs> input files in subdirectories labelled with '<prefix>-' within <self.args.workdir> If <info> is specified, instead split the list <info> into pieces and return a list of the resulting lists """ if info is None: assert infname is not None info = [] with opener('r')(infname) as infile: reader = csv.DictReader(infile) for line in reader: info.append(line) else: assert infname is None # make sure only *one* of 'em is specified outlists = [] queries_per_proc = float(len(info)) / n_procs n_queries_per_proc = int(math.ceil(queries_per_proc)) for iproc in range(n_procs): if infname is None: outlists.append([]) else: subworkdir = self.args.workdir + '/' + prefix + '-' + str( iproc) utils.prep_dir(subworkdir) sub_outfile = opener('w')(subworkdir + '/' + os.path.basename(infname)) writer = csv.DictWriter(sub_outfile, reader.fieldnames) writer.writeheader() for iquery in range(iproc * n_queries_per_proc, (iproc + 1) * n_queries_per_proc): if iquery >= len(info): break if infname is None: outlists[-1].append(info[iquery]) else: writer.writerow(info[iquery]) if infname is None: return outlists
def make_mean_plots(plotdir, subdirs, outdir): meanlist, variancelist = [], [] normalized_means = [] for sd in subdirs: with opener('r')(plotdir + '/' + sd + '/plots/means.csv') as meanfile: reader = csv.DictReader(meanfile) for line in reader: means = [float(m) for m in line['means'].split(':')] meanlist.append(numpy.mean(means)) variancelist.append(numpy.var(means)) nmvals = [ float(nm) for nm in line['normalized-means'].split(':') ] normalized_means += nmvals import matplotlib matplotlib.use('Agg') from matplotlib import pyplot # ---------------------------------------------------------------------------------------- # first make hexbin plot pyplot.subplot(111) pyplot.hexbin(meanlist, variancelist, gridsize=20, cmap=matplotlib.cm.gist_yarg, bins=None) # pyplot.axis([0, 5, 0, 2]) pyplot.xlabel('mean') pyplot.ylabel('variance') cb = pyplot.colorbar() cb.set_label('mean value') utils.prep_dir(outdir + '/plots', multilings=['*.png', '*.svg', '*.csv']) pyplot.savefig(outdir + '/plots/hexmeans.png') pyplot.clf() # ---------------------------------------------------------------------------------------- # then make normalized mean plot n, bins, patches = pyplot.hist(normalized_means, 50) pyplot.xlabel(r'$(x_i - \mu) / \sigma_i$') pyplot.title(r'$\sigma=' + str(math.sqrt(numpy.var(normalized_means))) + '$') # pyplot.axis([-10, 10, 0, 220]) pyplot.savefig(outdir + '/plots/means.png') check_call( ['./permissify-www', outdir] ) # NOTE this should really permissify starting a few directories higher up
def merge_csvs(outfname, csv_list, cleanup=True): """ NOTE copy of merge_hmm_outputs in partitiondriver, I should really combine the two functions """ header = None outfo = [] # print 'merging' for infname in csv_list: # print ' ', infname workdir = os.path.dirname(infname) with opener('r')(infname) as sub_outfile: reader = csv.DictReader(sub_outfile) header = reader.fieldnames for line in reader: outfo.append(line) if cleanup: os.remove(infname) os.rmdir(workdir) if not os.path.exists(os.path.dirname(outfname)): os.makedirs(os.path.dirname(outfname)) with opener('w')(outfname) as outfile: writer = csv.DictWriter(outfile, header) writer.writeheader() for line in outfo: writer.writerow(line)
def write(self, base_outdir): print ' writing parameters' start = time.time() utils.prep_dir(base_outdir, multilings=('*.csv', '*.svg')) mute_start = time.time() self.mutefreqer.write( base_outdir, mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv' ) # REGION is replace by each region in the three output files) print ' mut freq write time: %.3f' % (time.time() - mute_start) # print ' %d / %d cached' % (self.mutefreqer.n_cached, self.mutefreqer.n_cached + self.mutefreqer.n_not_cached) for column in self.counts: index = None outfname = None if column == 'all': index = utils.index_columns outfname = base_outdir + '/' + utils.get_parameter_fname( column='all') elif '_content' in column: index = [ column, ] outfname = base_outdir + '/' + column + '.csv' else: index = [ column, ] + utils.column_dependencies[column] outfname = base_outdir + '/' + utils.get_parameter_fname( column_and_deps=index) if os.path.isfile(outfname): os.remove(outfname) elif not os.path.exists(base_outdir): os.makedirs(base_outdir) with opener('w')(outfname) as outfile: out_fieldnames = list(index) out_fieldnames.append('count') out_data = csv.DictWriter(outfile, out_fieldnames) out_data.writeheader() # NOTE this will in general not be sorted for key, count in self.counts[column].iteritems(): line = {} for ic in range(len(key)): line[index[ic]] = key[ic] line['count'] = count out_data.writerow(line) print ' parameter write time: %.3f' % (time.time() - start)
def write_hist_to_file(fname, hist): """ see the make_hist_from* functions to reverse this operation """ with opener('w')(fname) as histfile: writer = csv.DictWriter( histfile, ('bin_low_edge', 'contents', 'binerror', 'xtitle', 'binlabel') ) # this is a really crummy way of writing style information, but root files *suck*, so this is what I do for now writer.writeheader() for ibin in range(hist.GetNbinsX() + 2): writer.writerow({ 'bin_low_edge': hist.GetXaxis().GetBinLowEdge(ibin), 'contents': hist.GetBinContent(ibin), 'binerror': hist.GetBinError(ibin), 'xtitle': hist.GetXaxis().GetTitle(), 'binlabel': hist.GetXaxis().GetBinLabel(ibin) })
def write(self, outfname): with opener('w')(outfname) as outfile: header = [ 'bin_low_edge', 'contents', 'binlabel' ] if self.errors is not None: header.append('error') else: header.append('sum-weights-squared') writer = csv.DictWriter(outfile, header) writer.writeheader() for ib in range(self.n_bins + 2): row = {'bin_low_edge':self.low_edges[ib], 'contents':self.bin_contents[ib], 'binlabel':self.bin_labels[ib] } if self.errors is not None: row['error'] = self.errors[ib] if self.errors[ib] is not None else 0.0 else: row['sum-weights-squared'] = self.sum_weights_squared[ib] writer.writerow(row)
def write(self, base_outdir, mean_freq_outfname): if not self.finalized: self.finalize() outdir = base_outdir + '/mute-freqs' utils.prep_dir(outdir, '*.csv') for gene in self.counts: counts, freqs, plotting_info = self.counts[gene], self.freqs[ gene], self.plotting_info[gene] sorted_positions = sorted(counts) outfname = outdir + '/' + utils.sanitize_name(gene) + '.csv' with opener('w')(outfname) as outfile: nuke_header = [] for nuke in utils.nukes: nuke_header.append(nuke) nuke_header.append(nuke + '_obs') nuke_header.append(nuke + '_lo_err') nuke_header.append(nuke + '_hi_err') writer = csv.DictWriter( outfile, ('position', 'mute_freq', 'lo_err', 'hi_err') + tuple(nuke_header)) writer.writeheader() for position in sorted_positions: row = { 'position': position, 'mute_freq': counts[position]['freq'], 'lo_err': counts[position]['freq_lo_err'], 'hi_err': counts[position]['freq_hi_err'] } for nuke in utils.nukes: row[nuke] = freqs[position][nuke] row[nuke + '_obs'] = counts[position][nuke] row[nuke + '_lo_err'] = freqs[position][nuke + '_lo_err'] row[nuke + '_hi_err'] = freqs[position][nuke + '_hi_err'] writer.writerow(row) assert 'REGION' in mean_freq_outfname self.mean_rates['all'].write( mean_freq_outfname.replace( 'REGION', 'all')) # hackey hackey hackey replacement... *sigh* for region in utils.regions: self.mean_rates[region].write( mean_freq_outfname.replace('REGION', region))
def read_overall_gene_probs(indir, only_gene='', normalize=True): """ Return the observed counts/probabilities of choosing each gene version. If <normalize> then return probabilities If <only_gene> is specified, just return the prob/count for that gene """ counts = {region: {} for region in regions} probs = {region: {} for region in regions} for region in regions: total = 0 with opener('r')( indir + '/' + region + '_gene-probs.csv' ) as infile: # NOTE note this ignores correlations... which I think is actually ok, but it wouldn't hurt to think through it again at some point reader = csv.DictReader(infile) for line in reader: line_count = int(line['count']) gene = line[region + '_gene'] total += line_count if gene not in counts[region]: counts[region][gene] = 0 counts[region][gene] += line_count if total < 1: assert total == 0 print 'ERROR zero counts in %s' % indir + '/' + region + '_gene-probs.csv' assert False for gene in counts[region]: probs[region][gene] = float(counts[region][gene]) / total if only_gene not in counts[get_region(only_gene)]: print ' WARNING %s not found in overall gene probs, returning zero' % only_gene if normalize: return 0.0 else: return 0 if only_gene == '': if normalize: return probs else: return counts else: if normalize: return probs[get_region(only_gene)][only_gene] else: return counts[get_region(only_gene)][only_gene]
def write_mute_freqs(self, region, gene_name, seq, reco_event, reco_seq_fname, is_insertion=False): """ Read position-by-position mute freqs from disk for <gene_name>, renormalize, then write to a file for bppseqgen. """ replacement_genes = None if is_insertion: replacement_genes = utils.find_replacement_genes(self.args.parameter_dir, min_counts=-1, all_from_region='v') else: n_occurences = utils.read_overall_gene_probs(self.args.parameter_dir, only_gene=gene_name, normalize=False) # how many times did we observe this gene in data? if n_occurences < self.args.min_observations_to_write: # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us # print ' only saw %s %d times, use info from other genes' % (utils.color_gene(gene_name), n_occurences) replacement_genes = utils.find_replacement_genes(self.args.parameter_dir, min_counts=self.args.min_observations_to_write, gene_name=gene_name, single_gene=False) mute_freqs, mute_counts = paramutils.read_mute_info(self.args.parameter_dir, this_gene=gene_name, approved_genes=replacement_genes) rates = [] # list with a relative mutation rate for each position in <seq> total = 0.0 # assert len(mute_freqs) == len(seq) # only equal length if no erosions NO oh right but mute_freqs only covers areas we could align to... for inuke in range(len(seq)): # append a freq for each nuke position = inuke + dict(reco_event.erosions.items() + reco_event.effective_erosions.items())[region + '_5p'] freq = 0.0 if position in mute_freqs: freq = mute_freqs[position] else: freq = mute_freqs['overall_mean'] rates.append(freq) total += freq # normalize to the number of sites (i.e. so an average site is given value 1.0) assert total != 0.0 # I am not hip enough to divide by zero for inuke in range(len(seq)): rates[inuke] *= float(len(seq)) / total total = 0.0 # and... double check it, just for shits and giggles for inuke in range(len(seq)): total += rates[inuke] assert utils.is_normed(total / float(len(seq))) assert len(rates) == len(seq) # you just can't be too careful. what if gremlins ate a few while python wasn't looking? # write the input file for bppseqgen, one base per line with opener('w')(reco_seq_fname) as reco_seq_file: reco_seq_file.write('state\trate\n') for inuke in range(len(seq)): reco_seq_file.write('%s\t%.15f\n' % (seq[inuke], rates[inuke]))
def file_init(self, fname): self.errors, self.sum_weights_squared = [], [ ] # kill the unused one after reading file with opener('r')(fname) as infile: reader = csv.DictReader(infile) for line in reader: self.low_edges.append(float(line['bin_low_edge'])) self.bin_contents.append(float(line['contents'])) if 'sum-weights-squared' in line: self.sum_weights_squared.append( float(line['sum-weights-squared'])) if 'error' in line or 'binerror' in line: # in theory I should go find all the code that writes these files and make 'em use the same header for this assert 'sum-weights-squared' not in line tmp_error = float( line['error']) if 'error' in line else float( line['binerror']) self.errors.append(tmp_error) if 'binlabel' in line: self.bin_labels.append(line['binlabel']) else: self.bin_labels.append('') if 'xtitle' in line: # should be the same for every line in the file... but this avoids complicating the file format self.xtitle = line['xtitle'] self.n_bins = len( self.low_edges ) - 2 # file should have a line for the under- and overflow bins self.xmin, self.xmax = self.low_edges[1], self.low_edges[ -1] # *upper* bound of underflow, *lower* bound of overflow assert sorted(self.low_edges) == self.low_edges assert len(self.bin_contents) == len(self.low_edges) assert len(self.low_edges) == len(self.bin_labels) if len(self.errors ) == 0: # (re)set to None if the file didn't have errors listed self.errors = None assert len(self.sum_weights_squared) == len(self.low_edges) if len(self.sum_weights_squared) == 0: self.sum_weights_squared = None assert len(self.errors) == len(self.low_edges)
def make_hist_from_observation_file(fname, column, hist_label='', n_bins=30, log=''): """ return root histogram filled with each value from <column> in csv file <fname> """ if not has_root: return None values = [] with opener('r')(fname) as infile: reader = csv.DictReader(infile) for line in reader: values.append(float(line[column])) values = sorted(values) xbins = array( 'f', [0 for i in range(n_bins + 1)]) # NOTE has to be n_bins *plus* 1 set_bins(values, n_bins, 'x' in log, xbins, var_type='float') hist = TH1D(hist_label, '', n_bins, xbins) for value in values: hist.Fill(value) return hist
def write_vdjalign_input(self, base_infname, n_procs): queries_per_proc = float(len(self.remaining_queries)) / n_procs n_queries_per_proc = int(math.ceil(queries_per_proc)) if n_procs == 1: # double check for rounding problems or whatnot assert n_queries_per_proc == len(self.remaining_queries) for iproc in range(n_procs): workdir = self.args.workdir if n_procs > 1: workdir += '/sw-' + str(iproc) utils.prep_dir(workdir) with opener('w')(workdir + '/' + base_infname) as sub_infile: for iquery in range(iproc * n_queries_per_proc, (iproc + 1) * n_queries_per_proc): if iquery >= len(self.remaining_queries): break query_name = self.remaining_queries[iquery] sub_infile.write('>' + query_name + ' NUKES\n') seq = self.input_info[query_name]['seq'] if query_name in self.info['indels']: seq = self.info['indels'][query_name][ 'reversed_seq'] # use the query sequence with shm insertions and deletions reversed sub_infile.write(seq + '\n')