def __init__(self, args, input_info, reco_info, germline_seqs, parameter_dir, write_parameters=False, plotdir=None): self.parameter_dir = parameter_dir self.plotdir = plotdir self.args = args self.input_info = input_info self.reco_info = reco_info self.germline_seqs = germline_seqs self.pcounter, self.true_pcounter = None, None if write_parameters: self.pcounter = ParameterCounter(self.germline_seqs) if not self.args.is_data: self.true_pcounter = ParameterCounter(self.germline_seqs) self.info = {} self.info['all_best_matches'] = set() # set of all the matches we found (for *all* queries) self.info['skipped_unproductive_queries'] = [] # list of unproductive queries if self.args.apply_choice_probs_in_sw: if self.args.debug: print ' reading gene choice probs from',parameter_dir self.gene_choice_probs = utils.read_overall_gene_probs(parameter_dir) with opener('r')(self.args.datadir + '/v-meta.json') as json_file: # get location of <begin> cysteine in each v region self.cyst_positions = json.load(json_file) with opener('r')(self.args.datadir + '/j_tryp.csv') as csv_file: # get location of <end> tryptophan in each j region (TGG) tryp_reader = csv.reader(csv_file) self.tryp_positions = {row[0]:row[1] for row in tryp_reader} # WARNING: this doesn't filter out the header line self.outfile = None if self.args.outfname != None: self.outfile = open(self.args.outfname, 'a') self.n_unproductive = 0 self.n_total = 0
def __init__(self, args, input_info, reco_info, germline_seqs, parameter_dir, write_parameters=False, plotdir=None): self.parameter_dir = parameter_dir self.plotdir = plotdir self.args = args self.input_info = input_info self.reco_info = reco_info self.germline_seqs = germline_seqs self.pcounter, self.true_pcounter = None, None if write_parameters: self.pcounter = ParameterCounter(self.germline_seqs) if not self.args.is_data: self.true_pcounter = ParameterCounter(self.germline_seqs) self.info = {} self.info['all_best_matches'] = set( ) # set of all the matches we found (for *all* queries) self.info['skipped_unproductive_queries'] = [ ] # list of unproductive queries if self.args.apply_choice_probs_in_sw: if self.args.debug: print ' reading gene choice probs from', parameter_dir self.gene_choice_probs = utils.read_overall_gene_probs( parameter_dir) with opener('r')( self.args.datadir + '/v-meta.json' ) as json_file: # get location of <begin> cysteine in each v region self.cyst_positions = json.load(json_file) with opener('r')( self.args.datadir + '/j_tryp.csv' ) as csv_file: # get location of <end> tryptophan in each j region (TGG) tryp_reader = csv.reader(csv_file) self.tryp_positions = { row[0]: row[1] for row in tryp_reader } # WARNING: this doesn't filter out the header line self.outfile = None if self.args.outfname != None: self.outfile = open(self.args.outfname, 'a') self.n_unproductive = 0 self.n_total = 0
def __init__(self, args, input_info, reco_info, glfo, parameter_dir, write_parameters, genes_to_use): print 'smith-waterman' sys.stdout.flush() self.parameter_dir = parameter_dir self.args = args self.debug = self.args.debug if self.args.sw_debug is None else self.args.sw_debug self.max_insertion_length = 35 # if vdjalign reports an insertion longer than this, rerun the query (typically with different match/mismatch ratio) self.absolute_max_insertion_length = 200 # just ignore them if it's longer than this self.input_info = input_info self.remaining_queries = set([q for q in self.input_info.keys()]) # we remove queries from this set when we're satisfied with the current output (in general we may have to rerun some queries with different match/mismatch scores) self.new_indels = 0 # number of new indels that were kicked up this time through self.reco_info = reco_info self.glfo = glfo self.pcounter, self.true_pcounter, self.perfplotter = None, None, None if write_parameters: self.pcounter = ParameterCounter(self.glfo['seqs']) if not self.args.is_data: self.true_pcounter = ParameterCounter(self.glfo['seqs']) if self.args.plot_performance: self.perfplotter = PerformancePlotter(self.glfo['seqs'], 'sw') self.info = {} self.info['queries'] = [] # list of queries that *passed* sw, i.e. for which we have information self.info['all_best_matches'] = set() # set of all the matches we found (for *all* queries) self.info['indels'] = {} if self.args.apply_choice_probs_in_sw: if self.debug: print ' reading gene choice probs from', parameter_dir self.gene_choice_probs = utils.read_overall_gene_probs(parameter_dir) self.outfile = None if self.args.outfname is not None: self.outfile = open(self.args.outfname, 'a') self.nth_try = 1 self.unproductive_queries = set() # rewrite input germline sets (if needed) self.genes_to_use = genes_to_use # if None, we use all of 'em. NOTE do *not* use self.args.only_genes in this file (see partitiondriver) self.my_datadir = self.args.datadir # make sure to use *only* use <self.my_datadir> elsewhere if self.genes_to_use is not None: self.my_datadir = self.args.workdir + '/germline-sets' self.rewritten_files = utils.rewrite_germline_fasta(self.args.datadir, self.my_datadir, self.genes_to_use) if not os.path.exists(self.args.ighutil_dir + '/bin/vdjalign'): raise Exception('ERROR ighutil path d.n.e: ' + self.args.ighutil_dir + '/bin/vdjalign')
def __init__(self, args, input_info, reco_info, germline_seqs, parameter_dir, write_parameters=False): self.parameter_dir = parameter_dir self.args = args self.debug = self.args.debug if self.args.sw_debug is None else self.args.sw_debug self.input_info = input_info self.remaining_queries = [query for query in self.input_info.keys()] # we remove queries from this list when we're satisfied with the current output (in general we may have to rerun some queries with different match/mismatch scores) self.new_indels = 0 # number of new indels that were kicked up this time through self.reco_info = reco_info self.germline_seqs = germline_seqs self.pcounter, self.true_pcounter, self.perfplotter = None, None, None if write_parameters: self.pcounter = ParameterCounter(self.germline_seqs) if not self.args.is_data: self.true_pcounter = ParameterCounter(self.germline_seqs) if self.args.plot_performance: self.perfplotter = PerformancePlotter(self.germline_seqs, 'sw') self.info = {} self.info['queries'] = [] self.info['all_best_matches'] = set() # set of all the matches we found (for *all* queries) self.info['skipped_unproductive_queries'] = [] # list of unproductive queries # self.info['skipped_indel_queries'] = [] # list of queries that had indels self.info['skipped_unknown_queries'] = [] self.info['indels'] = {} if self.args.apply_choice_probs_in_sw: if self.debug: print ' reading gene choice probs from', parameter_dir self.gene_choice_probs = utils.read_overall_gene_probs(parameter_dir) with opener('r')(self.args.datadir + '/v-meta.json') as json_file: # get location of <begin> cysteine in each v region self.cyst_positions = json.load(json_file) with opener('r')(self.args.datadir + '/j_tryp.csv') as csv_file: # get location of <end> tryptophan in each j region (TGG) tryp_reader = csv.reader(csv_file) self.tryp_positions = {row[0]:row[1] for row in tryp_reader} # WARNING: this doesn't filter out the header line self.outfile = None if self.args.outfname is not None: self.outfile = open(self.args.outfname, 'a') self.n_unproductive = 0 self.n_total = 0 print 'smith-waterman'
def __init__(self, args, input_info, reco_info, glfo, my_datadir, parameter_dir, write_parameters=False, find_new_alleles=False): self.parameter_dir = parameter_dir.rstrip('/') self.args = args self.debug = self.args.debug if self.args.sw_debug is None else self.args.sw_debug self.max_insertion_length = 35 # if vdjalign reports an insertion longer than this, rerun the query (typically with different match/mismatch ratio) self.absolute_max_insertion_length = 200 # just ignore them if it's longer than this self.input_info = input_info self.remaining_queries = set([q for q in self.input_info.keys()]) # we remove queries from this set when we're satisfied with the current output (in general we may have to rerun some queries with different match/mismatch scores) self.new_indels = 0 # number of new indels that were kicked up this time through self.match_mismatch = copy.deepcopy(self.args.initial_match_mismatch) # don't want to modify it! self.gap_open_penalty = self.args.gap_open_penalty # not modifying it now, but just to make sure we don't in the future self.reco_info = reco_info self.glfo = glfo self.info = {} self.info['queries'] = [] # list of queries that *passed* sw, i.e. for which we have information self.info['all_best_matches'] = set() # every gene that was a best match for at least one query self.info['all_matches'] = {r : set() for r in utils.regions} # every gene that was *any* match for at least one query self.info['indels'] = {} self.nth_try = 1 self.unproductive_queries = set() # rewrite input germline sets (if needed) self.my_datadir = my_datadir self.alfinder, self.pcounter, self.true_pcounter, self.perfplotter = None, None, None, None if find_new_alleles: # NOTE *not* the same as <self.args.find_new_alleles> self.alfinder = AlleleFinder(self.glfo, self.args) if write_parameters: # NOTE *not* the same as <self.args.cache_parameters> self.pcounter = ParameterCounter(self.glfo, self.args) if not self.args.is_data: self.true_pcounter = ParameterCounter(self.glfo, self.args) if self.args.plot_performance: self.perfplotter = PerformancePlotter(self.glfo, 'sw') if not os.path.exists(self.args.ig_sw_dir + 'ig-sw'): raise Exception('ERROR ig-sw path d.n.e: ' + self.args.ig_sw_dir + 'ig-sw')
class Waterer(object): """ Run smith-waterman on the query sequences in <infname> """ def __init__(self, args, input_info, reco_info, germline_seqs, parameter_dir, write_parameters=False): self.parameter_dir = parameter_dir self.args = args self.debug = self.args.debug if self.args.sw_debug is None else self.args.sw_debug self.input_info = input_info self.remaining_queries = [query for query in self.input_info.keys()] # we remove queries from this list when we're satisfied with the current output (in general we may have to rerun some queries with different match/mismatch scores) self.new_indels = 0 # number of new indels that were kicked up this time through self.reco_info = reco_info self.germline_seqs = germline_seqs self.pcounter, self.true_pcounter, self.perfplotter = None, None, None if write_parameters: self.pcounter = ParameterCounter(self.germline_seqs) if not self.args.is_data: self.true_pcounter = ParameterCounter(self.germline_seqs) if self.args.plot_performance: self.perfplotter = PerformancePlotter(self.germline_seqs, 'sw') self.info = {} self.info['queries'] = [] self.info['all_best_matches'] = set() # set of all the matches we found (for *all* queries) self.info['skipped_unproductive_queries'] = [] # list of unproductive queries # self.info['skipped_indel_queries'] = [] # list of queries that had indels self.info['skipped_unknown_queries'] = [] self.info['indels'] = {} if self.args.apply_choice_probs_in_sw: if self.debug: print ' reading gene choice probs from', parameter_dir self.gene_choice_probs = utils.read_overall_gene_probs(parameter_dir) with opener('r')(self.args.datadir + '/v-meta.json') as json_file: # get location of <begin> cysteine in each v region self.cyst_positions = json.load(json_file) with opener('r')(self.args.datadir + '/j_tryp.csv') as csv_file: # get location of <end> tryptophan in each j region (TGG) tryp_reader = csv.reader(csv_file) self.tryp_positions = {row[0]:row[1] for row in tryp_reader} # WARNING: this doesn't filter out the header line self.outfile = None if self.args.outfname is not None: self.outfile = open(self.args.outfname, 'a') self.n_unproductive = 0 self.n_total = 0 print 'smith-waterman' # ---------------------------------------------------------------------------------------- def __del__(self): if self.args.outfname is not None: self.outfile.close() # ---------------------------------------------------------------------------------------- def clean(self): if self.pcounter is not None: self.pcounter.clean() if self.true_pcounter is not None: self.true_pcounter.clean() # ---------------------------------------------------------------------------------------- def run(self): # start = time.time() base_infname = 'query-seqs.fa' base_outfname = 'query-seqs.bam' sys.stdout.flush() n_tries = 0 while len(self.remaining_queries) > 0: # we remove queries from <self.remaining_queries> as we're satisfied with their output self.write_vdjalign_input(base_infname, n_procs=self.args.n_fewer_procs) self.execute_command(base_infname, base_outfname, self.args.n_fewer_procs) self.read_output(base_outfname, n_procs=self.args.n_fewer_procs) n_tries += 1 if n_tries > 2: self.info['skipped_unknown_queries'] += self.remaining_queries break self.finalize() # ---------------------------------------------------------------------------------------- def finalize(self): if self.perfplotter is not None: self.perfplotter.plot(self.args.plotdir + '/sw/performance') # print ' sw time: %.3f' % (time.time()-start) if self.n_unproductive > 0: print ' unproductive skipped %d / %d = %.2f' % (self.n_unproductive, self.n_total, float(self.n_unproductive) / self.n_total) # if len(self.info['skipped_indel_queries']) > 0: # print ' indels skipped %d / %d = %.2f' % (len(self.info['skipped_indel_queries']), self.n_total, float(len(self.info['skipped_indel_queries'])) / self.n_total) if len(self.info['indels']) > 0: print ' indels: %s' % ':'.join(self.info['indels'].keys()) if self.pcounter is not None: self.pcounter.write(self.parameter_dir) if self.args.plotdir is not None: self.pcounter.plot(self.args.plotdir + '/sw', subset_by_gene=True, cyst_positions=self.cyst_positions, tryp_positions=self.tryp_positions) if self.true_pcounter is not None: self.true_pcounter.plot(self.args.plotdir + 'sw/true', subset_by_gene=True, cyst_positions=self.cyst_positions, tryp_positions=self.tryp_positions) # ---------------------------------------------------------------------------------------- def execute_command(self, base_infname, base_outfname, n_procs): if n_procs == 1: cmd_str = self.get_vdjalign_cmd_str(self.args.workdir, base_infname, base_outfname) proc = Popen(cmd_str.split(), stdout=PIPE, stderr=PIPE) out, err = proc.communicate() utils.process_out_err(out, err) if not self.args.no_clean: os.remove(self.args.workdir + '/' + base_infname) else: procs = [] for iproc in range(n_procs): cmd_str = self.get_vdjalign_cmd_str(self.args.workdir + '/sw-' + str(iproc), base_infname, base_outfname) procs.append(Popen(cmd_str.split(), stdout=PIPE, stderr=PIPE)) time.sleep(0.1) for iproc in range(len(procs)): out, err = procs[iproc].communicate() utils.process_out_err(out, err, extra_str=str(iproc)) if not self.args.no_clean: for iproc in range(n_procs): os.remove(self.args.workdir + '/sw-' + str(iproc) + '/' + base_infname) sys.stdout.flush() # ---------------------------------------------------------------------------------------- def write_vdjalign_input(self, base_infname, n_procs): queries_per_proc = float(len(self.remaining_queries)) / n_procs n_queries_per_proc = int(math.ceil(queries_per_proc)) if n_procs == 1: # double check for rounding problems or whatnot assert n_queries_per_proc == len(self.remaining_queries) for iproc in range(n_procs): workdir = self.args.workdir if n_procs > 1: workdir += '/sw-' + str(iproc) utils.prep_dir(workdir) with opener('w')(workdir + '/' + base_infname) as sub_infile: for iquery in range(iproc*n_queries_per_proc, (iproc + 1)*n_queries_per_proc): if iquery >= len(self.remaining_queries): break query_name = self.remaining_queries[iquery] sub_infile.write('>' + query_name + ' NUKES\n') seq = self.input_info[query_name]['seq'] if query_name in self.info['indels']: seq = self.info['indels'][query_name]['reversed_seq'] # use the query sequence with shm insertions and deletions reversed sub_infile.write(seq + '\n') # ---------------------------------------------------------------------------------------- def get_vdjalign_cmd_str(self, workdir, base_infname, base_outfname): """ Run smith-waterman alignment (from Connor's ighutils package) on the seqs in <base_infname>, and toss all the top matches into <base_outfname>. """ # large gap-opening penalty: we want *no* gaps in the middle of the alignments # match score larger than (negative) mismatch score: we want to *encourage* some level of shm. If they're equal, we tend to end up with short unmutated alignments, which screws everything up os.environ['PATH'] = os.getenv('PWD') + '/packages/samtools:' + os.getenv('PATH') check_output(['which', 'samtools']) if not os.path.exists(self.args.ighutil_dir + '/bin/vdjalign'): raise Exception('ERROR ighutil path d.n.e: ' + self.args.ighutil_dir + '/bin/vdjalign') cmd_str = self.args.ighutil_dir + '/bin/vdjalign align-fastq -q' if self.args.slurm: cmd_str = 'srun ' + cmd_str cmd_str += ' --max-drop 50' match, mismatch = self.args.match_mismatch cmd_str += ' --match ' + str(match) + ' --mismatch ' + str(mismatch) cmd_str += ' --gap-open ' + str(self.args.gap_open_penalty) #1000' #50' cmd_str += ' --vdj-dir ' + self.args.datadir cmd_str += ' ' + workdir + '/' + base_infname + ' ' + workdir + '/' + base_outfname return cmd_str # ---------------------------------------------------------------------------------------- def read_output(self, base_outfname, n_procs=1): n_processed = 0 for iproc in range(n_procs): workdir = self.args.workdir if n_procs > 1: workdir += '/sw-' + str(iproc) outfname = workdir + '/' + base_outfname with contextlib.closing(pysam.Samfile(outfname)) as bam: grouped = itertools.groupby(iter(bam), operator.attrgetter('qname')) for _, reads in grouped: # loop over query sequences self.n_total += 1 self.process_query(bam, list(reads)) n_processed += 1 if not self.args.no_clean: os.remove(outfname) if n_procs > 1: # still need the top-level workdir os.rmdir(workdir) print ' processed %d queries' % n_processed if len(self.remaining_queries) > 0: if self.new_indels > 0: # if we skipped some events, and if none of those were because they were indels, then increase mismatch score print ' skipped %d queries (%d indels), rerunning them' % (len(self.remaining_queries), self.new_indels) self.new_indels = 0 else: print ' skipped %d queries (%d indels), increasing mismatch score (%d --> %d) and rerunning them' % (len(self.remaining_queries), self.new_indels, self.args.match_mismatch[1], self.args.match_mismatch[1] + 1) self.args.match_mismatch[1] += 1 self.new_indels = 0 # ---------------------------------------------------------------------------------------- def get_choice_prob(self, region, gene): choice_prob = 1.0 if gene in self.gene_choice_probs[region]: choice_prob = self.gene_choice_probs[region][gene] else: choice_prob = 0.0 # NOTE would it make sense to use something else here? return choice_prob # ---------------------------------------------------------------------------------------- def get_indel_info(self, query_name, cigarstr, qrseq, glseq, gene): cigars = re.findall('[0-9][0-9]*[A-Z]', cigarstr) # split cigar string into its parts cigars = [(cstr[-1], int(cstr[:-1])) for cstr in cigars] # split each part into the code and the length codestr = '' qpos = 0 # position within query sequence indelfo = {'reversed_seq' : '', 'indels' : []} # replacement_seq: query seq with insertions removed and germline bases inserted at the position of deletions tmp_indices = [] for code, length in cigars: codestr += length * code if code == 'I': # advance qr seq but not gl seq indelfo['indels'].append({'type' : 'insertion', 'pos' : qpos, 'len' : length, 'seqstr' : ''}) # insertion begins at <pos> tmp_indices += [len(indelfo['indels']) - 1 for _ in range(length)]# indel index corresponding to this position in the alignment elif code == 'D': # advance qr seq but not gl seq indelfo['indels'].append({'type' : 'deletion', 'pos' : qpos, 'len' : length, 'seqstr' : ''}) # first deleted base is <pos> (well, first base which is in the position of the first deleted base) tmp_indices += [len(indelfo['indels']) - 1 for _ in range(length)]# indel index corresponding to this position in the alignment else: tmp_indices += [None for _ in range(length)] # indel index corresponding to this position in the alignment qpos += length qrprintstr, glprintstr = '', '' iqr, igl = 0, 0 for icode in range(len(codestr)): code = codestr[icode] if code == 'M': qrbase = qrseq[iqr] if qrbase != glseq[igl]: qrbase = utils.color('red', qrbase) qrprintstr += qrbase glprintstr += glseq[igl] indelfo['reversed_seq'] += qrseq[iqr] # add the base to the overall sequence with all indels reversed elif code == 'S': continue elif code == 'I': qrprintstr += utils.color('light_blue', qrseq[iqr]) glprintstr += utils.color('light_blue', '*') indelfo['indels'][tmp_indices[icode]]['seqstr'] += qrseq[iqr] # and to the sequence of just this indel igl -= 1 elif code == 'D': qrprintstr += utils.color('light_blue', '*') glprintstr += utils.color('light_blue', glseq[igl]) indelfo['reversed_seq'] += glseq[igl] # add the base to the overall sequence with all indels reversed indelfo['indels'][tmp_indices[icode]]['seqstr'] += glseq[igl] # and to the sequence of just this indel iqr -= 1 else: raise Exception('unhandled code %s' % code) iqr += 1 igl += 1 print '\n indels in %s' % query_name print ' %20s %s' % (gene, glprintstr) print ' %20s %s' % ('query', qrprintstr) for idl in indelfo['indels']: print ' %10s: %d bases at %d (%s)' % (idl['type'], idl['len'], idl['pos'], idl['seqstr']) # utils.undo_indels(indelfo) # print ' %s' % self.input_info[query_name]['seq'] return indelfo # ---------------------------------------------------------------------------------------- def process_query(self, bam, reads): primary = next((r for r in reads if not r.is_secondary), None) query_seq = primary.seq query_name = primary.qname first_match_query_bounds = None # since sw excises its favorite v match, we have to know this match's boundaries in order to calculate k_d for all the other matches all_match_names = {} warnings = {} # ick, this is a messy way to pass stuff around for region in utils.regions: all_match_names[region] = [] all_query_bounds, all_germline_bounds = {}, {} n_skipped_invalid_cpos = 0 for read in reads: # loop over the matches found for each query sequence # set this match's values read.seq = query_seq # only the first one has read.seq set by default, so we need to set the rest by hand gene = bam.references[read.tid] region = utils.get_region(gene) raw_score = read.tags[0][1] # raw because they don't include the gene choice probs score = raw_score if self.args.apply_choice_probs_in_sw: # NOTE I stopped applying the gene choice probs here because the smith-waterman scores don't correspond to log-probs, so throwing on the gene choice probs was dubious (and didn't seem to work that well) score = self.get_choice_prob(region, gene) * raw_score # multiply by the probability to choose this gene qrbounds = (read.qstart, read.qend) glbounds = (read.pos, read.aend) if region == 'v' and first_match_query_bounds is None: first_match_query_bounds = qrbounds # perform a few checks and see if we want to skip this match if region == 'v': # skip matches with cpos past the end of the query seq (i.e. eroded a ton on the right side of the v) cpos = utils.get_conserved_codon_position(self.cyst_positions, self.tryp_positions, 'v', gene, glbounds, qrbounds, assert_on_fail=False) if not utils.check_conserved_cysteine(self.germline_seqs['v'][gene], self.cyst_positions[gene]['cysteine-position'], assert_on_fail=False): # some of the damn cysteine positions in the json file were wrong, so now we check raise Exception('bad cysteine in %s: %d %s' % (gene, self.cyst_positions[gene]['cysteine-position'], self.germline_seqs['v'][gene])) if cpos < 0 or cpos >= len(query_seq): n_skipped_invalid_cpos += 1 continue if 'I' in read.cigarstring or 'D' in read.cigarstring: # skip indels, and tell the HMM to skip indels (you won't see any unless you decrease the <self.args.gap_open_penalty>) if len(all_match_names[region]) == 0: # if this is the first (best) match for this region, allow indels (otherwise skip the match) if query_name not in self.info['indels']: self.info['indels'][query_name] = self.get_indel_info(query_name, read.cigarstring, query_seq[qrbounds[0] : qrbounds[1]], self.germline_seqs[region][gene][glbounds[0] : glbounds[1]], gene) self.info['indels'][query_name]['reversed_seq'] = query_seq[ : qrbounds[0]] + self.info['indels'][query_name]['reversed_seq'] + query_seq[qrbounds[1] : ] self.new_indels += 1 # print ' query seq %s' % query_seq # print 'indelfo seq %s' % self.info['indels'][query_name]['reversed_seq'] # self.info['skipped_indel_queries'].append(query_name) # self.info[query_name] = {'indels'} else: print ' multiple indels for %s' % query_name return else: continue if qrbounds[1]-qrbounds[0] != glbounds[1]-glbounds[0]: raise Exception('germline match (%d %d) not same length as query match (%d %d)' % (qrbounds[0], qrbounds[1], glbounds[0], glbounds[1])) assert qrbounds[1] <= len(query_seq) if glbounds[1] > len(self.germline_seqs[region][gene]): print ' ', gene print ' ', glbounds[1], len(self.germline_seqs[region][gene]) print ' ', self.germline_seqs[region][gene] assert glbounds[1] <= len(self.germline_seqs[region][gene]) assert qrbounds[1]-qrbounds[0] == glbounds[1]-glbounds[0] # and finally add this match's information warnings[gene] = '' all_match_names[region].append((score, gene)) # NOTE it is important that this is ordered such that the best match is first all_query_bounds[gene] = qrbounds all_germline_bounds[gene] = glbounds # if n_skipped_invalid_cpos > 0: # print ' skipped %d invalid cpos values for %s' % (n_skipped_invalid_cpos, query_name) self.summarize_query(query_name, query_seq, all_match_names, all_query_bounds, all_germline_bounds, warnings, first_match_query_bounds) # ---------------------------------------------------------------------------------------- def print_match(self, region, gene, query_seq, score, glbounds, qrbounds, codon_pos, warnings, skipping=False): if self.debug < 2: return out_str_list = [] buff_str = (20 - len(gene)) * ' ' tmp_val = score if self.args.apply_choice_probs_in_sw and self.get_choice_prob(region, gene) != 0.0: tmp_val = score / self.get_choice_prob(region, gene) if self.args.apply_choice_probs_in_sw: out_str_list.append('%8s%s%s%9.1e * %3.0f = %-6.1f' % (' ', utils.color_gene(gene), buff_str, self.get_choice_prob(region, gene), tmp_val, score)) else: out_str_list.append('%8s%s%s%9s%3s %6.0f ' % (' ', utils.color_gene(gene), '', '', buff_str, score)) out_str_list.append('%4d%4d %s\n' % (glbounds[0], glbounds[1], self.germline_seqs[region][gene][glbounds[0]:glbounds[1]])) out_str_list.append('%46s %4d%4d' % ('', qrbounds[0], qrbounds[1])) out_str_list.append(' %s ' % (utils.color_mutants(self.germline_seqs[region][gene][glbounds[0]:glbounds[1]], query_seq[qrbounds[0]:qrbounds[1]]))) if region != 'd': out_str_list.append('(%s %d)' % (utils.conserved_codon_names[region], codon_pos)) if warnings[gene] != '': out_str_list.append('WARNING ' + warnings[gene]) if skipping: out_str_list.append('skipping!') if self.args.outfname is None: print ''.join(out_str_list) else: out_str_list.append('\n') self.outfile.write(''.join(out_str_list)) # ---------------------------------------------------------------------------------------- def shift_overlapping_boundaries(self, qrbounds, glbounds, query_name, query_seq, best): # NOTE this does pretty much the same thing as resolve_overlapping_matches in joinparser.py """ s-w allows d and j matches (and v and d matches) to overlap... which makes no sense, so apportion the disputed territory between the two regions """ for region_pairs in ({'left':'v', 'right':'d'}, {'left':'d', 'right':'j'}): l_reg = region_pairs['left'] r_reg = region_pairs['right'] l_gene = best[l_reg] r_gene = best[r_reg] overlap = qrbounds[l_gene][1] - qrbounds[r_gene][0] if overlap > 0: l_length = qrbounds[l_gene][1] - qrbounds[l_gene][0] r_length = qrbounds[r_gene][1] - qrbounds[r_gene][0] l_portion, r_portion = 0, 0 while l_portion + r_portion < overlap: if l_length <= 1 and r_length <= 1: # don't want to erode match (in practice it'll be the d match) all the way to zero print ' ERROR both lengths went to zero' assert False elif l_length > 1 and r_length > 1: # if both have length left, alternate back and forth if (l_portion + r_portion) % 2 == 0: l_portion += 1 # give one base to the left l_length -= 1 else: r_portion += 1 # and one to the right r_length -= 1 elif l_length > 1: l_portion += 1 l_length -= 1 elif r_length > 1: r_portion += 1 r_length -= 1 if self.debug: print ' WARNING %s apportioning %d bases between %s (%d) match and %s (%d) match' % (query_name, overlap, l_reg, l_portion, r_reg, r_portion) assert l_portion + r_portion == overlap qrbounds[l_gene] = (qrbounds[l_gene][0], qrbounds[l_gene][1] - l_portion) glbounds[l_gene] = (glbounds[l_gene][0], glbounds[l_gene][1] - l_portion) qrbounds[r_gene] = (qrbounds[r_gene][0] + r_portion, qrbounds[r_gene][1]) glbounds[r_gene] = (glbounds[r_gene][0] + r_portion, glbounds[r_gene][1]) best[l_reg + '_gl_seq'] = self.germline_seqs[l_reg][l_gene][glbounds[l_gene][0] : glbounds[l_gene][1]] best[l_reg + '_qr_seq'] = query_seq[qrbounds[l_gene][0]:qrbounds[l_gene][1]] best[r_reg + '_gl_seq'] = self.germline_seqs[r_reg][r_gene][glbounds[r_gene][0] : glbounds[r_gene][1]] best[r_reg + '_qr_seq'] = query_seq[qrbounds[r_gene][0]:qrbounds[r_gene][1]] # ---------------------------------------------------------------------------------------- def add_to_info(self, query_name, query_seq, kvals, match_names, best, all_germline_bounds, all_query_bounds, codon_positions): assert query_name not in self.info self.info['queries'].append(query_name) self.info[query_name] = {} self.info[query_name]['unique_id'] = query_name # redundant, but used somewhere down the line self.info[query_name]['k_v'] = kvals['v'] self.info[query_name]['k_d'] = kvals['d'] self.info[query_name]['all'] = ':'.join(match_names['v'] + match_names['d'] + match_names['j']) # assert codon_positions['v'] != -1 # assert codon_positions['j'] != -1 self.info[query_name]['cdr3_length'] = codon_positions['j'] - codon_positions['v'] + 3 #tryp_position_in_joined_seq - self.cyst_position + 3 self.info[query_name]['cyst_position'] = codon_positions['v'] self.info[query_name]['tryp_position'] = codon_positions['j'] if self.info[query_name]['cyst_position'] < 0 or self.info[query_name]['cyst_position'] >= len(query_seq): raise Exception('cpos %d invalid for %s (%s)' % (self.info[query_name]['cyst_position'], query_name, query_seq)) if self.info[query_name]['tryp_position'] < 0 or self.info[query_name]['tryp_position'] >= len(query_seq): raise Exception('tpos %d invalid for %s (%s)' % (self.info[query_name]['tryp_position'], query_name, query_seq)) # erosion, insertion, mutation info for best match self.info[query_name]['v_5p_del'] = all_germline_bounds[best['v']][0] self.info[query_name]['v_3p_del'] = len(self.germline_seqs['v'][best['v']]) - all_germline_bounds[best['v']][1] # len(germline v) - gl_match_end self.info[query_name]['d_5p_del'] = all_germline_bounds[best['d']][0] self.info[query_name]['d_3p_del'] = len(self.germline_seqs['d'][best['d']]) - all_germline_bounds[best['d']][1] self.info[query_name]['j_5p_del'] = all_germline_bounds[best['j']][0] self.info[query_name]['j_3p_del'] = len(self.germline_seqs['j'][best['j']]) - all_germline_bounds[best['j']][1] self.info[query_name]['fv_insertion'] = query_seq[ : all_query_bounds[best['v']][0]] self.info[query_name]['vd_insertion'] = query_seq[all_query_bounds[best['v']][1] : all_query_bounds[best['d']][0]] self.info[query_name]['dj_insertion'] = query_seq[all_query_bounds[best['d']][1] : all_query_bounds[best['j']][0]] self.info[query_name]['jf_insertion'] = query_seq[all_query_bounds[best['j']][1] : ] for region in utils.regions: self.info[query_name][region + '_gene'] = best[region] self.info[query_name][region + '_gl_seq'] = best[region + '_gl_seq'] self.info[query_name][region + '_qr_seq'] = best[region + '_qr_seq'] self.info['all_best_matches'].add(best[region]) self.info[query_name]['seq'] = query_seq # NOTE this is the seq output by vdjalign, i.e. if we reversed any indels it is the reversed sequence if self.debug: if not self.args.is_data: utils.print_reco_event(self.germline_seqs, self.reco_info[query_name], extra_str=' ', label='true:', indelfo=self.reco_info[query_name]['indels']) utils.print_reco_event(self.germline_seqs, self.info[query_name], extra_str=' ', label='inferred:', indelfo=self.info['indels'].get(query_name, None)) if self.pcounter is not None: self.pcounter.increment_reco_params(self.info[query_name]) self.pcounter.increment_mutation_params(self.info[query_name]) if self.true_pcounter is not None: self.true_pcounter.increment_reco_params(self.reco_info[query_name]) self.true_pcounter.increment_mutation_params(self.reco_info[query_name]) if self.perfplotter is not None: self.perfplotter.evaluate(self.reco_info[query_name], self.info[query_name]) #, subtract_unphysical_erosions=True) self.remaining_queries.remove(query_name) # ---------------------------------------------------------------------------------------- def summarize_query(self, query_name, query_seq, all_match_names, all_query_bounds, all_germline_bounds, warnings, first_match_query_bounds): if self.debug: print '%s' % query_name best, match_names, n_matches = {}, {}, {} n_used = {'v':0, 'd':0, 'j':0} k_v_min, k_d_min = 999, 999 k_v_max, k_d_max = 0, 0 for region in utils.regions: all_match_names[region] = sorted(all_match_names[region], reverse=True) match_names[region] = [] codon_positions = {'v':-1, 'd':-1, 'j':-1} # conserved codon positions (v:cysteine, d:dummy, j:tryptophan) for region in utils.regions: n_matches[region] = len(all_match_names[region]) n_skipped = 0 for score, gene in all_match_names[region]: glbounds = all_germline_bounds[gene] qrbounds = all_query_bounds[gene] assert qrbounds[1] <= len(query_seq) # NOTE I'm putting these up avove as well (in process_query), so in time I should remove them from here assert glbounds[1] <= len(self.germline_seqs[region][gene]) assert qrbounds[0] >= 0 assert glbounds[0] >= 0 glmatchseq = self.germline_seqs[region][gene][glbounds[0]:glbounds[1]] # TODO since I'm no longer skipping the genes after the first <args.n_max_per_region>, the OR of k-space below is overly conservative # only use a specified set of genes if self.args.only_genes is not None and gene not in self.args.only_genes: n_skipped += 1 continue # add match to the list n_used[region] += 1 match_names[region].append(gene) self.print_match(region, gene, query_seq, score, glbounds, qrbounds, -1, warnings, skipping=False) # if the germline match and the query match aren't the same length, s-w likely added an insert, which we shouldn't get since the gap-open penalty is jacked up so high if len(glmatchseq) != len(query_seq[qrbounds[0]:qrbounds[1]]): # neurotic double check (um, I think) EDIT hey this totally saved my ass print 'ERROR %d not same length' % query_name print glmatchseq, glbounds[0], glbounds[1] print query_seq[qrbounds[0]:qrbounds[1]] assert False if region == 'v': this_k_v = all_query_bounds[gene][1] # NOTE even if the v match doesn't start at the left hand edge of the query sequence, we still measure k_v from there. # In other words, sw doesn't tell the hmm about it k_v_min = min(this_k_v, k_v_min) k_v_max = max(this_k_v, k_v_max) if region == 'd': this_k_d = all_query_bounds[gene][1] - first_match_query_bounds[1] # end of d minus end of v k_d_min = min(this_k_d, k_d_min) k_d_max = max(this_k_d, k_d_max) # check consistency with best match (since the best match is excised in s-w code, and because ham is run with *one* k_v k_d set) if region not in best: best[region] = gene best[region + '_gl_seq'] = self.germline_seqs[region][gene][glbounds[0]:glbounds[1]] best[region + '_qr_seq'] = query_seq[qrbounds[0]:qrbounds[1]] best[region + '_score'] = score if self.debug and n_skipped > 0: print '%8s skipped %d %s genes' % ('', n_skipped, region) for region in utils.regions: if region not in best: print ' no', region, 'match found for', query_name # NOTE if no d match found, we should really just assume entire d was eroded return # s-w allows d and j matches to overlap, so we need to apportion the disputed bases try: self.shift_overlapping_boundaries(all_query_bounds, all_germline_bounds, query_name, query_seq, best) except AssertionError: print '%s: apportionment failed' % query_name return # check for unproductive rearrangements for region in utils.regions: codon_positions[region] = utils.get_conserved_codon_position(self.cyst_positions, self.tryp_positions, region, best[region], all_germline_bounds[best[region]], all_query_bounds[best[region]], assert_on_fail=False) # position in the query sequence, that is codons_ok = utils.check_both_conserved_codons(query_seq, codon_positions['v'], codon_positions['j'], debug=self.debug, extra_str=' ', assert_on_fail=False) cdr3_length = codon_positions['j'] - codon_positions['v'] + 3 in_frame_cdr3 = (cdr3_length % 3 == 0) if self.debug and not in_frame_cdr3: print ' out of frame cdr3: %d %% 3 = %d' % (cdr3_length, cdr3_length % 3) no_stop_codon = utils.stop_codon_check(query_seq, codon_positions['v'], debug=self.debug) if not codons_ok or not in_frame_cdr3 or not no_stop_codon: if self.debug: print ' unproductive rearrangement in waterer codons_ok: %s in_frame_cdr3: %s no_stop_codon: %s' % (codons_ok, in_frame_cdr3, no_stop_codon) if self.args.skip_unproductive: if self.debug: print ' ...skipping' self.n_unproductive += 1 self.info['skipped_unproductive_queries'].append(query_name) return # best k_v, k_d: k_v = all_query_bounds[best['v']][1] # end of v match k_d = all_query_bounds[best['d']][1] - all_query_bounds[best['v']][1] # end of d minus end of v if k_d_max < 5: # since the s-w step matches to the longest possible j and then excises it, this sometimes gobbles up the d, resulting in a very short d alignment. if self.debug: print ' expanding k_d' k_d_max = max(8, k_d_max) if 'IGHJ4*' in best['j'] and self.germline_seqs['d'][best['d']][-5:] == 'ACTAC': # the end of some d versions is the same as the start of some j versions, so the s-w frequently kicks out the 'wrong' alignment if self.debug: print ' doubly expanding k_d' if k_d_max-k_d_min < 8: k_d_min -= 5 k_d_max += 2 k_v_min = max(0, k_v_min - self.args.default_v_fuzz) # ok, so I don't *actually* want it to be zero... oh, well k_v_max += self.args.default_v_fuzz k_d_min = max(1, k_d_min - self.args.default_d_fuzz) k_d_max += self.args.default_d_fuzz assert k_v_min > 0 and k_d_min > 0 and k_v_max > 0 and k_d_max > 0 if self.debug: print ' k_v: %d [%d-%d)' % (k_v, k_v_min, k_v_max) print ' k_d: %d [%d-%d)' % (k_d, k_d_min, k_d_max) print ' used', for region in utils.regions: print ' %s: %d/%d' % (region, n_used[region], n_matches[region]), print '' kvals = {} kvals['v'] = {'best':k_v, 'min':k_v_min, 'max':k_v_max} kvals['d'] = {'best':k_d, 'min':k_d_min, 'max':k_d_max} self.add_to_info(query_name, query_seq, kvals, match_names, best, all_germline_bounds, all_query_bounds, codon_positions=codon_positions)
def __init__(self, args, input_info, reco_info, germline_seqs, parameter_dir, write_parameters=False): self.parameter_dir = parameter_dir self.args = args self.debug = self.args.debug if self.args.sw_debug is None else self.args.sw_debug self.input_info = input_info self.remaining_queries = [ query for query in self.input_info.keys() ] # we remove queries from this list when we're satisfied with the current output (in general we may have to rerun some queries with different match/mismatch scores) self.new_indels = 0 # number of new indels that were kicked up this time through self.reco_info = reco_info self.germline_seqs = germline_seqs self.pcounter, self.true_pcounter, self.perfplotter = None, None, None if write_parameters: self.pcounter = ParameterCounter(self.germline_seqs) if not self.args.is_data: self.true_pcounter = ParameterCounter(self.germline_seqs) if self.args.plot_performance: self.perfplotter = PerformancePlotter(self.germline_seqs, 'sw') self.info = {} self.info['queries'] = [] self.info['all_best_matches'] = set( ) # set of all the matches we found (for *all* queries) self.info['skipped_unproductive_queries'] = [ ] # list of unproductive queries # self.info['skipped_indel_queries'] = [] # list of queries that had indels self.info['skipped_unknown_queries'] = [] self.info['indels'] = {} if self.args.apply_choice_probs_in_sw: if self.debug: print ' reading gene choice probs from', parameter_dir self.gene_choice_probs = utils.read_overall_gene_probs( parameter_dir) with opener('r')( self.args.datadir + '/v-meta.json' ) as json_file: # get location of <begin> cysteine in each v region self.cyst_positions = json.load(json_file) with opener('r')( self.args.datadir + '/j_tryp.csv' ) as csv_file: # get location of <end> tryptophan in each j region (TGG) tryp_reader = csv.reader(csv_file) self.tryp_positions = { row[0]: row[1] for row in tryp_reader } # WARNING: this doesn't filter out the header line self.outfile = None if self.args.outfname is not None: self.outfile = open(self.args.outfname, 'a') self.n_unproductive = 0 self.n_total = 0 print 'smith-waterman'
class Waterer(object): """ Run smith-waterman on the query sequences in <infname> """ def __init__(self, args, input_info, reco_info, germline_seqs, parameter_dir, write_parameters=False): self.parameter_dir = parameter_dir self.args = args self.debug = self.args.debug if self.args.sw_debug is None else self.args.sw_debug self.input_info = input_info self.remaining_queries = [ query for query in self.input_info.keys() ] # we remove queries from this list when we're satisfied with the current output (in general we may have to rerun some queries with different match/mismatch scores) self.new_indels = 0 # number of new indels that were kicked up this time through self.reco_info = reco_info self.germline_seqs = germline_seqs self.pcounter, self.true_pcounter, self.perfplotter = None, None, None if write_parameters: self.pcounter = ParameterCounter(self.germline_seqs) if not self.args.is_data: self.true_pcounter = ParameterCounter(self.germline_seqs) if self.args.plot_performance: self.perfplotter = PerformancePlotter(self.germline_seqs, 'sw') self.info = {} self.info['queries'] = [] self.info['all_best_matches'] = set( ) # set of all the matches we found (for *all* queries) self.info['skipped_unproductive_queries'] = [ ] # list of unproductive queries # self.info['skipped_indel_queries'] = [] # list of queries that had indels self.info['skipped_unknown_queries'] = [] self.info['indels'] = {} if self.args.apply_choice_probs_in_sw: if self.debug: print ' reading gene choice probs from', parameter_dir self.gene_choice_probs = utils.read_overall_gene_probs( parameter_dir) with opener('r')( self.args.datadir + '/v-meta.json' ) as json_file: # get location of <begin> cysteine in each v region self.cyst_positions = json.load(json_file) with opener('r')( self.args.datadir + '/j_tryp.csv' ) as csv_file: # get location of <end> tryptophan in each j region (TGG) tryp_reader = csv.reader(csv_file) self.tryp_positions = { row[0]: row[1] for row in tryp_reader } # WARNING: this doesn't filter out the header line self.outfile = None if self.args.outfname is not None: self.outfile = open(self.args.outfname, 'a') self.n_unproductive = 0 self.n_total = 0 print 'smith-waterman' # ---------------------------------------------------------------------------------------- def __del__(self): if self.args.outfname is not None: self.outfile.close() # ---------------------------------------------------------------------------------------- def clean(self): if self.pcounter is not None: self.pcounter.clean() if self.true_pcounter is not None: self.true_pcounter.clean() # ---------------------------------------------------------------------------------------- def run(self): # start = time.time() base_infname = 'query-seqs.fa' base_outfname = 'query-seqs.bam' sys.stdout.flush() n_tries = 0 while len( self.remaining_queries ) > 0: # we remove queries from <self.remaining_queries> as we're satisfied with their output self.write_vdjalign_input(base_infname, n_procs=self.args.n_fewer_procs) self.execute_command(base_infname, base_outfname, self.args.n_fewer_procs) self.read_output(base_outfname, n_procs=self.args.n_fewer_procs) n_tries += 1 if n_tries > 2: self.info['skipped_unknown_queries'] += self.remaining_queries break self.finalize() # ---------------------------------------------------------------------------------------- def finalize(self): if self.perfplotter is not None: self.perfplotter.plot(self.args.plotdir + '/sw/performance') # print ' sw time: %.3f' % (time.time()-start) if self.n_unproductive > 0: print ' unproductive skipped %d / %d = %.2f' % ( self.n_unproductive, self.n_total, float(self.n_unproductive) / self.n_total) # if len(self.info['skipped_indel_queries']) > 0: # print ' indels skipped %d / %d = %.2f' % (len(self.info['skipped_indel_queries']), self.n_total, float(len(self.info['skipped_indel_queries'])) / self.n_total) if len(self.info['indels']) > 0: print ' indels: %s' % ':'.join(self.info['indels'].keys()) if self.pcounter is not None: self.pcounter.write(self.parameter_dir) if self.args.plotdir is not None: self.pcounter.plot(self.args.plotdir + '/sw', subset_by_gene=True, cyst_positions=self.cyst_positions, tryp_positions=self.tryp_positions) if self.true_pcounter is not None: self.true_pcounter.plot(self.args.plotdir + 'sw/true', subset_by_gene=True, cyst_positions=self.cyst_positions, tryp_positions=self.tryp_positions) # ---------------------------------------------------------------------------------------- def execute_command(self, base_infname, base_outfname, n_procs): if n_procs == 1: cmd_str = self.get_vdjalign_cmd_str(self.args.workdir, base_infname, base_outfname) proc = Popen(cmd_str.split(), stdout=PIPE, stderr=PIPE) out, err = proc.communicate() utils.process_out_err(out, err) if not self.args.no_clean: os.remove(self.args.workdir + '/' + base_infname) else: procs = [] for iproc in range(n_procs): cmd_str = self.get_vdjalign_cmd_str( self.args.workdir + '/sw-' + str(iproc), base_infname, base_outfname) procs.append(Popen(cmd_str.split(), stdout=PIPE, stderr=PIPE)) time.sleep(0.1) for iproc in range(len(procs)): out, err = procs[iproc].communicate() utils.process_out_err(out, err, extra_str=str(iproc)) if not self.args.no_clean: for iproc in range(n_procs): os.remove(self.args.workdir + '/sw-' + str(iproc) + '/' + base_infname) sys.stdout.flush() # ---------------------------------------------------------------------------------------- def write_vdjalign_input(self, base_infname, n_procs): queries_per_proc = float(len(self.remaining_queries)) / n_procs n_queries_per_proc = int(math.ceil(queries_per_proc)) if n_procs == 1: # double check for rounding problems or whatnot assert n_queries_per_proc == len(self.remaining_queries) for iproc in range(n_procs): workdir = self.args.workdir if n_procs > 1: workdir += '/sw-' + str(iproc) utils.prep_dir(workdir) with opener('w')(workdir + '/' + base_infname) as sub_infile: for iquery in range(iproc * n_queries_per_proc, (iproc + 1) * n_queries_per_proc): if iquery >= len(self.remaining_queries): break query_name = self.remaining_queries[iquery] sub_infile.write('>' + query_name + ' NUKES\n') seq = self.input_info[query_name]['seq'] if query_name in self.info['indels']: seq = self.info['indels'][query_name][ 'reversed_seq'] # use the query sequence with shm insertions and deletions reversed sub_infile.write(seq + '\n') # ---------------------------------------------------------------------------------------- def get_vdjalign_cmd_str(self, workdir, base_infname, base_outfname): """ Run smith-waterman alignment (from Connor's ighutils package) on the seqs in <base_infname>, and toss all the top matches into <base_outfname>. """ # large gap-opening penalty: we want *no* gaps in the middle of the alignments # match score larger than (negative) mismatch score: we want to *encourage* some level of shm. If they're equal, we tend to end up with short unmutated alignments, which screws everything up os.environ['PATH'] = os.getenv( 'PWD') + '/packages/samtools:' + os.getenv('PATH') check_output(['which', 'samtools']) if not os.path.exists(self.args.ighutil_dir + '/bin/vdjalign'): raise Exception('ERROR ighutil path d.n.e: ' + self.args.ighutil_dir + '/bin/vdjalign') cmd_str = self.args.ighutil_dir + '/bin/vdjalign align-fastq -q' if self.args.slurm: cmd_str = 'srun ' + cmd_str cmd_str += ' --max-drop 50' match, mismatch = self.args.match_mismatch cmd_str += ' --match ' + str(match) + ' --mismatch ' + str(mismatch) cmd_str += ' --gap-open ' + str( self.args.gap_open_penalty) #1000' #50' cmd_str += ' --vdj-dir ' + self.args.datadir cmd_str += ' ' + workdir + '/' + base_infname + ' ' + workdir + '/' + base_outfname return cmd_str # ---------------------------------------------------------------------------------------- def read_output(self, base_outfname, n_procs=1): n_processed = 0 for iproc in range(n_procs): workdir = self.args.workdir if n_procs > 1: workdir += '/sw-' + str(iproc) outfname = workdir + '/' + base_outfname with contextlib.closing(pysam.Samfile(outfname)) as bam: grouped = itertools.groupby(iter(bam), operator.attrgetter('qname')) for _, reads in grouped: # loop over query sequences self.n_total += 1 self.process_query(bam, list(reads)) n_processed += 1 if not self.args.no_clean: os.remove(outfname) if n_procs > 1: # still need the top-level workdir os.rmdir(workdir) print ' processed %d queries' % n_processed if len(self.remaining_queries) > 0: if self.new_indels > 0: # if we skipped some events, and if none of those were because they were indels, then increase mismatch score print ' skipped %d queries (%d indels), rerunning them' % ( len(self.remaining_queries), self.new_indels) self.new_indels = 0 else: print ' skipped %d queries (%d indels), increasing mismatch score (%d --> %d) and rerunning them' % ( len(self.remaining_queries), self.new_indels, self.args.match_mismatch[1], self.args.match_mismatch[1] + 1) self.args.match_mismatch[1] += 1 self.new_indels = 0 # ---------------------------------------------------------------------------------------- def get_choice_prob(self, region, gene): choice_prob = 1.0 if gene in self.gene_choice_probs[region]: choice_prob = self.gene_choice_probs[region][gene] else: choice_prob = 0.0 # NOTE would it make sense to use something else here? return choice_prob # ---------------------------------------------------------------------------------------- def get_indel_info(self, query_name, cigarstr, qrseq, glseq, gene): cigars = re.findall('[0-9][0-9]*[A-Z]', cigarstr) # split cigar string into its parts cigars = [(cstr[-1], int(cstr[:-1])) for cstr in cigars ] # split each part into the code and the length codestr = '' qpos = 0 # position within query sequence indelfo = { 'reversed_seq': '', 'indels': [] } # replacement_seq: query seq with insertions removed and germline bases inserted at the position of deletions tmp_indices = [] for code, length in cigars: codestr += length * code if code == 'I': # advance qr seq but not gl seq indelfo['indels'].append({ 'type': 'insertion', 'pos': qpos, 'len': length, 'seqstr': '' }) # insertion begins at <pos> tmp_indices += [ len(indelfo['indels']) - 1 for _ in range(length) ] # indel index corresponding to this position in the alignment elif code == 'D': # advance qr seq but not gl seq indelfo['indels'].append( { 'type': 'deletion', 'pos': qpos, 'len': length, 'seqstr': '' } ) # first deleted base is <pos> (well, first base which is in the position of the first deleted base) tmp_indices += [ len(indelfo['indels']) - 1 for _ in range(length) ] # indel index corresponding to this position in the alignment else: tmp_indices += [ None for _ in range(length) ] # indel index corresponding to this position in the alignment qpos += length qrprintstr, glprintstr = '', '' iqr, igl = 0, 0 for icode in range(len(codestr)): code = codestr[icode] if code == 'M': qrbase = qrseq[iqr] if qrbase != glseq[igl]: qrbase = utils.color('red', qrbase) qrprintstr += qrbase glprintstr += glseq[igl] indelfo['reversed_seq'] += qrseq[ iqr] # add the base to the overall sequence with all indels reversed elif code == 'S': continue elif code == 'I': qrprintstr += utils.color('light_blue', qrseq[iqr]) glprintstr += utils.color('light_blue', '*') indelfo['indels'][tmp_indices[icode]]['seqstr'] += qrseq[ iqr] # and to the sequence of just this indel igl -= 1 elif code == 'D': qrprintstr += utils.color('light_blue', '*') glprintstr += utils.color('light_blue', glseq[igl]) indelfo['reversed_seq'] += glseq[ igl] # add the base to the overall sequence with all indels reversed indelfo['indels'][tmp_indices[icode]]['seqstr'] += glseq[ igl] # and to the sequence of just this indel iqr -= 1 else: raise Exception('unhandled code %s' % code) iqr += 1 igl += 1 print '\n indels in %s' % query_name print ' %20s %s' % (gene, glprintstr) print ' %20s %s' % ('query', qrprintstr) for idl in indelfo['indels']: print ' %10s: %d bases at %d (%s)' % ( idl['type'], idl['len'], idl['pos'], idl['seqstr']) # utils.undo_indels(indelfo) # print ' %s' % self.input_info[query_name]['seq'] return indelfo # ---------------------------------------------------------------------------------------- def process_query(self, bam, reads): primary = next((r for r in reads if not r.is_secondary), None) query_seq = primary.seq query_name = primary.qname first_match_query_bounds = None # since sw excises its favorite v match, we have to know this match's boundaries in order to calculate k_d for all the other matches all_match_names = {} warnings = {} # ick, this is a messy way to pass stuff around for region in utils.regions: all_match_names[region] = [] all_query_bounds, all_germline_bounds = {}, {} n_skipped_invalid_cpos = 0 for read in reads: # loop over the matches found for each query sequence # set this match's values read.seq = query_seq # only the first one has read.seq set by default, so we need to set the rest by hand gene = bam.references[read.tid] region = utils.get_region(gene) raw_score = read.tags[0][ 1] # raw because they don't include the gene choice probs score = raw_score if self.args.apply_choice_probs_in_sw: # NOTE I stopped applying the gene choice probs here because the smith-waterman scores don't correspond to log-probs, so throwing on the gene choice probs was dubious (and didn't seem to work that well) score = self.get_choice_prob( region, gene ) * raw_score # multiply by the probability to choose this gene qrbounds = (read.qstart, read.qend) glbounds = (read.pos, read.aend) if region == 'v' and first_match_query_bounds is None: first_match_query_bounds = qrbounds # perform a few checks and see if we want to skip this match if region == 'v': # skip matches with cpos past the end of the query seq (i.e. eroded a ton on the right side of the v) cpos = utils.get_conserved_codon_position(self.cyst_positions, self.tryp_positions, 'v', gene, glbounds, qrbounds, assert_on_fail=False) if not utils.check_conserved_cysteine( self.germline_seqs['v'][gene], self.cyst_positions[gene]['cysteine-position'], assert_on_fail=False ): # some of the damn cysteine positions in the json file were wrong, so now we check raise Exception( 'bad cysteine in %s: %d %s' % (gene, self.cyst_positions[gene]['cysteine-position'], self.germline_seqs['v'][gene])) if cpos < 0 or cpos >= len(query_seq): n_skipped_invalid_cpos += 1 continue if 'I' in read.cigarstring or 'D' in read.cigarstring: # skip indels, and tell the HMM to skip indels (you won't see any unless you decrease the <self.args.gap_open_penalty>) if len( all_match_names[region] ) == 0: # if this is the first (best) match for this region, allow indels (otherwise skip the match) if query_name not in self.info['indels']: self.info['indels'][query_name] = self.get_indel_info( query_name, read.cigarstring, query_seq[qrbounds[0]:qrbounds[1]], self.germline_seqs[region][gene] [glbounds[0]:glbounds[1]], gene) self.info['indels'][query_name][ 'reversed_seq'] = query_seq[:qrbounds[ 0]] + self.info['indels'][query_name][ 'reversed_seq'] + query_seq[qrbounds[1]:] self.new_indels += 1 # print ' query seq %s' % query_seq # print 'indelfo seq %s' % self.info['indels'][query_name]['reversed_seq'] # self.info['skipped_indel_queries'].append(query_name) # self.info[query_name] = {'indels'} else: print ' multiple indels for %s' % query_name return else: continue if qrbounds[1] - qrbounds[0] != glbounds[1] - glbounds[0]: raise Exception( 'germline match (%d %d) not same length as query match (%d %d)' % (qrbounds[0], qrbounds[1], glbounds[0], glbounds[1])) assert qrbounds[1] <= len(query_seq) if glbounds[1] > len(self.germline_seqs[region][gene]): print ' ', gene print ' ', glbounds[1], len(self.germline_seqs[region][gene]) print ' ', self.germline_seqs[region][gene] assert glbounds[1] <= len(self.germline_seqs[region][gene]) assert qrbounds[1] - qrbounds[0] == glbounds[1] - glbounds[0] # and finally add this match's information warnings[gene] = '' all_match_names[region].append( (score, gene) ) # NOTE it is important that this is ordered such that the best match is first all_query_bounds[gene] = qrbounds all_germline_bounds[gene] = glbounds # if n_skipped_invalid_cpos > 0: # print ' skipped %d invalid cpos values for %s' % (n_skipped_invalid_cpos, query_name) self.summarize_query(query_name, query_seq, all_match_names, all_query_bounds, all_germline_bounds, warnings, first_match_query_bounds) # ---------------------------------------------------------------------------------------- def print_match(self, region, gene, query_seq, score, glbounds, qrbounds, codon_pos, warnings, skipping=False): if self.debug < 2: return out_str_list = [] buff_str = (20 - len(gene)) * ' ' tmp_val = score if self.args.apply_choice_probs_in_sw and self.get_choice_prob( region, gene) != 0.0: tmp_val = score / self.get_choice_prob(region, gene) if self.args.apply_choice_probs_in_sw: out_str_list.append( '%8s%s%s%9.1e * %3.0f = %-6.1f' % (' ', utils.color_gene(gene), buff_str, self.get_choice_prob(region, gene), tmp_val, score)) else: out_str_list.append( '%8s%s%s%9s%3s %6.0f ' % (' ', utils.color_gene(gene), '', '', buff_str, score)) out_str_list.append( '%4d%4d %s\n' % (glbounds[0], glbounds[1], self.germline_seqs[region][gene][glbounds[0]:glbounds[1]])) out_str_list.append('%46s %4d%4d' % ('', qrbounds[0], qrbounds[1])) out_str_list.append(' %s ' % (utils.color_mutants( self.germline_seqs[region][gene][glbounds[0]:glbounds[1]], query_seq[qrbounds[0]:qrbounds[1]]))) if region != 'd': out_str_list.append( '(%s %d)' % (utils.conserved_codon_names[region], codon_pos)) if warnings[gene] != '': out_str_list.append('WARNING ' + warnings[gene]) if skipping: out_str_list.append('skipping!') if self.args.outfname is None: print ''.join(out_str_list) else: out_str_list.append('\n') self.outfile.write(''.join(out_str_list)) # ---------------------------------------------------------------------------------------- def shift_overlapping_boundaries(self, qrbounds, glbounds, query_name, query_seq, best): # NOTE this does pretty much the same thing as resolve_overlapping_matches in joinparser.py """ s-w allows d and j matches (and v and d matches) to overlap... which makes no sense, so apportion the disputed territory between the two regions """ for region_pairs in ({ 'left': 'v', 'right': 'd' }, { 'left': 'd', 'right': 'j' }): l_reg = region_pairs['left'] r_reg = region_pairs['right'] l_gene = best[l_reg] r_gene = best[r_reg] overlap = qrbounds[l_gene][1] - qrbounds[r_gene][0] if overlap > 0: l_length = qrbounds[l_gene][1] - qrbounds[l_gene][0] r_length = qrbounds[r_gene][1] - qrbounds[r_gene][0] l_portion, r_portion = 0, 0 while l_portion + r_portion < overlap: if l_length <= 1 and r_length <= 1: # don't want to erode match (in practice it'll be the d match) all the way to zero print ' ERROR both lengths went to zero' assert False elif l_length > 1 and r_length > 1: # if both have length left, alternate back and forth if (l_portion + r_portion) % 2 == 0: l_portion += 1 # give one base to the left l_length -= 1 else: r_portion += 1 # and one to the right r_length -= 1 elif l_length > 1: l_portion += 1 l_length -= 1 elif r_length > 1: r_portion += 1 r_length -= 1 if self.debug: print ' WARNING %s apportioning %d bases between %s (%d) match and %s (%d) match' % ( query_name, overlap, l_reg, l_portion, r_reg, r_portion) assert l_portion + r_portion == overlap qrbounds[l_gene] = (qrbounds[l_gene][0], qrbounds[l_gene][1] - l_portion) glbounds[l_gene] = (glbounds[l_gene][0], glbounds[l_gene][1] - l_portion) qrbounds[r_gene] = (qrbounds[r_gene][0] + r_portion, qrbounds[r_gene][1]) glbounds[r_gene] = (glbounds[r_gene][0] + r_portion, glbounds[r_gene][1]) best[l_reg + '_gl_seq'] = self.germline_seqs[l_reg][l_gene][ glbounds[l_gene][0]:glbounds[l_gene][1]] best[l_reg + '_qr_seq'] = query_seq[ qrbounds[l_gene][0]:qrbounds[l_gene][1]] best[r_reg + '_gl_seq'] = self.germline_seqs[r_reg][r_gene][ glbounds[r_gene][0]:glbounds[r_gene][1]] best[r_reg + '_qr_seq'] = query_seq[ qrbounds[r_gene][0]:qrbounds[r_gene][1]] # ---------------------------------------------------------------------------------------- def add_to_info(self, query_name, query_seq, kvals, match_names, best, all_germline_bounds, all_query_bounds, codon_positions): assert query_name not in self.info self.info['queries'].append(query_name) self.info[query_name] = {} self.info[query_name][ 'unique_id'] = query_name # redundant, but used somewhere down the line self.info[query_name]['k_v'] = kvals['v'] self.info[query_name]['k_d'] = kvals['d'] self.info[query_name]['all'] = ':'.join(match_names['v'] + match_names['d'] + match_names['j']) # assert codon_positions['v'] != -1 # assert codon_positions['j'] != -1 self.info[query_name][ 'cdr3_length'] = codon_positions['j'] - codon_positions[ 'v'] + 3 #tryp_position_in_joined_seq - self.cyst_position + 3 self.info[query_name]['cyst_position'] = codon_positions['v'] self.info[query_name]['tryp_position'] = codon_positions['j'] if self.info[query_name]['cyst_position'] < 0 or self.info[query_name][ 'cyst_position'] >= len(query_seq): raise Exception('cpos %d invalid for %s (%s)' % (self.info[query_name]['cyst_position'], query_name, query_seq)) if self.info[query_name]['tryp_position'] < 0 or self.info[query_name][ 'tryp_position'] >= len(query_seq): raise Exception('tpos %d invalid for %s (%s)' % (self.info[query_name]['tryp_position'], query_name, query_seq)) # erosion, insertion, mutation info for best match self.info[query_name]['v_5p_del'] = all_germline_bounds[best['v']][0] self.info[query_name]['v_3p_del'] = len( self.germline_seqs['v'][best['v']]) - all_germline_bounds[ best['v']][1] # len(germline v) - gl_match_end self.info[query_name]['d_5p_del'] = all_germline_bounds[best['d']][0] self.info[query_name]['d_3p_del'] = len(self.germline_seqs['d'][ best['d']]) - all_germline_bounds[best['d']][1] self.info[query_name]['j_5p_del'] = all_germline_bounds[best['j']][0] self.info[query_name]['j_3p_del'] = len(self.germline_seqs['j'][ best['j']]) - all_germline_bounds[best['j']][1] self.info[query_name][ 'fv_insertion'] = query_seq[:all_query_bounds[best['v']][0]] self.info[query_name]['vd_insertion'] = query_seq[ all_query_bounds[best['v']][1]:all_query_bounds[best['d']][0]] self.info[query_name]['dj_insertion'] = query_seq[ all_query_bounds[best['d']][1]:all_query_bounds[best['j']][0]] self.info[query_name]['jf_insertion'] = query_seq[ all_query_bounds[best['j']][1]:] for region in utils.regions: self.info[query_name][region + '_gene'] = best[region] self.info[query_name][region + '_gl_seq'] = best[region + '_gl_seq'] self.info[query_name][region + '_qr_seq'] = best[region + '_qr_seq'] self.info['all_best_matches'].add(best[region]) self.info[query_name][ 'seq'] = query_seq # NOTE this is the seq output by vdjalign, i.e. if we reversed any indels it is the reversed sequence if self.debug: if not self.args.is_data: utils.print_reco_event( self.germline_seqs, self.reco_info[query_name], extra_str=' ', label='true:', indelfo=self.reco_info[query_name]['indels']) utils.print_reco_event(self.germline_seqs, self.info[query_name], extra_str=' ', label='inferred:', indelfo=self.info['indels'].get( query_name, None)) if self.pcounter is not None: self.pcounter.increment_reco_params(self.info[query_name]) self.pcounter.increment_mutation_params(self.info[query_name]) if self.true_pcounter is not None: self.true_pcounter.increment_reco_params( self.reco_info[query_name]) self.true_pcounter.increment_mutation_params( self.reco_info[query_name]) if self.perfplotter is not None: self.perfplotter.evaluate( self.reco_info[query_name], self.info[query_name]) #, subtract_unphysical_erosions=True) self.remaining_queries.remove(query_name) # ---------------------------------------------------------------------------------------- def summarize_query(self, query_name, query_seq, all_match_names, all_query_bounds, all_germline_bounds, warnings, first_match_query_bounds): if self.debug: print '%s' % query_name best, match_names, n_matches = {}, {}, {} n_used = {'v': 0, 'd': 0, 'j': 0} k_v_min, k_d_min = 999, 999 k_v_max, k_d_max = 0, 0 for region in utils.regions: all_match_names[region] = sorted(all_match_names[region], reverse=True) match_names[region] = [] codon_positions = { 'v': -1, 'd': -1, 'j': -1 } # conserved codon positions (v:cysteine, d:dummy, j:tryptophan) for region in utils.regions: n_matches[region] = len(all_match_names[region]) n_skipped = 0 for score, gene in all_match_names[region]: glbounds = all_germline_bounds[gene] qrbounds = all_query_bounds[gene] assert qrbounds[1] <= len( query_seq ) # NOTE I'm putting these up avove as well (in process_query), so in time I should remove them from here assert glbounds[1] <= len(self.germline_seqs[region][gene]) assert qrbounds[0] >= 0 assert glbounds[0] >= 0 glmatchseq = self.germline_seqs[region][gene][ glbounds[0]:glbounds[1]] # TODO since I'm no longer skipping the genes after the first <args.n_max_per_region>, the OR of k-space below is overly conservative # only use a specified set of genes if self.args.only_genes is not None and gene not in self.args.only_genes: n_skipped += 1 continue # add match to the list n_used[region] += 1 match_names[region].append(gene) self.print_match(region, gene, query_seq, score, glbounds, qrbounds, -1, warnings, skipping=False) # if the germline match and the query match aren't the same length, s-w likely added an insert, which we shouldn't get since the gap-open penalty is jacked up so high if len(glmatchseq) != len( query_seq[qrbounds[0]:qrbounds[1]] ): # neurotic double check (um, I think) EDIT hey this totally saved my ass print 'ERROR %d not same length' % query_name print glmatchseq, glbounds[0], glbounds[1] print query_seq[qrbounds[0]:qrbounds[1]] assert False if region == 'v': this_k_v = all_query_bounds[gene][ 1] # NOTE even if the v match doesn't start at the left hand edge of the query sequence, we still measure k_v from there. # In other words, sw doesn't tell the hmm about it k_v_min = min(this_k_v, k_v_min) k_v_max = max(this_k_v, k_v_max) if region == 'd': this_k_d = all_query_bounds[gene][ 1] - first_match_query_bounds[ 1] # end of d minus end of v k_d_min = min(this_k_d, k_d_min) k_d_max = max(this_k_d, k_d_max) # check consistency with best match (since the best match is excised in s-w code, and because ham is run with *one* k_v k_d set) if region not in best: best[region] = gene best[region + '_gl_seq'] = self.germline_seqs[region][ gene][glbounds[0]:glbounds[1]] best[region + '_qr_seq'] = query_seq[qrbounds[0]:qrbounds[1]] best[region + '_score'] = score if self.debug and n_skipped > 0: print '%8s skipped %d %s genes' % ('', n_skipped, region) for region in utils.regions: if region not in best: print ' no', region, 'match found for', query_name # NOTE if no d match found, we should really just assume entire d was eroded return # s-w allows d and j matches to overlap, so we need to apportion the disputed bases try: self.shift_overlapping_boundaries(all_query_bounds, all_germline_bounds, query_name, query_seq, best) except AssertionError: print '%s: apportionment failed' % query_name return # check for unproductive rearrangements for region in utils.regions: codon_positions[region] = utils.get_conserved_codon_position( self.cyst_positions, self.tryp_positions, region, best[region], all_germline_bounds[best[region]], all_query_bounds[best[region]], assert_on_fail=False ) # position in the query sequence, that is codons_ok = utils.check_both_conserved_codons(query_seq, codon_positions['v'], codon_positions['j'], debug=self.debug, extra_str=' ', assert_on_fail=False) cdr3_length = codon_positions['j'] - codon_positions['v'] + 3 in_frame_cdr3 = (cdr3_length % 3 == 0) if self.debug and not in_frame_cdr3: print ' out of frame cdr3: %d %% 3 = %d' % (cdr3_length, cdr3_length % 3) no_stop_codon = utils.stop_codon_check(query_seq, codon_positions['v'], debug=self.debug) if not codons_ok or not in_frame_cdr3 or not no_stop_codon: if self.debug: print ' unproductive rearrangement in waterer codons_ok: %s in_frame_cdr3: %s no_stop_codon: %s' % ( codons_ok, in_frame_cdr3, no_stop_codon) if self.args.skip_unproductive: if self.debug: print ' ...skipping' self.n_unproductive += 1 self.info['skipped_unproductive_queries'].append(query_name) return # best k_v, k_d: k_v = all_query_bounds[best['v']][1] # end of v match k_d = all_query_bounds[best['d']][1] - all_query_bounds[best['v']][ 1] # end of d minus end of v if k_d_max < 5: # since the s-w step matches to the longest possible j and then excises it, this sometimes gobbles up the d, resulting in a very short d alignment. if self.debug: print ' expanding k_d' k_d_max = max(8, k_d_max) if 'IGHJ4*' in best['j'] and self.germline_seqs['d'][best['d']][ -5:] == 'ACTAC': # the end of some d versions is the same as the start of some j versions, so the s-w frequently kicks out the 'wrong' alignment if self.debug: print ' doubly expanding k_d' if k_d_max - k_d_min < 8: k_d_min -= 5 k_d_max += 2 k_v_min = max( 0, k_v_min - self.args.default_v_fuzz ) # ok, so I don't *actually* want it to be zero... oh, well k_v_max += self.args.default_v_fuzz k_d_min = max(1, k_d_min - self.args.default_d_fuzz) k_d_max += self.args.default_d_fuzz assert k_v_min > 0 and k_d_min > 0 and k_v_max > 0 and k_d_max > 0 if self.debug: print ' k_v: %d [%d-%d)' % (k_v, k_v_min, k_v_max) print ' k_d: %d [%d-%d)' % (k_d, k_d_min, k_d_max) print ' used', for region in utils.regions: print ' %s: %d/%d' % (region, n_used[region], n_matches[region]), print '' kvals = {} kvals['v'] = {'best': k_v, 'min': k_v_min, 'max': k_v_max} kvals['d'] = {'best': k_d, 'min': k_d_min, 'max': k_d_max} self.add_to_info(query_name, query_seq, kvals, match_names, best, all_germline_bounds, all_query_bounds, codon_positions=codon_positions)
utils.remove_from_arglist(sys.argv, '--fasta-output-file', has_arg=True) args = parser.parse_args() args.extra_columns = utils.get_arg_list(args.extra_columns) assert utils.getsuffix(args.outfile) in ['.csv', '.tsv', '.fa', '.fasta'] default_glfo_dir = partis_dir + '/data/germlines/human' if utils.getsuffix(args.infile) == '.csv' and args.glfo_dir is None: print ' note: reading deprecated csv format, so need to get germline info from a separate directory; --glfo-dir was not set, so using default %s. If it doesn\'t crash, it\'s probably ok.' % default_glfo_dir args.glfo_dir = default_glfo_dir glfo, annotation_list, cpath = utils.read_output(args.infile, glfo_dir=args.glfo_dir, locus=args.locus) if args.plotdir is not None: from parametercounter import ParameterCounter setattr(args, 'region_end_exclusions', {r : [0 for e in ['5p', '3p']] for r in utils.regions}) # hackity hackity hackity pcounter = ParameterCounter(glfo, args) for line in annotation_list: pcounter.increment(line) pcounter.plot(args.plotdir) #, make_per_base_plots=True) #, only_overall=True, make_per_base_plots=True sys.exit(0) if cpath is None or cpath.i_best is None: clusters_to_use = [l['unique_ids'] for l in annotation_list] print ' no cluster path in input file, so just using all %d sequences (in %d clusters) in annotations' % (sum(len(c) for c in clusters_to_use), len(clusters_to_use)) else: ipartition = cpath.i_best if args.partition_index is None else args.partition_index print ' found %d clusters in %s' % (len(cpath.partitions[ipartition]), 'best partition' if args.partition_index is None else 'partition at index %d (of %d)' % (ipartition, len(cpath.partitions))) if args.cluster_index is None: clusters_to_use = cpath.partitions[ipartition] print ' taking all %d clusters' % len(clusters_to_use) else:
def read_hmm_output(self, algorithm, hmm_csv_outfname, make_clusters=True, count_parameters=False, parameter_out_dir=None, plotdir=None): print ' read output' if count_parameters: assert parameter_out_dir is not None assert plotdir is not None pcounter = ParameterCounter( self.germline_seqs) if count_parameters else None true_pcounter = ParameterCounter(self.germline_seqs) if ( count_parameters and not self.args.is_data) else None perfplotter = PerformancePlotter( self.germline_seqs, plotdir + '/hmm/performance', 'hmm') if self.args.plot_performance else None n_processed = 0 hmminfo = [] with opener('r')(hmm_csv_outfname) as hmm_csv_outfile: reader = csv.DictReader(hmm_csv_outfile) last_key = None boundary_error_queries = [] for line in reader: utils.intify(line, splitargs=('unique_ids', 'seqs')) ids = line['unique_ids'] this_key = utils.get_key(ids) same_event = from_same_event(self.args.is_data, True, self.reco_info, ids) id_str = ''.join(['%20s ' % i for i in ids]) # check for errors if last_key != this_key: # if this is the first line for this set of ids (i.e. the best viterbi path or only forward score) if line['errors'] != None and 'boundary' in line[ 'errors'].split(':'): boundary_error_queries.append(':'.join( [str(uid) for uid in ids])) else: assert len(line['errors']) == 0 if algorithm == 'viterbi': line['seq'] = line['seqs'][ 0] # add info for the best match as 'seq' line['unique_id'] = ids[0] utils.add_match_info(self.germline_seqs, line, self.cyst_positions, self.tryp_positions, debug=(self.args.debug > 0)) if last_key != this_key or self.args.plot_all_best_events: # if this is the first line (i.e. the best viterbi path) for this query (or query pair), print the true event n_processed += 1 if self.args.debug: print '%s %d' % (id_str, same_event) if line['cdr3_length'] != -1 or not self.args.skip_unproductive: # if it's productive, or if we're not skipping unproductive rearrangements hmminfo.append( dict([ ('unique_id', line['unique_ids'][0]), ] + line.items())) if pcounter is not None: # increment counters (but only for the best [first] match) pcounter.increment(line) if true_pcounter is not None: # increment true counters true_pcounter.increment(self.reco_info[ids[0]]) if perfplotter is not None: perfplotter.evaluate(self.reco_info[ids[0]], line) if self.args.debug: self.print_hmm_output( line, print_true=(last_key != this_key), perfplotter=perfplotter) line['seq'] = None line['unique_id'] = None else: # for forward, write the pair scores to file to be read by the clusterer if not make_clusters: # self.args.debug or print '%3d %10.3f %s' % ( same_event, float(line['score']), id_str) if line['score'] == '-nan': print ' WARNING encountered -nan, setting to -999999.0' score = -999999.0 else: score = float(line['score']) if len(ids) == 2: hmminfo.append({ 'id_a': line['unique_ids'][0], 'id_b': line['unique_ids'][1], 'score': score }) n_processed += 1 last_key = utils.get_key(ids) if pcounter is not None: pcounter.write(parameter_out_dir) if not self.args.no_plot: pcounter.plot(plotdir, subset_by_gene=True, cyst_positions=self.cyst_positions, tryp_positions=self.tryp_positions) if true_pcounter is not None: true_pcounter.write(parameter_out_dir + '/true') if not self.args.no_plot: true_pcounter.plot(plotdir + '/true', subset_by_gene=True, cyst_positions=self.cyst_positions, tryp_positions=self.tryp_positions) if perfplotter is not None: perfplotter.plot() print ' processed %d queries' % n_processed if len(boundary_error_queries) > 0: print ' %d boundary errors (%s)' % ( len(boundary_error_queries), ', '.join(boundary_error_queries)) return hmminfo
def read_hmm_output(self, algorithm, hmm_csv_outfname, make_clusters=True, count_parameters=False, parameter_out_dir=None, plotdir=None): print ' read output' if count_parameters: assert parameter_out_dir is not None assert plotdir is not None pcounter = ParameterCounter(self.germline_seqs) if count_parameters else None true_pcounter = ParameterCounter(self.germline_seqs) if (count_parameters and not self.args.is_data) else None perfplotter = PerformancePlotter(self.germline_seqs, plotdir + '/hmm/performance', 'hmm') if self.args.plot_performance else None n_processed = 0 hmminfo = [] with opener('r')(hmm_csv_outfname) as hmm_csv_outfile: reader = csv.DictReader(hmm_csv_outfile) last_key = None boundary_error_queries = [] for line in reader: utils.intify(line, splitargs=('unique_ids', 'seqs')) ids = line['unique_ids'] this_key = utils.get_key(ids) same_event = from_same_event(self.args.is_data, True, self.reco_info, ids) id_str = ''.join(['%20s ' % i for i in ids]) # check for errors if last_key != this_key: # if this is the first line for this set of ids (i.e. the best viterbi path or only forward score) if line['errors'] != None and 'boundary' in line['errors'].split(':'): boundary_error_queries.append(':'.join([str(uid) for uid in ids])) else: assert len(line['errors']) == 0 if algorithm == 'viterbi': line['seq'] = line['seqs'][0] # add info for the best match as 'seq' line['unique_id'] = ids[0] utils.add_match_info(self.germline_seqs, line, self.cyst_positions, self.tryp_positions, debug=(self.args.debug > 0)) if last_key != this_key or self.args.plot_all_best_events: # if this is the first line (i.e. the best viterbi path) for this query (or query pair), print the true event n_processed += 1 if self.args.debug: print '%s %d' % (id_str, same_event) if line['cdr3_length'] != -1 or not self.args.skip_unproductive: # if it's productive, or if we're not skipping unproductive rearrangements hmminfo.append(dict([('unique_id', line['unique_ids'][0]), ] + line.items())) if pcounter is not None: # increment counters (but only for the best [first] match) pcounter.increment(line) if true_pcounter is not None: # increment true counters true_pcounter.increment(self.reco_info[ids[0]]) if perfplotter is not None: perfplotter.evaluate(self.reco_info[ids[0]], line) if self.args.debug: self.print_hmm_output(line, print_true=(last_key != this_key), perfplotter=perfplotter) line['seq'] = None line['unique_id'] = None else: # for forward, write the pair scores to file to be read by the clusterer if not make_clusters: # self.args.debug or print '%3d %10.3f %s' % (same_event, float(line['score']), id_str) if line['score'] == '-nan': print ' WARNING encountered -nan, setting to -999999.0' score = -999999.0 else: score = float(line['score']) if len(ids) == 2: hmminfo.append({'id_a':line['unique_ids'][0], 'id_b':line['unique_ids'][1], 'score':score}) n_processed += 1 last_key = utils.get_key(ids) if pcounter is not None: pcounter.write(parameter_out_dir) if not self.args.no_plot: pcounter.plot(plotdir, subset_by_gene=True, cyst_positions=self.cyst_positions, tryp_positions=self.tryp_positions) if true_pcounter is not None: true_pcounter.write(parameter_out_dir + '/true') if not self.args.no_plot: true_pcounter.plot(plotdir + '/true', subset_by_gene=True, cyst_positions=self.cyst_positions, tryp_positions=self.tryp_positions) if perfplotter is not None: perfplotter.plot() print ' processed %d queries' % n_processed if len(boundary_error_queries) > 0: print ' %d boundary errors (%s)' % (len(boundary_error_queries), ', '.join(boundary_error_queries)) return hmminfo
class Waterer(object): """ Run smith-waterman on the query sequences in <infname> """ def __init__(self, args, input_info, reco_info, glfo, my_datadir, parameter_dir, write_parameters=False, find_new_alleles=False): self.parameter_dir = parameter_dir.rstrip('/') self.args = args self.debug = self.args.debug if self.args.sw_debug is None else self.args.sw_debug self.max_insertion_length = 35 # if vdjalign reports an insertion longer than this, rerun the query (typically with different match/mismatch ratio) self.absolute_max_insertion_length = 200 # just ignore them if it's longer than this self.input_info = input_info self.remaining_queries = set([q for q in self.input_info.keys()]) # we remove queries from this set when we're satisfied with the current output (in general we may have to rerun some queries with different match/mismatch scores) self.new_indels = 0 # number of new indels that were kicked up this time through self.match_mismatch = copy.deepcopy(self.args.initial_match_mismatch) # don't want to modify it! self.gap_open_penalty = self.args.gap_open_penalty # not modifying it now, but just to make sure we don't in the future self.reco_info = reco_info self.glfo = glfo self.info = {} self.info['queries'] = [] # list of queries that *passed* sw, i.e. for which we have information self.info['all_best_matches'] = set() # every gene that was a best match for at least one query self.info['all_matches'] = {r : set() for r in utils.regions} # every gene that was *any* match for at least one query self.info['indels'] = {} self.nth_try = 1 self.unproductive_queries = set() # rewrite input germline sets (if needed) self.my_datadir = my_datadir self.alfinder, self.pcounter, self.true_pcounter, self.perfplotter = None, None, None, None if find_new_alleles: # NOTE *not* the same as <self.args.find_new_alleles> self.alfinder = AlleleFinder(self.glfo, self.args) if write_parameters: # NOTE *not* the same as <self.args.cache_parameters> self.pcounter = ParameterCounter(self.glfo, self.args) if not self.args.is_data: self.true_pcounter = ParameterCounter(self.glfo, self.args) if self.args.plot_performance: self.perfplotter = PerformancePlotter(self.glfo, 'sw') if not os.path.exists(self.args.ig_sw_dir + 'ig-sw'): raise Exception('ERROR ig-sw path d.n.e: ' + self.args.ig_sw_dir + 'ig-sw') # ---------------------------------------------------------------------------------------- def run(self): # start = time.time() base_infname = 'query-seqs.fa' base_outfname = 'query-seqs.sam' sys.stdout.flush() n_procs = self.args.n_fewer_procs initial_queries_per_proc = float(len(self.remaining_queries)) / n_procs while len(self.remaining_queries) > 0: # we remove queries from <self.remaining_queries> as we're satisfied with their output if self.nth_try > 1 and float(len(self.remaining_queries)) / n_procs < initial_queries_per_proc: n_procs = int(max(1., float(len(self.remaining_queries)) / initial_queries_per_proc)) self.write_vdjalign_input(base_infname, n_procs) self.execute_commands(base_infname, base_outfname, n_procs) self.read_output(base_outfname, n_procs) if self.nth_try > 3: break self.nth_try += 1 # it's set to 1 before we begin the first try, and increases to 2 just before we start the second try self.finalize() # ---------------------------------------------------------------------------------------- def finalize(self): if self.perfplotter is not None: self.perfplotter.plot(self.args.plotdir + '/sw', only_csv=self.args.only_csv_plots) # print ' sw time: %.3f' % (time.time()-start) print ' info for %d' % len(self.info['queries']), skipped_unproductive = len(self.unproductive_queries) n_remaining = len(self.remaining_queries) if skipped_unproductive > 0 or n_remaining > 0: print ' (skipped', print '%d / %d = %.2f unproductive' % (skipped_unproductive, len(self.input_info), float(skipped_unproductive) / len(self.input_info)), if n_remaining > 0: print ' %d / %d = %.2f other' % (n_remaining, len(self.input_info), float(n_remaining) / len(self.input_info)), print ')', print '' sys.stdout.flush() if n_remaining > 0: printstr = ' %s %d missing %s' % (utils.color('red', 'warning'), n_remaining, utils.plural_str('annotation', n_remaining)) if n_remaining < 15: printstr += ' (' + ':'.join(self.remaining_queries) + ')' print printstr if self.debug and len(self.info['indels']) > 0: print ' indels: %s' % ':'.join(self.info['indels'].keys()) assert len(self.info['queries']) + skipped_unproductive + n_remaining == len(self.input_info) if self.debug and not self.args.is_data and n_remaining > 0: print 'true annotations for remaining events:' for qry in self.remaining_queries: utils.print_reco_event(self.glfo['seqs'], self.reco_info[qry], extra_str=' ', label='true:') if self.alfinder is not None: self.alfinder.finalize(debug=self.args.debug_new_allele_finding) self.info['new-alleles'] = self.alfinder.new_allele_info if self.args.plotdir is not None: self.alfinder.plot(self.args.plotdir + '/sw', only_csv=self.args.only_csv_plots) # add padded info to self.info (returns if stuff has already been padded) self.pad_seqs_to_same_length() # NOTE this uses *all the gene matches (not just the best ones), so it has to come before we call pcounter.write(), since that fcn rewrites the germlines removing genes that weren't best matches. But NOTE also that I'm not sure what but that the padding actually *needs* all matches (rather than just all *best* matches) if self.pcounter is not None: if self.args.plotdir is not None: self.pcounter.plot(self.args.plotdir + '/sw', subset_by_gene=True, cyst_positions=self.glfo['cyst-positions'], tryp_positions=self.glfo['tryp-positions'], only_csv=self.args.only_csv_plots) if self.true_pcounter is not None: self.true_pcounter.plot(self.args.plotdir + '/sw-true', subset_by_gene=True, cyst_positions=self.glfo['cyst-positions'], tryp_positions=self.glfo['tryp-positions'], only_csv=self.args.only_csv_plots) self.pcounter.write(self.parameter_dir, self.my_datadir) if self.true_pcounter is not None: self.true_pcounter.write(self.parameter_dir + '-true') self.info['remaining_queries'] = self.remaining_queries # ---------------------------------------------------------------------------------------- def subworkdir(self, iproc, n_procs): if n_procs == 1: return self.args.workdir else: return self.args.workdir + '/sw-' + str(iproc) # ---------------------------------------------------------------------------------------- def execute_commands(self, base_infname, base_outfname, n_procs): # ---------------------------------------------------------------------------------------- def get_outfname(iproc): return self.subworkdir(iproc, n_procs) + '/' + base_outfname # ---------------------------------------------------------------------------------------- def get_cmd_str(iproc): return self.get_vdjalign_cmd_str(self.subworkdir(iproc, n_procs), base_infname, base_outfname, n_procs) # start all procs for the first time procs, n_tries = [], [] for iproc in range(n_procs): procs.append(utils.run_cmd(get_cmd_str(iproc), self.subworkdir(iproc, n_procs))) n_tries.append(1) time.sleep(0.1) # keep looping over the procs until they're all done while procs.count(None) != len(procs): # we set each proc to None when it finishes for iproc in range(n_procs): if procs[iproc] is None: # already finished continue if procs[iproc].poll() is not None: # it's finished utils.finish_process(iproc, procs, n_tries, self.subworkdir(iproc, n_procs), get_outfname(iproc), get_cmd_str(iproc)) sys.stdout.flush() time.sleep(1) for iproc in range(n_procs): os.remove(self.subworkdir(iproc, n_procs) + '/' + base_infname) sys.stdout.flush() # ---------------------------------------------------------------------------------------- def write_vdjalign_input(self, base_infname, n_procs): n_remaining = len(self.remaining_queries) queries_per_proc = float(n_remaining) / n_procs n_queries_per_proc = int(math.ceil(queries_per_proc)) written_queries = set() # make sure we actually write each query TODO remove this when you work out where they're disappearing to if n_procs == 1: # double check for rounding problems or whatnot assert n_queries_per_proc == n_remaining for iproc in range(n_procs): workdir = self.subworkdir(iproc, n_procs) if n_procs > 1: utils.prep_dir(workdir) with opener('w')(workdir + '/' + base_infname) as sub_infile: iquery = 0 for query_name in self.remaining_queries: # NOTE this is wasteful to loop of all the remaining queries for each process... but maybe not that wasteful if iquery >= n_remaining: break if iquery < iproc*n_queries_per_proc or iquery >= (iproc + 1)*n_queries_per_proc: # not for this process iquery += 1 continue sub_infile.write('>' + query_name + ' NUKES\n') seq = self.input_info[query_name]['seq'] if query_name in self.info['indels']: seq = self.info['indels'][query_name]['reversed_seq'] # use the query sequence with shm insertions and deletions reversed sub_infile.write(seq + '\n') written_queries.add(query_name) iquery += 1 not_written = self.remaining_queries - written_queries if len(not_written) > 0: raise Exception('didn\'t write %s to %s' % (':'.join(not_written), self.args.workdir)) # ---------------------------------------------------------------------------------------- def get_vdjalign_cmd_str(self, workdir, base_infname, base_outfname, n_procs=None): """ Run smith-waterman alignment (from Connor's ighutils package) on the seqs in <base_infname>, and toss all the top matches into <base_outfname>. """ # large gap-opening penalty: we want *no* gaps in the middle of the alignments # match score larger than (negative) mismatch score: we want to *encourage* some level of shm. If they're equal, we tend to end up with short unmutated alignments, which screws everything up cmd_str = '/partis/packages/ig-sw/src/ig_align/ig-sw' if self.args.slurm or utils.auto_slurm(n_procs): cmd_str = 'srun ' + cmd_str cmd_str += ' -l ' + 'IG' + self.args.chain.upper() cmd_str += ' -d 50' match, mismatch = self.match_mismatch cmd_str += ' -m ' + str(match) + ' -u ' + str(mismatch) cmd_str += ' -o ' + str(self.gap_open_penalty) cmd_str += ' -p ' + self.my_datadir + '/' + self.args.chain + '/' cmd_str += ' ' + workdir + '/' + base_infname + ' ' + workdir + '/' + base_outfname return cmd_str # ---------------------------------------------------------------------------------------- def read_output(self, base_outfname, n_procs=1): queries_to_rerun = OrderedDict() # This is to keep track of every query that we don't add to self.info (i.e. it does *not* include unproductive queries that we ignore/skip entirely because we were told to by a command line argument) # ...whereas <self.unproductive_queries> is to keep track of the queries that were definitively unproductive (i.e. we removed them from self.remaining_queries) when we were told to skip unproductives by a command line argument for reason in ['unproductive', 'no-match', 'weird-annot.', 'nonsense-bounds', 'invalid-codon']: queries_to_rerun[reason] = set() self.new_indels = 0 n_processed = 0 self.tmp_queries_read_from_file = set() # TODO remove this for iproc in range(n_procs): outfname = self.subworkdir(iproc, n_procs) + '/' + base_outfname with contextlib.closing(pysam.Samfile(outfname)) as sam: #changed bam to sam because ig-sw outputs sam files grouped = itertools.groupby(iter(sam), operator.attrgetter('qname')) for _, reads in grouped: # loop over query sequences self.process_query(sam.references, list(reads), queries_to_rerun) n_processed += 1 not_read = self.remaining_queries - self.tmp_queries_read_from_file if len(not_read) > 0: raise Exception('didn\'t read %s from %s' % (':'.join(not_read), self.args.workdir)) if self.nth_try == 1: print ' processed remaining new-indels rerun: ' + ' '.join([reason for reason in queries_to_rerun]) print ' %8d' % n_processed, if len(self.remaining_queries) > 0: printstr = ' %8d' % len(self.remaining_queries) printstr += ' %8d' % self.new_indels printstr += ' ' n_to_rerun = 0 for reason in queries_to_rerun: printstr += ' %8d' % len(queries_to_rerun[reason]) n_to_rerun += len(queries_to_rerun[reason]) print printstr, if n_to_rerun + self.new_indels != len(self.remaining_queries): print '' raise Exception('numbers don\'t add up in sw output reader (n_to_rerun + new_indels != remaining_queries): %d + %d != %d (look in %s)' % (n_to_rerun, self.new_indels, len(self.remaining_queries), self.args.workdir)) if self.nth_try < 2 or self.new_indels == 0: # increase the mismatch score if it's the first try, or if there's no new indels print ' increasing mismatch score (%d --> %d) and rerunning them' % (self.match_mismatch[1], self.match_mismatch[1] + 1) self.match_mismatch[1] += 1 elif self.new_indels > 0: # if there were some indels, rerun with the same parameters (but when the input is written the indel will be "reversed' in the sequences that's passed to ighutil) print ' rerunning for indels' self.new_indels = 0 else: # shouldn't get here assert False else: print ' all done' for iproc in range(n_procs): workdir = self.subworkdir(iproc, n_procs) os.remove(workdir + '/' + base_outfname) if n_procs > 1: # still need the top-level workdir os.rmdir(workdir) # ---------------------------------------------------------------------------------------- def get_indel_info(self, query_name, cigarstr, qrseq, glseq, gene): cigars = re.findall('[0-9][0-9]*[A-Z]', cigarstr) # split cigar string into its parts cigars = [(cstr[-1], int(cstr[:-1])) for cstr in cigars] # split each part into the code and the length codestr = '' qpos = 0 # position within query sequence indelfo = utils.get_empty_indel() # replacement_seq: query seq with insertions removed and germline bases inserted at the position of deletions tmp_indices = [] for code, length in cigars: codestr += length * code if code == 'I': # advance qr seq but not gl seq indelfo['indels'].append({'type' : 'insertion', 'pos' : qpos, 'len' : length, 'seqstr' : ''}) # insertion begins at <pos> tmp_indices += [len(indelfo['indels']) - 1 for _ in range(length)]# indel index corresponding to this position in the alignment elif code == 'D': # advance qr seq but not gl seq indelfo['indels'].append({'type' : 'deletion', 'pos' : qpos, 'len' : length, 'seqstr' : ''}) # first deleted base is <pos> (well, first base which is in the position of the first deleted base) tmp_indices += [len(indelfo['indels']) - 1 for _ in range(length)]# indel index corresponding to this position in the alignment else: tmp_indices += [None for _ in range(length)] # indel index corresponding to this position in the alignment qpos += length qrprintstr, glprintstr = '', '' iqr, igl = 0, 0 for icode in range(len(codestr)): code = codestr[icode] if code == 'M': qrbase = qrseq[iqr] if qrbase != glseq[igl]: qrbase = utils.color('red', qrbase) qrprintstr += qrbase glprintstr += glseq[igl] indelfo['reversed_seq'] += qrseq[iqr] # add the base to the overall sequence with all indels reversed elif code == 'S': continue elif code == 'I': qrprintstr += utils.color('light_blue', qrseq[iqr]) glprintstr += utils.color('light_blue', '*') indelfo['indels'][tmp_indices[icode]]['seqstr'] += qrseq[iqr] # and to the sequence of just this indel igl -= 1 elif code == 'D': qrprintstr += utils.color('light_blue', '*') glprintstr += utils.color('light_blue', glseq[igl]) indelfo['reversed_seq'] += glseq[igl] # add the base to the overall sequence with all indels reversed indelfo['indels'][tmp_indices[icode]]['seqstr'] += glseq[igl] # and to the sequence of just this indel iqr -= 1 else: raise Exception('unhandled code %s' % code) iqr += 1 igl += 1 if self.debug: print '\n indels in %s' % query_name print ' %20s %s' % (gene, glprintstr) print ' %20s %s' % ('query', qrprintstr) for idl in indelfo['indels']: print ' %10s: %d bases at %d (%s)' % (idl['type'], idl['len'], idl['pos'], idl['seqstr']) # utils.undo_indels(indelfo) # print ' %s' % self.input_info[query_name]['seq'] return indelfo # ---------------------------------------------------------------------------------------- def process_query(self, references, reads, queries_to_rerun): primary = next((r for r in reads if not r.is_secondary), None) query_seq = primary.seq query_name = primary.qname self.tmp_queries_read_from_file.add(query_name) first_match_query_bounds = None # since sw excises its favorite v match, we have to know this match's boundaries in order to calculate k_d for all the other matches all_match_names = {} warnings = {} # ick, this is a messy way to pass stuff around for region in utils.regions: all_match_names[region] = [] all_query_bounds, all_germline_bounds = {}, {} for read in reads: # loop over the matches found for each query sequence # set this match's values read.seq = query_seq # only the first one has read.seq set by default, so we need to set the rest by hand gene = references[read.tid] region = utils.get_region(gene) raw_score = read.tags[0][1] # raw because they don't include the gene choice probs score = raw_score qrbounds = (read.qstart, read.qend) glbounds = (read.pos, read.aend) if region == 'v' and first_match_query_bounds is None: first_match_query_bounds = qrbounds # perform a few checks and see if we want to skip this match # TODO I wish this wasn't here and I suspect I don't really need it (any more) UPDATE I dunno, this definitely eliminates some stupid (albeit rare) matches if region == 'v': # skip matches with cpos past the end of the query seq (i.e. eroded a ton on the right side of the v) cpos = self.glfo['cyst-positions'][gene] - glbounds[0] + qrbounds[0] # position within original germline gene, minus the position in that germline gene at which the match starts, plus the position in the query sequence at which the match starts if cpos < 0 or cpos >= len(query_seq): continue if 'I' in read.cigarstring or 'D' in read.cigarstring: # skip indels, and tell the HMM to skip indels (you won't see any unless you decrease the <self.gap_open_penalty>) if self.args.no_indels: # you can forbid indels on the command line continue if self.nth_try < 2: # we also forbid indels on the first try (we want to increase the mismatch score before we conclude it's "really" an indel) continue if len(all_match_names[region]) == 0: # if this is the first (best) match for this region, allow indels (otherwise skip the match) if query_name not in self.info['indels']: self.info['indels'][query_name] = self.get_indel_info(query_name, read.cigarstring, query_seq[qrbounds[0] : qrbounds[1]], self.glfo['seqs'][region][gene][glbounds[0] : glbounds[1]], gene) self.info['indels'][query_name]['reversed_seq'] = query_seq[ : qrbounds[0]] + self.info['indels'][query_name]['reversed_seq'] + query_seq[qrbounds[1] : ] self.new_indels += 1 # TODO this 'return' used to be after and indented from the else below, and that continue wasn't there. I should make sure this is how I want it return # don't process this query any further -- since it's now in the indel info it'll get run next time through else: if self.debug: print ' ignoring subsequent indels for %s' % query_name continue # hopefully there's a later match without indels else: continue if qrbounds[1]-qrbounds[0] != glbounds[1]-glbounds[0]: raise Exception('germline match (%d %d) not same length as query match (%d %d)' % (qrbounds[0], qrbounds[1], glbounds[0], glbounds[1])) assert qrbounds[1] <= len(query_seq) if glbounds[1] > len(self.glfo['seqs'][region][gene]): print ' ', gene print ' ', glbounds[1], len(self.glfo['seqs'][region][gene]) print ' ', self.glfo['seqs'][region][gene] assert glbounds[1] <= len(self.glfo['seqs'][region][gene]) assert qrbounds[1]-qrbounds[0] == glbounds[1]-glbounds[0] # and finally add this match's information warnings[gene] = '' all_match_names[region].append((score, gene)) # NOTE it is important that this is ordered such that the best match is first all_query_bounds[gene] = qrbounds all_germline_bounds[gene] = glbounds self.summarize_query(query_name, query_seq, all_match_names, all_query_bounds, all_germline_bounds, warnings, first_match_query_bounds, queries_to_rerun) # ---------------------------------------------------------------------------------------- def print_match(self, region, gene, query_seq, score, glbounds, qrbounds, codon_pos, warnings, skipping=False): out_str_list = [] buff_str = (20 - len(gene)) * ' ' out_str_list.append('%8s%s%s%9s%3s %6.0f ' % (' ', utils.color_gene(gene), '', '', buff_str, score)) out_str_list.append('%4d%4d %s\n' % (glbounds[0], glbounds[1], self.glfo['seqs'][region][gene][glbounds[0]:glbounds[1]])) out_str_list.append('%46s %4d%4d' % ('', qrbounds[0], qrbounds[1])) out_str_list.append(' %s ' % (utils.color_mutants(self.glfo['seqs'][region][gene][glbounds[0]:glbounds[1]], query_seq[qrbounds[0]:qrbounds[1]]))) if region != 'd': out_str_list.append('(%s %d)' % (utils.conserved_codons[region], codon_pos)) if warnings[gene] != '': out_str_list.append('WARNING ' + warnings[gene]) if skipping: out_str_list.append('skipping!') print ''.join(out_str_list) # ---------------------------------------------------------------------------------------- def get_overlap_and_available_space(self, rpair, best, qrbounds): l_reg = rpair['left'] r_reg = rpair['right'] l_gene = best[l_reg] r_gene = best[r_reg] overlap = qrbounds[l_gene][1] - qrbounds[r_gene][0] available_space = qrbounds[r_gene][1] - qrbounds[l_gene][0] return overlap, available_space # ---------------------------------------------------------------------------------------- def check_boundaries(self, rpair, qrbounds, glbounds, query_name, query_seq, best, recursed=False, debug=False): # NOTE this duplicates code in shift_overlapping_boundaries(), which makes me cranky, but this setup avoids other things I dislike more l_reg = rpair['left'] r_reg = rpair['right'] l_gene = best[l_reg] r_gene = best[r_reg] overlap, available_space = self.get_overlap_and_available_space(rpair, best, qrbounds) if debug: print ' %s %s overlap %d available space %d' % (l_reg, r_reg, overlap, available_space) status = 'ok' if overlap > 0: # positive overlap means they actually overlap status = 'overlap' if overlap > available_space or overlap == 1 and available_space == 1: # call it nonsense if the boundaries are really whack (i.e. there isn't enough space to resolve the overlap) -- we'll presumably either toss the query or rerun with different match/mismatch status = 'nonsense' if debug: print ' overlap status: %s' % status if not recursed and status == 'nonsense' and l_reg == 'd' and self.nth_try > 2: # on rare occasions with very high mutation, vdjalign refuses to give us a j match that's at all to the right of the d match assert l_reg == 'd' and r_reg == 'j' if debug: print ' %s: synthesizing d match' % query_name leftmost_position = min(qrbounds[l_gene][0], qrbounds[r_gene][0]) qrbounds[l_gene] = (leftmost_position, leftmost_position + 1) # swap whatever crummy nonsense d match we have now for a one-base match at the left end of things (things in practice should be left end of j match) glbounds[l_gene] = (0, 1) status = self.check_boundaries(rpair, qrbounds, glbounds, query_name, query_seq, best, recursed=True, debug=debug) if status == 'overlap': if debug: print ' \'overlap\' status after synthesizing d match. Setting to \'nonsense\', I can\'t deal with this bullshit' status = 'nonsense' return status # ---------------------------------------------------------------------------------------- def shift_overlapping_boundaries(self, rpair, qrbounds, glbounds, query_name, query_seq, best, debug=False): # NOTE this does pretty much the same thing as resolve_overlapping_matches in joinparser.py """ s-w allows d and j matches (and v and d matches) to overlap... which makes no sense, so apportion the disputed territory between the two regions. Note that this still works if, say, v is the entire sequence, i.e. one match is entirely subsumed by another. """ l_reg = rpair['left'] r_reg = rpair['right'] l_gene = best[l_reg] r_gene = best[r_reg] overlap, available_space = self.get_overlap_and_available_space(rpair, best, qrbounds) if overlap <= 0: # nothing to do, they're already consistent print 'shouldn\'t get here any more if there\'s no overlap' return if overlap > available_space: raise Exception('overlap %d bigger than available space %d between %s and %s for %s' % (overlap, available_space, l_reg, r_reg, query_name)) if debug: print '%s%s: %d-%d overlaps with %d-%d by %d' % (l_reg, r_reg, qrbounds[l_gene][0], qrbounds[l_gene][1], qrbounds[r_gene][0], qrbounds[r_gene][1], overlap) l_length = qrbounds[l_gene][1] - qrbounds[l_gene][0] # initial length of lefthand gene match r_length = qrbounds[r_gene][1] - qrbounds[r_gene][0] # and same for the righthand one l_portion, r_portion = 0, 0 # portion of the initial overlap that we give to each side if debug: print ' lengths portions ' while l_portion + r_portion < overlap: if debug: print ' %4d %4d %4d %4d' % (l_length, r_length, l_portion, r_portion) if l_length <= 1 and r_length <= 1: # don't want to erode match (in practice it'll be the d match) all the way to zero raise Exception('both lengths went to one without resolving overlap for %s: %s %s' % (query_name, qrbounds[l_gene], qrbounds[r_gene])) elif l_length > 1 and r_length > 1: # if both have length left, alternate back and forth if (l_portion + r_portion) % 2 == 0: l_portion += 1 # give one base to the left l_length -= 1 else: r_portion += 1 # and one to the right r_length -= 1 elif l_length > 1: l_portion += 1 l_length -= 1 elif r_length > 1: r_portion += 1 r_length -= 1 if debug: print ' %4d %4d %4d %4d %s %s' % (l_length, r_length, l_portion, r_portion, '', '') print ' %s apportioning %d bases between %s (%d) match and %s (%d) match' % (query_name, overlap, l_reg, l_portion, r_reg, r_portion) assert l_portion + r_portion == overlap qrbounds[l_gene] = (qrbounds[l_gene][0], qrbounds[l_gene][1] - l_portion) glbounds[l_gene] = (glbounds[l_gene][0], glbounds[l_gene][1] - l_portion) qrbounds[r_gene] = (qrbounds[r_gene][0] + r_portion, qrbounds[r_gene][1]) glbounds[r_gene] = (glbounds[r_gene][0] + r_portion, glbounds[r_gene][1]) best[l_reg + '_gl_seq'] = self.glfo['seqs'][l_reg][l_gene][glbounds[l_gene][0] : glbounds[l_gene][1]] best[l_reg + '_qr_seq'] = query_seq[qrbounds[l_gene][0]:qrbounds[l_gene][1]] best[r_reg + '_gl_seq'] = self.glfo['seqs'][r_reg][r_gene][glbounds[r_gene][0] : glbounds[r_gene][1]] best[r_reg + '_qr_seq'] = query_seq[qrbounds[r_gene][0]:qrbounds[r_gene][1]] # ---------------------------------------------------------------------------------------- def add_to_info(self, query_name, query_seq, kvals, match_names, best, all_germline_bounds, all_query_bounds, codon_positions): assert query_name not in self.info self.info['queries'].append(query_name) self.info[query_name] = {} self.info[query_name]['unique_id'] = query_name # redundant, but used somewhere down the line self.info[query_name]['k_v'] = kvals['v'] self.info[query_name]['k_d'] = kvals['d'] self.info[query_name]['all'] = ':'.join(match_names['v'] + match_names['d'] + match_names['j']) # all gene matches for this query self.info[query_name]['cdr3_length'] = codon_positions['j'] - codon_positions['v'] + 3 #tryp_position_in_joined_seq - self.cyst_position + 3 self.info[query_name]['cyst_position'] = codon_positions['v'] self.info[query_name]['tryp_position'] = codon_positions['j'] # erosion, insertion, mutation info for best match self.info[query_name]['v_5p_del'] = all_germline_bounds[best['v']][0] self.info[query_name]['v_3p_del'] = len(self.glfo['seqs']['v'][best['v']]) - all_germline_bounds[best['v']][1] # len(germline v) - gl_match_end self.info[query_name]['d_5p_del'] = all_germline_bounds[best['d']][0] self.info[query_name]['d_3p_del'] = len(self.glfo['seqs']['d'][best['d']]) - all_germline_bounds[best['d']][1] self.info[query_name]['j_5p_del'] = all_germline_bounds[best['j']][0] self.info[query_name]['j_3p_del'] = len(self.glfo['seqs']['j'][best['j']]) - all_germline_bounds[best['j']][1] self.info[query_name]['fv_insertion'] = query_seq[ : all_query_bounds[best['v']][0]] self.info[query_name]['vd_insertion'] = query_seq[all_query_bounds[best['v']][1] : all_query_bounds[best['d']][0]] self.info[query_name]['dj_insertion'] = query_seq[all_query_bounds[best['d']][1] : all_query_bounds[best['j']][0]] self.info[query_name]['jf_insertion'] = query_seq[all_query_bounds[best['j']][1] : ] self.info[query_name]['indelfo'] = self.info['indels'].get(query_name, utils.get_empty_indel()) for region in utils.regions: self.info[query_name][region + '_gene'] = best[region] self.info['all_best_matches'].add(best[region]) self.info['all_matches'][region] |= set(match_names[region]) self.info[query_name]['seq'] = query_seq # NOTE this is the seq output by vdjalign, i.e. if we reversed any indels it is the reversed sequence existing_implicit_keys = tuple(['cdr3_length', 'cyst_position', 'tryp_position']) utils.add_implicit_info(self.glfo, self.info[query_name], multi_seq=False, existing_implicit_keys=existing_implicit_keys) if self.debug: if not self.args.is_data: utils.print_reco_event(self.glfo['seqs'], self.reco_info[query_name], extra_str=' ', label='true:') utils.print_reco_event(self.glfo['seqs'], self.info[query_name], extra_str=' ', label='inferred:') if self.alfinder is not None: self.alfinder.increment(self.info[query_name]) if self.pcounter is not None: self.pcounter.increment_all_params(self.info[query_name]) if self.true_pcounter is not None: self.true_pcounter.increment_all_params(self.reco_info[query_name]) if self.perfplotter is not None: if query_name in self.info['indels']: print ' skipping performance evaluation of %s because of indels' % query_name # I just have no idea how to handle naive hamming fraction when there's indels else: self.perfplotter.evaluate(self.reco_info[query_name], self.info[query_name]) self.remaining_queries.remove(query_name) # ---------------------------------------------------------------------------------------- def summarize_query(self, query_name, query_seq, all_match_names, all_query_bounds, all_germline_bounds, warnings, first_match_query_bounds, queries_to_rerun): best, match_names = {}, {} k_v_min, k_d_min = 999, 999 k_v_max, k_d_max = 0, 0 for region in utils.regions: all_match_names[region] = sorted(all_match_names[region], reverse=True) match_names[region] = [] if self.debug >= 2: print query_name for region in utils.regions: for score, gene in all_match_names[region]: glbounds = all_germline_bounds[gene] qrbounds = all_query_bounds[gene] assert qrbounds[1] <= len(query_seq) # NOTE I'm putting these up above as well (in process_query), so in time I should remove them from here assert glbounds[1] <= len(self.glfo['seqs'][region][gene]) assert qrbounds[0] >= 0 assert glbounds[0] >= 0 glmatchseq = self.glfo['seqs'][region][gene][glbounds[0]:glbounds[1]] match_names[region].append(gene) if self.debug >= 2: self.print_match(region, gene, query_seq, score, glbounds, qrbounds, -1, warnings, skipping=False) # if the germline match and the query match aren't the same length, s-w likely added an insert, which we shouldn't get since the gap-open penalty is jacked up so high if len(glmatchseq) != len(query_seq[qrbounds[0]:qrbounds[1]]): # neurotic double check (um, I think) EDIT hey this totally saved my ass print 'ERROR %d not same length' % query_name print glmatchseq, glbounds[0], glbounds[1] print query_seq[qrbounds[0]:qrbounds[1]] assert False # NOTE since I'm no longer skipping the genes after the first <args.n_max_per_region>, the OR of k-space below is overly conservative. UPDATE not sure if this is still relevant, but I'll move it down here in case I feel like thinking about it later if region == 'v': this_k_v = all_query_bounds[gene][1] # NOTE even if the v match doesn't start at the left hand edge of the query sequence, we still measure k_v from there. # In other words, sw doesn't tell the hmm about it k_v_min = min(this_k_v, k_v_min) k_v_max = max(this_k_v, k_v_max) if region == 'd': this_k_d = all_query_bounds[gene][1] - first_match_query_bounds[1] # end of d minus end of v k_d_min = min(this_k_d, k_d_min) k_d_max = max(this_k_d, k_d_max) # check consistency with best match (since the best match is excised in s-w code, and because ham is run with *one* k_v k_d set) if region not in best: best[region] = gene best[region + '_gl_seq'] = self.glfo['seqs'][region][gene][glbounds[0]:glbounds[1]] best[region + '_qr_seq'] = query_seq[qrbounds[0]:qrbounds[1]] best[region + '_score'] = score for region in utils.regions: if region not in best: if self.debug: print ' no', region, 'match found for', query_name # NOTE if no d match found, we should really just assume entire d was eroded queries_to_rerun['no-match'].add(query_name) return # s-w allows d and j matches to overlap, so we need to apportion the disputed bases region_pairs = ({'left':'v', 'right':'d'}, {'left':'d', 'right':'j'}) for rpair in region_pairs: overlap_status = self.check_boundaries(rpair, all_query_bounds, all_germline_bounds, query_name, query_seq, best) if overlap_status == 'overlap': self.shift_overlapping_boundaries(rpair, all_query_bounds, all_germline_bounds, query_name, query_seq, best) elif overlap_status == 'nonsense': queries_to_rerun['nonsense-bounds'].add(query_name) return else: assert overlap_status == 'ok' # check for suspiciously bad annotations vd_insertion = query_seq[all_query_bounds[best['v']][1] : all_query_bounds[best['d']][0]] dj_insertion = query_seq[all_query_bounds[best['d']][1] : all_query_bounds[best['j']][0]] if self.nth_try < 2: if len(vd_insertion) > self.max_insertion_length or len(dj_insertion) > self.max_insertion_length: if self.debug: print ' suspiciously long insertion in %s, rerunning' % query_name queries_to_rerun['weird-annot.'].add(query_name) return if len(vd_insertion) > self.absolute_max_insertion_length or len(dj_insertion) > self.absolute_max_insertion_length: if self.debug: print ' suspiciously long insertion in %s, rerunning' % query_name queries_to_rerun['weird-annot.'].add(query_name) return if self.debug: print query_name # set and check conserved codon positions tmp_gl_positions = {'v' : self.glfo['cyst-positions'], 'j' : self.glfo['tryp-positions']} # hack hack hack codon_positions = {} for region in ['v', 'j']: pos = tmp_gl_positions[region][best[region]] - all_germline_bounds[best[region]][0] + all_query_bounds[best[region]][0] # position within original germline gene, minus the position in that germline gene at which the match starts, plus the position in the query sequence at which the match starts if pos < 0 or pos >= len(query_seq): if self.debug: print ' invalid %s codon position (%d in seq of length %d), rerunning' % (region, pos, len(query_seq)) queries_to_rerun['invalid-codon'].add(query_name) return codon_positions[region] = pos # check for unproductive rearrangements codons_ok = utils.check_both_conserved_codons(query_seq, codon_positions['v'], codon_positions['j'], assert_on_fail=False) cdr3_length = codon_positions['j'] - codon_positions['v'] + 3 if cdr3_length < 6: # NOTE six is also hardcoded in utils if self.debug: print ' negative cdr3 length %d' % (cdr3_length) queries_to_rerun['invalid-codon'].add(query_name) return in_frame_cdr3 = (cdr3_length % 3 == 0) no_stop_codon = utils.stop_codon_check(query_seq, codon_positions['v']) if not codons_ok or not in_frame_cdr3 or not no_stop_codon: if self.debug: print ' unproductive rearrangement:', if not codons_ok: print ' bad codons', if not in_frame_cdr3: print ' out of frame cdr3', if not no_stop_codon: print ' stop codon' print '' if self.nth_try < 2 and (not codons_ok or not in_frame_cdr3): # rerun with higher mismatch score (sometimes unproductiveness is the result of a really screwed up annotation rather than an actual unproductive sequence). Note that stop codons aren't really indicative of screwed up annotations, so they don't count. if self.debug: print ' ...rerunning' queries_to_rerun['unproductive'].add(query_name) return elif self.args.skip_unproductive: if self.debug: print ' ...skipping' self.unproductive_queries.add(query_name) self.remaining_queries.remove(query_name) return else: pass # this is here so you don't forget that if neither of the above is true, we fall through and add the query to self.info # best k_v, k_d: k_v = all_query_bounds[best['v']][1] # end of v match k_d = all_query_bounds[best['d']][1] - all_query_bounds[best['v']][1] # end of d minus end of v if k_d_max < 5: # since the s-w step matches to the longest possible j and then excises it, this sometimes gobbles up the d, resulting in a very short d alignment. if self.debug: print ' expanding k_d' k_d_max = max(8, k_d_max) if 'IGHJ4*' in best['j'] and self.glfo['seqs']['d'][best['d']][-5:] == 'ACTAC': # the end of some d versions is the same as the start of some j versions, so the s-w frequently kicks out the 'wrong' alignment if self.debug: print ' doubly expanding k_d' if k_d_max-k_d_min < 8: k_d_min -= 5 k_d_max += 2 k_v_min = max(1, k_v_min - self.args.default_v_fuzz) # ok, so I don't *actually* want it to be zero... oh, well k_v_max += self.args.default_v_fuzz k_d_min = max(1, k_d_min - self.args.default_d_fuzz) k_d_max += self.args.default_d_fuzz assert k_v_min > 0 and k_d_min > 0 and k_v_max > 0 and k_d_max > 0 if self.debug: print ' k_v: %d [%d-%d)' % (k_v, k_v_min, k_v_max) print ' k_d: %d [%d-%d)' % (k_d, k_d_min, k_d_max) kvals = {} kvals['v'] = {'best':k_v, 'min':k_v_min, 'max':k_v_max} kvals['d'] = {'best':k_d, 'min':k_d_min, 'max':k_d_max} self.add_to_info(query_name, query_seq, kvals, match_names, best, all_germline_bounds, all_query_bounds, codon_positions=codon_positions) # ---------------------------------------------------------------------------------------- def get_padding_parameters(self, debug=False): maxima = {'gl_cpos' : None, 'gl_cpos_to_j_end' : None} for query in self.info['queries']: swfo = self.info[query] fvstuff = max(0, len(swfo['fv_insertion']) - swfo['v_5p_del']) # we always want to pad out to the entire germline sequence, so don't let this go negative jfstuff = max(0, len(swfo['jf_insertion']) - swfo['j_3p_del']) for v_match in self.info['all_matches']['v']: # NOTE have to loop over all gl matches, even ones for other sequences, because we want bcrham to be able to compare any sequence to any other UPDATE but do I really need to use *all* all matches, or would it be ok to just use all *best* matches? not sure... gl_cpos = self.glfo['cyst-positions'][v_match] + fvstuff if maxima['gl_cpos'] is None or gl_cpos > maxima['gl_cpos']: maxima['gl_cpos'] = gl_cpos seq = swfo['seq'] cpos = swfo['cyst_position'] # cyst position in query sequence (as opposed to gl_cpos, which is in germline allele) for j_match in self.info['all_matches']['j']: # NOTE have to loop over all gl matches, even ones for other sequences, because we want bcrham to be able to compare any sequence to any other UPDATE but do I really need to use *all* all matches, or would it be ok to just use all *best* matches? not sure... # TODO this is totally wrong -- I'm only storing j_3p_del for the best match... but hopefully it'll give enough padding for the moment gl_cpos_to_j_end = len(seq) - cpos + swfo['j_3p_del'] + jfstuff if maxima['gl_cpos_to_j_end'] is None or gl_cpos_to_j_end > maxima['gl_cpos_to_j_end']: maxima['gl_cpos_to_j_end'] = gl_cpos_to_j_end if debug: print ' maxima:', for k, v in maxima.items(): print '%s %d ' % (k, v), print '' return maxima # ---------------------------------------------------------------------------------------- def pad_seqs_to_same_length(self, debug=False): """ Pad all sequences in <seqinfo> to the same length to the left and right of their conserved cysteine positions. Next, pads all sequences further out (if necessary) such as to eliminate all v_5p and j_3p deletions. """ maxima = self.get_padding_parameters(debug=debug) for query in self.info['queries']: swfo = self.info[query] if 'padded' in swfo: # already added padded information (we're probably partitioning, and this is not the first step) return seq = swfo['seq'] cpos = swfo['cyst_position'] if cpos < 0 or cpos >= len(seq): print 'hm now what do I want to do here?' k_v = swfo['k_v'] padleft = maxima['gl_cpos'] - cpos # left padding: biggest germline cpos minus cpos in this sequence padright = maxima['gl_cpos_to_j_end'] - (len(seq) - cpos) if padleft < 0 or padright < 0: raise Exception('bad padding %d %d for %s' % (padleft, padright, query)) padfo = {} assert len(utils.ambiguous_bases) == 1 # could allow more than one, but it's not implemented a.t.m. padfo['seq'] = padleft * utils.ambiguous_bases[0] + seq + padright * utils.ambiguous_bases[0] if query in self.info['indels']: if debug: print ' also padding reversed sequence' self.info['indels'][query]['reversed_seq'] = padleft * utils.ambiguous_bases[0] + self.info['indels'][query]['reversed_seq'] + padright * utils.ambiguous_bases[0] padfo['k_v'] = {'min' : k_v['min'] + padleft, 'max' : k_v['max'] + padleft} padfo['cyst_position'] = swfo['cyst_position'] + padleft padfo['padleft'] = padleft padfo['padright'] = padright if debug: print ' pad %d %d %s' % (padleft, padright, query) print ' %d --> %d (%d-%d --> %d-%d)' % (len(seq), len(padfo['seq']), k_v['min'], k_v['max'], padfo['k_v']['min'], padfo['k_v']['max']) swfo['padded'] = padfo if debug: for query in self.info['queries']: print '%20s %s' % (query, self.info[query]['padded']['seq'])
class Waterer(object): """ Run smith-waterman on the query sequences in <infname> """ def __init__(self, args, input_info, reco_info, germline_seqs, parameter_dir, write_parameters=False, plotdir=None): self.parameter_dir = parameter_dir self.plotdir = plotdir self.args = args self.input_info = input_info self.reco_info = reco_info self.germline_seqs = germline_seqs self.pcounter, self.true_pcounter = None, None if write_parameters: self.pcounter = ParameterCounter(self.germline_seqs) if not self.args.is_data: self.true_pcounter = ParameterCounter(self.germline_seqs) self.info = {} self.info['all_best_matches'] = set() # set of all the matches we found (for *all* queries) self.info['skipped_unproductive_queries'] = [] # list of unproductive queries if self.args.apply_choice_probs_in_sw: if self.args.debug: print ' reading gene choice probs from',parameter_dir self.gene_choice_probs = utils.read_overall_gene_probs(parameter_dir) with opener('r')(self.args.datadir + '/v-meta.json') as json_file: # get location of <begin> cysteine in each v region self.cyst_positions = json.load(json_file) with opener('r')(self.args.datadir + '/j_tryp.csv') as csv_file: # get location of <end> tryptophan in each j region (TGG) tryp_reader = csv.reader(csv_file) self.tryp_positions = {row[0]:row[1] for row in tryp_reader} # WARNING: this doesn't filter out the header line self.outfile = None if self.args.outfname != None: self.outfile = open(self.args.outfname, 'a') self.n_unproductive = 0 self.n_total = 0 # ---------------------------------------------------------------------------------------- def __del__(self): if self.args.outfname != None: self.outfile.close() # ---------------------------------------------------------------------------------------- def clean(self): if self.pcounter != None: self.pcounter.clean() if self.true_pcounter != None: self.true_pcounter.clean() # ---------------------------------------------------------------------------------------- def run(self): start = time.time() base_infname = 'query-seqs.fa' base_outfname = 'query-seqs.bam' sys.stdout.flush() self.write_vdjalign_input(base_infname) if self.args.n_procs == 1: cmd_str = self.get_vdjalign_cmd_str(self.args.workdir, base_infname, base_outfname) check_call(cmd_str.split()) if not self.args.no_clean: os.remove(self.args.workdir + '/' + base_infname) else: procs = [] for iproc in range(self.args.n_procs): cmd_str = self.get_vdjalign_cmd_str(self.args.workdir + '/sw-' + str(iproc), base_infname, base_outfname, iproc) procs.append(Popen(cmd_str.split())) time.sleep(0.1) for proc in procs: proc.wait() if not self.args.no_clean: for iproc in range(self.args.n_procs): os.remove(self.args.workdir + '/sw-' + str(iproc) + '/' + base_infname) sys.stdout.flush() self.read_output(base_outfname, plot_performance=self.args.plot_performance) print ' sw time: %.3f' % (time.time()-start) if self.n_unproductive > 0: print ' unproductive skipped %d / %d = %.2f' % (self.n_unproductive, self.n_total, float(self.n_unproductive) / self.n_total) if self.pcounter != None: self.pcounter.write(self.parameter_dir) # if self.true_pcounter != None: # self.true_pcounter.write(parameter_xxx_dir, plotdir=plotdir + '/true') if not self.args.no_plot and self.plotdir != '': self.pcounter.plot(self.plotdir, subset_by_gene=True, cyst_positions=self.cyst_positions, tryp_positions=self.tryp_positions) if self.true_pcounter != None: self.true_pcounter.plot(self.plotdir + '/true', subset_by_gene=True, cyst_positions=self.cyst_positions, tryp_positions=self.tryp_positions) # ---------------------------------------------------------------------------------------- def write_vdjalign_input(self, base_infname): # first make a list of query names so we can iterate over an ordered collection ordered_info = [] for query_name in self.input_info: ordered_info.append(query_name) queries_per_proc = float(len(self.input_info)) / self.args.n_procs n_queries_per_proc = int(math.ceil(queries_per_proc)) if self.args.n_procs == 1: # double check for rounding problems or whatnot assert n_queries_per_proc == len(self.input_info) for iproc in range(self.args.n_procs): workdir = self.args.workdir if self.args.n_procs > 1: workdir += '/sw-' + str(iproc) utils.prep_dir(workdir) infname = workdir + '/' + base_infname with opener('w')(workdir + '/' + base_infname) as sub_infile: for iquery in range(iproc*n_queries_per_proc, (iproc + 1)*n_queries_per_proc): if iquery >= len(ordered_info): break query_name = ordered_info[iquery] sub_infile.write('>' + str(query_name) + ' NUKES\n') sub_infile.write(self.input_info[query_name]['seq'] + '\n') # ---------------------------------------------------------------------------------------- def get_vdjalign_cmd_str(self, workdir, base_infname, base_outfname, iproc=-1): """ Run smith-waterman alignment (from Connor's ighutils package) on the seqs in <base_infname>, and toss all the top matches into <base_outfname>. """ # large gap-opening penalty: we want *no* gaps in the middle of the alignments # match score larger than (negative) mismatch score: we want to *encourage* some level of shm. If they're equal, we tend to end up with short unmutated alignments, which screws everything up os.environ['PATH'] = os.getenv('PWD') + '/packages/samtools:' + os.getenv('PATH') check_output(['which', 'samtools']) cmd_str = self.args.ighutil_dir + '/bin/vdjalign align-fastq -q' if self.args.slurm: cmd_str = 'srun ' + cmd_str cmd_str += ' --max-drop 50' cmd_str += ' --match 5 --mismatch 3' cmd_str += ' --gap-open 1000' cmd_str += ' --vdj-dir ' + self.args.datadir cmd_str += ' ' + workdir + '/' + base_infname + ' ' + workdir + '/' + base_outfname return cmd_str # ---------------------------------------------------------------------------------------- def read_output(self, base_outfname, plot_performance=False): perfplotter = None if plot_performance: assert self.args.plotdir != None assert not self.args.is_data from performanceplotter import PerformancePlotter perfplotter = PerformancePlotter(self.germline_seqs, self.args.plotdir + '/sw/performance', 'sw') n_processed = 0 for iproc in range(self.args.n_procs): workdir = self.args.workdir if self.args.n_procs > 1: workdir += '/sw-' + str(iproc) outfname = workdir + '/' + base_outfname with contextlib.closing(pysam.Samfile(outfname)) as bam: grouped = itertools.groupby(iter(bam), operator.attrgetter('qname')) for _, reads in grouped: # loop over query sequences self.n_total += 1 self.process_query(bam, list(reads), perfplotter) n_processed += 1 if not self.args.no_clean: os.remove(outfname) if self.args.n_procs > 1: # still need the top-level workdir os.rmdir(workdir) print ' processed %d queries' % n_processed if perfplotter != None: perfplotter.plot() # ---------------------------------------------------------------------------------------- def get_choice_prob(self, region, gene): choice_prob = 1.0 if gene in self.gene_choice_probs[region]: choice_prob = self.gene_choice_probs[region][gene] else: choice_prob = 0.0 # NOTE would it make sense to use something else here? return choice_prob # ---------------------------------------------------------------------------------------- def process_query(self, bam, reads, perfplotter=None): primary = next((r for r in reads if not r.is_secondary), None) query_seq = primary.seq try: query_name = int(primary.qname) # if it's just one of my hashes, we want it as an int except ValueError: query_name = primary.qname # but if it's someone else's random-ass alphasymbolonumeric string we'll just leave it as-is raw_best = {} all_match_names = {} warnings = {} # ick, this is a messy way to pass stuff around for region in utils.regions: all_match_names[region] = [] all_query_bounds, all_germline_bounds = {}, {} for read in reads: # loop over the matches found for each query sequence read.seq = query_seq # only the first one has read.seq set by default, so we need to set the rest by hand gene = bam.references[read.tid] region = utils.get_region(gene) warnings[gene] = '' if region not in raw_best: # best v, d, and j before multiplying by gene choice probs. needed 'cause *these* are the v and j that get excised raw_best[region] = gene raw_score = read.tags[0][1] # raw because they don't include the gene choice probs score = raw_score if self.args.apply_choice_probs_in_sw: # NOTE I stopped applying the gene choice probs here because the smith-waterman scores don't correspond to log-probs, so throwing on the gene choice probs was dubious (and didn't seem to work that well) score = self.get_choice_prob(region, gene) * raw_score # multiply by the probability to choose this gene # set bounds qrbounds = (read.qstart, read.qend) glbounds = (read.pos, read.aend) assert qrbounds[1]-qrbounds[0] == glbounds[1]-glbounds[0] assert qrbounds[1] <= len(query_seq) if glbounds[1] > len(self.germline_seqs[region][gene]): print ' ', gene print ' ', glbounds[1], len(self.germline_seqs[region][gene]) print ' ', self.germline_seqs[region][gene] assert glbounds[1] <= len(self.germline_seqs[region][gene]) assert qrbounds[1]-qrbounds[0] == glbounds[1]-glbounds[0] all_match_names[region].append((score,gene)) # NOTE it is important that this is ordered such that the best match is first all_query_bounds[gene] = qrbounds all_germline_bounds[gene] = glbounds self.summarize_query(query_name, query_seq, raw_best, all_match_names, all_query_bounds, all_germline_bounds, perfplotter, warnings) # ---------------------------------------------------------------------------------------- def print_match(self, region, gene, query_seq, score, glbounds, qrbounds, codon_pos, warnings, skipping=False): if self.args.debug < 2: return out_str_list = [] buff_str = (20 - len(gene)) * ' ' tmp_val = score if self.args.apply_choice_probs_in_sw and self.get_choice_prob(region, gene) != 0.0: tmp_val = score / self.get_choice_prob(region, gene) if self.args.apply_choice_probs_in_sw: out_str_list.append('%8s%s%s%9.1e * %3.0f = %-6.1f' % (' ', utils.color_gene(gene), buff_str, self.get_choice_prob(region, gene), tmp_val, score)) else: out_str_list.append('%8s%s%s%9s%3s %6.0f ' % (' ', utils.color_gene(gene), '', '', buff_str, score)) out_str_list.append('%4d%4d %s\n' % (glbounds[0], glbounds[1], self.germline_seqs[region][gene][glbounds[0]:glbounds[1]])) out_str_list.append('%50s %4d%4d' % ('', qrbounds[0], qrbounds[1])) out_str_list.append(' %s ' % (utils.color_mutants(self.germline_seqs[region][gene][glbounds[0]:glbounds[1]], query_seq[qrbounds[0]:qrbounds[1]]))) if region != 'd': out_str_list.append('(%s %d)' % (utils.conserved_codon_names[region], codon_pos)) if warnings[gene] != '': out_str_list.append('WARNING ' + warnings[gene]) if skipping: out_str_list.append('skipping!') if self.args.outfname == None: print ''.join(out_str_list) else: out_str_list.append('\n') self.outfile.write(''.join(out_str_list)) # ---------------------------------------------------------------------------------------- def shift_overlapping_boundaries(self, qrbounds, glbounds, query_name, query_seq, best): # NOTE this does pretty much the same thing as resolve_overlapping_matches in joinparser.py """ s-w allows d and j matches (and v and d matches) to overlap... which makes no sense, so apportion the disputed territory between the two regions """ for region_pairs in ({'left':'v', 'right':'d'}, {'left':'d', 'right':'j'}): l_reg = region_pairs['left'] r_reg = region_pairs['right'] l_gene = best[l_reg] r_gene = best[r_reg] overlap = qrbounds[l_gene][1] - qrbounds[r_gene][0] if overlap > 0: l_length = qrbounds[l_gene][1] - qrbounds[l_gene][0] r_length = qrbounds[r_gene][1] - qrbounds[r_gene][0] l_portion, r_portion = 0, 0 while l_portion + r_portion < overlap: if l_length <= 1 and r_length <= 1: # don't want to erode match (in practice it'll be the d match) all the way to zero print ' ERROR both lengths went to zero' assert False elif l_length > 1 and r_length > 1: # if both have length left, alternate back and forth if (l_portion + r_portion) % 2 == 0: l_portion += 1 # give one base to the left l_length -= 1 else: r_portion += 1 # and one to the right r_length -= 1 elif l_length > 1: l_portion += 1 l_length -= 1 elif r_length > 1: r_portion += 1 r_length -= 1 if self.args.debug: print ' WARNING %s apportioning %d bases between %s (%d) match and %s (%d) match' % (str(query_name), overlap, l_reg, l_portion, r_reg, r_portion) assert l_portion + r_portion == overlap qrbounds[l_gene] = (qrbounds[l_gene][0], qrbounds[l_gene][1] - l_portion) glbounds[l_gene] = (glbounds[l_gene][0], glbounds[l_gene][1] - l_portion) qrbounds[r_gene] = (qrbounds[r_gene][0] + r_portion, qrbounds[r_gene][1]) glbounds[r_gene] = (glbounds[r_gene][0] + r_portion, glbounds[r_gene][1]) best[l_reg + '_gl_seq'] = self.germline_seqs[l_reg][l_gene][glbounds[l_gene][0] : glbounds[l_gene][1]] best[l_reg + '_qr_seq'] = query_seq[qrbounds[l_gene][0]:qrbounds[l_gene][1]] best[r_reg + '_gl_seq'] = self.germline_seqs[r_reg][r_gene][glbounds[r_gene][0] : glbounds[r_gene][1]] best[r_reg + '_qr_seq'] = query_seq[qrbounds[r_gene][0]:qrbounds[r_gene][1]] # ---------------------------------------------------------------------------------------- def add_to_info(self, query_name, query_seq, kvals, match_names, best, all_germline_bounds, all_query_bounds, codon_positions, perfplotter=None): assert query_name not in self.info self.info[query_name] = {} self.info[query_name]['unique_id'] = query_name # redundant, but used somewhere down the line self.info[query_name]['k_v'] = kvals['v'] self.info[query_name]['k_d'] = kvals['d'] self.info[query_name]['all'] = ':'.join(match_names['v'] + match_names['d'] + match_names['j']) assert codon_positions['v'] != -1 assert codon_positions['j'] != -1 self.info[query_name]['cdr3_length'] = codon_positions['j'] - codon_positions['v'] + 3 #tryp_position_in_joined_seq - self.cyst_position + 3 self.info[query_name]['cyst_position'] = codon_positions['v'] self.info[query_name]['tryp_position'] = codon_positions['j'] # erosion, insertion, mutation info for best match self.info[query_name]['v_5p_del'] = all_germline_bounds[best['v']][0] self.info[query_name]['v_3p_del'] = len(self.germline_seqs['v'][best['v']]) - all_germline_bounds[best['v']][1] # len(germline v) - gl_match_end self.info[query_name]['d_5p_del'] = all_germline_bounds[best['d']][0] self.info[query_name]['d_3p_del'] = len(self.germline_seqs['d'][best['d']]) - all_germline_bounds[best['d']][1] self.info[query_name]['j_5p_del'] = all_germline_bounds[best['j']][0] self.info[query_name]['j_3p_del'] = len(self.germline_seqs['j'][best['j']]) - all_germline_bounds[best['j']][1] self.info[query_name]['fv_insertion'] = query_seq[ : all_query_bounds[best['v']][0]] self.info[query_name]['vd_insertion'] = query_seq[all_query_bounds[best['v']][1] : all_query_bounds[best['d']][0]] self.info[query_name]['dj_insertion'] = query_seq[all_query_bounds[best['d']][1] : all_query_bounds[best['j']][0]] self.info[query_name]['jf_insertion'] = query_seq[all_query_bounds[best['j']][1] : ] for region in utils.regions: self.info[query_name][region + '_gene'] = best[region] self.info[query_name][region + '_gl_seq'] = best[region + '_gl_seq'] self.info[query_name][region + '_qr_seq'] = best[region + '_qr_seq'] self.info['all_best_matches'].add(best[region]) self.info[query_name]['seq'] = query_seq # only need to add this so I can pass it to print_reco_event if self.args.debug: if not self.args.is_data: utils.print_reco_event(self.germline_seqs, self.reco_info[query_name], extra_str=' ', label='true:') utils.print_reco_event(self.germline_seqs, self.info[query_name], extra_str=' ', label='inferred:') if self.pcounter != None: self.pcounter.increment(self.info[query_name]) if self.true_pcounter != None: self.true_pcounter.increment(self.reco_info[query_name]) if perfplotter != None: perfplotter.evaluate(self.reco_info[query_name], self.info[query_name]) #, subtract_unphysical_erosions=True) # ---------------------------------------------------------------------------------------- def summarize_query(self, query_name, query_seq, raw_best, all_match_names, all_query_bounds, all_germline_bounds, perfplotter, warnings): if self.args.debug: print '%s' % str(query_name) best, match_names, n_matches = {}, {}, {} n_used = {'v':0, 'd':0, 'j':0} k_v_min, k_d_min = 999, 999 k_v_max, k_d_max = 0, 0 for region in utils.regions: all_match_names[region] = sorted(all_match_names[region], reverse=True) match_names[region] = [] codon_positions = {'v':-1, 'd':-1, 'j':-1} # conserved codon positions (v:cysteine, d:dummy, j:tryptophan) for region in utils.regions: n_matches[region] = len(all_match_names[region]) n_skipped = 0 for score, gene in all_match_names[region]: glbounds = all_germline_bounds[gene] qrbounds = all_query_bounds[gene] assert qrbounds[1] <= len(query_seq) # NOTE I'm putting these up avove as well (in process_query), so in time I should remove them from here assert glbounds[1] <= len(self.germline_seqs[region][gene]) assert qrbounds[0] >= 0 assert glbounds[0] >= 0 glmatchseq = self.germline_seqs[region][gene][glbounds[0]:glbounds[1]] # only use the best few matches if n_used[region] >= int(self.args.n_max_per_region[utils.regions.index(region)]): # only take the top few from each region break # only use a specified set of genes if self.args.only_genes != None and gene not in self.args.only_genes: n_skipped += 1 continue # add match to the list n_used[region] += 1 match_names[region].append(gene) self.print_match(region, gene, query_seq, score, glbounds, qrbounds, -1, warnings, skipping=False) # if the germline match and the query match aren't the same length, s-w likely added an insert, which we shouldn't get since the gap-open penalty is jacked up so high if len(glmatchseq) != len(query_seq[qrbounds[0]:qrbounds[1]]): # neurotic double check (um, I think) EDIT hey this totally saved my ass print 'ERROR %d not same length' % query_name print glmatchseq, glbounds[0], glbounds[1] print query_seq[qrbounds[0]:qrbounds[1]] assert False if region == 'v': this_k_v = all_query_bounds[gene][1] # NOTE even if the v match doesn't start at the left hand edge of the query sequence, we still measure k_v from there. # In other words, sw doesn't tell the hmm about it k_v_min = min(this_k_v, k_v_min) k_v_max = max(this_k_v, k_v_max) if region == 'd': this_k_d = all_query_bounds[gene][1] - all_query_bounds[raw_best['v']][1] # end of d minus end of v k_d_min = min(this_k_d, k_d_min) k_d_max = max(this_k_d, k_d_max) # check consistency with best match (since the best match is excised in s-w code, and because ham is run with *one* k_v k_d set) if region not in best: best[region] = gene best[region + '_gl_seq'] = self.germline_seqs[region][gene][glbounds[0]:glbounds[1]] best[region + '_qr_seq'] = query_seq[qrbounds[0]:qrbounds[1]] best[region + '_score'] = score if self.args.debug and n_skipped > 0: print '%8s skipped %d %s genes' % ('', n_skipped, region) for region in utils.regions: if region not in best: print ' no',region,'match found for',query_name # NOTE if no d match found, we should really should just assume entire d was eroded if not self.args.is_data: print ' true:' utils.print_reco_event(self.germline_seqs, self.reco_info[query_name], extra_str=' ') return # s-w allows d and j matches to overlap... which makes no sense, so arbitrarily give the disputed territory to j try: self.shift_overlapping_boundaries(all_query_bounds, all_germline_bounds, query_name, query_seq, best) except AssertionError: print ' ERROR %s apportionment failed' % str(query_name) return for region in utils.regions: codon_positions[region] = utils.get_conserved_codon_position(self.cyst_positions, self.tryp_positions, region, best[region], all_germline_bounds, all_query_bounds) # position in the query sequence, that is # check for unproductive rearrangements try: # NOTE it's actually expected that this'll fail with a 'sequence too short' error, since the s-w doesn't know it's supposed to make sure the match contains the conserved codons utils.check_both_conserved_codons(query_seq, codon_positions['v'], codon_positions['j'], debug=self.args.debug, extra_str=' ') cdr3_length = codon_positions['j'] - codon_positions['v'] + 3 if cdr3_length % 3 != 0: # make sure we've stayed in frame if self.args.debug: print ' out of frame cdr3: %d %% 3 = %d' % (cdr3_length, cdr3_length % 3) assert False utils.check_for_stop_codon(query_seq, codon_positions['v'], debug=self.args.debug) except AssertionError: if self.args.debug: print ' unproductive rearrangement in waterer' if self.args.skip_unproductive: if self.args.debug: print ' ...skipping' self.n_unproductive += 1 self.info['skipped_unproductive_queries'].append(query_name) return # best k_v, k_d: k_v = all_query_bounds[best['v']][1] # end of v match k_d = all_query_bounds[best['d']][1] - all_query_bounds[best['v']][1] # end of d minus end of v if k_d_max < 5: # since the s-w step matches to the longest possible j and then excises it, this sometimes gobbles up the d, resulting in a very short d alignment. if self.args.debug: print ' expanding k_d' k_d_max = max(8, k_d_max) if 'IGHJ4*' in best['j'] and self.germline_seqs['d'][best['d']][-5:] == 'ACTAC': # the end of some d versions is the same as the start of some j versions, so the s-w frequently kicks out the 'wrong' alignment if self.args.debug: print ' doubly expanding k_d' if k_d_max-k_d_min < 8: k_d_min -= 5 k_d_max += 2 k_v_min = max(0, k_v_min - self.args.default_v_fuzz) # ok, so I don't *actually* want it to be zero... oh, well k_v_max += self.args.default_v_fuzz k_d_min = max(1, k_d_min - self.args.default_d_fuzz) k_d_max += self.args.default_d_fuzz assert k_v_min > 0 and k_d_min > 0 and k_v_max > 0 and k_d_max > 0 if self.args.debug: print ' k_v: %d [%d-%d)' % (k_v, k_v_min, k_v_max) print ' k_d: %d [%d-%d)' % (k_d, k_d_min, k_d_max) print ' used', for region in utils.regions: print ' %s: %d/%d' % (region, n_used[region], n_matches[region]), print '' kvals = {} kvals['v'] = {'best':k_v, 'min':k_v_min, 'max':k_v_max} kvals['d'] = {'best':k_d, 'min':k_d_min, 'max':k_d_max} self.add_to_info(query_name, query_seq, kvals, match_names, best, all_germline_bounds, all_query_bounds, codon_positions=codon_positions, perfplotter=perfplotter)
class Waterer(object): """ Run smith-waterman on the query sequences in <infname> """ def __init__(self, args, input_info, reco_info, germline_seqs, parameter_dir, write_parameters=False, plotdir=None): self.parameter_dir = parameter_dir self.plotdir = plotdir self.args = args self.input_info = input_info self.reco_info = reco_info self.germline_seqs = germline_seqs self.pcounter, self.true_pcounter = None, None if write_parameters: self.pcounter = ParameterCounter(self.germline_seqs) if not self.args.is_data: self.true_pcounter = ParameterCounter(self.germline_seqs) self.info = {} self.info['all_best_matches'] = set( ) # set of all the matches we found (for *all* queries) self.info['skipped_unproductive_queries'] = [ ] # list of unproductive queries if self.args.apply_choice_probs_in_sw: if self.args.debug: print ' reading gene choice probs from', parameter_dir self.gene_choice_probs = utils.read_overall_gene_probs( parameter_dir) with opener('r')( self.args.datadir + '/v-meta.json' ) as json_file: # get location of <begin> cysteine in each v region self.cyst_positions = json.load(json_file) with opener('r')( self.args.datadir + '/j_tryp.csv' ) as csv_file: # get location of <end> tryptophan in each j region (TGG) tryp_reader = csv.reader(csv_file) self.tryp_positions = { row[0]: row[1] for row in tryp_reader } # WARNING: this doesn't filter out the header line self.outfile = None if self.args.outfname != None: self.outfile = open(self.args.outfname, 'a') self.n_unproductive = 0 self.n_total = 0 # ---------------------------------------------------------------------------------------- def __del__(self): if self.args.outfname != None: self.outfile.close() # ---------------------------------------------------------------------------------------- def clean(self): if self.pcounter != None: self.pcounter.clean() if self.true_pcounter != None: self.true_pcounter.clean() # ---------------------------------------------------------------------------------------- def run(self): start = time.time() base_infname = 'query-seqs.fa' base_outfname = 'query-seqs.bam' sys.stdout.flush() self.write_vdjalign_input(base_infname) if self.args.n_procs == 1: cmd_str = self.get_vdjalign_cmd_str(self.args.workdir, base_infname, base_outfname) check_call(cmd_str.split()) if not self.args.no_clean: os.remove(self.args.workdir + '/' + base_infname) else: procs = [] for iproc in range(self.args.n_procs): cmd_str = self.get_vdjalign_cmd_str( self.args.workdir + '/sw-' + str(iproc), base_infname, base_outfname, iproc) procs.append(Popen(cmd_str.split())) time.sleep(0.1) for proc in procs: proc.wait() if not self.args.no_clean: for iproc in range(self.args.n_procs): os.remove(self.args.workdir + '/sw-' + str(iproc) + '/' + base_infname) sys.stdout.flush() self.read_output(base_outfname, plot_performance=self.args.plot_performance) print ' sw time: %.3f' % (time.time() - start) if self.n_unproductive > 0: print ' unproductive skipped %d / %d = %.2f' % ( self.n_unproductive, self.n_total, float(self.n_unproductive) / self.n_total) if self.pcounter != None: self.pcounter.write(self.parameter_dir) # if self.true_pcounter != None: # self.true_pcounter.write(parameter_xxx_dir, plotdir=plotdir + '/true') if not self.args.no_plot and self.plotdir != '': self.pcounter.plot(self.plotdir, subset_by_gene=True, cyst_positions=self.cyst_positions, tryp_positions=self.tryp_positions) if self.true_pcounter != None: self.true_pcounter.plot(self.plotdir + '/true', subset_by_gene=True, cyst_positions=self.cyst_positions, tryp_positions=self.tryp_positions) # ---------------------------------------------------------------------------------------- def write_vdjalign_input(self, base_infname): # first make a list of query names so we can iterate over an ordered collection ordered_info = [] for query_name in self.input_info: ordered_info.append(query_name) queries_per_proc = float(len(self.input_info)) / self.args.n_procs n_queries_per_proc = int(math.ceil(queries_per_proc)) if self.args.n_procs == 1: # double check for rounding problems or whatnot assert n_queries_per_proc == len(self.input_info) for iproc in range(self.args.n_procs): workdir = self.args.workdir if self.args.n_procs > 1: workdir += '/sw-' + str(iproc) utils.prep_dir(workdir) infname = workdir + '/' + base_infname with opener('w')(workdir + '/' + base_infname) as sub_infile: for iquery in range(iproc * n_queries_per_proc, (iproc + 1) * n_queries_per_proc): if iquery >= len(ordered_info): break query_name = ordered_info[iquery] sub_infile.write('>' + str(query_name) + ' NUKES\n') sub_infile.write(self.input_info[query_name]['seq'] + '\n') # ---------------------------------------------------------------------------------------- def get_vdjalign_cmd_str(self, workdir, base_infname, base_outfname, iproc=-1): """ Run smith-waterman alignment (from Connor's ighutils package) on the seqs in <base_infname>, and toss all the top matches into <base_outfname>. """ # large gap-opening penalty: we want *no* gaps in the middle of the alignments # match score larger than (negative) mismatch score: we want to *encourage* some level of shm. If they're equal, we tend to end up with short unmutated alignments, which screws everything up os.environ['PATH'] = os.getenv( 'PWD') + '/packages/samtools:' + os.getenv('PATH') check_output(['which', 'samtools']) cmd_str = self.args.ighutil_dir + '/bin/vdjalign align-fastq -q' if self.args.slurm: cmd_str = 'srun ' + cmd_str cmd_str += ' --max-drop 50' cmd_str += ' --match 5 --mismatch 3' cmd_str += ' --gap-open 1000' cmd_str += ' --vdj-dir ' + self.args.datadir cmd_str += ' ' + workdir + '/' + base_infname + ' ' + workdir + '/' + base_outfname return cmd_str # ---------------------------------------------------------------------------------------- def read_output(self, base_outfname, plot_performance=False): perfplotter = None if plot_performance: assert self.args.plotdir != None assert not self.args.is_data from performanceplotter import PerformancePlotter perfplotter = PerformancePlotter( self.germline_seqs, self.args.plotdir + '/sw/performance', 'sw') n_processed = 0 for iproc in range(self.args.n_procs): workdir = self.args.workdir if self.args.n_procs > 1: workdir += '/sw-' + str(iproc) outfname = workdir + '/' + base_outfname with contextlib.closing(pysam.Samfile(outfname)) as bam: grouped = itertools.groupby(iter(bam), operator.attrgetter('qname')) for _, reads in grouped: # loop over query sequences self.n_total += 1 self.process_query(bam, list(reads), perfplotter) n_processed += 1 if not self.args.no_clean: os.remove(outfname) if self.args.n_procs > 1: # still need the top-level workdir os.rmdir(workdir) print ' processed %d queries' % n_processed if perfplotter != None: perfplotter.plot() # ---------------------------------------------------------------------------------------- def get_choice_prob(self, region, gene): choice_prob = 1.0 if gene in self.gene_choice_probs[region]: choice_prob = self.gene_choice_probs[region][gene] else: choice_prob = 0.0 # NOTE would it make sense to use something else here? return choice_prob # ---------------------------------------------------------------------------------------- def process_query(self, bam, reads, perfplotter=None): primary = next((r for r in reads if not r.is_secondary), None) query_seq = primary.seq try: query_name = int( primary.qname ) # if it's just one of my hashes, we want it as an int except ValueError: query_name = primary.qname # but if it's someone else's random-ass alphasymbolonumeric string we'll just leave it as-is raw_best = {} all_match_names = {} warnings = {} # ick, this is a messy way to pass stuff around for region in utils.regions: all_match_names[region] = [] all_query_bounds, all_germline_bounds = {}, {} for read in reads: # loop over the matches found for each query sequence read.seq = query_seq # only the first one has read.seq set by default, so we need to set the rest by hand gene = bam.references[read.tid] region = utils.get_region(gene) warnings[gene] = '' if region not in raw_best: # best v, d, and j before multiplying by gene choice probs. needed 'cause *these* are the v and j that get excised raw_best[region] = gene raw_score = read.tags[0][ 1] # raw because they don't include the gene choice probs score = raw_score if self.args.apply_choice_probs_in_sw: # NOTE I stopped applying the gene choice probs here because the smith-waterman scores don't correspond to log-probs, so throwing on the gene choice probs was dubious (and didn't seem to work that well) score = self.get_choice_prob( region, gene ) * raw_score # multiply by the probability to choose this gene # set bounds qrbounds = (read.qstart, read.qend) glbounds = (read.pos, read.aend) assert qrbounds[1] - qrbounds[0] == glbounds[1] - glbounds[0] assert qrbounds[1] <= len(query_seq) if glbounds[1] > len(self.germline_seqs[region][gene]): print ' ', gene print ' ', glbounds[1], len(self.germline_seqs[region][gene]) print ' ', self.germline_seqs[region][gene] assert glbounds[1] <= len(self.germline_seqs[region][gene]) assert qrbounds[1] - qrbounds[0] == glbounds[1] - glbounds[0] all_match_names[region].append( (score, gene) ) # NOTE it is important that this is ordered such that the best match is first all_query_bounds[gene] = qrbounds all_germline_bounds[gene] = glbounds self.summarize_query(query_name, query_seq, raw_best, all_match_names, all_query_bounds, all_germline_bounds, perfplotter, warnings) # ---------------------------------------------------------------------------------------- def print_match(self, region, gene, query_seq, score, glbounds, qrbounds, codon_pos, warnings, skipping=False): if self.args.debug < 2: return out_str_list = [] buff_str = (20 - len(gene)) * ' ' tmp_val = score if self.args.apply_choice_probs_in_sw and self.get_choice_prob( region, gene) != 0.0: tmp_val = score / self.get_choice_prob(region, gene) if self.args.apply_choice_probs_in_sw: out_str_list.append( '%8s%s%s%9.1e * %3.0f = %-6.1f' % (' ', utils.color_gene(gene), buff_str, self.get_choice_prob(region, gene), tmp_val, score)) else: out_str_list.append( '%8s%s%s%9s%3s %6.0f ' % (' ', utils.color_gene(gene), '', '', buff_str, score)) out_str_list.append( '%4d%4d %s\n' % (glbounds[0], glbounds[1], self.germline_seqs[region][gene][glbounds[0]:glbounds[1]])) out_str_list.append('%50s %4d%4d' % ('', qrbounds[0], qrbounds[1])) out_str_list.append(' %s ' % (utils.color_mutants( self.germline_seqs[region][gene][glbounds[0]:glbounds[1]], query_seq[qrbounds[0]:qrbounds[1]]))) if region != 'd': out_str_list.append( '(%s %d)' % (utils.conserved_codon_names[region], codon_pos)) if warnings[gene] != '': out_str_list.append('WARNING ' + warnings[gene]) if skipping: out_str_list.append('skipping!') if self.args.outfname == None: print ''.join(out_str_list) else: out_str_list.append('\n') self.outfile.write(''.join(out_str_list)) # ---------------------------------------------------------------------------------------- def shift_overlapping_boundaries(self, qrbounds, glbounds, query_name, query_seq, best): # NOTE this does pretty much the same thing as resolve_overlapping_matches in joinparser.py """ s-w allows d and j matches (and v and d matches) to overlap... which makes no sense, so apportion the disputed territory between the two regions """ for region_pairs in ({ 'left': 'v', 'right': 'd' }, { 'left': 'd', 'right': 'j' }): l_reg = region_pairs['left'] r_reg = region_pairs['right'] l_gene = best[l_reg] r_gene = best[r_reg] overlap = qrbounds[l_gene][1] - qrbounds[r_gene][0] if overlap > 0: l_length = qrbounds[l_gene][1] - qrbounds[l_gene][0] r_length = qrbounds[r_gene][1] - qrbounds[r_gene][0] l_portion, r_portion = 0, 0 while l_portion + r_portion < overlap: if l_length <= 1 and r_length <= 1: # don't want to erode match (in practice it'll be the d match) all the way to zero print ' ERROR both lengths went to zero' assert False elif l_length > 1 and r_length > 1: # if both have length left, alternate back and forth if (l_portion + r_portion) % 2 == 0: l_portion += 1 # give one base to the left l_length -= 1 else: r_portion += 1 # and one to the right r_length -= 1 elif l_length > 1: l_portion += 1 l_length -= 1 elif r_length > 1: r_portion += 1 r_length -= 1 if self.args.debug: print ' WARNING %s apportioning %d bases between %s (%d) match and %s (%d) match' % ( str(query_name), overlap, l_reg, l_portion, r_reg, r_portion) assert l_portion + r_portion == overlap qrbounds[l_gene] = (qrbounds[l_gene][0], qrbounds[l_gene][1] - l_portion) glbounds[l_gene] = (glbounds[l_gene][0], glbounds[l_gene][1] - l_portion) qrbounds[r_gene] = (qrbounds[r_gene][0] + r_portion, qrbounds[r_gene][1]) glbounds[r_gene] = (glbounds[r_gene][0] + r_portion, glbounds[r_gene][1]) best[l_reg + '_gl_seq'] = self.germline_seqs[l_reg][l_gene][ glbounds[l_gene][0]:glbounds[l_gene][1]] best[l_reg + '_qr_seq'] = query_seq[ qrbounds[l_gene][0]:qrbounds[l_gene][1]] best[r_reg + '_gl_seq'] = self.germline_seqs[r_reg][r_gene][ glbounds[r_gene][0]:glbounds[r_gene][1]] best[r_reg + '_qr_seq'] = query_seq[ qrbounds[r_gene][0]:qrbounds[r_gene][1]] # ---------------------------------------------------------------------------------------- def add_to_info(self, query_name, query_seq, kvals, match_names, best, all_germline_bounds, all_query_bounds, codon_positions, perfplotter=None): assert query_name not in self.info self.info[query_name] = {} self.info[query_name][ 'unique_id'] = query_name # redundant, but used somewhere down the line self.info[query_name]['k_v'] = kvals['v'] self.info[query_name]['k_d'] = kvals['d'] self.info[query_name]['all'] = ':'.join(match_names['v'] + match_names['d'] + match_names['j']) assert codon_positions['v'] != -1 assert codon_positions['j'] != -1 self.info[query_name][ 'cdr3_length'] = codon_positions['j'] - codon_positions[ 'v'] + 3 #tryp_position_in_joined_seq - self.cyst_position + 3 self.info[query_name]['cyst_position'] = codon_positions['v'] self.info[query_name]['tryp_position'] = codon_positions['j'] # erosion, insertion, mutation info for best match self.info[query_name]['v_5p_del'] = all_germline_bounds[best['v']][0] self.info[query_name]['v_3p_del'] = len( self.germline_seqs['v'][best['v']]) - all_germline_bounds[ best['v']][1] # len(germline v) - gl_match_end self.info[query_name]['d_5p_del'] = all_germline_bounds[best['d']][0] self.info[query_name]['d_3p_del'] = len(self.germline_seqs['d'][ best['d']]) - all_germline_bounds[best['d']][1] self.info[query_name]['j_5p_del'] = all_germline_bounds[best['j']][0] self.info[query_name]['j_3p_del'] = len(self.germline_seqs['j'][ best['j']]) - all_germline_bounds[best['j']][1] self.info[query_name][ 'fv_insertion'] = query_seq[:all_query_bounds[best['v']][0]] self.info[query_name]['vd_insertion'] = query_seq[ all_query_bounds[best['v']][1]:all_query_bounds[best['d']][0]] self.info[query_name]['dj_insertion'] = query_seq[ all_query_bounds[best['d']][1]:all_query_bounds[best['j']][0]] self.info[query_name]['jf_insertion'] = query_seq[ all_query_bounds[best['j']][1]:] for region in utils.regions: self.info[query_name][region + '_gene'] = best[region] self.info[query_name][region + '_gl_seq'] = best[region + '_gl_seq'] self.info[query_name][region + '_qr_seq'] = best[region + '_qr_seq'] self.info['all_best_matches'].add(best[region]) self.info[query_name][ 'seq'] = query_seq # only need to add this so I can pass it to print_reco_event if self.args.debug: if not self.args.is_data: utils.print_reco_event(self.germline_seqs, self.reco_info[query_name], extra_str=' ', label='true:') utils.print_reco_event(self.germline_seqs, self.info[query_name], extra_str=' ', label='inferred:') if self.pcounter != None: self.pcounter.increment(self.info[query_name]) if self.true_pcounter != None: self.true_pcounter.increment(self.reco_info[query_name]) if perfplotter != None: perfplotter.evaluate( self.reco_info[query_name], self.info[query_name]) #, subtract_unphysical_erosions=True) # ---------------------------------------------------------------------------------------- def summarize_query(self, query_name, query_seq, raw_best, all_match_names, all_query_bounds, all_germline_bounds, perfplotter, warnings): if self.args.debug: print '%s' % str(query_name) best, match_names, n_matches = {}, {}, {} n_used = {'v': 0, 'd': 0, 'j': 0} k_v_min, k_d_min = 999, 999 k_v_max, k_d_max = 0, 0 for region in utils.regions: all_match_names[region] = sorted(all_match_names[region], reverse=True) match_names[region] = [] codon_positions = { 'v': -1, 'd': -1, 'j': -1 } # conserved codon positions (v:cysteine, d:dummy, j:tryptophan) for region in utils.regions: n_matches[region] = len(all_match_names[region]) n_skipped = 0 for score, gene in all_match_names[region]: glbounds = all_germline_bounds[gene] qrbounds = all_query_bounds[gene] assert qrbounds[1] <= len( query_seq ) # NOTE I'm putting these up avove as well (in process_query), so in time I should remove them from here assert glbounds[1] <= len(self.germline_seqs[region][gene]) assert qrbounds[0] >= 0 assert glbounds[0] >= 0 glmatchseq = self.germline_seqs[region][gene][ glbounds[0]:glbounds[1]] # only use the best few matches if n_used[region] >= int( self.args.n_max_per_region[utils.regions.index(region)] ): # only take the top few from each region break # only use a specified set of genes if self.args.only_genes != None and gene not in self.args.only_genes: n_skipped += 1 continue # add match to the list n_used[region] += 1 match_names[region].append(gene) self.print_match(region, gene, query_seq, score, glbounds, qrbounds, -1, warnings, skipping=False) # if the germline match and the query match aren't the same length, s-w likely added an insert, which we shouldn't get since the gap-open penalty is jacked up so high if len(glmatchseq) != len( query_seq[qrbounds[0]:qrbounds[1]] ): # neurotic double check (um, I think) EDIT hey this totally saved my ass print 'ERROR %d not same length' % query_name print glmatchseq, glbounds[0], glbounds[1] print query_seq[qrbounds[0]:qrbounds[1]] assert False if region == 'v': this_k_v = all_query_bounds[gene][ 1] # NOTE even if the v match doesn't start at the left hand edge of the query sequence, we still measure k_v from there. # In other words, sw doesn't tell the hmm about it k_v_min = min(this_k_v, k_v_min) k_v_max = max(this_k_v, k_v_max) if region == 'd': this_k_d = all_query_bounds[gene][1] - all_query_bounds[ raw_best['v']][1] # end of d minus end of v k_d_min = min(this_k_d, k_d_min) k_d_max = max(this_k_d, k_d_max) # check consistency with best match (since the best match is excised in s-w code, and because ham is run with *one* k_v k_d set) if region not in best: best[region] = gene best[region + '_gl_seq'] = self.germline_seqs[region][ gene][glbounds[0]:glbounds[1]] best[region + '_qr_seq'] = query_seq[qrbounds[0]:qrbounds[1]] best[region + '_score'] = score if self.args.debug and n_skipped > 0: print '%8s skipped %d %s genes' % ('', n_skipped, region) for region in utils.regions: if region not in best: print ' no', region, 'match found for', query_name # NOTE if no d match found, we should really should just assume entire d was eroded if not self.args.is_data: print ' true:' utils.print_reco_event(self.germline_seqs, self.reco_info[query_name], extra_str=' ') return # s-w allows d and j matches to overlap... which makes no sense, so arbitrarily give the disputed territory to j try: self.shift_overlapping_boundaries(all_query_bounds, all_germline_bounds, query_name, query_seq, best) except AssertionError: print ' ERROR %s apportionment failed' % str(query_name) return for region in utils.regions: codon_positions[region] = utils.get_conserved_codon_position( self.cyst_positions, self.tryp_positions, region, best[region], all_germline_bounds, all_query_bounds) # position in the query sequence, that is # check for unproductive rearrangements try: # NOTE it's actually expected that this'll fail with a 'sequence too short' error, since the s-w doesn't know it's supposed to make sure the match contains the conserved codons utils.check_both_conserved_codons(query_seq, codon_positions['v'], codon_positions['j'], debug=self.args.debug, extra_str=' ') cdr3_length = codon_positions['j'] - codon_positions['v'] + 3 if cdr3_length % 3 != 0: # make sure we've stayed in frame if self.args.debug: print ' out of frame cdr3: %d %% 3 = %d' % ( cdr3_length, cdr3_length % 3) assert False utils.check_for_stop_codon(query_seq, codon_positions['v'], debug=self.args.debug) except AssertionError: if self.args.debug: print ' unproductive rearrangement in waterer' if self.args.skip_unproductive: if self.args.debug: print ' ...skipping' self.n_unproductive += 1 self.info['skipped_unproductive_queries'].append(query_name) return # best k_v, k_d: k_v = all_query_bounds[best['v']][1] # end of v match k_d = all_query_bounds[best['d']][1] - all_query_bounds[best['v']][ 1] # end of d minus end of v if k_d_max < 5: # since the s-w step matches to the longest possible j and then excises it, this sometimes gobbles up the d, resulting in a very short d alignment. if self.args.debug: print ' expanding k_d' k_d_max = max(8, k_d_max) if 'IGHJ4*' in best['j'] and self.germline_seqs['d'][best['d']][ -5:] == 'ACTAC': # the end of some d versions is the same as the start of some j versions, so the s-w frequently kicks out the 'wrong' alignment if self.args.debug: print ' doubly expanding k_d' if k_d_max - k_d_min < 8: k_d_min -= 5 k_d_max += 2 k_v_min = max( 0, k_v_min - self.args.default_v_fuzz ) # ok, so I don't *actually* want it to be zero... oh, well k_v_max += self.args.default_v_fuzz k_d_min = max(1, k_d_min - self.args.default_d_fuzz) k_d_max += self.args.default_d_fuzz assert k_v_min > 0 and k_d_min > 0 and k_v_max > 0 and k_d_max > 0 if self.args.debug: print ' k_v: %d [%d-%d)' % (k_v, k_v_min, k_v_max) print ' k_d: %d [%d-%d)' % (k_d, k_d_min, k_d_max) print ' used', for region in utils.regions: print ' %s: %d/%d' % (region, n_used[region], n_matches[region]), print '' kvals = {} kvals['v'] = {'best': k_v, 'min': k_v_min, 'max': k_v_max} kvals['d'] = {'best': k_d, 'min': k_d_min, 'max': k_d_max} self.add_to_info(query_name, query_seq, kvals, match_names, best, all_germline_bounds, all_query_bounds, codon_positions=codon_positions, perfplotter=perfplotter)