def print_hmm_output(self, line, print_true=False, perfplotter=None): out_str_list = [] ilabel = '' if print_true and not self.args.is_data: # first print true event (if this is simulation) for reco_id, uids in self.get_true_clusters( line['unique_ids']).items(): for iid in range(len(uids)): out_str_list.append( utils.print_reco_event(self.germline_seqs, self.reco_info[uids[iid]], extra_str=' ', return_string=True, label='true:', one_line=(iid != 0))) ilabel = 'inferred:' out_str_list.append( utils.print_reco_event(self.germline_seqs, line, extra_str=' ', return_string=True, label=ilabel)) for iextra in range(1, len(line['unique_ids'])): line['seq'] = line['seqs'][iextra] out_str_list.append( utils.print_reco_event(self.germline_seqs, line, extra_str=' ', return_string=True, one_line=True)) # if not self.args.is_data: # self.print_performance_info(line, perfplotter=perfplotter) print ''.join(out_str_list),
def harmonize_naive_seq_lengths(self, true_line, line): def tpos_to_j_end(tmpline): return len(tmpline['naive_seq']) - tmpline['codon_positions'][ 'j'] # not quite sure it's best to use the naive seq, but I think it is true_naive_seq = true_line['naive_seq'] inferred_naive_seq = line['naive_seq'] if len(line['fv_insertion']) > 0: inferred_naive_seq = inferred_naive_seq[len(line['fv_insertion']):] if len(true_naive_seq) != len(inferred_naive_seq) and len( line['jf_insertion'] ) > 0: # some j genes are very similar, except differ by one base in length, so shit is complicated inferred_naive_seq = inferred_naive_seq[:len(inferred_naive_seq) - len(line['jf_insertion'])] if len(true_naive_seq) != len(inferred_naive_seq) and tpos_to_j_end( true_line) != tpos_to_j_end(line): extra_true_bases = tpos_to_j_end(true_line) - tpos_to_j_end(line) if extra_true_bases > 0: # add Ns to the inferred line if the true line is longer inferred_naive_seq += extra_true_bases * 'N' else: # otherwise add 'em to the true line true_naive_seq += (-extra_true_bases) * 'N' if len(true_naive_seq) != len(inferred_naive_seq): utils.print_reco_event(true_line, label='true') utils.print_reco_event(line, label='inf') raise Exception( 'different length true and inferred naive seqs for %s\n %s\n %s (see above)' % (' '.join( line['unique_ids']), true_naive_seq, inferred_naive_seq)) return true_naive_seq, inferred_naive_seq
def print_event(self): line = {} # collect some information into a form that the print fcn understands for region in utils.regions: line[region + '_gene'] = self.genes[region] for boundary in utils.boundaries: line[boundary + '_insertion'] = self.insertions[boundary] for erosion in utils.real_erosions: line[erosion + '_del'] = self.erosions[erosion] for erosion in utils.effective_erosions: line[erosion + '_del'] = self.effective_erosions[erosion] assert 'fv_insertion' not in line # well, in principle it's ok if they're there, but in that case I'll need to at least think about updating some things assert 'jf_insertion' not in line line['fv_insertion'] = '' line['jf_insertion'] = '' line['input_seqs'] = self.final_seqs line['indel_reversed_seqs'] = [] for iseq in range(len(self.indelfos)): if self.indelfos[iseq]['reversed_seq'] != '': line['indel_reversed_seqs'].append(self.indelfos[iseq]['reversed_seq']) else: line['indel_reversed_seqs'].append(line['input_seqs'][iseq]) line['seqs'] = line['indel_reversed_seqs'] line['indelfos'] = self.indelfos line['unique_ids'] = [str(i) for i in range(len(self.final_seqs))] line['cdr3_length'] = self.cdr3_length line['codon_positions'] = copy.deepcopy(self.final_codon_positions) utils.add_implicit_info(self.glfo, line) utils.print_reco_event(self.glfo['seqs'], line)
def add_to_info(self, query_name, query_seq, kvals, match_names, best, all_germline_bounds, all_query_bounds, codon_positions): assert query_name not in self.info self.info['queries'].append(query_name) self.info[query_name] = {} self.info[query_name]['unique_id'] = query_name # redundant, but used somewhere down the line self.info[query_name]['k_v'] = kvals['v'] self.info[query_name]['k_d'] = kvals['d'] self.info[query_name]['all'] = ':'.join(match_names['v'] + match_names['d'] + match_names['j']) # all gene matches for this query self.info[query_name]['cdr3_length'] = codon_positions['j'] - codon_positions['v'] + 3 #tryp_position_in_joined_seq - self.cyst_position + 3 self.info[query_name]['cyst_position'] = codon_positions['v'] self.info[query_name]['tryp_position'] = codon_positions['j'] # erosion, insertion, mutation info for best match self.info[query_name]['v_5p_del'] = all_germline_bounds[best['v']][0] self.info[query_name]['v_3p_del'] = len(self.glfo['seqs']['v'][best['v']]) - all_germline_bounds[best['v']][1] # len(germline v) - gl_match_end self.info[query_name]['d_5p_del'] = all_germline_bounds[best['d']][0] self.info[query_name]['d_3p_del'] = len(self.glfo['seqs']['d'][best['d']]) - all_germline_bounds[best['d']][1] self.info[query_name]['j_5p_del'] = all_germline_bounds[best['j']][0] self.info[query_name]['j_3p_del'] = len(self.glfo['seqs']['j'][best['j']]) - all_germline_bounds[best['j']][1] self.info[query_name]['fv_insertion'] = query_seq[ : all_query_bounds[best['v']][0]] self.info[query_name]['vd_insertion'] = query_seq[all_query_bounds[best['v']][1] : all_query_bounds[best['d']][0]] self.info[query_name]['dj_insertion'] = query_seq[all_query_bounds[best['d']][1] : all_query_bounds[best['j']][0]] self.info[query_name]['jf_insertion'] = query_seq[all_query_bounds[best['j']][1] : ] self.info[query_name]['indelfo'] = self.info['indels'].get(query_name, utils.get_empty_indel()) for region in utils.regions: self.info[query_name][region + '_gene'] = best[region] self.info['all_best_matches'].add(best[region]) self.info['all_matches'][region] |= set(match_names[region]) self.info[query_name]['seq'] = query_seq # NOTE this is the seq output by vdjalign, i.e. if we reversed any indels it is the reversed sequence existing_implicit_keys = tuple(['cdr3_length', 'cyst_position', 'tryp_position']) utils.add_implicit_info(self.glfo, self.info[query_name], multi_seq=False, existing_implicit_keys=existing_implicit_keys) if self.debug: if not self.args.is_data: utils.print_reco_event(self.glfo['seqs'], self.reco_info[query_name], extra_str=' ', label='true:') utils.print_reco_event(self.glfo['seqs'], self.info[query_name], extra_str=' ', label='inferred:') if self.alfinder is not None: self.alfinder.increment(self.info[query_name]) if self.pcounter is not None: self.pcounter.increment_all_params(self.info[query_name]) if self.true_pcounter is not None: self.true_pcounter.increment_all_params(self.reco_info[query_name]) if self.perfplotter is not None: if query_name in self.info['indels']: print ' skipping performance evaluation of %s because of indels' % query_name # I just have no idea how to handle naive hamming fraction when there's indels else: self.perfplotter.evaluate(self.reco_info[query_name], self.info[query_name]) self.remaining_queries.remove(query_name)
def add_to_info(self, query_name, query_seq, kvals, match_names, best, all_germline_bounds, all_query_bounds, codon_positions): assert query_name not in self.info self.info['queries'].append(query_name) self.info[query_name] = {} self.info[query_name]['unique_id'] = query_name # redundant, but used somewhere down the line self.info[query_name]['k_v'] = kvals['v'] self.info[query_name]['k_d'] = kvals['d'] self.info[query_name]['all'] = ':'.join(match_names['v'] + match_names['d'] + match_names['j']) # assert codon_positions['v'] != -1 # assert codon_positions['j'] != -1 self.info[query_name]['cdr3_length'] = codon_positions['j'] - codon_positions['v'] + 3 #tryp_position_in_joined_seq - self.cyst_position + 3 self.info[query_name]['cyst_position'] = codon_positions['v'] self.info[query_name]['tryp_position'] = codon_positions['j'] if self.info[query_name]['cyst_position'] < 0 or self.info[query_name]['cyst_position'] >= len(query_seq): raise Exception('cpos %d invalid for %s (%s)' % (self.info[query_name]['cyst_position'], query_name, query_seq)) if self.info[query_name]['tryp_position'] < 0 or self.info[query_name]['tryp_position'] >= len(query_seq): raise Exception('tpos %d invalid for %s (%s)' % (self.info[query_name]['tryp_position'], query_name, query_seq)) # erosion, insertion, mutation info for best match self.info[query_name]['v_5p_del'] = all_germline_bounds[best['v']][0] self.info[query_name]['v_3p_del'] = len(self.germline_seqs['v'][best['v']]) - all_germline_bounds[best['v']][1] # len(germline v) - gl_match_end self.info[query_name]['d_5p_del'] = all_germline_bounds[best['d']][0] self.info[query_name]['d_3p_del'] = len(self.germline_seqs['d'][best['d']]) - all_germline_bounds[best['d']][1] self.info[query_name]['j_5p_del'] = all_germline_bounds[best['j']][0] self.info[query_name]['j_3p_del'] = len(self.germline_seqs['j'][best['j']]) - all_germline_bounds[best['j']][1] self.info[query_name]['fv_insertion'] = query_seq[ : all_query_bounds[best['v']][0]] self.info[query_name]['vd_insertion'] = query_seq[all_query_bounds[best['v']][1] : all_query_bounds[best['d']][0]] self.info[query_name]['dj_insertion'] = query_seq[all_query_bounds[best['d']][1] : all_query_bounds[best['j']][0]] self.info[query_name]['jf_insertion'] = query_seq[all_query_bounds[best['j']][1] : ] for region in utils.regions: self.info[query_name][region + '_gene'] = best[region] self.info[query_name][region + '_gl_seq'] = best[region + '_gl_seq'] self.info[query_name][region + '_qr_seq'] = best[region + '_qr_seq'] self.info['all_best_matches'].add(best[region]) self.info[query_name]['seq'] = query_seq # NOTE this is the seq output by vdjalign, i.e. if we reversed any indels it is the reversed sequence if self.debug: if not self.args.is_data: utils.print_reco_event(self.germline_seqs, self.reco_info[query_name], extra_str=' ', label='true:', indelfo=self.reco_info[query_name]['indels']) utils.print_reco_event(self.germline_seqs, self.info[query_name], extra_str=' ', label='inferred:', indelfo=self.info['indels'].get(query_name, None)) if self.pcounter is not None: self.pcounter.increment_reco_params(self.info[query_name]) self.pcounter.increment_mutation_params(self.info[query_name]) if self.true_pcounter is not None: self.true_pcounter.increment_reco_params(self.reco_info[query_name]) self.true_pcounter.increment_mutation_params(self.reco_info[query_name]) if self.perfplotter is not None: self.perfplotter.evaluate(self.reco_info[query_name], self.info[query_name]) #, subtract_unphysical_erosions=True) self.remaining_queries.remove(query_name)
def harmonize_naive_seq_lengths(self, true_line, line): def tpos_to_j_end(tmpline): return len(tmpline['naive_seq']) - tmpline['codon_positions'][ 'j'] # not quite sure it's best to use the naive seq, but I think it is true_naive_seq = true_line['naive_seq'] inferred_naive_seq = line['naive_seq'] if len(line['fv_insertion']) > 0: inferred_naive_seq = inferred_naive_seq[len(line['fv_insertion']):] if len(true_naive_seq) != len(inferred_naive_seq) and len( line['jf_insertion'] ) > 0: # some j genes are very similar, except differ by one base in length, so shit is complicated inferred_naive_seq = inferred_naive_seq[:len(inferred_naive_seq) - len(line['jf_insertion'])] if len(true_naive_seq) != len(inferred_naive_seq) and tpos_to_j_end( true_line) != tpos_to_j_end(line): extra_true_bases = tpos_to_j_end(true_line) - tpos_to_j_end(line) if extra_true_bases > 0: # add Ns to the inferred line if the true line is longer inferred_naive_seq += extra_true_bases * 'N' else: # otherwise add 'em to the true line true_naive_seq += (-extra_true_bases) * 'N' if len(true_naive_seq) != len(inferred_naive_seq): # all this stuff gets printed four times, since we're calling this fcn for each region. sigh. utils.print_reco_event(true_line, label='true') utils.print_reco_event(line, label='inf') print '%s different length true and inferred naive seqs for %s (see above)\n %s\n %s' % ( utils.color('yellow', 'warning'), ' '.join( line['unique_ids']), true_naive_seq, inferred_naive_seq) # I'd rather just give up and skip it at this point, but that involves passing knowledge of the failure through too many functions so it's hard, so... align 'em, which isn't right, but oh well aligned_true, aligned_inferred = utils.align_seqs( true_naive_seq, inferred_naive_seq) true_list, inf_list = [], [] for ctrue, cinf in zip( aligned_true, aligned_inferred ): # remove bases corresponding to gaps in true, and replace gaps in inf with Ns (the goal is to end up with aligned seqs that are the same length as the true inferred sequence, so the restrict_to_region stuff still works) if ctrue in utils.gap_chars: continue elif cinf in utils.gap_chars: true_list += [ctrue] inf_list += [utils.ambiguous_bases[0]] else: true_list += [ctrue] inf_list += [cinf] assert len(true_list) == len(true_naive_seq) true_naive_seq = ''.join(true_list) inferred_naive_seq = ''.join(inf_list) # utils.color_mutants(true_naive_seq, inferred_naive_seq, print_result=True) return true_naive_seq, inferred_naive_seq
def finalize(self): if self.perfplotter is not None: self.perfplotter.plot(self.args.plotdir + '/sw', only_csv=self.args.only_csv_plots) # print ' sw time: %.3f' % (time.time()-start) print ' info for %d' % len(self.info['queries']), skipped_unproductive = len(self.unproductive_queries) n_remaining = len(self.remaining_queries) if skipped_unproductive > 0 or n_remaining > 0: print ' (skipped', print '%d / %d = %.2f unproductive' % (skipped_unproductive, len(self.input_info), float(skipped_unproductive) / len(self.input_info)), if n_remaining > 0: print ' %d / %d = %.2f other' % (n_remaining, len(self.input_info), float(n_remaining) / len(self.input_info)), print ')', print '' sys.stdout.flush() if n_remaining > 0: printstr = ' %s %d missing %s' % (utils.color('red', 'warning'), n_remaining, utils.plural_str('annotation', n_remaining)) if n_remaining < 15: printstr += ' (' + ':'.join(self.remaining_queries) + ')' print printstr if self.debug and len(self.info['indels']) > 0: print ' indels: %s' % ':'.join(self.info['indels'].keys()) assert len(self.info['queries']) + skipped_unproductive + n_remaining == len(self.input_info) if self.debug and not self.args.is_data and n_remaining > 0: print 'true annotations for remaining events:' for qry in self.remaining_queries: utils.print_reco_event(self.glfo['seqs'], self.reco_info[qry], extra_str=' ', label='true:') if self.alfinder is not None: self.alfinder.finalize(debug=self.args.debug_new_allele_finding) self.info['new-alleles'] = self.alfinder.new_allele_info if self.args.plotdir is not None: self.alfinder.plot(self.args.plotdir + '/sw', only_csv=self.args.only_csv_plots) # add padded info to self.info (returns if stuff has already been padded) self.pad_seqs_to_same_length() # NOTE this uses *all the gene matches (not just the best ones), so it has to come before we call pcounter.write(), since that fcn rewrites the germlines removing genes that weren't best matches. But NOTE also that I'm not sure what but that the padding actually *needs* all matches (rather than just all *best* matches) if self.pcounter is not None: if self.args.plotdir is not None: self.pcounter.plot(self.args.plotdir + '/sw', subset_by_gene=True, cyst_positions=self.glfo['cyst-positions'], tryp_positions=self.glfo['tryp-positions'], only_csv=self.args.only_csv_plots) if self.true_pcounter is not None: self.true_pcounter.plot(self.args.plotdir + '/sw-true', subset_by_gene=True, cyst_positions=self.glfo['cyst-positions'], tryp_positions=self.glfo['tryp-positions'], only_csv=self.args.only_csv_plots) self.pcounter.write(self.parameter_dir, self.my_datadir) if self.true_pcounter is not None: self.true_pcounter.write(self.parameter_dir + '-true') self.info['remaining_queries'] = self.remaining_queries
def add_to_info(self, query_name, query_seq, kvals, match_names, best, all_germline_bounds, all_query_bounds, codon_positions, perfplotter=None): assert query_name not in self.info self.info[query_name] = {} self.info[query_name]['unique_id'] = query_name # redundant, but used somewhere down the line self.info[query_name]['k_v'] = kvals['v'] self.info[query_name]['k_d'] = kvals['d'] self.info[query_name]['all'] = ':'.join(match_names['v'] + match_names['d'] + match_names['j']) assert codon_positions['v'] != -1 assert codon_positions['j'] != -1 self.info[query_name]['cdr3_length'] = codon_positions['j'] - codon_positions['v'] + 3 #tryp_position_in_joined_seq - self.cyst_position + 3 self.info[query_name]['cyst_position'] = codon_positions['v'] self.info[query_name]['tryp_position'] = codon_positions['j'] # erosion, insertion, mutation info for best match self.info[query_name]['v_5p_del'] = all_germline_bounds[best['v']][0] self.info[query_name]['v_3p_del'] = len(self.germline_seqs['v'][best['v']]) - all_germline_bounds[best['v']][1] # len(germline v) - gl_match_end self.info[query_name]['d_5p_del'] = all_germline_bounds[best['d']][0] self.info[query_name]['d_3p_del'] = len(self.germline_seqs['d'][best['d']]) - all_germline_bounds[best['d']][1] self.info[query_name]['j_5p_del'] = all_germline_bounds[best['j']][0] self.info[query_name]['j_3p_del'] = len(self.germline_seqs['j'][best['j']]) - all_germline_bounds[best['j']][1] self.info[query_name]['fv_insertion'] = query_seq[ : all_query_bounds[best['v']][0]] self.info[query_name]['vd_insertion'] = query_seq[all_query_bounds[best['v']][1] : all_query_bounds[best['d']][0]] self.info[query_name]['dj_insertion'] = query_seq[all_query_bounds[best['d']][1] : all_query_bounds[best['j']][0]] self.info[query_name]['jf_insertion'] = query_seq[all_query_bounds[best['j']][1] : ] for region in utils.regions: self.info[query_name][region + '_gene'] = best[region] self.info[query_name][region + '_gl_seq'] = best[region + '_gl_seq'] self.info[query_name][region + '_qr_seq'] = best[region + '_qr_seq'] self.info['all_best_matches'].add(best[region]) self.info[query_name]['seq'] = query_seq # only need to add this so I can pass it to print_reco_event if self.args.debug: if not self.args.is_data: utils.print_reco_event(self.germline_seqs, self.reco_info[query_name], extra_str=' ', label='true:') utils.print_reco_event(self.germline_seqs, self.info[query_name], extra_str=' ', label='inferred:') if self.pcounter != None: self.pcounter.increment(self.info[query_name]) if self.true_pcounter != None: self.true_pcounter.increment(self.reco_info[query_name]) if perfplotter != None: perfplotter.evaluate(self.reco_info[query_name], self.info[query_name]) #, subtract_unphysical_erosions=True)
def print_hmm_output(self, line, print_true=False, perfplotter=None): out_str_list = [] ilabel = '' if print_true and not self.args.is_data: # first print true event (if this is simulation) for reco_id, uids in self.get_true_clusters(line['unique_ids']).items(): for iid in range(len(uids)): out_str_list.append(utils.print_reco_event(self.germline_seqs, self.reco_info[uids[iid]], extra_str=' ', return_string=True, label='true:', one_line=(iid!=0))) ilabel = 'inferred:' out_str_list.append(utils.print_reco_event(self.germline_seqs, line, extra_str=' ', return_string=True, label=ilabel)) for iextra in range(1, len(line['unique_ids'])): line['seq'] = line['seqs'][iextra] out_str_list.append(utils.print_reco_event(self.germline_seqs, line, extra_str=' ', return_string=True, one_line=True)) # if not self.args.is_data: # self.print_performance_info(line, perfplotter=perfplotter) print ''.join(out_str_list),
def print_event(self): line = {} # collect some information into a form that print_reco_event understands line['cdr3_length'] = self.cdr3_length for region in utils.regions: line[region + '_gene'] = self.genes[region] for boundary in utils.boundaries: line[boundary + '_insertion'] = self.insertions[boundary] for erosion in utils.real_erosions: line[erosion + '_del'] = self.erosions[erosion] for erosion in utils.effective_erosions: line[erosion + '_del'] = self.effective_erosions[erosion] line['cyst_position'] = self.final_cyst_position line['tryp_position'] = self.final_tryp_position assert 'fv_insertion' not in line # well, in principle it's ok if they're there, but in that case I'll need to at least think about updating some things assert 'jf_insertion' not in line line['fv_insertion'] = '' line['jf_insertion'] = '' line['seqs'] = self.final_seqs line['unique_ids'] = [i for i in range(len(self.final_seqs))] utils.print_reco_event(self.germlines, line, indelfos=self.indelfo)
def print_event(self, total_length_from_right=0): line = { } # collect some information into a form that print_reco_event understands line['cdr3_length'] = self.cdr3_length for region in utils.regions: line[region + '_gene'] = self.genes[region] for boundary in utils.boundaries: line[boundary + '_insertion'] = self.insertions[boundary] for erosion in utils.real_erosions: line[erosion + '_del'] = self.erosions[erosion] for erosion in utils.effective_erosions: line[erosion + '_del'] = self.effective_erosions[erosion] line['cyst_position'] = self.final_cyst_position line['tryp_position'] = self.final_tryp_position assert 'fv_insertion' not in line # well, in principle it's ok if they're there, but in that case I'll need to at least think about updating some things assert 'jf_insertion' not in line line['fv_insertion'] = '' line['jf_insertion'] = '' line['seqs'] = self.final_seqs line['unique_ids'] = [i for i in range(len(self.final_seqs))] utils.print_reco_event(self.germlines, line, indelfos=self.indelfo)
def finalize(self): if self.perfplotter is not None: self.perfplotter.plot(self.args.plotdir + '/sw', only_csv=self.args.only_csv_plots) # print ' sw time: %.3f' % (time.time()-start) print ' info for %d' % len(self.info['queries']), skipped_unproductive = len(self.unproductive_queries) n_remaining = len(self.remaining_queries) if skipped_unproductive > 0 or n_remaining > 0: print ' (skipped', print '%d / %d = %.2f unproductive' % (skipped_unproductive, len(self.input_info), float(skipped_unproductive) / len(self.input_info)), if n_remaining > 0: print ' %d / %d = %.2f other' % (n_remaining, len(self.input_info), float(n_remaining) / len(self.input_info)), print ')', print '' sys.stdout.flush() if n_remaining > 0: printstr = ' %s %d missing annotations' % (utils.color('red', 'warning'), n_remaining) if n_remaining < 15: printstr += ' (' + ':'.join(self.remaining_queries) + ')' print printstr if self.debug and len(self.info['indels']) > 0: print ' indels: %s' % ':'.join(self.info['indels'].keys()) assert len(self.info['queries']) + skipped_unproductive + n_remaining == len(self.input_info) if self.debug and not self.args.is_data and n_remaining > 0: print 'true annotations for remaining events:' for qry in self.remaining_queries: utils.print_reco_event(self.glfo['seqs'], self.reco_info[qry], extra_str=' ', label='true:') if self.pcounter is not None: self.pcounter.write(self.parameter_dir) if self.true_pcounter is not None: assert self.parameter_dir[-1] != '/' self.true_pcounter.write(self.parameter_dir + '-true') if self.args.plotdir is not None: self.pcounter.plot(self.args.plotdir + '/sw', subset_by_gene=True, cyst_positions=self.glfo['cyst-positions'], tryp_positions=self.glfo['tryp-positions'], only_csv=self.args.only_csv_plots) if self.true_pcounter is not None: self.true_pcounter.plot(self.args.plotdir + '/sw-true', subset_by_gene=True, cyst_positions=self.glfo['cyst-positions'], tryp_positions=self.glfo['tryp-positions'], only_csv=self.args.only_csv_plots) utils.pad_seqs_to_same_length(self.info['queries'], self.info, self.glfo, self.info['indels']) # adds padded info to self.info (returns if stuff has already been padded) self.info['remaining_queries'] = self.remaining_queries
def print_event(self, total_length_from_right=0): line = {} # collect some information into a form that print_reco_event understands line["cdr3_length"] = self.cdr3_length for region in utils.regions: line[region + "_gene"] = self.genes[region] for boundary in utils.boundaries: line[boundary + "_insertion"] = self.insertions[boundary] for erosion in utils.real_erosions: line[erosion + "_del"] = self.erosions[erosion] for erosion in utils.effective_erosions: line[erosion + "_del"] = self.effective_erosions[erosion] line["cyst_position"] = self.final_cyst_position line["tryp_position"] = self.final_tryp_position assert ( "fv_insertion" not in line ) # well, in principle it's ok if they're there, but in that case I'll need to at least think about updating some things assert "jf_insertion" not in line line["fv_insertion"] = "" line["jf_insertion"] = "" line["seqs"] = self.final_seqs line["unique_ids"] = [i for i in range(len(self.final_seqs))] utils.print_reco_event(self.germlines, line, indelfos=self.indelfo)
def print_event(self): line = {} # collect some information into a form that the print fcn understands for region in utils.regions: line[region + '_gene'] = self.genes[region] for boundary in utils.boundaries: line[boundary + '_insertion'] = self.insertions[boundary] for erosion in utils.real_erosions: line[erosion + '_del'] = self.erosions[erosion] for erosion in utils.effective_erosions: line[erosion + '_del'] = self.effective_erosions[erosion] assert 'fv_insertion' not in line # well, in principle it's ok if they're there, but in that case I'll need to at least think about updating some things assert 'jf_insertion' not in line line['fv_insertion'] = '' line['jf_insertion'] = '' line['seqs'] = self.final_seqs line['unique_ids'] = [i for i in range(len(self.final_seqs))] line['cdr3_length'] = self.cdr3_length line['cyst_position'] = self.final_cyst_position line['tryp_position'] = self.final_tryp_position line['indelfos'] = self.indelfos utils.add_implicit_info(self.glfo, line, multi_seq=True, existing_implicit_keys=('cdr3_length', 'cyst_position', 'tryp_position')) utils.print_reco_event(self.glfo['seqs'], line)
def print_event(self, total_length_from_right=0): line = {} # collect some information into a form that print_reco_event understands line['cdr3_length'] = self.cdr3_length for region in utils.regions: line[region + '_gene'] = self.genes[region] for boundary in utils.boundaries: line[boundary + '_insertion'] = self.insertions[boundary] for erosion in utils.real_erosions: line[erosion + '_del'] = self.erosions[erosion] for erosion in utils.effective_erosions: line[erosion + '_del'] = self.effective_erosions[erosion] line['cyst_position'] = self.final_cyst_position line['tryp_position'] = self.final_tryp_position assert 'fv_insertion' not in line # well, in principle it's ok if they're there, but in that case I'll need to at least think about updating some things assert 'jf_insertion' not in line line['fv_insertion'] = '' line['jf_insertion'] = '' for imute in range(len(self.final_seqs)): line['seq'] = self.final_seqs[imute] if total_length_from_right > 0: line['seq'] = line['seq'][len(line['seq'])-total_length_from_right : ] utils.print_reco_event(self.germlines, line, one_line=(imute!=0))
def print_stuff(line): cluster_index = sorted_clusters.index(cluster) naive_cdr3, matureiseq0_cdr3 = utils.subset_sequences(line, iseq=0, restrict_to_region='cdr3') # returns the CDR3 nt sequence for naive, and the first mutated sequence (iseq0); CDR3 = first base of cysteine through last base of tryptophan # mature_cdr3_seqs = [] # trying to translate the consensus cdr3 so I can search these with my seed seqs # for iseq in range(len(line['unique_ids'])): # naive_cdr3_seq, mature_cdr3_seq = utils.subset_sequences(line, iseq=iseq, restrict_to_region='cdr3') # mature_cdr3_seqs.append(mature_cdr3_seq) # mature_cdr3_seqs # translated_cdr3 = mature_cdr3_seqs.translate() cdr3_aa = '%-30s' % Seq(naive_cdr3).translate() # If a cluster contains one of our seed seqs, color this CDR3 red if any('-ig' in s for s in line['unique_ids']): cdr3_aa = utils.color('red', cdr3_aa, width=30) if args.cdr3 in cdr3_aa: # Only print clusters with naive CDR3 that matches our specified --cdr3 argument print 'index genes size n muts SHM rep frac CDR3 FayWuH' print ' mean med len seq' print '%4s %s %s %s %5d %5d %5d %7.3f %8.4f %2d %s %4.2f' % ( cluster_index, utils.color_gene(line['v_gene'], width=15), utils.color_gene(line['d_gene'], width=15), utils.color_gene(line['j_gene'], width=10), len(line['unique_ids']), numpy.mean(line['n_mutations']), numpy.median(line['n_mutations']), numpy.mean(line['mut_freqs']), float(len(cluster)) / n_total, (line['cdr3_length']/3), cdr3_aa, utils.fay_wu_h(line, debug=False), ) # print 'number of mutations per sequence in cluster', sorted(line['n_mutations']) print len(line['naive_seq']), 'length of naive seq' # utils.print_reco_event(utils.synthesize_single_seq_line(line, iseq=0)) # print ascii-art representation of the rearrangement event print 'unique_ids: ', getkey(line['unique_ids']) print print utils.print_reco_event(line)
def parse_file(self, infname): tree = ET.parse(infname) root = tree.getroot() for query in root: self.n_total += 1 if self.n_max_queries > 0 and self.n_total > self.n_max_queries: break unique_id = query.attrib['id'].replace('>', '').replace(' ', '') if len(self.queries) > 0 and unique_id not in self.queries: continue if self.debug: print self.n_total, unique_id line = {} line['unique_id'] = unique_id line['seq'] = self.seqinfo[unique_id]['seq'] for region in utils.regions: if self.debug: print ' ', region self.get_region_matches(region, query, line) if 'v_gene' not in line or 'd_gene' not in line or 'j_gene' not in line: print ' ERROR giving up on %s' % unique_id self.n_failed += 1 continue add_insertions(line) try: resolve_overlapping_matches(line, self.debug) except: print 'ERROR apportionment failed on %s' % unique_id self.n_failed += 1 continue self.perfplotter.evaluate(self.seqinfo[unique_id], line) if self.debug: utils.print_reco_event(self.germline_seqs, line)
def parse_detail(self, fk, unique_id): assert fk.iline < len(fk.lines) while fk.line[1] != "Details": fk.increment() if fk.eof: return fk.increment() info = {} info["unique_id"] = unique_id for begin_line, column, index, required, default in line_order: if fk.line[0].find(begin_line) != 0: if required: print "oop", begin_line, fk.line sys.exit() else: info[column] = default continue if column != "": info[column] = clean_value(column, fk.line[index]) # if '[' in info[column]: # print 'added', column, clean_value(column, fk.line[index]) if column.find("_gene") == 1: region = column[0] info[region + "_5p_del"] = ( int(fk.line[fk.line.index("start:") + 1]) - 1 ) # NOTE their indices are 1-based gl_length = int(fk.line[fk.line.index("gene:") + 1]) - 1 match_end = int(fk.line[fk.line.index("end:") + 1]) - 1 assert gl_length >= match_end info[region + "_3p_del"] = gl_length - match_end fk.increment() if unique_id not in self.sim_need: while not fk.eof and fk.line[1] != "Details": # skip stuff until start of next Detail block fk.increment() return info["fv_insertion"] = "" info["jf_insertion"] = "" info["seq"] = ( info["v_qr_seq"] + info["vd_insertion"] + info["d_qr_seq"] + info["dj_insertion"] + info["j_qr_seq"] ) if "-" in info["seq"]: print "ERROR found a dash in %s, returning failure" % unique_id while not fk.eof and fk.line[1] != "Details": # skip stuff until start of next Detail block fk.increment() return if ( info["seq"] not in self.siminfo[unique_id]["seq"] ): # arg. I can't do != because it tacks on v left and j right deletions print "ERROR didn't find the right sequence for %s" % unique_id print " ", info["seq"] print " ", self.siminfo[unique_id]["seq"] sys.exit() if self.args.debug: print unique_id for region in utils.regions: infer_gene = info[region + "_gene"] true_gene = self.siminfo[unique_id][region + "_gene"] if utils.are_alleles(infer_gene, true_gene): regionstr = utils.color("bold", utils.color("blue", region)) truestr = "" #'(originally %s)' % match_name else: regionstr = utils.color("bold", utils.color("red", region)) truestr = "(true: %s)" % utils.color_gene(true_gene).replace(region, "") print " %s %s %s" % (regionstr, utils.color_gene(infer_gene).replace(region, ""), truestr) utils.print_reco_event(self.germline_seqs, self.siminfo[unique_id], label="true:", extra_str=" ") utils.print_reco_event(self.germline_seqs, info, label="inferred:", extra_str=" ") for region in utils.regions: if info[region + "_gene"] not in self.germline_seqs[region]: print "ERROR %s not in germlines" % info[region + "_gene"] assert False gl_seq = info[region + "_gl_seq"] if "[" in gl_seq: # ambiguous for nuke in utils.nukes: gl_seq = gl_seq.replace("[", nuke) if gl_seq in self.germline_seqs[region][info[region + "_gene"]]: print " replaced [ with %s" % nuke break info[region + "_gl_seq"] = gl_seq if info[region + "_gl_seq"] not in self.germline_seqs[region][info[region + "_gene"]]: print "ERROR gl match not found for %s in %s" % (info[region + "_gene"], unique_id) print " ", info[region + "_gl_seq"] print " ", self.germline_seqs[region][info[region + "_gene"]] self.perfplotter.add_partial_fail(self.siminfo[unique_id], info) while not fk.eof and fk.line[1] != "Details": # skip stuff until start of next Detail block fk.increment() return self.perfplotter.evaluate(self.siminfo[unique_id], info) self.details[unique_id] = info self.sim_need.remove(unique_id) while not fk.eof and fk.line[1] != "Details": # skip stuff until start of next Detail block fk.increment()
def process_query(self, qr_info, query_name, query_lines): # split query_lines up into blocks blocks = [] for line in query_lines: if line.find('Query_') == 0: blocks.append([]) if len(line) == 0: continue if len(re.findall('<a name=#_[0-9][0-9]*_IGH', line)) == 0 and line.find('Query_') != 0: continue if len(blocks) == 0: print 'wtf? %s' % query_name # it's probably kicking a reverse match self.perfplotter.add_partial_fail( self.seqinfo[query_name], qr_info) # NOTE that's really a total failure self.n_partially_failed += 1 return blocks[-1].append(line) # then process each block for block in blocks: self.process_single_block(block, query_name, qr_info) if 'fail' in qr_info: self.perfplotter.add_partial_fail(self.seqinfo[query_name], qr_info) self.n_partially_failed += 1 return for region in utils.regions: if region + '_gene' not in qr_info: print ' ERROR no %s match for %d' % (region, query_name) self.perfplotter.add_partial_fail(self.seqinfo[query_name], qr_info) self.n_partially_failed += 1 return # expand v match to left end and j match to right end qr_info['v_5p_del'] = 0 qr_info['fv_insertion'] = '' if qr_info['match_start'] > 0: if self.args.debug: print ' add to v left:', self.seqinfo[query_name][ 'seq'][:qr_info['match_start']] qr_info['seq'] = self.seqinfo[query_name][ 'seq'][:qr_info['match_start']] + qr_info['seq'] qr_info['j_3p_del'] = 0 qr_info['jf_insertion'] = '' if len(self.seqinfo[query_name]['seq']) > qr_info['match_end']: if self.args.debug: print ' add to j right:', self.seqinfo[query_name][ 'seq'][qr_info['match_end'] - len(self.seqinfo[query_name]['seq']):] qr_info['seq'] = qr_info['seq'] + self.seqinfo[query_name]['seq'][ qr_info['match_end'] - len(self.seqinfo[query_name]['seq']):] for boundary in utils.boundaries: start = qr_info[boundary[0] + '_qr_bounds'][1] end = qr_info[boundary[1] + '_qr_bounds'][0] qr_info[boundary + '_insertion'] = qr_info['seq'][start:end] for region in utils.regions: start = qr_info[region + '_qr_bounds'][0] end = qr_info[region + '_qr_bounds'][1] qr_info[region + '_qr_seq'] = qr_info['seq'][start:end] try: resolve_overlapping_matches(qr_info, self.args.debug, self.germline_seqs) except AssertionError: print 'ERROR apportionment failed on %s' % query_name self.perfplotter.add_partial_fail(self.seqinfo[query_name], qr_info) self.n_partially_failed += 1 return if self.args.debug: print ' query seq:', qr_info['seq'] for region in utils.regions: print ' %s %3d %3d %s %s' % ( region, qr_info[region + '_qr_bounds'][0], qr_info[region + '_qr_bounds'][1], utils.color_gene(qr_info[region + '_gene']), qr_info[region + '_gl_seq']) for boundary in utils.boundaries: start = qr_info[boundary[0] + '_qr_bounds'][1] end = qr_info[boundary[1] + '_qr_bounds'][0] qr_info[boundary + '_insertion'] = qr_info['seq'][start:end] if self.args.debug: print ' ', boundary, qr_info[boundary + '_insertion'] self.perfplotter.evaluate(self.seqinfo[query_name], qr_info) # for key, val in qr_info.items(): # print key, val if self.args.debug: utils.print_reco_event(self.germline_seqs, self.seqinfo[query_name], label='true:', extra_str=' ') utils.print_reco_event(self.germline_seqs, qr_info, extra_str=' ')
def summarize_query(self, query_name, query_seq, raw_best, all_match_names, all_query_bounds, all_germline_bounds, perfplotter, warnings): if self.args.debug: print '%s' % str(query_name) best, match_names, n_matches = {}, {}, {} n_used = {'v': 0, 'd': 0, 'j': 0} k_v_min, k_d_min = 999, 999 k_v_max, k_d_max = 0, 0 for region in utils.regions: all_match_names[region] = sorted(all_match_names[region], reverse=True) match_names[region] = [] codon_positions = { 'v': -1, 'd': -1, 'j': -1 } # conserved codon positions (v:cysteine, d:dummy, j:tryptophan) for region in utils.regions: n_matches[region] = len(all_match_names[region]) n_skipped = 0 for score, gene in all_match_names[region]: glbounds = all_germline_bounds[gene] qrbounds = all_query_bounds[gene] assert qrbounds[1] <= len( query_seq ) # NOTE I'm putting these up avove as well (in process_query), so in time I should remove them from here assert glbounds[1] <= len(self.germline_seqs[region][gene]) assert qrbounds[0] >= 0 assert glbounds[0] >= 0 glmatchseq = self.germline_seqs[region][gene][ glbounds[0]:glbounds[1]] # only use the best few matches if n_used[region] >= int( self.args.n_max_per_region[utils.regions.index(region)] ): # only take the top few from each region break # only use a specified set of genes if self.args.only_genes != None and gene not in self.args.only_genes: n_skipped += 1 continue # add match to the list n_used[region] += 1 match_names[region].append(gene) self.print_match(region, gene, query_seq, score, glbounds, qrbounds, -1, warnings, skipping=False) # if the germline match and the query match aren't the same length, s-w likely added an insert, which we shouldn't get since the gap-open penalty is jacked up so high if len(glmatchseq) != len( query_seq[qrbounds[0]:qrbounds[1]] ): # neurotic double check (um, I think) EDIT hey this totally saved my ass print 'ERROR %d not same length' % query_name print glmatchseq, glbounds[0], glbounds[1] print query_seq[qrbounds[0]:qrbounds[1]] assert False if region == 'v': this_k_v = all_query_bounds[gene][ 1] # NOTE even if the v match doesn't start at the left hand edge of the query sequence, we still measure k_v from there. # In other words, sw doesn't tell the hmm about it k_v_min = min(this_k_v, k_v_min) k_v_max = max(this_k_v, k_v_max) if region == 'd': this_k_d = all_query_bounds[gene][1] - all_query_bounds[ raw_best['v']][1] # end of d minus end of v k_d_min = min(this_k_d, k_d_min) k_d_max = max(this_k_d, k_d_max) # check consistency with best match (since the best match is excised in s-w code, and because ham is run with *one* k_v k_d set) if region not in best: best[region] = gene best[region + '_gl_seq'] = self.germline_seqs[region][ gene][glbounds[0]:glbounds[1]] best[region + '_qr_seq'] = query_seq[qrbounds[0]:qrbounds[1]] best[region + '_score'] = score if self.args.debug and n_skipped > 0: print '%8s skipped %d %s genes' % ('', n_skipped, region) for region in utils.regions: if region not in best: print ' no', region, 'match found for', query_name # NOTE if no d match found, we should really should just assume entire d was eroded if not self.args.is_data: print ' true:' utils.print_reco_event(self.germline_seqs, self.reco_info[query_name], extra_str=' ') return # s-w allows d and j matches to overlap... which makes no sense, so arbitrarily give the disputed territory to j try: self.shift_overlapping_boundaries(all_query_bounds, all_germline_bounds, query_name, query_seq, best) except AssertionError: print ' ERROR %s apportionment failed' % str(query_name) return for region in utils.regions: codon_positions[region] = utils.get_conserved_codon_position( self.cyst_positions, self.tryp_positions, region, best[region], all_germline_bounds, all_query_bounds) # position in the query sequence, that is # check for unproductive rearrangements try: # NOTE it's actually expected that this'll fail with a 'sequence too short' error, since the s-w doesn't know it's supposed to make sure the match contains the conserved codons utils.check_both_conserved_codons(query_seq, codon_positions['v'], codon_positions['j'], debug=self.args.debug, extra_str=' ') cdr3_length = codon_positions['j'] - codon_positions['v'] + 3 if cdr3_length % 3 != 0: # make sure we've stayed in frame if self.args.debug: print ' out of frame cdr3: %d %% 3 = %d' % ( cdr3_length, cdr3_length % 3) assert False utils.check_for_stop_codon(query_seq, codon_positions['v'], debug=self.args.debug) except AssertionError: if self.args.debug: print ' unproductive rearrangement in waterer' if self.args.skip_unproductive: if self.args.debug: print ' ...skipping' self.n_unproductive += 1 self.info['skipped_unproductive_queries'].append(query_name) return # best k_v, k_d: k_v = all_query_bounds[best['v']][1] # end of v match k_d = all_query_bounds[best['d']][1] - all_query_bounds[best['v']][ 1] # end of d minus end of v if k_d_max < 5: # since the s-w step matches to the longest possible j and then excises it, this sometimes gobbles up the d, resulting in a very short d alignment. if self.args.debug: print ' expanding k_d' k_d_max = max(8, k_d_max) if 'IGHJ4*' in best['j'] and self.germline_seqs['d'][best['d']][ -5:] == 'ACTAC': # the end of some d versions is the same as the start of some j versions, so the s-w frequently kicks out the 'wrong' alignment if self.args.debug: print ' doubly expanding k_d' if k_d_max - k_d_min < 8: k_d_min -= 5 k_d_max += 2 k_v_min = max( 0, k_v_min - self.args.default_v_fuzz ) # ok, so I don't *actually* want it to be zero... oh, well k_v_max += self.args.default_v_fuzz k_d_min = max(1, k_d_min - self.args.default_d_fuzz) k_d_max += self.args.default_d_fuzz assert k_v_min > 0 and k_d_min > 0 and k_v_max > 0 and k_d_max > 0 if self.args.debug: print ' k_v: %d [%d-%d)' % (k_v, k_v_min, k_v_max) print ' k_d: %d [%d-%d)' % (k_d, k_d_min, k_d_max) print ' used', for region in utils.regions: print ' %s: %d/%d' % (region, n_used[region], n_matches[region]), print '' kvals = {} kvals['v'] = {'best': k_v, 'min': k_v_min, 'max': k_v_max} kvals['d'] = {'best': k_d, 'min': k_d_min, 'max': k_d_max} self.add_to_info(query_name, query_seq, kvals, match_names, best, all_germline_bounds, all_query_bounds, codon_positions=codon_positions, perfplotter=perfplotter)
def add_mutants(self, reco_event, irandom): if self.args.mutation_multiplier is not None and self.args.mutation_multiplier == 0.: # some of the stuff below fails if mut mult is actually 0. reco_event.final_seqs.append( reco_event.recombined_seq) # set final sequnce in reco_event reco_event.indelfos = [ indelutils.get_empty_indel() for _ in range(len(reco_event.final_seqs)) ] return # When generating trees, each tree's number of leaves and total depth are chosen from the specified distributions (a.t.m., by default n-leaves is from a geometric/zipf, and depth is from data) # This chosen depth corresponds to the sequence-wide mutation frequency. # In order to account for varying mutation rates in v, d, and j we simulate these regions separately, by appropriately rescaling the tree for each region. # i.e.: here we get the sequence-wide mute freq from the tree, and rescale it by the repertoire-wide ratios from data (which are stored in the tree file). # looks like e.g.: (t2:0.003751736951,t1:0.003751736951):0.001248262937;v:0.98,d:1.8,j:0.87, where the newick trees has branch lengths corresponding to the whole sequence (i.e. the weighted mean of v, d, and j) # NOTE a.t.m (and probably permanently) the mean branch lengths for each region are the same for all the trees in the file, I just don't have a better place to put them while I'm passing from TreeGenerator to here than at the end of each line in the file treefostr = self.treeinfo[random.randint( 0, len(self.treeinfo) - 1 )] # per-region mutation info is tacked on after the tree... sigh. kind of hackey but works ok. assert treefostr.count(';') == 1 isplit = treefostr.find(';') + 1 chosen_tree = treefostr[:isplit] # includes semi-colon mutefo = [rstr for rstr in treefostr[isplit:].split(',')] mean_total_height = treegenerator.get_mean_height(chosen_tree) regional_heights = { } # per-region height, including <self.args.mutation_multiplier> for tmpstr in mutefo: region, ratio = tmpstr.split(':') assert region in utils.regions ratio = float(ratio) if self.args.mutation_multiplier is not None: # multiply the branch lengths by some factor ratio *= self.args.mutation_multiplier regional_heights[region] = mean_total_height * ratio scaled_trees = { r: treegenerator.rescale_tree(chosen_tree, regional_heights[r]) for r in utils.regions } if self.args.debug: print ' chose tree with total height %f' % treegenerator.get_mean_height( chosen_tree) print ' regional trees rescaled to heights: %s' % (' '.join([ '%s %.3f (expected %.3f)' % (region, treegenerator.get_mean_height( scaled_trees[region]), regional_heights[region]) for region in utils.regions ])) print treegenerator.get_ascii_tree(chosen_tree, extra_str=' ') n_leaves = treegenerator.get_n_leaves(chosen_tree) cmdfos = [] for region in utils.regions: simstr = reco_event.eroded_seqs[region] if region == 'd': simstr = reco_event.insertions[ 'vd'] + simstr + reco_event.insertions['dj'] cmdfos.append( self.prepare_bppseqgen(simstr, scaled_trees[region], n_leaves, reco_event.genes[region], reco_event, seed=irandom)) utils.run_cmds( [cfo for cfo in cmdfos if cfo is not None], sleep=False) # shenanigan is to handle zero-length regional seqs mseqs = {} for ireg in range( len(utils.regions) ): # NOTE kind of sketchy just using index in <utils.regions> (although it just depends on the loop immediately above a.t.m.) if cmdfos[ireg] is None: mseqs[utils.regions[ireg]] = [ '' for _ in range(n_leaves) ] # return an empty string for each leaf node else: mseqs[utils.regions[ireg]] = self.read_bppseqgen_output( cmdfos[ireg], n_leaves) assert len(reco_event.final_seqs) == 0 for iseq in range(n_leaves): seq = mseqs['v'][iseq] + mseqs['d'][iseq] + mseqs['j'][iseq] seq = reco_event.revert_conserved_codons( seq, debug=self.args.debug ) # if mutation screwed up the conserved codons, just switch 'em back to what they were to start with reco_event.final_seqs.append( seq) # set final sequnce in reco_event reco_event.final_codon_positions.append( copy.deepcopy(reco_event.post_erosion_codon_positions) ) # separate codon positions for each sequence, because of shm indels self.add_shm_indels(reco_event) reco_event.setline( irandom ) # set the line here because we use it when checking tree simulation, and want to make sure the uids are always set at the same point in the workflow self.check_tree_simulation(mean_total_height, regional_heights, scaled_trees, mseqs, reco_event) if self.args.debug: utils.print_reco_event(reco_event.line, extra_str=' ')
def parse_detail(self, fk, unique_id): assert fk.iline < len(fk.lines) while fk.line[1] != 'Details': fk.increment() if fk.eof: return fk.increment() info = {} info['unique_id'] = unique_id for begin_line, column, index, required, default in line_order: if fk.line[0].find(begin_line) != 0: if required: print 'oop', begin_line, fk.line sys.exit() else: info[column] = default continue if column != '': info[column] = clean_value(column, fk.line[index]) # if '[' in info[column]: # print 'added', column, clean_value(column, fk.line[index]) if column.find('_gene') == 1: region = column[0] info[region + '_5p_del'] = int( fk.line[fk.line.index('start:') + 1]) - 1 # NOTE their indices are 1-based gl_length = int(fk.line[fk.line.index('gene:') + 1]) - 1 match_end = int(fk.line[fk.line.index('end:') + 1]) - 1 assert gl_length >= match_end info[region + '_3p_del'] = gl_length - match_end fk.increment() if unique_id not in self.sim_need: while not fk.eof and fk.line[ 1] != 'Details': # skip stuff until start of next Detail block fk.increment() return info['fv_insertion'] = '' info['jf_insertion'] = '' info['seq'] = info['v_qr_seq'] + info['vd_insertion'] + info[ 'd_qr_seq'] + info['dj_insertion'] + info['j_qr_seq'] if '-' in info['seq']: print 'ERROR found a dash in %s, returning failure' % unique_id while not fk.eof and fk.line[ 1] != 'Details': # skip stuff until start of next Detail block fk.increment() return if info['seq'] not in self.siminfo[unique_id][ 'seq']: # arg. I can't do != because it tacks on v left and j right deletions print 'ERROR didn\'t find the right sequence for %s' % unique_id print ' ', info['seq'] print ' ', self.siminfo[unique_id]['seq'] sys.exit() if self.args.debug: print unique_id utils.print_reco_event(self.germline_seqs, self.siminfo[unique_id], label='true:', extra_str=' ') utils.print_reco_event(self.germline_seqs, info, label='inferred:', extra_str=' ') for region in utils.regions: if info[region + '_gene'] not in self.germline_seqs[region]: print 'ERROR %s not in germlines' % info[region + '_gene'] assert False gl_seq = info[region + '_gl_seq'] if '[' in gl_seq: # ambiguous for nuke in utils.nukes: gl_seq = gl_seq.replace('[', nuke) if gl_seq in self.germline_seqs[region][info[region + '_gene']]: print ' replaced [ with %s' % nuke break info[region + '_gl_seq'] = gl_seq if info[region + '_gl_seq'] not in self.germline_seqs[region][info[ region + '_gene']]: print 'ERROR gl match not found for %s in %s' % ( info[region + '_gene'], unique_id) print ' ', info[region + '_gl_seq'] print ' ', self.germline_seqs[region][info[region + '_gene']] self.perfplotter.add_partial_fail(self.siminfo[unique_id], info) while not fk.eof and fk.line[ 1] != 'Details': # skip stuff until start of next Detail block fk.increment() return self.perfplotter.evaluate(self.siminfo[unique_id], info) self.details[unique_id] = info self.sim_need.remove(unique_id) while not fk.eof and fk.line[ 1] != 'Details': # skip stuff until start of next Detail block fk.increment()
import glutils from clusterpath import ClusterPath # read default germline info glfo = glutils.read_glfo(partis_path + '/data/germlines/human', locus='igh') print 'first parse an annotation csv file:' with open(partis_path + '/test/reference-results/annotate-new-simu.csv') as csvfile: reader = csv.DictReader(csvfile) for line in reader: if line['v_gene'] == '': # failed (i.e. couldn't find an annotation) continue utils.process_input_line( line) # converts strings in the csv file to floats/ints/dicts/etc. utils.add_implicit_info( glfo, line ) # add stuff to <line> that's useful, isn't written to the csv since it's redundant utils.print_reco_event( line) # print ascii-art representation of the rearrangement event print '\navailable annotation info for each line (see manual for descriptions):' for key, val in line.items(): print '%20s %s' % (key, val) break print '\n\nthen parse a partition csv file:' cp = ClusterPath() cp.readfile(partis_path + '/test/reference-results/seed-partition-new-simu.csv') cp.print_partitions(abbreviate=True)
outline[region + '_gene'] = utils.unsanitize_name(inferred_name) true_name = utils.sanitize_name(inline[region + '_gene']) inferred_group_str += inferred_name true_group_str += true_name if inferred_name == 'none': print ' none', elif inferred_name == true_name: print ' - ', else: print ' x ', for region in utils.regions: print '%3d' % searcher.n_tries[region], print '' print ' true' utils.print_reco_event(germlines, inline, -1, -1) if searcher.all_matched(): print ' inferred' try: searcher.build_inferred_seq(inline['seq'], germlines, outline) utils.print_reco_event(germlines, outline, -1, -1) except: print ' *something* is wrong!' print ' ',searcher.best_matches['v'] print ' ',searcher.best_matches['d'] print ' ',searcher.best_matches['j'] continue else: print 'no matches!' print ' ',searcher.best_matches['v'] print ' ',searcher.best_matches['d']
def parse_bcr_phylo_output(glfo, naive_line, outdir, ievent): seqfos = utils.read_fastx(bcr_phylo_fasta_fname(outdir)) # output mutated sequences from bcr-phylo assert len(naive_line['unique_ids']) == 1 # enforces that we ran naive-only, 1-leaf partis simulation above assert not indelutils.has_indels(naive_line['indelfos'][0]) # would have to handle this below if args.debug: utils.print_reco_event(naive_line) reco_info = collections.OrderedDict() for sfo in seqfos: mline = copy.deepcopy(naive_line) utils.remove_all_implicit_info(mline) del mline['tree'] mline['unique_ids'] = [sfo['name']] mline['seqs'] = [sfo['seq']] # it's really important to set both the seqs (since they're both already in there from the naive line) mline['input_seqs'] = [sfo['seq']] # it's really important to set both the seqs (since they're both already in there from the naive line) mline['duplicates'] = [[]] reco_info[sfo['name']] = mline try: utils.add_implicit_info(glfo, mline) except: # TODO not sure if I really want to leave this in long term, but it shouldn't hurt anything (it's crashing on unequal naive/mature sequence lengths, and I need this to track down which event it is) UPDATE: yeah it was just because something crashed in the middle of writing a .fa file print 'implicit info adding failed for ievent %d in %s' % (ievent, outdir) lines = traceback.format_exception(*sys.exc_info()) print utils.pad_lines(''.join(lines)) # NOTE this will still crash on the next line if implicit info adding failed final_line = utils.synthesize_multi_seq_line_from_reco_info([sfo['name'] for sfo in seqfos], reco_info) if args.debug: utils.print_reco_event(final_line) # extract kd values from pickle file (use a separate script since it requires ete/anaconda to read) if args.stype == 'selection': kdfname, nwkfname = '%s/kd-vals.csv' % outdir, '%s/simu.nwk' % outdir if not utils.output_exists(args, kdfname, outlabel='kd/nwk conversion', offset=4): # eh, don't really need to check for both kd an nwk file, chances of only one being missing are really small, and it'll just crash when it looks for it a couple lines later cmd = './bin/read-bcr-phylo-trees.py --pickle-tree-file %s/%s_lineage_tree.p --kdfile %s --newick-tree-file %s' % (outdir, args.extrastr, kdfname, nwkfname) utils.run_ete_script(cmd, ete_path, debug=args.n_procs==1) nodefo = {} with open(kdfname) as kdfile: reader = csv.DictReader(kdfile) for line in reader: nodefo[line['uid']] = { 'kd' : float(line['kd']), 'relative_kd' : float(line['relative_kd']), 'lambda' : line.get('lambda', None), 'target_index' : int(line['target_index']), } if len(set(nodefo) - set(final_line['unique_ids'])) > 0: # uids in the kd file but not the <line> (i.e. not in the newick/fasta files) are probably just bcr-phylo discarding internal nodes print ' in kd file, but missing from final_line (probably just internal nodes that bcr-phylo wrote to the tree without names): %s' % (set(nodefo) - set(final_line['unique_ids'])) if len(set(final_line['unique_ids']) - set(nodefo)) > 0: print ' in final_line, but missing from kdvals: %s' % ' '.join(set(final_line['unique_ids']) - set(nodefo)) final_line['affinities'] = [1. / nodefo[u]['kd'] for u in final_line['unique_ids']] final_line['relative_affinities'] = [1. / nodefo[u]['relative_kd'] for u in final_line['unique_ids']] final_line['lambdas'] = [nodefo[u]['lambda'] for u in final_line['unique_ids']] final_line['nearest_target_indices'] = [nodefo[u]['target_index'] for u in final_line['unique_ids']] tree = treeutils.get_dendro_tree(treefname=nwkfname) tree.scale_edges(1. / numpy.mean([len(s) for s in final_line['seqs']])) if args.debug: print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=tree), padwidth=12) final_line['tree'] = tree.as_string(schema='newick') tmp_event = RecombinationEvent(glfo) # I don't want to move the function out of event.py right now tmp_event.set_reco_id(final_line, irandom=ievent) # not sure that setting <irandom> here actually does anything # get target sequences target_seqfos = utils.read_fastx('%s/%s_targets.fa' % (outdir, args.extrastr)) final_line['target_seqs'] = [tfo['seq'] for tfo in target_seqfos] return final_line
def summarize_query(self, query_name, query_seq, raw_best, all_match_names, all_query_bounds, all_germline_bounds, perfplotter, warnings): if self.args.debug: print '%s' % str(query_name) best, match_names, n_matches = {}, {}, {} n_used = {'v':0, 'd':0, 'j':0} k_v_min, k_d_min = 999, 999 k_v_max, k_d_max = 0, 0 for region in utils.regions: all_match_names[region] = sorted(all_match_names[region], reverse=True) match_names[region] = [] codon_positions = {'v':-1, 'd':-1, 'j':-1} # conserved codon positions (v:cysteine, d:dummy, j:tryptophan) for region in utils.regions: n_matches[region] = len(all_match_names[region]) n_skipped = 0 for score, gene in all_match_names[region]: glbounds = all_germline_bounds[gene] qrbounds = all_query_bounds[gene] assert qrbounds[1] <= len(query_seq) # NOTE I'm putting these up avove as well (in process_query), so in time I should remove them from here assert glbounds[1] <= len(self.germline_seqs[region][gene]) assert qrbounds[0] >= 0 assert glbounds[0] >= 0 glmatchseq = self.germline_seqs[region][gene][glbounds[0]:glbounds[1]] # only use the best few matches if n_used[region] >= int(self.args.n_max_per_region[utils.regions.index(region)]): # only take the top few from each region break # only use a specified set of genes if self.args.only_genes != None and gene not in self.args.only_genes: n_skipped += 1 continue # add match to the list n_used[region] += 1 match_names[region].append(gene) self.print_match(region, gene, query_seq, score, glbounds, qrbounds, -1, warnings, skipping=False) # if the germline match and the query match aren't the same length, s-w likely added an insert, which we shouldn't get since the gap-open penalty is jacked up so high if len(glmatchseq) != len(query_seq[qrbounds[0]:qrbounds[1]]): # neurotic double check (um, I think) EDIT hey this totally saved my ass print 'ERROR %d not same length' % query_name print glmatchseq, glbounds[0], glbounds[1] print query_seq[qrbounds[0]:qrbounds[1]] assert False if region == 'v': this_k_v = all_query_bounds[gene][1] # NOTE even if the v match doesn't start at the left hand edge of the query sequence, we still measure k_v from there. # In other words, sw doesn't tell the hmm about it k_v_min = min(this_k_v, k_v_min) k_v_max = max(this_k_v, k_v_max) if region == 'd': this_k_d = all_query_bounds[gene][1] - all_query_bounds[raw_best['v']][1] # end of d minus end of v k_d_min = min(this_k_d, k_d_min) k_d_max = max(this_k_d, k_d_max) # check consistency with best match (since the best match is excised in s-w code, and because ham is run with *one* k_v k_d set) if region not in best: best[region] = gene best[region + '_gl_seq'] = self.germline_seqs[region][gene][glbounds[0]:glbounds[1]] best[region + '_qr_seq'] = query_seq[qrbounds[0]:qrbounds[1]] best[region + '_score'] = score if self.args.debug and n_skipped > 0: print '%8s skipped %d %s genes' % ('', n_skipped, region) for region in utils.regions: if region not in best: print ' no',region,'match found for',query_name # NOTE if no d match found, we should really should just assume entire d was eroded if not self.args.is_data: print ' true:' utils.print_reco_event(self.germline_seqs, self.reco_info[query_name], extra_str=' ') return # s-w allows d and j matches to overlap... which makes no sense, so arbitrarily give the disputed territory to j try: self.shift_overlapping_boundaries(all_query_bounds, all_germline_bounds, query_name, query_seq, best) except AssertionError: print ' ERROR %s apportionment failed' % str(query_name) return for region in utils.regions: codon_positions[region] = utils.get_conserved_codon_position(self.cyst_positions, self.tryp_positions, region, best[region], all_germline_bounds, all_query_bounds) # position in the query sequence, that is # check for unproductive rearrangements try: # NOTE it's actually expected that this'll fail with a 'sequence too short' error, since the s-w doesn't know it's supposed to make sure the match contains the conserved codons utils.check_both_conserved_codons(query_seq, codon_positions['v'], codon_positions['j'], debug=self.args.debug, extra_str=' ') cdr3_length = codon_positions['j'] - codon_positions['v'] + 3 if cdr3_length % 3 != 0: # make sure we've stayed in frame if self.args.debug: print ' out of frame cdr3: %d %% 3 = %d' % (cdr3_length, cdr3_length % 3) assert False utils.check_for_stop_codon(query_seq, codon_positions['v'], debug=self.args.debug) except AssertionError: if self.args.debug: print ' unproductive rearrangement in waterer' if self.args.skip_unproductive: if self.args.debug: print ' ...skipping' self.n_unproductive += 1 self.info['skipped_unproductive_queries'].append(query_name) return # best k_v, k_d: k_v = all_query_bounds[best['v']][1] # end of v match k_d = all_query_bounds[best['d']][1] - all_query_bounds[best['v']][1] # end of d minus end of v if k_d_max < 5: # since the s-w step matches to the longest possible j and then excises it, this sometimes gobbles up the d, resulting in a very short d alignment. if self.args.debug: print ' expanding k_d' k_d_max = max(8, k_d_max) if 'IGHJ4*' in best['j'] and self.germline_seqs['d'][best['d']][-5:] == 'ACTAC': # the end of some d versions is the same as the start of some j versions, so the s-w frequently kicks out the 'wrong' alignment if self.args.debug: print ' doubly expanding k_d' if k_d_max-k_d_min < 8: k_d_min -= 5 k_d_max += 2 k_v_min = max(0, k_v_min - self.args.default_v_fuzz) # ok, so I don't *actually* want it to be zero... oh, well k_v_max += self.args.default_v_fuzz k_d_min = max(1, k_d_min - self.args.default_d_fuzz) k_d_max += self.args.default_d_fuzz assert k_v_min > 0 and k_d_min > 0 and k_v_max > 0 and k_d_max > 0 if self.args.debug: print ' k_v: %d [%d-%d)' % (k_v, k_v_min, k_v_max) print ' k_d: %d [%d-%d)' % (k_d, k_d_min, k_d_max) print ' used', for region in utils.regions: print ' %s: %d/%d' % (region, n_used[region], n_matches[region]), print '' kvals = {} kvals['v'] = {'best':k_v, 'min':k_v_min, 'max':k_v_max} kvals['d'] = {'best':k_d, 'min':k_d_min, 'max':k_d_max} self.add_to_info(query_name, query_seq, kvals, match_names, best, all_germline_bounds, all_query_bounds, codon_positions=codon_positions, perfplotter=perfplotter)
chfo = { uid: { k: v for k, v in zip( ('imax', 'max_abs_diff'), utils.get_chimera_max_abs_diff( annotations[uid], iseq=0, chunk_len=args.chunk_len)) } for uid in annotations } biggest_adiffs = sorted(chfo, key=lambda q: chfo[q]['max_abs_diff'], reverse=True) for uid in biggest_adiffs[:5]: print '%-3d %6.3f' % (chfo[uid]['imax'], chfo[uid]['max_abs_diff']) utils.print_reco_event(annotations[uid]) n_above_cutoff = len( [_ for cfo in chfo.values() if cfo['max_abs_diff'] > args.cutoff]) chimeric_fraction = n_above_cutoff / float(len(chfo)) print ' %d / %d = %.3f above chimeric cutoff' % (n_above_cutoff, len(chfo), chimeric_fraction) hmaxval = Hist(45, 0., 0.65) for uid in annotations: hmaxval.fill(chfo[uid]['max_abs_diff']) himax = Hist(75, 0., 400) for uid in annotations: himax.fill(chfo[uid]['imax']) utils.prep_dir(args.plotdir, wildlings=['*.svg', '*.csv'])
def __init__(self, args): self.args = args self.germline_seqs = utils.read_germlines(self.args.datadir) perfplotter = PerformancePlotter(self.germline_seqs, self.args.plotdir, 'imgt') # get sequence info that was passed to imgt self.seqinfo = {} with opener('r')(self.args.simfname) as simfile: reader = csv.DictReader(simfile) iline = 0 for line in reader: if self.args.queries != None and line[ 'unique_id'] not in self.args.queries: continue if len(re.findall('_[FP]', line['j_gene'])) > 0: line['j_gene'] = line['j_gene'].replace( re.findall('_[FP]', line['j_gene'])[0], '') self.seqinfo[line['unique_id']] = line iline += 1 if self.args.n_queries > 0 and iline >= self.args.n_queries: break paragraphs, csv_info = None, None if self.args.infname != None and '.html' in self.args.infname: print 'reading', self.args.infname with opener('r')(self.args.infname) as infile: soup = BeautifulSoup(infile) paragraphs = soup.find_all('pre') summarydir = self.args.indir[:self.args.indir.rfind( '/' )] # one directoy up from <indir>, which has the detailed per-sequence files summary_fname = glob.glob(summarydir + '/1_Summary_*.txt') assert len(summary_fname) == 1 summary_fname = summary_fname[0] get_genes_to_skip(summary_fname, self.germline_seqs) n_failed, n_skipped, n_total, n_not_found, n_found = 0, 0, 0, 0, 0 for unique_id in self.seqinfo: if self.args.debug: print unique_id, imgtinfo = [] # print 'true' # utils.print_reco_event(self.germline_seqs, self.seqinfo[unique_id]) if self.args.infname != None and '.html' in self.args.infname: for pre in paragraphs: # NOTE this loops over everything an awful lot of times. Shouldn't really matter for now, though if unique_id in pre.text: imgtinfo.append(pre.text) else: n_total += 1 assert self.args.infname == None infnames = glob.glob(self.args.indir + '/' + unique_id + '*') assert len(infnames) <= 1 if len(infnames) != 1: if self.args.debug: print ' couldn\'t find it' n_not_found += 1 continue n_found += 1 with opener('r')(infnames[0]) as infile: full_text = infile.read() if len( re.findall('[123]. Alignment for [VDJ]-GENE', full_text)) < 3: failregions = re.findall( 'No [VDJ]-GENE has been identified', full_text) if self.args.debug and len(failregions) > 0: print ' ', failregions n_failed += 1 continue # loop over the paragraphs I want position = full_text.find(unique_id) # don't need this one for ir in range(4): position = full_text.find(unique_id, position + 1) pgraph = full_text[position:full_text. find('\n\n', position + 1)] if 'insertion(s) and/or deletion(s) which are not dealt in this release' in pgraph: ir -= 1 continue imgtinfo.append(pgraph) # query seq paragraph if len(imgtinfo) == 0: print '%s no info' % unique_id continue else: if self.args.debug: print '' line = self.parse_query_text(unique_id, imgtinfo) if 'skip_gene' in line: # assert self.args.skip_missing_genes n_skipped += 1 continue try: assert 'failed' not in line joinparser.add_insertions(line, debug=self.args.debug) joinparser.resolve_overlapping_matches( line, debug=False, germlines=self.germline_seqs) except (AssertionError, KeyError): print ' giving up' n_failed += 1 perfplotter.add_partial_fail(self.seqinfo[unique_id], line) # print ' perfplotter: not sure what to do with a fail' continue perfplotter.evaluate(self.seqinfo[unique_id], line) if self.args.debug: utils.print_reco_event(self.germline_seqs, self.seqinfo[unique_id], label='true:') utils.print_reco_event(self.germline_seqs, line, label='inferred:') perfplotter.plot() print 'failed: %d / %d = %f' % (n_failed, n_total, float(n_failed) / n_total) print 'skipped: %d / %d = %f' % (n_skipped, n_total, float(n_skipped) / n_total) print ' ', for g, n in genes_actually_skipped.items(): print ' %d %s' % (n, utils.color_gene(g)) print '' if n_not_found > 0: print ' not found: %d / %d = %f' % (n_not_found, n_not_found + n_found, n_not_found / float(n_not_found + n_found))
def add_to_info(self, query_name, query_seq, kvals, match_names, best, all_germline_bounds, all_query_bounds, codon_positions): assert query_name not in self.info self.info['queries'].append(query_name) self.info[query_name] = {} self.info[query_name][ 'unique_id'] = query_name # redundant, but used somewhere down the line self.info[query_name]['k_v'] = kvals['v'] self.info[query_name]['k_d'] = kvals['d'] self.info[query_name]['all'] = ':'.join(match_names['v'] + match_names['d'] + match_names['j']) # assert codon_positions['v'] != -1 # assert codon_positions['j'] != -1 self.info[query_name][ 'cdr3_length'] = codon_positions['j'] - codon_positions[ 'v'] + 3 #tryp_position_in_joined_seq - self.cyst_position + 3 self.info[query_name]['cyst_position'] = codon_positions['v'] self.info[query_name]['tryp_position'] = codon_positions['j'] if self.info[query_name]['cyst_position'] < 0 or self.info[query_name][ 'cyst_position'] >= len(query_seq): raise Exception('cpos %d invalid for %s (%s)' % (self.info[query_name]['cyst_position'], query_name, query_seq)) if self.info[query_name]['tryp_position'] < 0 or self.info[query_name][ 'tryp_position'] >= len(query_seq): raise Exception('tpos %d invalid for %s (%s)' % (self.info[query_name]['tryp_position'], query_name, query_seq)) # erosion, insertion, mutation info for best match self.info[query_name]['v_5p_del'] = all_germline_bounds[best['v']][0] self.info[query_name]['v_3p_del'] = len( self.germline_seqs['v'][best['v']]) - all_germline_bounds[ best['v']][1] # len(germline v) - gl_match_end self.info[query_name]['d_5p_del'] = all_germline_bounds[best['d']][0] self.info[query_name]['d_3p_del'] = len(self.germline_seqs['d'][ best['d']]) - all_germline_bounds[best['d']][1] self.info[query_name]['j_5p_del'] = all_germline_bounds[best['j']][0] self.info[query_name]['j_3p_del'] = len(self.germline_seqs['j'][ best['j']]) - all_germline_bounds[best['j']][1] self.info[query_name][ 'fv_insertion'] = query_seq[:all_query_bounds[best['v']][0]] self.info[query_name]['vd_insertion'] = query_seq[ all_query_bounds[best['v']][1]:all_query_bounds[best['d']][0]] self.info[query_name]['dj_insertion'] = query_seq[ all_query_bounds[best['d']][1]:all_query_bounds[best['j']][0]] self.info[query_name]['jf_insertion'] = query_seq[ all_query_bounds[best['j']][1]:] for region in utils.regions: self.info[query_name][region + '_gene'] = best[region] self.info[query_name][region + '_gl_seq'] = best[region + '_gl_seq'] self.info[query_name][region + '_qr_seq'] = best[region + '_qr_seq'] self.info['all_best_matches'].add(best[region]) self.info[query_name][ 'seq'] = query_seq # NOTE this is the seq output by vdjalign, i.e. if we reversed any indels it is the reversed sequence if self.debug: if not self.args.is_data: utils.print_reco_event( self.germline_seqs, self.reco_info[query_name], extra_str=' ', label='true:', indelfo=self.reco_info[query_name]['indels']) utils.print_reco_event(self.germline_seqs, self.info[query_name], extra_str=' ', label='inferred:', indelfo=self.info['indels'].get( query_name, None)) if self.pcounter is not None: self.pcounter.increment_reco_params(self.info[query_name]) self.pcounter.increment_mutation_params(self.info[query_name]) if self.true_pcounter is not None: self.true_pcounter.increment_reco_params( self.reco_info[query_name]) self.true_pcounter.increment_mutation_params( self.reco_info[query_name]) if self.perfplotter is not None: self.perfplotter.evaluate( self.reco_info[query_name], self.info[query_name]) #, subtract_unphysical_erosions=True) self.remaining_queries.remove(query_name)
def process_query(self, qr_info, query_name, query_lines): # split query_lines up into blocks blocks = [] for line in query_lines: if line.find('Query_') == 0: blocks.append([]) if len(line) == 0: continue if len(re.findall('<a name=#_[0-9][0-9]*_IGH', line)) == 0 and line.find('Query_') != 0: continue if len(blocks) == 0: print 'wtf? %s' % query_name # it's probably kicking a reverse match self.perfplotter.add_partial_fail(self.seqinfo[query_name], qr_info) # NOTE that's really a total failure self.n_partially_failed += 1 return blocks[-1].append(line) # then process each block for block in blocks: self.process_single_block(block, query_name, qr_info) if 'skip_gene' in qr_info: self.n_skipped += 1 return if 'fail' in qr_info: self.perfplotter.add_partial_fail(self.seqinfo[query_name], qr_info) self.n_partially_failed += 1 return for region in utils.regions: if region + '_gene' not in qr_info: print ' %d: no %s match' % (query_name, region) self.perfplotter.add_partial_fail(self.seqinfo[query_name], qr_info) self.n_partially_failed += 1 return # expand v match to left end and j match to right end qr_info['v_5p_del'] = 0 qr_info['fv_insertion'] = '' if qr_info['match_start'] > 0: if self.args.debug: print ' add to v left:', self.seqinfo[query_name]['seq'][ : qr_info['match_start']] qr_info['seq'] = self.seqinfo[query_name]['seq'][ : qr_info['match_start']] + qr_info['seq'] qr_info['j_3p_del'] = 0 qr_info['jf_insertion'] = '' if len(self.seqinfo[query_name]['seq']) > qr_info['match_end']: if self.args.debug: print ' add to j right:', self.seqinfo[query_name]['seq'][ qr_info['match_end'] - len(self.seqinfo[query_name]['seq']) : ] qr_info['seq'] = qr_info['seq'] + self.seqinfo[query_name]['seq'][ qr_info['match_end'] - len(self.seqinfo[query_name]['seq']) : ] for boundary in utils.boundaries: start = qr_info[boundary[0] + '_qr_bounds'][1] end = qr_info[boundary[1] + '_qr_bounds'][0] qr_info[boundary + '_insertion'] = qr_info['seq'][start : end] for region in utils.regions: start = qr_info[region + '_qr_bounds'][0] end = qr_info[region + '_qr_bounds'][1] qr_info[region + '_qr_seq'] = qr_info['seq'][start : end] try: resolve_overlapping_matches(qr_info, self.args.debug, self.germline_seqs) except AssertionError: print ' %s: apportionment failed' % query_name self.perfplotter.add_partial_fail(self.seqinfo[query_name], qr_info) self.n_partially_failed += 1 return if self.args.debug: print ' query seq:', qr_info['seq'] for region in utils.regions: true_gene = self.seqinfo[query_name][region + '_gene'] infer_gene = qr_info[region + '_gene'] if utils.are_alleles(infer_gene, true_gene): regionstr = utils.color('bold', utils.color('blue', region)) truestr = '' #'(originally %s)' % match_name else: regionstr = utils.color('bold', utils.color('red', region)) truestr = '(true: %s)' % utils.color_gene(true_gene).replace(region, '') # print ' %s %s %s' % (regionstr, utils.color_gene(infer_gene).replace(region, ''), truestr) print ' %s %3d %3d %s %s %s' % (regionstr, qr_info[region + '_qr_bounds'][0], qr_info[region + '_qr_bounds'][1], utils.color_gene(infer_gene).replace(region, ''), truestr, qr_info[region + '_gl_seq']) for boundary in utils.boundaries: start = qr_info[boundary[0] + '_qr_bounds'][1] end = qr_info[boundary[1] + '_qr_bounds'][0] qr_info[boundary + '_insertion'] = qr_info['seq'][start : end] if self.args.debug: print ' ', boundary, qr_info[boundary + '_insertion'] self.perfplotter.evaluate(self.seqinfo[query_name], qr_info) # for key, val in qr_info.items(): # print key, val if self.args.debug: utils.print_reco_event(self.germline_seqs, self.seqinfo[query_name], label='true:', extra_str=' ') utils.print_reco_event(self.germline_seqs, qr_info, extra_str=' ')
def parse_bcr_phylo_output(glfo, naive_line, outdir, ievent): seqfos = utils.read_fastx(bcr_phylo_fasta_fname( outdir)) # output mutated sequences from bcr-phylo assert len( naive_line['unique_ids'] ) == 1 # enforces that we ran naive-only, 1-leaf partis simulation above assert not indelutils.has_indels( naive_line['indelfos'][0]) # would have to handle this below if args.debug: utils.print_reco_event(naive_line) reco_info = collections.OrderedDict() for sfo in seqfos: mline = copy.deepcopy(naive_line) utils.remove_all_implicit_info(mline) del mline['tree'] mline['unique_ids'] = [sfo['name']] mline['seqs'] = [ sfo['seq'] ] # it's really important to set both the seqs (since they're both already in there from the naive line) mline['input_seqs'] = [ sfo['seq'] ] # it's really important to set both the seqs (since they're both already in there from the naive line) mline['duplicates'] = [[]] reco_info[sfo['name']] = mline utils.add_implicit_info(glfo, mline) final_line = utils.synthesize_multi_seq_line_from_reco_info( [sfo['name'] for sfo in seqfos], reco_info) if args.debug: utils.print_reco_event(final_line) # extract kd values from pickle file (use a separate script since it requires ete/anaconda to read) if args.stype == 'selection': cmd = './bin/read-bcr-phylo-trees.py --pickle-tree-file %s/%s_lineage_tree.p --kdfile %s/kd-vals.csv --newick-tree-file %s/simu.nwk' % ( outdir, args.extrastr, outdir, outdir) utils.run_ete_script(cmd, ete_path) nodefo = {} with open('%s/kd-vals.csv' % outdir) as kdfile: reader = csv.DictReader(kdfile) for line in reader: nodefo[line['uid']] = { 'kd': float(line['kd']), 'relative_kd': float(line['relative_kd']), 'lambda': line.get('lambda', None), 'target_index': int(line['target_index']), } if len( set(nodefo) - set(final_line['unique_ids']) ) > 0: # uids in the kd file but not the <line> (i.e. not in the newick/fasta files) are probably just bcr-phylo discarding internal nodes print ' in kd file, but missing from final_line (probably just internal nodes that bcr-phylo wrote to the tree without names): %s' % ( set(nodefo) - set(final_line['unique_ids'])) if len(set(final_line['unique_ids']) - set(nodefo)) > 0: print ' in final_line, but missing from kdvals: %s' % ' '.join( set(final_line['unique_ids']) - set(nodefo)) final_line['affinities'] = [ 1. / nodefo[u]['kd'] for u in final_line['unique_ids'] ] final_line['relative_affinities'] = [ 1. / nodefo[u]['relative_kd'] for u in final_line['unique_ids'] ] final_line['lambdas'] = [ nodefo[u]['lambda'] for u in final_line['unique_ids'] ] final_line['nearest_target_indices'] = [ nodefo[u]['target_index'] for u in final_line['unique_ids'] ] tree = treeutils.get_dendro_tree(treefname='%s/simu.nwk' % outdir) tree.scale_edges(1. / numpy.mean([len(s) for s in final_line['seqs']])) if args.debug: print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=tree), padwidth=12) final_line['tree'] = tree.as_string(schema='newick') tmp_event = RecombinationEvent( glfo) # I don't want to move the function out of event.py right now tmp_event.set_reco_id( final_line, irandom=ievent ) # not sure that setting <irandom> here actually does anything # get target sequences target_seqfos = utils.read_fastx('%s/%s_targets.fa' % (outdir, args.extrastr)) final_line['target_seqs'] = [tfo['seq'] for tfo in target_seqfos] return final_line
def parse_bcr_phylo_output(glfo, naive_line, outdir, ievent): seqfos = utils.read_fastx( '%s/%s.fasta' % (outdir, args.extrastr)) # output mutated sequences from bcr-phylo assert len( naive_line['unique_ids'] ) == 1 # enforces that we ran naive-only, 1-leaf partis simulation above assert not indelutils.has_indels( naive_line['indelfos'][0]) # would have to handle this below if args.debug: utils.print_reco_event(naive_line) reco_info = collections.OrderedDict() for sfo in seqfos: mline = copy.deepcopy(naive_line) utils.remove_all_implicit_info(mline) del mline['tree'] mline['unique_ids'] = [sfo['name']] mline['seqs'] = [ sfo['seq'] ] # it's really important to set both the seqs (since they're both already in there from the naive line) mline['input_seqs'] = [ sfo['seq'] ] # it's really important to set both the seqs (since they're both already in there from the naive line) reco_info[sfo['name']] = mline utils.add_implicit_info(glfo, mline) final_line = utils.synthesize_multi_seq_line_from_reco_info( [sfo['name'] for sfo in seqfos], reco_info) if args.debug: utils.print_reco_event(final_line) # extract kd values from pickle file (use a separate script since it requires ete/anaconda to read) if args.stype == 'selection': cmd = 'export PATH=%s:$PATH && xvfb-run -a python ./bin/view-trees.py --pickle-tree-file %s/%s_lineage_tree.p --kdfile %s/kd-vals.csv --newick-tree-file %s/simu.nwk' % ( ete_path, outdir, args.extrastr, outdir, outdir) utils.simplerun(cmd, shell=True) kdvals = {} with open('%s/kd-vals.csv' % outdir) as kdfile: reader = csv.DictReader(kdfile) for line in reader: kdvals[line['uid']] = float(line['kd']) if len( set(kdvals) - set(final_line['unique_ids']) ) > 0: # uids in the kd file but not the <line> (i.e. not in the newick/fasta files) are probably just bcr-phylo discarding internal nodes print ' in kd file, but missing from final_line (probably just internal nodes that bcr-phylo wrote to the tree without names): %s' % ( set(kdvals) - set(final_line['unique_ids'])) if len(set(final_line['unique_ids']) - set(kdvals)) > 0: print ' in final_line, but missing from kdvals: %s' % ' '.join( set(final_line['unique_ids']) - set(kdvals)) final_line['affinities'] = [ 1. / kdvals[u] for u in final_line['unique_ids'] ] tree = treeutils.get_dendro_tree(treefname='%s/simu.nwk' % outdir) if args.debug: print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=tree), padwidth=12) final_line['tree'] = tree.as_string(schema='newick') tmp_event = RecombinationEvent( glfo) # I don't want to move the function out of event.py right now tmp_event.set_reco_id( final_line, irandom=ievent ) # not sure that setting <irandom> here actually does anything # get target sequences target_seqfos = utils.read_fastx('%s/%s_targets.fa' % (outdir, args.extrastr)) final_line['target_seqs'] = [tfo['seq'] for tfo in target_seqfos] from Bio.Seq import Seq final_line['nearest_target_indices'] = [] aa_targets = [Seq(seq).translate() for seq in final_line['target_seqs']] for mseq in final_line['input_seqs']: aa_mseq = Seq(mseq).translate() aa_hdists = [ utils.hamming_distance(aa_t, aa_mseq, amino_acid=True) for aa_t in aa_targets ] imin = aa_hdists.index( min(aa_hdists) ) # NOTE doesn't do anything differently if there's more than one min final_line['nearest_target_indices'].append(imin) return final_line
#!/usr/bin/env python import csv import sys partis_path = '.' # edit this if you're not running from the main partis dir sys.path.insert(1, partis_path + '/python') import utils import glutils from clusterpath import ClusterPath # read default germline info glfo = glutils.read_glfo(partis_path + '/data/germlines/human', locus='igh') print 'first parse an annotation csv file:' with open(partis_path + '/test/reference-results/annotate-new-simu.csv') as csvfile: reader = csv.DictReader(csvfile) for line in reader: if line['v_gene'] == '': # failed (i.e. couldn't find an annotation) continue utils.process_input_line(line) utils.add_implicit_info(glfo, line) utils.print_reco_event(line) break print 'then parse a partition csv file:' cp = ClusterPath() cp.readfile(partis_path + '/test/reference-results/seed-partition-new-simu.csv') cp.print_partitions(abbreviate=True)
def __init__(self, args): self.args = args self.germline_seqs = utils.read_germlines(self.args.datadir) perfplotter = PerformancePlotter(self.germline_seqs, self.args.plotdir, 'imgt') # get sequence info that was passed to imgt self.seqinfo = {} with opener('r')(self.args.simfname) as simfile: reader = csv.DictReader(simfile) iline = 0 for line in reader: if self.args.queries != None and line['unique_id'] not in self.args.queries: continue if len(re.findall('_[FP]', line['j_gene'])) > 0: line['j_gene'] = line['j_gene'].replace(re.findall('_[FP]', line['j_gene'])[0], '') self.seqinfo[line['unique_id']] = line iline += 1 if self.args.n_queries > 0 and iline >= self.args.n_queries: break paragraphs, csv_info = None, None if self.args.infname != None and '.html' in self.args.infname: print 'reading', self.args.infname with opener('r')(self.args.infname) as infile: soup = BeautifulSoup(infile) paragraphs = soup.find_all('pre') summarydir = self.args.indir[ : self.args.indir.rfind('/')] # one directoy up from <indir>, which has the detailed per-sequence files summary_fname = glob.glob(summarydir + '/1_Summary_*.txt') assert len(summary_fname) == 1 summary_fname = summary_fname[0] get_genes_to_skip(summary_fname, self.germline_seqs) n_failed, n_skipped, n_total, n_not_found, n_found = 0, 0, 0, 0, 0 for unique_id in self.seqinfo: if self.args.debug: print unique_id, imgtinfo = [] # print 'true' # utils.print_reco_event(self.germline_seqs, self.seqinfo[unique_id]) if self.args.infname != None and '.html' in self.args.infname: for pre in paragraphs: # NOTE this loops over everything an awful lot of times. Shouldn't really matter for now, though if unique_id in pre.text: imgtinfo.append(pre.text) else: n_total += 1 assert self.args.infname == None infnames = glob.glob(self.args.indir + '/' + unique_id + '*') assert len(infnames) <= 1 if len(infnames) != 1: if self.args.debug: print ' couldn\'t find it' n_not_found += 1 continue n_found += 1 with opener('r')(infnames[0]) as infile: full_text = infile.read() if len(re.findall('[123]. Alignment for [VDJ]-GENE', full_text)) < 3: failregions = re.findall('No [VDJ]-GENE has been identified', full_text) if self.args.debug and len(failregions) > 0: print ' ', failregions n_failed += 1 continue # loop over the paragraphs I want position = full_text.find(unique_id) # don't need this one for ir in range(4): position = full_text.find(unique_id, position+1) pgraph = full_text[position : full_text.find('\n\n', position+1)] if 'insertion(s) and/or deletion(s) which are not dealt in this release' in pgraph: ir -= 1 continue imgtinfo.append(pgraph) # query seq paragraph if len(imgtinfo) == 0: print '%s no info' % unique_id continue else: if self.args.debug: print '' line = self.parse_query_text(unique_id, imgtinfo) if 'skip_gene' in line: # assert self.args.skip_missing_genes n_skipped += 1 continue try: assert 'failed' not in line joinparser.add_insertions(line, debug=self.args.debug) joinparser.resolve_overlapping_matches(line, debug=False, germlines=self.germline_seqs) except (AssertionError, KeyError): print ' giving up' n_failed += 1 perfplotter.add_partial_fail(self.seqinfo[unique_id], line) # print ' perfplotter: not sure what to do with a fail' continue perfplotter.evaluate(self.seqinfo[unique_id], line) if self.args.debug: utils.print_reco_event(self.germline_seqs, self.seqinfo[unique_id], label='true:') utils.print_reco_event(self.germline_seqs, line, label='inferred:') perfplotter.plot() print 'failed: %d / %d = %f' % (n_failed, n_total, float(n_failed) / n_total) print 'skipped: %d / %d = %f' % (n_skipped, n_total, float(n_skipped) / n_total) print ' ', for g, n in genes_actually_skipped.items(): print ' %d %s' % (n, utils.color_gene(g)) print '' if n_not_found > 0: print ' not found: %d / %d = %f' % (n_not_found, n_not_found + n_found, n_not_found / float(n_not_found + n_found))
default=partis_dir + '/test/reference-results/partition-ref-simu.yaml') parser.add_argument('--glfo-dir', default=partis_dir + '/data/germlines/human') parser.add_argument('--locus', default='igh') args = parser.parse_args() glfo = None if utils.getsuffix(args.fname) == '.csv': print ' reading deprecated csv format, so need to read germline info from somewhere else, using --glfo-dir %s, hopefully it works' % args.glfo_dir glfo = glutils.read_glfo(args.glfo_dir, locus=args.locus) glfo, annotation_list, cpath = utils.read_output(args.fname, glfo=glfo) if cpath is None or len(cpath.partitions) == 0: print 'no partitions read from %s, so just printing first annotation:' % args.fname utils.print_reco_event(annotation_list[0]) sys.exit(0) print utils.color('green', 'list of partitions:') cpath.print_partitions( abbreviate=True ) # 'abbreviate' print little 'o's instead of the full sequence ids # print annotations for the biggest cluster in the most likely partition annotations = { ':'.join(adict['unique_ids']): adict for adict in annotation_list } # collect the annotations in a dictionary so they're easier to access most_likely_partition = cpath.partitions[ cpath. i_best] # a partition is represented as a list of lists of strings, with each string a sequence id
def get_mature_line(sfos, naive_line, glfo, nodefo, dtree, target_sfos, locus=None): assert len( naive_line['unique_ids'] ) == 1 # enforces that we ran naive-only, 1-leaf partis simulation above assert not indelutils.has_indels( naive_line['indelfos'][0]) # would have to handle this below if args.debug: utils.print_reco_event(naive_line) reco_info = collections.OrderedDict() for sfo in sfos: mline = utils.get_non_implicit_copy(naive_line) del mline['tree'] mline['unique_ids'] = [sfo['name']] mline['seqs'] = [sfo['seq']] mline['input_seqs'] = [ sfo['seq'] ] # it's really important to set both the seqs (since they're both already in there from the naive line) mline['duplicates'] = [[]] reco_info[sfo['name']] = mline try: utils.add_implicit_info(glfo, mline) except: # TODO not sure if I really want to leave this in long term, but it shouldn't hurt anything (it's crashing on unequal naive/mature sequence lengths, and I need this to track down which event it is) UPDATE: yeah it was just because something crashed in the middle of writing a .fa file print 'implicit info adding failed for ievent %d in %s' % ( ievent, outdir) lines = traceback.format_exception(*sys.exc_info()) print utils.pad_lines( ''.join(lines) ) # NOTE this will still crash on the next line if implicit info adding failed final_line = utils.synthesize_multi_seq_line_from_reco_info( [sfo['name'] for sfo in sfos], reco_info) ftree = copy.deepcopy(dtree) if locus is not None: def ltr(u): return u + '-' + locus new_nodefo = {} for u_old in nodefo: new_nodefo[ltr(u_old)] = nodefo[u_old] nodefo = new_nodefo treeutils.translate_labels(ftree, [(u, ltr(u)) for u in final_line['unique_ids']]) final_line['unique_ids'] = [ ltr(u) for u in final_line['unique_ids'] ] assert len(sfos) == len(final_line['unique_ids']) for iseq, sfo in enumerate(sfos): naive_id = naive_line['unique_ids'][0] assert naive_id.count('-') == 1 bstr = naive_id.replace('-' + locus, '') pids = final_line['paired-uids'][iseq] assert len(pids) == 1 and pids[0].find( bstr ) == 0 and pids[0].count('-') == 1 and pids[0].split( '-' )[1] in utils.loci # if uid is xxx-igh, paired id shoud be e.g. xxx-igk final_line['paired-uids'][iseq] = [ p.replace(bstr, sfo['name']) for p in pids ] if args.debug: utils.print_reco_event(final_line) # extract kd values from pickle file (use a separate script since it requires ete/anaconda to read) if len( set(nodefo) - set(final_line['unique_ids']) ) > 0: # uids in the kd file but not the <line> (i.e. not in the newick/fasta files) are probably just bcr-phylo discarding internal nodes print ' in kd file, but missing from final_line (probably just internal nodes that bcr-phylo wrote to the tree without names): %s' % ( set(nodefo) - set(final_line['unique_ids'])) if len(set(final_line['unique_ids']) - set(nodefo)) > 0: print ' in final_line, but missing from kdvals: %s' % ' '.join( set(final_line['unique_ids']) - set(nodefo)) final_line['affinities'] = [ 1. / nodefo[u]['kd'] for u in final_line['unique_ids'] ] final_line['relative_affinities'] = [ 1. / nodefo[u]['relative_kd'] for u in final_line['unique_ids'] ] final_line['lambdas'] = [ nodefo[u]['lambda'] for u in final_line['unique_ids'] ] final_line['nearest_target_indices'] = [ nodefo[u]['target_index'] for u in final_line['unique_ids'] ] ftree.scale_edges(1. / numpy.mean([len(s) for s in final_line['seqs']])) if args.debug: print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=ftree), padwidth=12) final_line['tree'] = ftree.as_string(schema='newick') tmp_event = RecombinationEvent( glfo ) # I don't want to move the function out of event.py right now tmp_event.set_reco_id( final_line, irandom=ievent ) # not sure that setting <irandom> here actually does anything final_line['target_seqs'] = [tfo['seq'] for tfo in target_sfos] return final_line
def clean_pair_info(cpaths, antn_lists, n_max_clusters=3, debug=False): # ---------------------------------------------------------------------------------------- def check_droplet_id_groups(tdbg=False): # check against the droplet id method (we could just do it this way, but it would only work for 10x, and only until they change their naming convention) pgroup_strs = set(':'.join(sorted(pg)) for pg in pid_groups) all_uids = list( set([ su for l in cpaths for c in cpaths[l].best() for u in c for su in [u] + utils.per_seq_val(all_antns[u], 'paired-uids', u) ])) n_not_found = 0 for dropid, drop_queries in itertools.groupby( sorted(all_uids, key=utils.get_droplet_id), key=utils.get_droplet_id): dqlist = list(drop_queries) found = ':'.join(sorted(dqlist)) in pgroup_strs if not found: overlaps = [g for g in pgroup_strs if dropid in g] overlaps = utils.get_single_entry(overlaps) n_not_found += 1 if tdbg or not found: print ' %25s %s %s %s' % ( utils.color('green', '-') if found else utils.color( 'red', 'x'), dropid, ' '.join( sorted(utils.get_contig_id(q) for q in dqlist)), utils.color( 'red', ' '.join( sorted( utils.get_contig_id(q) for q in overlaps.split(':'))) if not found else '')) if n_not_found > 0: print ' %s droplet id group check failed for %d groups' % ( utils.color('red', 'error'), n_not_found) # ---------------------------------------------------------------------------------------- def getloc(uid): if uid not in all_antns: return '?' return utils.per_seq_val(all_antns[uid], 'loci', uid) # ---------------------------------------------------------------------------------------- def gval(uid, key): # get per-seq val for <uid> if uid not in all_antns: return None return utils.per_seq_val(all_antns[uid], key, uid) # ---------------------------------------------------------------------------------------- def lgstr(lgroup, sort=True): return ' '.join( utils.locstr(l) for l in (sorted if sort else utils.pass_fcn )([getloc(u) for u in lgroup])) # ---------------------------------------------------------------------------------------- def choose_seqs_to_remove( chain_ids, max_hdist=4, tdbg=False): # choose one of <chain_ids> to eliminate # look for pairs with the same locus that ids_to_remove = set(u for u in chain_ids if getloc(u) == '?') if tdbg and len( ids_to_remove ) > 0: # i think this actually can't happen a.t.m. TODO maybe remove it print ' removed %d with missing annotations' % len( ids_to_remove) dbgstr = [] n_equivalent = 0 for tpair in itertools.combinations(chain_ids, 2): if len(set(getloc(u) for u in tpair)) > 1: continue if len(set(len(gval(u, 'seqs')) for u in tpair)) > 1: continue hdist = utils.hamming_distance(*[gval(u, 'seqs') for u in tpair]) if tdbg: dbgstr.append( utils.color('blue' if hdist == 0 else 'yellow', '%d' % hdist)) if hdist <= max_hdist: # TODO would be nice to be able to combine their sequences, but I think propagating the resulting annotation modifications would be hard # print ' identical sequence overlap, choosing longer one' better_id, worse_id = sorted( tpair, key=lambda q: utils.ambig_frac(gval(q, 'seqs')) ) # TODO if we're tossing one with hdist > 0, maybe should take the lower-shm one if they're the same length? ids_to_remove.add(worse_id) n_equivalent += 1 if tdbg and len(dbgstr) > 0: print ' %d pair%s equivalent with hdists %s' % ( n_equivalent, utils.plural(n_equivalent), ' '.join(dbgstr)) # remove unproductive dbgstr = [] unproductive_ids = [] for uid in chain_ids: if not utils.is_functional( all_antns[uid], all_antns[uid]['unique_ids'].index(uid)): unproductive_ids.append(uid) if tdbg: dbgstr.append( utils.is_functional_dbg_str( all_antns[uid], all_antns[uid]['unique_ids'].index(uid), sep='+')) # unproductive_ids = [u for u in chain_ids if not utils.is_functional(all_antns[u], all_antns[u]['unique_ids'].index(u))] # this way is only one line, which may or may not be nicer if tdbg and len(unproductive_ids) > 0: print ' %d unproductive %s' % (len(unproductive_ids), ', '.join(dbgstr)) ids_to_remove |= set(unproductive_ids) return ids_to_remove # ---------------------------------------------------------------------------------------- antn_dicts = { l: utils.get_annotation_dict(antn_lists[l]) for l in antn_lists } # first make a map from each uid (for all loci) to its annotation pid_groups = [ ] # list of pid groups, i.e. each element is the uids from a single droplet (for 10x) pid_ids = {} # map from each uid to the index of its pid group all_antns = {} if debug: print ' %s consolidating info for %d loci with cluster/sequence counts: %s' % ( utils.color('blue', '+'.join(cpaths)), len(cpaths), ' '.join( '%s: %d/%d' % (l, len(cpaths[l].best()), sum(len(c) for c in cpaths[l].best())) for l in sorted(cpaths))) for ltmp in sorted(cpaths): for cluster in cpaths[ltmp].best(): cline = antn_dicts[ltmp][':'.join(cluster)] if 'paired-uids' not in cline: print ' %s no paired-uids in line' % utils.color( 'yellow', 'warning') continue # maybe should still add to all_antns? for uid, pids in zip(cline['unique_ids'], cline['paired-uids']): pset = set([uid] + pids) found = False for ipg, pgroup in enumerate(pid_groups): if any( p in pgroup for p in pset ): # TODO should maybe check for consistency if some of them are already in there (i.e. from reciprocal info in another chain)? found = True pgroup |= pset break if not found: pid_groups.append(pset) ipg = len(pid_groups) - 1 assert ipg is not None for pid in pset: pid_ids[pid] = ipg cline['loci'] = [ ltmp for _ in cline['unique_ids'] ] # TODO maybe should add this somewhere else, like in partitiondriver? (eh, maybe not? the locus is always available in each file from the germline info anyway) for uid in cline['unique_ids']: all_antns[uid] = cline # for ipg, pg in enumerate(pid_groups): # print ' %3d %s' % (ipg, ' '.join(pg)) check_droplet_id_groups() # TODO handle/keep better track of failures # then go through each group and try to figure out which seqs are real print ' cleaning %d pid groups:' % len(pid_groups) n_ok = {} for ipg, pgroup in enumerate(pid_groups): pgroup = [u for u in pgroup if getloc(u) != '?' ] # TODO figure out what to do with missing ones # print ' %s' % lgstr(pgroup), hids = [u for u in pgroup if utils.has_d_gene(getloc(u))] lids = [u for u in pgroup if u not in hids] if len(hids) < 2 and len(lids) < 2: # print ' both ok' if lgstr(pgroup) not in n_ok: n_ok[lgstr(pgroup)] = 0 n_ok[lgstr(pgroup)] += 1 pid_groups[ipg] = pgroup continue if debug: print ' %s' % lgstr(pgroup), for chain, idlist in zip(utils.chains, [hids, lids]): if len(idlist) < 2: continue if debug: print '\n too many %s chains: %s' % (chain, lgstr(idlist)) ids_to_remove = choose_seqs_to_remove(idlist) for rid in ids_to_remove: pgroup.remove(rid) idlist.remove(rid) if debug: print ' %s: removed %d, leaving %d' % (utils.color( 'green', 'fixed') if len(idlist) == 1 else utils.color( 'red', 'nope'), len(ids_to_remove), len(idlist)) if len(idlist) > 1: for uid in idlist: prutils.print_seq_in_reco_event( all_antns[uid], all_antns[uid]['unique_ids'].index(uid), one_line=True, extra_str=' ', uid_extra_str=utils.locstr(getloc(uid))) pid_groups[ipg] = pgroup print ' N ok:' for lstr, count in sorted(n_ok.items(), key=operator.itemgetter(1), reverse=True): print ' %3d %s' % (count, lstr) for ltmp in sorted(cpaths): print '%s' % utils.color('green', ltmp) cpaths[ltmp].print_partitions() for iclust, cluster in enumerate( sorted(cpaths[ltmp].best(), key=len, reverse=True)): cline = antn_dicts[ltmp][':'.join(cluster)] # before_strs = [lgstr(pids) for pids in cline['paired-uids']] cline['paired-uids'] = [[ p for p in pid_groups[pid_ids[u]] if p != u ] for u in cline['unique_ids']] # see what others in its family are paired with pfamilies = { } # TODO rewrite comment: map, for each locus, of the families that are paired with each uid in <cluster> (family name str : family annotation) for uid, pids in zip(cline['unique_ids'], cline['paired-uids']): for pid in pids: fline = all_antns[pid] fkey = ':'.join(fline['unique_ids']) floc = gval(pid, 'loci') if fkey not in pfamilies: pfamilies[fkey] = {'locus': floc, 'count': 0} pfamilies[fkey]['count'] += 1 print ' N size cdr3' for fkey, fdict in sorted(pfamilies.items(), key=lambda x: x[1]['count'], reverse=True): print ' %s %3d %3d %3d' % ( utils.locstr(fdict['locus']), fdict['count'], len(antn_dicts[fdict['locus']][fkey]['unique_ids']), antn_dicts[fdict['locus']][fkey]['cdr3_length']) def pfkey(p): return ':'.join(all_antns[p]['unique_ids']) pfcounts = [[pfamilies[pfkey(p)]['count'] for p in pids] for pids in cline['paired-uids']] def lcstr(pids, pfcs): if len(pids) == 0: return '' spids, spfcs = zip(*sorted( zip(pids, pfcs), key=operator.itemgetter(1), reverse=True)) return '%s %s' % (lgstr(spids, sort=False), ' '.join( str(c) for c in spfcs)) uid_extra_strs = [ lcstr(pids, pfs) for pids, pfs in zip(cline['paired-uids'], pfcounts) ] utils.print_reco_event(cline, uid_extra_strs=uid_extra_strs, extra_str=' ') if iclust >= n_max_clusters: break
def parse_detail(self, fk, unique_id): assert fk.iline < len(fk.lines) while fk.line[1] != 'Details': fk.increment() if fk.eof: return fk.increment() info = {} info['unique_id'] = unique_id for begin_line, column, index, required, default in line_order: if fk.line[0].find(begin_line) != 0: if required: print 'oop', begin_line, fk.line sys.exit() else: info[column] = default continue if column != '': info[column] = clean_value(column, fk.line[index]) # if '[' in info[column]: # print 'added', column, clean_value(column, fk.line[index]) if column.find('_gene') == 1: region = column[0] info[region + '_5p_del'] = int(fk.line[fk.line.index('start:') + 1]) - 1 # NOTE their indices are 1-based gl_length = int(fk.line[fk.line.index('gene:') + 1]) - 1 match_end = int(fk.line[fk.line.index('end:') + 1]) - 1 assert gl_length >= match_end info[region + '_3p_del'] = gl_length - match_end fk.increment() if unique_id not in self.sim_need: while not fk.eof and fk.line[1] != 'Details': # skip stuff until start of next Detail block fk.increment() return info['fv_insertion'] = '' info['jf_insertion'] = '' info['seq'] = info['v_qr_seq'] + info['vd_insertion'] + info['d_qr_seq'] + info['dj_insertion'] + info['j_qr_seq'] if '-' in info['seq']: print 'ERROR found a dash in %s, returning failure' % unique_id while not fk.eof and fk.line[1] != 'Details': # skip stuff until start of next Detail block fk.increment() return if info['seq'] not in self.siminfo[unique_id]['seq']: # arg. I can't do != because it tacks on v left and j right deletions print 'ERROR didn\'t find the right sequence for %s' % unique_id print ' ', info['seq'] print ' ', self.siminfo[unique_id]['seq'] sys.exit() if self.args.debug: print unique_id utils.print_reco_event(self.germline_seqs, self.siminfo[unique_id], label='true:', extra_str=' ') utils.print_reco_event(self.germline_seqs, info, label='inferred:', extra_str=' ') for region in utils.regions: if info[region + '_gene'] not in self.germline_seqs[region]: print 'ERROR %s not in germlines' % info[region + '_gene'] assert False gl_seq = info[region + '_gl_seq'] if '[' in gl_seq: # ambiguous for nuke in utils.nukes: gl_seq = gl_seq.replace('[', nuke) if gl_seq in self.germline_seqs[region][info[region + '_gene']]: print ' replaced [ with %s' % nuke break info[region + '_gl_seq'] = gl_seq if info[region + '_gl_seq'] not in self.germline_seqs[region][info[region + '_gene']]: print 'ERROR gl match not found for %s in %s' % (info[region + '_gene'], unique_id) print ' ', info[region + '_gl_seq'] print ' ', self.germline_seqs[region][info[region + '_gene']] self.perfplotter.add_partial_fail(self.siminfo[unique_id], info) while not fk.eof and fk.line[1] != 'Details': # skip stuff until start of next Detail block fk.increment() return self.perfplotter.evaluate(self.siminfo[unique_id], info) self.details[unique_id] = info self.sim_need.remove(unique_id) while not fk.eof and fk.line[1] != 'Details': # skip stuff until start of next Detail block fk.increment()
import sys partis_path = '.' # edit this if you're not running from the main partis dir sys.path.insert(1, partis_path + '/python') import utils import glutils from clusterpath import ClusterPath # read default germline info glfo = glutils.read_glfo(partis_path + '/data/germlines/human', chain='h') print 'first parse an annotation csv file:' with open(partis_path + '/test/reference-results/annotate-new-simu.csv') as csvfile: reader = csv.DictReader(csvfile) for line in reader: utils.process_input_line(line) utils.add_implicit_info(glfo, line) utils.print_reco_event(glfo['seqs'], line) cdr3_bounds = (line['codon_positions']['v'], line['codon_positions']['j'] + 3) print '' print ' should match the above:' print ' %s naive cdr3' % line['naive_seq'][cdr3_bounds[0] : cdr3_bounds[1]] print ' %s mature' % line['indel_reversed_seqs'][0][cdr3_bounds[0] : cdr3_bounds[1]] print '' break print 'then parse a partition csv file:' cp = ClusterPath() cp.readfile(partis_path + '/test/reference-results/seed-partition-new-simu.csv') cp.print_partitions(abbreviate=True)
(args.basedir, args.locus)) lh_info = read_linearham_output() # print annotations for the biggest cluster in the most likely partition annotations = { ':'.join(adict['unique_ids']): adict for adict in annotation_list } # collect the annotations in a dictionary so they're easier to access most_likely_partition = cpath.partitions[ cpath. i_best] # a partition is represented as a list of lists of strings, with each string a sequence id sorted_clusters = sorted(most_likely_partition, key=len, reverse=True) for cluster in sorted_clusters: line = annotations[':'.join(cluster)] print ':'.join(line['unique_ids']) utils.print_reco_event(line, extra_str=' ') print '' lh_clusters = [(uidstr, cfo) for uidstr, cfo in lh_info.items() if set(uidstr.split(':')) & set(line['unique_ids'])] lh_naive_seqs = [] if len(lh_clusters) == 0: print ' %s zero linearham clusters with any of these uids' % utils.color( 'red', 'error') elif len(lh_clusters) != 1: raise Exception('expected 1 linearham cluster but found %d' % len(lh_clusters)) else: lh_uidstr, lh_naive_seqs = lh_clusters[0] if set(lh_uidstr.split(':')) != set(line['unique_ids']): print ' %s different uids\n extra in linearham: %s\n missing from linearham: %s' % (
def add_to_info(self, query_name, query_seq, kvals, match_names, best, all_germline_bounds, all_query_bounds, codon_positions, perfplotter=None): assert query_name not in self.info self.info[query_name] = {} self.info[query_name][ 'unique_id'] = query_name # redundant, but used somewhere down the line self.info[query_name]['k_v'] = kvals['v'] self.info[query_name]['k_d'] = kvals['d'] self.info[query_name]['all'] = ':'.join(match_names['v'] + match_names['d'] + match_names['j']) assert codon_positions['v'] != -1 assert codon_positions['j'] != -1 self.info[query_name][ 'cdr3_length'] = codon_positions['j'] - codon_positions[ 'v'] + 3 #tryp_position_in_joined_seq - self.cyst_position + 3 self.info[query_name]['cyst_position'] = codon_positions['v'] self.info[query_name]['tryp_position'] = codon_positions['j'] # erosion, insertion, mutation info for best match self.info[query_name]['v_5p_del'] = all_germline_bounds[best['v']][0] self.info[query_name]['v_3p_del'] = len( self.germline_seqs['v'][best['v']]) - all_germline_bounds[ best['v']][1] # len(germline v) - gl_match_end self.info[query_name]['d_5p_del'] = all_germline_bounds[best['d']][0] self.info[query_name]['d_3p_del'] = len(self.germline_seqs['d'][ best['d']]) - all_germline_bounds[best['d']][1] self.info[query_name]['j_5p_del'] = all_germline_bounds[best['j']][0] self.info[query_name]['j_3p_del'] = len(self.germline_seqs['j'][ best['j']]) - all_germline_bounds[best['j']][1] self.info[query_name][ 'fv_insertion'] = query_seq[:all_query_bounds[best['v']][0]] self.info[query_name]['vd_insertion'] = query_seq[ all_query_bounds[best['v']][1]:all_query_bounds[best['d']][0]] self.info[query_name]['dj_insertion'] = query_seq[ all_query_bounds[best['d']][1]:all_query_bounds[best['j']][0]] self.info[query_name]['jf_insertion'] = query_seq[ all_query_bounds[best['j']][1]:] for region in utils.regions: self.info[query_name][region + '_gene'] = best[region] self.info[query_name][region + '_gl_seq'] = best[region + '_gl_seq'] self.info[query_name][region + '_qr_seq'] = best[region + '_qr_seq'] self.info['all_best_matches'].add(best[region]) self.info[query_name][ 'seq'] = query_seq # only need to add this so I can pass it to print_reco_event if self.args.debug: if not self.args.is_data: utils.print_reco_event(self.germline_seqs, self.reco_info[query_name], extra_str=' ', label='true:') utils.print_reco_event(self.germline_seqs, self.info[query_name], extra_str=' ', label='inferred:') if self.pcounter != None: self.pcounter.increment(self.info[query_name]) if self.true_pcounter != None: self.true_pcounter.increment(self.reco_info[query_name]) if perfplotter != None: perfplotter.evaluate( self.reco_info[query_name], self.info[query_name]) #, subtract_unphysical_erosions=True)