def write_cyst_file(known_cyst_positions): unaligned_genes = utils.read_germlines(args.dirname, only_region='v')['v'] aligned_genes = utils.read_germlines(args.dirname, only_region='v', aligned=True)['v'] common_gene = None # we need to find at least one gene that's in the old and the new sets, so we know how to convert cyst positions for gene, info in known_cyst_positions.items(): if gene in aligned_genes: common_gene = gene break if common_gene is None: raise Exception('couldn\'t find any genes in common between %s and %s, so can\'t write new cyst position file' % (args.reference, args.dirname + '/' + aligned_fname)) aligned_seq = aligned_genes[common_gene] seq = unaligned_genes[common_gene] cpos = known_cyst_positions[common_gene]['cysteine-position'] utils.check_conserved_cysteine(seq, cpos) cpos_in_alignment = cpos ipos = 0 # position in unaligned sequence n_dots_passed = 0 # number of gapped positions in the aligned sequences that we pass before getting to cpos (i.e. while ipos < cpos) while ipos < cpos: if aligned_seq[ipos + n_dots_passed] in utils.gap_chars: cpos_in_alignment += 1 n_dots_passed += 1 else: ipos += 1 utils.check_conserved_cysteine(aligned_seq, cpos_in_alignment) displacement = cpos_in_alignment - cpos print ' cpos displacement: %d' % displacement cyst_positions = [] for gene, seq in unaligned_genes.items(): cyst_positions.append({'gene' : gene, 'cyst_start' : cpos})
def generate_snpd_gene(gene, cpos, seq, positions): assert utils.get_region(gene) == 'v' # others not yet handled def choose_position(): snp_pos = None while snp_pos is None or snp_pos in snpd_positions or not utils.check_conserved_cysteine(tmpseq, cpos, debug=True, assert_on_fail=False): snp_pos = random.randint(10, len(seq) - 15) # note that randint() is inclusive tmpseq = seq[: snp_pos] + 'X' + seq[snp_pos + 1 :] # for checking cyst position return snp_pos snpd_positions = set() # only used if a position wasn't specified (i.e. was None) in <snps_to_add> mutfo = OrderedDict() for snp_pos in positions: if snp_pos is None: snp_pos = choose_position() snpd_positions.add(snp_pos) new_base = None while new_base is None or new_base == seq[snp_pos]: new_base = utils.nukes[random.randint(0, len(utils.nukes) - 1)] print ' %3d %s --> %s' % (snp_pos, seq[snp_pos], new_base) mutfo[snp_pos] = {'original' : seq[snp_pos], 'new' : new_base} seq = seq[: snp_pos] + new_base + seq[snp_pos + 1 :] utils.check_conserved_cysteine(seq, cpos) snpd_name, mutfo = get_new_allele_name_and_change_mutfo(gene, mutfo) return {'template-gene' : gene, 'gene' : snpd_name, 'seq' : seq}
def write_cyst_file(known_cyst_positions): unaligned_genes = utils.read_germline_seqs(args.dirname, only_region='v')['v'] aligned_genes = utils.read_germline_seqs(args.dirname, only_region='v', aligned=True)['v'] known_gene = None # we need to find at least one gene that's in the old and the new sets, so we know how to convert cyst positions for gene, info in known_cyst_positions.items(): if gene in aligned_genes: known_gene = gene break if known_gene is None: raise Exception('couldn\'t find any genes in common between %s and %s, so can\'t write new cyst position file' % (args.reference, args.dirname + '/' + aligned_fname)) known_cpos = known_cyst_positions[known_gene] cpos_in_alignment = get_cpos_in_alignment(aligned_genes[known_gene], unaligned_genes[known_gene], known_cpos) cyst_positions = {} errors = [] for gene, seq in unaligned_genes.items(): unaligned_cpos = cpos_in_alignment - utils.count_gaps(aligned_genes[gene], istop=cpos_in_alignment) try: utils.check_conserved_cysteine(seq, unaligned_cpos, debug=True) except: print ' %s cysteine not found in %s, skipping' % (utils.color('red', 'warning'), gene) # print gene, unaligned_cpos # print seq # print aligned_genes[gene] errors.append(gene) continue cyst_positions[gene] = unaligned_cpos with open(args.dirname + '/' + cyst_fname, 'w') as cystfile: writer = csv.DictWriter(cystfile, ('gene', 'istart')) writer.writeheader() for gene, cpos in cyst_positions.items(): writer.writerow({'gene' : gene, 'istart' : cpos}) with open(args.dirname + '/' + error_fname, 'w') as errorfile: for gene in errors: errorfile.write('%s\n' % gene)
def process_query(self, bam, reads): primary = next((r for r in reads if not r.is_secondary), None) query_seq = primary.seq query_name = primary.qname first_match_query_bounds = None # since sw excises its favorite v match, we have to know this match's boundaries in order to calculate k_d for all the other matches all_match_names = {} warnings = {} # ick, this is a messy way to pass stuff around for region in utils.regions: all_match_names[region] = [] all_query_bounds, all_germline_bounds = {}, {} n_skipped_invalid_cpos = 0 for read in reads: # loop over the matches found for each query sequence # set this match's values read.seq = query_seq # only the first one has read.seq set by default, so we need to set the rest by hand gene = bam.references[read.tid] region = utils.get_region(gene) raw_score = read.tags[0][1] # raw because they don't include the gene choice probs score = raw_score if self.args.apply_choice_probs_in_sw: # NOTE I stopped applying the gene choice probs here because the smith-waterman scores don't correspond to log-probs, so throwing on the gene choice probs was dubious (and didn't seem to work that well) score = self.get_choice_prob(region, gene) * raw_score # multiply by the probability to choose this gene qrbounds = (read.qstart, read.qend) glbounds = (read.pos, read.aend) if region == 'v' and first_match_query_bounds is None: first_match_query_bounds = qrbounds # perform a few checks and see if we want to skip this match if region == 'v': # skip matches with cpos past the end of the query seq (i.e. eroded a ton on the right side of the v) cpos = utils.get_conserved_codon_position(self.cyst_positions, self.tryp_positions, 'v', gene, glbounds, qrbounds, assert_on_fail=False) if not utils.check_conserved_cysteine(self.germline_seqs['v'][gene], self.cyst_positions[gene]['cysteine-position'], assert_on_fail=False): # some of the damn cysteine positions in the json file were wrong, so now we check raise Exception('bad cysteine in %s: %d %s' % (gene, self.cyst_positions[gene]['cysteine-position'], self.germline_seqs['v'][gene])) if cpos < 0 or cpos >= len(query_seq): n_skipped_invalid_cpos += 1 continue if 'I' in read.cigarstring or 'D' in read.cigarstring: # skip indels, and tell the HMM to skip indels (you won't see any unless you decrease the <self.args.gap_open_penalty>) if len(all_match_names[region]) == 0: # if this is the first (best) match for this region, allow indels (otherwise skip the match) if query_name not in self.info['indels']: self.info['indels'][query_name] = self.get_indel_info(query_name, read.cigarstring, query_seq[qrbounds[0] : qrbounds[1]], self.germline_seqs[region][gene][glbounds[0] : glbounds[1]], gene) self.info['indels'][query_name]['reversed_seq'] = query_seq[ : qrbounds[0]] + self.info['indels'][query_name]['reversed_seq'] + query_seq[qrbounds[1] : ] self.new_indels += 1 # print ' query seq %s' % query_seq # print 'indelfo seq %s' % self.info['indels'][query_name]['reversed_seq'] # self.info['skipped_indel_queries'].append(query_name) # self.info[query_name] = {'indels'} else: print ' multiple indels for %s' % query_name return else: continue if qrbounds[1]-qrbounds[0] != glbounds[1]-glbounds[0]: raise Exception('germline match (%d %d) not same length as query match (%d %d)' % (qrbounds[0], qrbounds[1], glbounds[0], glbounds[1])) assert qrbounds[1] <= len(query_seq) if glbounds[1] > len(self.germline_seqs[region][gene]): print ' ', gene print ' ', glbounds[1], len(self.germline_seqs[region][gene]) print ' ', self.germline_seqs[region][gene] assert glbounds[1] <= len(self.germline_seqs[region][gene]) assert qrbounds[1]-qrbounds[0] == glbounds[1]-glbounds[0] # and finally add this match's information warnings[gene] = '' all_match_names[region].append((score, gene)) # NOTE it is important that this is ordered such that the best match is first all_query_bounds[gene] = qrbounds all_germline_bounds[gene] = glbounds # if n_skipped_invalid_cpos > 0: # print ' skipped %d invalid cpos values for %s' % (n_skipped_invalid_cpos, query_name) self.summarize_query(query_name, query_seq, all_match_names, all_query_bounds, all_germline_bounds, warnings, first_match_query_bounds)
def process_query(self, bam, reads): primary = next((r for r in reads if not r.is_secondary), None) query_seq = primary.seq query_name = primary.qname first_match_query_bounds = None # since sw excises its favorite v match, we have to know this match's boundaries in order to calculate k_d for all the other matches all_match_names = {} warnings = {} # ick, this is a messy way to pass stuff around for region in utils.regions: all_match_names[region] = [] all_query_bounds, all_germline_bounds = {}, {} n_skipped_invalid_cpos = 0 for read in reads: # loop over the matches found for each query sequence # set this match's values read.seq = query_seq # only the first one has read.seq set by default, so we need to set the rest by hand gene = bam.references[read.tid] region = utils.get_region(gene) raw_score = read.tags[0][ 1] # raw because they don't include the gene choice probs score = raw_score if self.args.apply_choice_probs_in_sw: # NOTE I stopped applying the gene choice probs here because the smith-waterman scores don't correspond to log-probs, so throwing on the gene choice probs was dubious (and didn't seem to work that well) score = self.get_choice_prob( region, gene ) * raw_score # multiply by the probability to choose this gene qrbounds = (read.qstart, read.qend) glbounds = (read.pos, read.aend) if region == 'v' and first_match_query_bounds is None: first_match_query_bounds = qrbounds # perform a few checks and see if we want to skip this match if region == 'v': # skip matches with cpos past the end of the query seq (i.e. eroded a ton on the right side of the v) cpos = utils.get_conserved_codon_position(self.cyst_positions, self.tryp_positions, 'v', gene, glbounds, qrbounds, assert_on_fail=False) if not utils.check_conserved_cysteine( self.germline_seqs['v'][gene], self.cyst_positions[gene]['cysteine-position'], assert_on_fail=False ): # some of the damn cysteine positions in the json file were wrong, so now we check raise Exception( 'bad cysteine in %s: %d %s' % (gene, self.cyst_positions[gene]['cysteine-position'], self.germline_seqs['v'][gene])) if cpos < 0 or cpos >= len(query_seq): n_skipped_invalid_cpos += 1 continue if 'I' in read.cigarstring or 'D' in read.cigarstring: # skip indels, and tell the HMM to skip indels (you won't see any unless you decrease the <self.args.gap_open_penalty>) if len( all_match_names[region] ) == 0: # if this is the first (best) match for this region, allow indels (otherwise skip the match) if query_name not in self.info['indels']: self.info['indels'][query_name] = self.get_indel_info( query_name, read.cigarstring, query_seq[qrbounds[0]:qrbounds[1]], self.germline_seqs[region][gene] [glbounds[0]:glbounds[1]], gene) self.info['indels'][query_name][ 'reversed_seq'] = query_seq[:qrbounds[ 0]] + self.info['indels'][query_name][ 'reversed_seq'] + query_seq[qrbounds[1]:] self.new_indels += 1 # print ' query seq %s' % query_seq # print 'indelfo seq %s' % self.info['indels'][query_name]['reversed_seq'] # self.info['skipped_indel_queries'].append(query_name) # self.info[query_name] = {'indels'} else: print ' multiple indels for %s' % query_name return else: continue if qrbounds[1] - qrbounds[0] != glbounds[1] - glbounds[0]: raise Exception( 'germline match (%d %d) not same length as query match (%d %d)' % (qrbounds[0], qrbounds[1], glbounds[0], glbounds[1])) assert qrbounds[1] <= len(query_seq) if glbounds[1] > len(self.germline_seqs[region][gene]): print ' ', gene print ' ', glbounds[1], len(self.germline_seqs[region][gene]) print ' ', self.germline_seqs[region][gene] assert glbounds[1] <= len(self.germline_seqs[region][gene]) assert qrbounds[1] - qrbounds[0] == glbounds[1] - glbounds[0] # and finally add this match's information warnings[gene] = '' all_match_names[region].append( (score, gene) ) # NOTE it is important that this is ordered such that the best match is first all_query_bounds[gene] = qrbounds all_germline_bounds[gene] = glbounds # if n_skipped_invalid_cpos > 0: # print ' skipped %d invalid cpos values for %s' % (n_skipped_invalid_cpos, query_name) self.summarize_query(query_name, query_seq, all_match_names, all_query_bounds, all_germline_bounds, warnings, first_match_query_bounds)
if 'N' in align_seq: print '\n WARNING replacing N with A' align_seq = align_seq.replace('N', 'A') for pos in align_seq: if pos not in 'ACGT.': print 'ERROR unexpected character %s in %s from %s' % (pos, name, align_fname) sys.exit() # see if it's too short (WTF?!?!) if align_cpos >= len(align_seq): print 'too short!' bad_genes.append(name) continue try: utils.check_conserved_cysteine(align_seq, align_cpos, debug=True) except: bad_genes.append(name) continue # remove dots n_dots = align_seq.count('.') real_cpos = align_cpos - n_dots utils.check_conserved_cysteine(align_seq.replace('.', ''), real_cpos, debug=True) if name in cyst_positions and real_cpos == cyst_positions[name][ 'cysteine-position']: print 'ok' else: if name in cyst_positions and real_cpos != cyst_positions[name][
# check for unexpected characters if 'N' in align_seq: print '\n WARNING replacing N with A' align_seq = align_seq.replace('N', 'A') for pos in align_seq: if pos not in 'ACGT.': print 'ERROR unexpected character %s in %s from %s' % (pos, name, align_fname) sys.exit() # see if it's too short (WTF?!?!) if align_cpos >= len(align_seq): print 'too short!' bad_genes.append(name) continue try: utils.check_conserved_cysteine(align_seq, align_cpos, debug=True) except: bad_genes.append(name) continue # remove dots n_dots = align_seq.count('.') real_cpos = align_cpos - n_dots utils.check_conserved_cysteine(align_seq.replace('.', ''), real_cpos, debug=True) if name in cyst_positions and real_cpos == cyst_positions[name]['cysteine-position']: print 'ok' else: if name in cyst_positions and real_cpos != cyst_positions[name]['cysteine-position']: print 'not the same, new: %d old: %s' % (real_cpos, cyst_positions[name]['cysteine-position']), print ' switching to the new one' else:
def get_cpos_in_alignment(aligned_seq, seq, cpos): """ given <cpos> in <seq>, find the cysteine's position in <aligned_seq> """ utils.check_conserved_cysteine(seq, cpos) cpos_in_alignment = cpos + get_n_gaps_up_to_cpos(aligned_seq, cpos) utils.check_conserved_cysteine(aligned_seq, cpos_in_alignment) return cpos_in_alignment
def choose_position(): snp_pos = None while snp_pos is None or snp_pos in snpd_positions or not utils.check_conserved_cysteine(tmpseq, cpos, debug=True, assert_on_fail=False): snp_pos = random.randint(10, len(seq) - 15) # note that randint() is inclusive tmpseq = seq[: snp_pos] + 'X' + seq[snp_pos + 1 :] # for checking cyst position return snp_pos