def read_glfo(gldir, chain, only_genes=None, skip_pseudogenes=True, debug=False): if not os.path.exists(gldir + "/" + chain): raise Exception( "germline set directory '%s' does not exist (maybe --parameter-dir is corrupted, maybe crashed while writing parameters?)" % (gldir + "/" + chain) ) if debug: print " reading %s chain glfo from %s" % (chain, gldir) glfo = {"chain": chain} glfo["seqs"] = read_germline_seqs(gldir, chain, skip_pseudogenes) read_extra_info(glfo, gldir) get_missing_codon_info(glfo, debug=debug) restrict_to_genes(glfo, only_genes, debug=debug) for region, codon in utils.conserved_codons[glfo["chain"]].items(): seqons = [ (seq, glfo[codon + "-positions"][gene]) for gene, seq in glfo["seqs"][region].items() ] # (seq, pos) pairs utils.check_a_bunch_of_codons(codon, seqons, extra_str=" ", debug=debug) if debug: print " read %s" % " ".join([("%s: %d" % (r, len(glfo["seqs"][r]))) for r in utils.regions]) return glfo
def get_missing_codon_info(glfo, debug=False): # ---------------------------------------------------------------------------------------- def get_n_gaps_up_to_pos(aligned_seq, pos): # NOTE I think this duplicates the functionality of count_gaps() """ return number of gapped positions in <aligned_seq> before <pos> """ ipos = 0 # position in unaligned sequence n_gaps_passed = ( 0 ) # number of gapped positions in the aligned sequence that we pass before getting to <pos> (i.e. while ipos < pos) while ipos < pos: if aligned_seq[ipos + n_gaps_passed] in utils.gap_chars: n_gaps_passed += 1 else: ipos += 1 return n_gaps_passed # ---------------------------------------------------------------------------------------- def get_pos_in_alignment(codon, aligned_seq, seq, pos): """ given <pos> in <seq>, find the codon's position in <aligned_seq> """ assert utils.codon_ok( codon, seq, pos, debug=debug ) # this only gets called on the gene with the *known* position, so it shouldn't fail pos_in_alignment = pos + get_n_gaps_up_to_pos(aligned_seq, pos) assert utils.codon_ok(codon, aligned_seq, pos_in_alignment, debug=debug) return pos_in_alignment for region, codon in utils.conserved_codons[glfo["chain"]].items(): missing_genes = set(glfo["seqs"][region]) - set(glfo[codon + "-positions"]) if len(missing_genes) == 0: if debug: print " no missing %s info" % codon continue if debug: print " missing %d %s positions" % (len(missing_genes), codon) aligned_seqs = get_new_alignments(glfo, region, debug=debug) # for g, s in aligned_seqs.items(): # print s, utils.color_gene(g) # if region == 'j': # raise Exception('missing tryp position for %s, and we can\'t infer it because tryp positions don\'t reliably align to the same position' % ' '.join(missing_genes)) # existing codon position (this assumes that once aligned, all genes have the same codon position -- which is only really true for the imgt-gapped alignment) if len(glfo[codon + "-positions"]) > 0: known_gene, known_pos = None, None for gene, pos in glfo[ codon + "-positions" ].items(): # take the first one for which we have the sequence (NOTE it would be safer to check that they're all the same) if ( gene in glfo["seqs"][region] and gene in aligned_seqs and utils.codon_ok(codon, glfo["seqs"][region][gene], pos) ): known_gene, known_pos = gene, pos break if known_gene is None: raise Exception("couldn't find a known %s position" % codon) # NOTE for cyst, should be 309 if alignments are imgt [which they used to usually be, but now probably aren't] (imgt says 104th codon --> subtract 1 to get zero-indexing, then multiply by three 3 * (104 - 1) = 309 known_pos_in_alignment = get_pos_in_alignment( codon, aligned_seqs[known_gene], glfo["seqs"][region][known_gene], known_pos ) if debug: print " using known position %d (aligned %d) in %s" % ( known_pos, known_pos_in_alignment, known_gene, ) elif codon == "cyst": known_pos_in_alignment = 309 print " assuming aligned %s position is %d (this will %s work if you're using imgt alignments)" % ( codon, known_pos_in_alignment, utils.color("red", "only"), ) raise Exception("not really using imgt alignments much any more, so this isn't really going to work") else: raise Exception("no existing %s info, and couldn't guess it, either" % codon) n_added = 0 seqons = [] # (seq, pos) pairs for gene in missing_genes: unaligned_pos = known_pos_in_alignment - utils.count_gaps(aligned_seqs[gene], istop=known_pos_in_alignment) seq_to_check = glfo["seqs"][region][gene] seqons.append((seq_to_check, unaligned_pos)) glfo[codon + "-positions"][gene] = unaligned_pos n_added += 1 # if debug: # tmpseq = aligned_seqs[gene] # tmppos = known_pos_in_alignment # print ' %s%s%s %s (new)' % (tmpseq[:tmppos], utils.color('reverse_video', tmpseq[tmppos : tmppos + 3]), tmpseq[tmppos + 3:], utils.color_gene(gene)) utils.check_a_bunch_of_codons(codon, seqons, extra_str=" ", debug=debug) if debug: print " added %d %s positions" % (n_added, codon)