Example #1
0
 def get_pos_in_alignment(codon, aligned_seq, seq, pos):
     """ given <pos> in <seq>, find the codon's position in <aligned_seq> """
     assert utils.codon_ok(
         codon, seq, pos, debug=debug
     )  # this only gets called on the gene with the *known* position, so it shouldn't fail
     pos_in_alignment = pos + get_n_gaps_up_to_pos(aligned_seq, pos)
     assert utils.codon_ok(codon, aligned_seq, pos_in_alignment, debug=debug)
     return pos_in_alignment
Example #2
0
def generate_snpd_gene(gene, cpos, seq, positions):
    assert utils.get_region(gene) == "v"  # others not yet handled

    def choose_position():
        snp_pos = None
        while snp_pos is None or snp_pos in snpd_positions or not utils.codon_ok("cyst", tmpseq, cpos, debug=True):
            snp_pos = random.randint(10, len(seq) - 15)  # note that randint() is inclusive
            tmpseq = seq[:snp_pos] + "X" + seq[snp_pos + 1 :]  # for checking cyst position
        return snp_pos

    snpd_positions = set()  # only used if a position wasn't specified (i.e. was None) in <snps_to_add>
    mutfo = OrderedDict()
    for snp_pos in positions:
        if snp_pos is None:
            snp_pos = choose_position()
        snpd_positions.add(snp_pos)
        new_base = None
        while new_base is None or new_base == seq[snp_pos]:
            new_base = utils.nukes[random.randint(0, len(utils.nukes) - 1)]
        print "        %3d   %s --> %s" % (snp_pos, seq[snp_pos], new_base)
        mutfo[snp_pos] = {"original": seq[snp_pos], "new": new_base}

        seq = seq[:snp_pos] + new_base + seq[snp_pos + 1 :]

    assert utils.codon_ok("cyst", seq, cpos, debug=True)  # this is probably unnecessary
    snpd_name, mutfo = get_new_allele_name_and_change_mutfo(gene, mutfo)
    return {"template-gene": gene, "gene": snpd_name, "seq": seq}
Example #3
0
 def revert_conserved_codons(self, seq):
     """ revert conserved cysteine and tryptophan to their original bases, eg if they were messed up by s.h.m. """
     for region, pos in self.final_codon_positions.items():
         if seq[pos : pos + 3] != self.unmutated_codons[region]:
             assert len(self.unmutated_codons[region]) == 3
             seq = seq[:pos] + self.unmutated_codons[region] + seq[pos + 3 :]
         assert utils.codon_ok(utils.conserved_codons[self.glfo['chain']][region], seq, pos)
     return seq
Example #4
0
def remove_v_genes_with_bad_cysteines(glfo, debug=False):
    prelength = len(glfo["seqs"]["v"])
    for gene in glfo["seqs"]["v"].keys():
        # if len(glfo['seqs']['v'][gene]) < glfo['cyst-positions'][gene] + 3:
        if not utils.codon_ok("cyst", glfo["seqs"]["v"][gene], glfo["cyst-positions"][gene]):
            remove_gene(glfo, gene, debug=debug)
    if True:  # debug:
        print "  removed %d / %d v genes with bad cysteines" % (
            prelength - len(glfo["seqs"]["v"]),
            len(glfo["seqs"]["v"]),
        )
Example #5
0
 def choose_position():
     snp_pos = None
     while snp_pos is None or snp_pos in snpd_positions or not utils.codon_ok("cyst", tmpseq, cpos, debug=True):
         snp_pos = random.randint(10, len(seq) - 15)  # note that randint() is inclusive
         tmpseq = seq[:snp_pos] + "X" + seq[snp_pos + 1 :]  # for checking cyst position
     return snp_pos
Example #6
0
def get_missing_codon_info(glfo, debug=False):
    # ----------------------------------------------------------------------------------------
    def get_n_gaps_up_to_pos(aligned_seq, pos):
        # NOTE I think this duplicates the functionality of count_gaps()
        """ return number of gapped positions in <aligned_seq> before <pos> """
        ipos = 0  # position in unaligned sequence
        n_gaps_passed = (
            0
        )  # number of gapped positions in the aligned sequence that we pass before getting to <pos> (i.e. while ipos < pos)
        while ipos < pos:
            if aligned_seq[ipos + n_gaps_passed] in utils.gap_chars:
                n_gaps_passed += 1
            else:
                ipos += 1
        return n_gaps_passed

    # ----------------------------------------------------------------------------------------
    def get_pos_in_alignment(codon, aligned_seq, seq, pos):
        """ given <pos> in <seq>, find the codon's position in <aligned_seq> """
        assert utils.codon_ok(
            codon, seq, pos, debug=debug
        )  # this only gets called on the gene with the *known* position, so it shouldn't fail
        pos_in_alignment = pos + get_n_gaps_up_to_pos(aligned_seq, pos)
        assert utils.codon_ok(codon, aligned_seq, pos_in_alignment, debug=debug)
        return pos_in_alignment

    for region, codon in utils.conserved_codons[glfo["chain"]].items():
        missing_genes = set(glfo["seqs"][region]) - set(glfo[codon + "-positions"])
        if len(missing_genes) == 0:
            if debug:
                print "      no missing %s info" % codon
            continue

        if debug:
            print "      missing %d %s positions" % (len(missing_genes), codon)

        aligned_seqs = get_new_alignments(glfo, region, debug=debug)
        # for g, s in aligned_seqs.items():
        #     print s, utils.color_gene(g)

        # if region == 'j':
        #     raise Exception('missing tryp position for %s, and we can\'t infer it because tryp positions don\'t reliably align to the same position' % ' '.join(missing_genes))

        # existing codon position (this assumes that once aligned, all genes have the same codon position -- which is only really true for the imgt-gapped alignment)
        if len(glfo[codon + "-positions"]) > 0:
            known_gene, known_pos = None, None
            for gene, pos in glfo[
                codon + "-positions"
            ].items():  # take the first one for which we have the sequence (NOTE it would be safer to check that they're all the same)
                if (
                    gene in glfo["seqs"][region]
                    and gene in aligned_seqs
                    and utils.codon_ok(codon, glfo["seqs"][region][gene], pos)
                ):
                    known_gene, known_pos = gene, pos
                    break
            if known_gene is None:
                raise Exception("couldn't find a known %s position" % codon)
            # NOTE for cyst, should be 309 if alignments are imgt [which they used to usually be, but now probably aren't] (imgt says 104th codon --> subtract 1 to get zero-indexing, then multiply by three 3 * (104 - 1) = 309
            known_pos_in_alignment = get_pos_in_alignment(
                codon, aligned_seqs[known_gene], glfo["seqs"][region][known_gene], known_pos
            )
            if debug:
                print "        using known position %d (aligned %d) in %s" % (
                    known_pos,
                    known_pos_in_alignment,
                    known_gene,
                )
        elif codon == "cyst":
            known_pos_in_alignment = 309
            print "      assuming aligned %s position is %d (this will %s work if you're using imgt alignments)" % (
                codon,
                known_pos_in_alignment,
                utils.color("red", "only"),
            )
            raise Exception("not really using imgt alignments much any more, so this isn't really going to work")
        else:
            raise Exception("no existing %s info, and couldn't guess it, either" % codon)

        n_added = 0
        seqons = []  # (seq, pos) pairs
        for gene in missing_genes:
            unaligned_pos = known_pos_in_alignment - utils.count_gaps(aligned_seqs[gene], istop=known_pos_in_alignment)
            seq_to_check = glfo["seqs"][region][gene]
            seqons.append((seq_to_check, unaligned_pos))
            glfo[codon + "-positions"][gene] = unaligned_pos
            n_added += 1
            # if debug:
            #     tmpseq = aligned_seqs[gene]
            #     tmppos = known_pos_in_alignment
            #     print '    %s%s%s   %s    (new)' % (tmpseq[:tmppos], utils.color('reverse_video', tmpseq[tmppos : tmppos + 3]), tmpseq[tmppos + 3:], utils.color_gene(gene))

        utils.check_a_bunch_of_codons(codon, seqons, extra_str="          ", debug=debug)
        if debug:
            print "      added %d %s positions" % (n_added, codon)