Exemple #1
0
def read_glfo(gldir, chain, only_genes=None, skip_pseudogenes=True, debug=False):
    if not os.path.exists(gldir + "/" + chain):
        raise Exception(
            "germline set directory '%s' does not exist (maybe --parameter-dir is corrupted, maybe crashed while writing parameters?)"
            % (gldir + "/" + chain)
        )

    if debug:
        print "  reading %s chain glfo from %s" % (chain, gldir)
    glfo = {"chain": chain}
    glfo["seqs"] = read_germline_seqs(gldir, chain, skip_pseudogenes)
    read_extra_info(glfo, gldir)
    get_missing_codon_info(glfo, debug=debug)
    restrict_to_genes(glfo, only_genes, debug=debug)

    for region, codon in utils.conserved_codons[glfo["chain"]].items():
        seqons = [
            (seq, glfo[codon + "-positions"][gene]) for gene, seq in glfo["seqs"][region].items()
        ]  # (seq, pos) pairs
        utils.check_a_bunch_of_codons(codon, seqons, extra_str="      ", debug=debug)

    if debug:
        print "  read %s" % "  ".join([("%s: %d" % (r, len(glfo["seqs"][r]))) for r in utils.regions])

    return glfo
Exemple #2
0
def get_missing_codon_info(glfo, debug=False):
    # ----------------------------------------------------------------------------------------
    def get_n_gaps_up_to_pos(aligned_seq, pos):
        # NOTE I think this duplicates the functionality of count_gaps()
        """ return number of gapped positions in <aligned_seq> before <pos> """
        ipos = 0  # position in unaligned sequence
        n_gaps_passed = (
            0
        )  # number of gapped positions in the aligned sequence that we pass before getting to <pos> (i.e. while ipos < pos)
        while ipos < pos:
            if aligned_seq[ipos + n_gaps_passed] in utils.gap_chars:
                n_gaps_passed += 1
            else:
                ipos += 1
        return n_gaps_passed

    # ----------------------------------------------------------------------------------------
    def get_pos_in_alignment(codon, aligned_seq, seq, pos):
        """ given <pos> in <seq>, find the codon's position in <aligned_seq> """
        assert utils.codon_ok(
            codon, seq, pos, debug=debug
        )  # this only gets called on the gene with the *known* position, so it shouldn't fail
        pos_in_alignment = pos + get_n_gaps_up_to_pos(aligned_seq, pos)
        assert utils.codon_ok(codon, aligned_seq, pos_in_alignment, debug=debug)
        return pos_in_alignment

    for region, codon in utils.conserved_codons[glfo["chain"]].items():
        missing_genes = set(glfo["seqs"][region]) - set(glfo[codon + "-positions"])
        if len(missing_genes) == 0:
            if debug:
                print "      no missing %s info" % codon
            continue

        if debug:
            print "      missing %d %s positions" % (len(missing_genes), codon)

        aligned_seqs = get_new_alignments(glfo, region, debug=debug)
        # for g, s in aligned_seqs.items():
        #     print s, utils.color_gene(g)

        # if region == 'j':
        #     raise Exception('missing tryp position for %s, and we can\'t infer it because tryp positions don\'t reliably align to the same position' % ' '.join(missing_genes))

        # existing codon position (this assumes that once aligned, all genes have the same codon position -- which is only really true for the imgt-gapped alignment)
        if len(glfo[codon + "-positions"]) > 0:
            known_gene, known_pos = None, None
            for gene, pos in glfo[
                codon + "-positions"
            ].items():  # take the first one for which we have the sequence (NOTE it would be safer to check that they're all the same)
                if (
                    gene in glfo["seqs"][region]
                    and gene in aligned_seqs
                    and utils.codon_ok(codon, glfo["seqs"][region][gene], pos)
                ):
                    known_gene, known_pos = gene, pos
                    break
            if known_gene is None:
                raise Exception("couldn't find a known %s position" % codon)
            # NOTE for cyst, should be 309 if alignments are imgt [which they used to usually be, but now probably aren't] (imgt says 104th codon --> subtract 1 to get zero-indexing, then multiply by three 3 * (104 - 1) = 309
            known_pos_in_alignment = get_pos_in_alignment(
                codon, aligned_seqs[known_gene], glfo["seqs"][region][known_gene], known_pos
            )
            if debug:
                print "        using known position %d (aligned %d) in %s" % (
                    known_pos,
                    known_pos_in_alignment,
                    known_gene,
                )
        elif codon == "cyst":
            known_pos_in_alignment = 309
            print "      assuming aligned %s position is %d (this will %s work if you're using imgt alignments)" % (
                codon,
                known_pos_in_alignment,
                utils.color("red", "only"),
            )
            raise Exception("not really using imgt alignments much any more, so this isn't really going to work")
        else:
            raise Exception("no existing %s info, and couldn't guess it, either" % codon)

        n_added = 0
        seqons = []  # (seq, pos) pairs
        for gene in missing_genes:
            unaligned_pos = known_pos_in_alignment - utils.count_gaps(aligned_seqs[gene], istop=known_pos_in_alignment)
            seq_to_check = glfo["seqs"][region][gene]
            seqons.append((seq_to_check, unaligned_pos))
            glfo[codon + "-positions"][gene] = unaligned_pos
            n_added += 1
            # if debug:
            #     tmpseq = aligned_seqs[gene]
            #     tmppos = known_pos_in_alignment
            #     print '    %s%s%s   %s    (new)' % (tmpseq[:tmppos], utils.color('reverse_video', tmpseq[tmppos : tmppos + 3]), tmpseq[tmppos + 3:], utils.color_gene(gene))

        utils.check_a_bunch_of_codons(codon, seqons, extra_str="          ", debug=debug)
        if debug:
            print "      added %d %s positions" % (n_added, codon)