Beispiel #1
0
    def __init__(self, hdp, tx_ac, alt_ac, alt_aln_method):
        self.tx_ac = tx_ac
        self.alt_ac = alt_ac
        self.alt_aln_method = alt_aln_method
        if self.alt_aln_method != "transcript":
            tx_info = hdp.get_tx_info(self.tx_ac, self.alt_ac,
                                      self.alt_aln_method)
            if tx_info is None:
                raise HGVSDataNotAvailableError(
                    "AlignmentMapper(tx_ac={self.tx_ac}, "
                    "alt_ac={self.alt_ac}, alt_aln_method={self.alt_aln_method}): "
                    "No transcript info".format(self=self))

            tx_exons = hdp.get_tx_exons(self.tx_ac, self.alt_ac,
                                        self.alt_aln_method)
            if tx_exons is None:
                raise HGVSDataNotAvailableError(
                    "AlignmentMapper(tx_ac={self.tx_ac}, "
                    "alt_ac={self.alt_ac}, alt_aln_method={self.alt_aln_method}): "
                    "No transcript exons".format(self=self))

            # hgvs-386: An assumption when building the cigar string
            # is that exons are adjacent. Assert that here.
            sorted_tx_exons = sorted(tx_exons, key=lambda e: e["ord"])
            for i in range(1, len(sorted_tx_exons)):
                if sorted_tx_exons[
                        i - 1]["tx_end_i"] != sorted_tx_exons[i]["tx_start_i"]:
                    raise HGVSDataNotAvailableError(
                        "AlignmentMapper(tx_ac={self.tx_ac}, "
                        "alt_ac={self.alt_ac}, alt_aln_method={self.alt_aln_method}): "
                        "Exons {a} and {b} are not adjacent".format(self=self,
                                                                    a=i,
                                                                    b=i + 1))

            self.strand = tx_exons[0]["alt_strand"]
            self.gc_offset = tx_exons[0]["alt_start_i"]
            self.cds_start_i = tx_info["cds_start_i"]
            self.cds_end_i = tx_info["cds_end_i"]
            self.cigar = build_tx_cigar(tx_exons, self.strand)
            self.ref_pos, self.tgt_pos, self.cigar_op = self._parse_cigar(
                self.cigar)
            self.tgt_len = self.tgt_pos[-1]
        else:
            # this covers the identity cases n <-> c
            tx_identity_info = hdp.get_tx_identity_info(self.tx_ac)
            if tx_identity_info is None:
                raise HGVSDataNotAvailableError(
                    "AlignmentMapper(tx_ac={self.tx_ac}, "
                    "alt_ac={self.alt_ac}, alt_aln_method={self.alt_aln_method}): "
                    "No transcript identity info".format(self=self))
            self.cds_start_i = tx_identity_info["cds_start_i"]
            self.cds_end_i = tx_identity_info["cds_end_i"]
            self.tgt_len = sum(tx_identity_info["lengths"])

        assert not (
            (self.cds_start_i is None) ^ (self.cds_end_i is None)
        ), "CDS start and end must both be defined or neither defined"
Beispiel #2
0
            def setup_transcript_data(cls, hdp, tx_ac, pro_ac):
                """helper for generating RefTranscriptData from for c_to_p"""
                tx_info = hdp.get_tx_identity_info(var_c.ac)
                tx_seq = hdp.get_seq(tx_ac)

                if tx_info is None or tx_seq is None:
                    raise HGVSDataNotAvailableError("Missing transcript data for accession: {}".format(tx_ac))

                # use 1-based hgvs coords
                cds_start = tx_info["cds_start_i"] + 1
                cds_stop = tx_info["cds_end_i"]

                # padding list so biopython won't complain during the conversion
                tx_seq_to_translate = tx_seq[cds_start - 1:cds_stop]
                if len(tx_seq_to_translate) % 3 != 0:
                    "".join(list(tx_seq_to_translate).extend(["N"] * ((3 - len(tx_seq_to_translate) % 3) % 3)))

                tx_seq_cds = Seq(tx_seq_to_translate)
                protein_seq = str(tx_seq_cds.translate())

                if pro_ac is None:
                    # get_acs... will always return at least the MD5_ accession
                    pro_ac = (hdp.get_pro_ac_for_tx_ac(tx_ac) or hdp.get_acs_for_protein_seq(protein_seq)[0])

                transcript_data = RefTranscriptData(tx_seq, protein_seq, cds_start, cds_stop, pro_ac)

                return transcript_data
Beispiel #3
0
    def _alt_ac_for_tx_ac(self, tx_ac):
        """return chromosomal accession for given transcript accession (and
        the_assembly and aln_method setting used to instantiate this
        AssemblyMapper)

        """
        alt_acs = [
            e["alt_ac"] for e in self.hdp.get_tx_mapping_options(tx_ac)
            if e["alt_aln_method"] == self.alt_aln_method
            and e["alt_ac"] in self._assembly_accessions
        ]

        if not alt_acs:
            raise HGVSDataNotAvailableError(
                "No alignments for {tx_ac} in {an} using {am}".format(
                    tx_ac=tx_ac, an=self.assembly_name,
                    am=self.alt_aln_method))

        # TODO: conditional is unnecessary; remove
        if len(alt_acs) > 1:
            names = set(self._assembly_map[ac] for ac in alt_acs)
            if names != set("XY"):
                alts = ", ".join([
                    "{ac} ({n})".format(ac=ac, n=self._assembly_map[ac])
                    for ac in alt_acs
                ])
                raise HGVSError(
                    "Multiple chromosomal alignments for {tx_ac} in {an}"
                    " using {am} (non-pseudoautosomal region) [{alts}]".format(
                        tx_ac=tx_ac,
                        an=self.assembly_name,
                        am=self.alt_aln_method,
                        alts=alts))

            # assume PAR
            if self.in_par_assume is None:
                raise HGVSError(
                    "Multiple chromosomal alignments for {tx_ac} in {an}"
                    " using {am} (likely pseudoautosomal region)".format(
                        tx_ac=tx_ac,
                        an=self.assembly_name,
                        am=self.alt_aln_method))

            alt_acs = [
                ac for ac in alt_acs
                if self._assembly_map[ac] == self.in_par_assume
            ]
            if len(alt_acs) != 1:
                raise HGVSError(
                    "Multiple chromosomal alignments for {tx_ac} in {an}"
                    " using {am}; in_par_assume={ipa} selected {n} of them".
                    format(tx_ac=tx_ac,
                           an=self.assembly_name,
                           am=self.alt_aln_method,
                           ipa=self.in_par_assume,
                           n=len(alt_acs)))

        assert len(
            alt_acs) == 1, "Should have exactly one alignment at this point"
        return alt_acs[0]
Beispiel #4
0
    def __init__(self, hdp, tx_ac, pro_ac):
        """helper for generating RefTranscriptData from for c_to_p"""
        tx_info = hdp.get_tx_identity_info(tx_ac)
        tx_seq = hdp.get_seq(tx_ac)

        if tx_info is None or tx_seq is None:
            raise HGVSDataNotAvailableError(
                "Missing transcript data for accession: {}".format(tx_ac))

        # use 1-based hgvs coords
        cds_start = tx_info["cds_start_i"] + 1
        cds_stop = tx_info["cds_end_i"]

        # coding sequences that are not divisable by 3 are not yet supported
        tx_seq_to_translate = tx_seq[cds_start - 1:cds_stop]
        if len(tx_seq_to_translate) % 3 != 0:
            raise NotImplementedError(
                "Transcript {} is not supported because its sequence length of {} is not divisible by 3."
                .format(tx_ac, len(tx_seq_to_translate)))

        protein_seq = translate_cds(tx_seq_to_translate)

        if pro_ac is None:
            # get_acs... will always return at least the MD5_ accession
            # TODO: drop get_acs_for_protein_seq; use known mapping or digest (wo/pro ac inference)
            pro_ac = (hdp.get_pro_ac_for_tx_ac(tx_ac)
                      or hdp.get_acs_for_protein_seq(protein_seq)[0])

        self.transcript_sequence = tx_seq
        self.aa_sequence = protein_seq
        self.cds_start = cds_start
        self.cds_stop = cds_stop
        self.protein_accession = pro_ac
Beispiel #5
0
 def _ensure_schema_exists(self):
     # N.B. On AWS RDS, information_schema.schemata always returns zero rows
     r = self._fetchone("select exists(SELECT 1 FROM pg_namespace WHERE nspname = %s)",
                        [self.url.schema])
     if r[0]:
         return
     raise HGVSDataNotAvailableError("specified schema ({}) does not exist (url={})".format(
         self.url.schema, self.url))
Beispiel #6
0
 def _get_tgt_length(self, var):
     """Get the total length of the whole reference sequence
     """
     if var.type == "g" or var.type == "m":
         return float("inf")
     else:
         # Get genomic sequence access number for this transcript
         identity_info = self.hdp.get_tx_identity_info(var.ac)
         if not identity_info:
             raise HGVSDataNotAvailableError("No identity info available for {ac}".format(ac=var.ac))
         tgt_len = sum(identity_info["lengths"])
         return tgt_len
Beispiel #7
0
    def _get_boundary(self, var):
        """Get the position of exon-intron boundary for current variant
        """
        if var.type == "r" or var.type == "n":
            if self.cross_boundaries:
                return 0, float("inf")
            else:
                # Get genomic sequence access number for this transcript
                map_info = self.hdp.get_tx_mapping_options(var.ac)
                if not map_info:
                    raise HGVSDataNotAvailableError(
                        "No mapping info available for {ac}".format(ac=var.ac))
                map_info = [
                    item for item in map_info
                    if item["alt_aln_method"] == self.alt_aln_method
                ]
                alt_ac = map_info[0]["alt_ac"]

                # Get tx info
                tx_info = self.hdp.get_tx_info(var.ac, alt_ac,
                                               self.alt_aln_method)
                cds_start = tx_info["cds_start_i"]
                cds_end = tx_info["cds_end_i"]

                # Get exon info
                exon_info = self.hdp.get_tx_exons(var.ac, alt_ac,
                                                  self.alt_aln_method)
                exon_starts = [exon["tx_start_i"] for exon in exon_info]
                exon_ends = [exon["tx_end_i"] for exon in exon_info]
                exon_starts.sort()
                exon_ends.sort()
                exon_starts.append(exon_ends[-1])
                exon_ends.append(float("inf"))

                # Find the end pos of the exon where the var locates
                left = 0
                right = float("inf")

                # TODO: #242: implement methods to find tx regions
                for i in range(0, len(exon_starts)):
                    if (var.posedit.pos.start.base - 1 >= exon_starts[i]
                            and var.posedit.pos.start.base - 1 < exon_ends[i]):
                        break

                for j in range(0, len(exon_starts)):
                    if (var.posedit.pos.end.base - 1 >= exon_starts[j]
                            and var.posedit.pos.end.base - 1 < exon_ends[j]):
                        break

                if i != j:
                    raise HGVSUnsupportedOperationError(
                        "Unsupported normalization of variants spanning the exon-intron boundary ({var})"
                        .format(var=var))

                left = exon_starts[i]
                right = exon_ends[i]

                if cds_start is None:
                    pass
                elif var.posedit.pos.end.base - 1 < cds_start:
                    right = min(right, cds_start)
                elif var.posedit.pos.start.base - 1 >= cds_start:
                    left = max(left, cds_start)
                else:
                    raise HGVSUnsupportedOperationError(
                        "Unsupported normalization of variants spanning the UTR-exon boundary ({var})"
                        .format(var=var))

                if cds_end is None:
                    pass
                elif var.posedit.pos.start.base - 1 >= cds_end:
                    left = max(left, cds_end)
                elif var.posedit.pos.end.base - 1 < cds_end:
                    right = min(right, cds_end)
                else:
                    raise HGVSUnsupportedOperationError(
                        "Unsupported normalization of variants spanning the exon-UTR boundary ({var})"
                        .format(var=var))

                return left, right
        else:
            # For variant type of g and m etc.
            return 0, float("inf")