예제 #1
0
    def _get_altered_sequence(self, strand, interval, var):
        seq = list(self.hdp.get_seq(var.ac, interval.start.base - 1, interval.end.base))
        # positions are 0-based and half-open
        pos_start = var.posedit.pos.start.base - interval.start.base
        pos_end = var.posedit.pos.end.base - interval.start.base + 1
        edit = var.posedit.edit

        if edit.type == 'sub':
            seq[pos_start] = edit.alt
        elif edit.type == 'del':
            del seq[pos_start:pos_end]
        elif edit.type == 'ins':
            seq.insert(pos_start + 1, edit.alt)
        elif edit.type == 'delins':
            del seq[pos_start:pos_end]
            seq.insert(pos_start, edit.alt)
        elif edit.type == 'dup':
            seq.insert(pos_end, ''.join(seq[pos_start:pos_end]))
        elif edit.type == 'inv':
            seq[pos_start:pos_end] = list(reverse_complement(''.join(seq[pos_start:pos_end])))
        elif edit.type == 'identity':
            pass
        else:
            raise HGVSUnsupportedOperationError(
                "Getting altered sequence for {type} is unsupported".format(type=edit.type))

        seq = ''.join(seq)
        if strand == -1:
            seq = reverse_complement(seq)
        return seq
예제 #2
0
    def _get_ref_alt(self, var, boundary):
        """Get reference allele and alternative allele of the variant
        """

        # Get reference allele
        if var.posedit.edit.type == "ins" or var.posedit.edit.type == "dup":
            ref = ""
        else:
            # For NARefAlt and Inv
            if var.posedit.edit.ref_s is None or var.posedit.edit.ref == "":
                ref = self._fetch_bounded_seq(var,
                                              var.posedit.pos.start.base - 1,
                                              var.posedit.pos.end.base, 0,
                                              boundary)
            else:
                ref = var.posedit.edit.ref

        # Get alternative allele
        if var.posedit.edit.type == "sub" or var.posedit.edit.type == "delins" or var.posedit.edit.type == "ins":
            alt = var.posedit.edit.alt
        elif var.posedit.edit.type == "del":
            alt = ""
        elif var.posedit.edit.type == "dup":
            alt = var.posedit.edit.ref or self._fetch_bounded_seq(
                var, var.posedit.pos.start.base - 1, var.posedit.pos.end.base,
                0, boundary)
        elif var.posedit.edit.type == "inv":
            alt = reverse_complement(ref)
        elif var.posedit.edit.type == "identity":
            alt = ref

        return ref, alt
예제 #3
0
    def from_hgvs_obj(hgvs_var,
                      seq_fetcher=seq_utils.SeqRepoWrapper.get_instance()):
        chr = int(hgvs_var.ac.split("_")[1].split('.')[0])

        alt = hgvs_var.posedit.edit.alt if hasattr(hgvs_var.posedit.edit,
                                                   'alt') else ''

        if not alt:
            alt = ''

        edit_type = str(hgvs_var.posedit.edit)

        pos = hgvs_var.posedit.pos.start.base
        ref = hgvs_var.posedit.edit.ref
        if not ref:
            if edit_type.startswith('ins'):
                ref = str(
                    seq_fetcher.get_seq(str(chr),
                                        hgvs_var.posedit.pos.start.base,
                                        hgvs_var.posedit.pos.start.base + 1))
            else:
                ref = str(
                    seq_fetcher.get_seq(str(chr),
                                        hgvs_var.posedit.pos.start.base,
                                        hgvs_var.posedit.pos.end.base + 1))

        if len(ref) >= 1 and len(alt) >= 1 and not edit_type.startswith('ins'):
            return VCFVariant(int(chr), int(pos), ref, alt)

        # require padding, i.e. inserting previous base to avoid empty alt
        # e.g. instead of 'C'>'' do 'AC'>'A'
        if edit_type.startswith('del') or edit_type.startswith(
                'ins') or edit_type.startswith('dup') or edit_type.startswith(
                    'inv'):
            if not edit_type.startswith('ins') and not edit_type.startswith(
                    'inv'):
                pos -= 1

            # transforming 'del' to a delins
            padding = str(seq_fetcher.get_seq_at(str(chr), pos, 1))

            if edit_type.startswith('ins'):
                alt = padding + alt
            elif edit_type.startswith('dup'):
                alt = padding + ref
                ref = padding
            elif edit_type.startswith('del'):
                ref = padding + ref
                alt = padding + alt
            elif edit_type.startswith('inv'):
                alt = reverse_complement(ref)

        return VCFVariant(int(chr), int(pos), ref, alt)
예제 #4
0
 def _convert_edit_check_strand(strand, edit_in):
     """
     Convert an edit from one type to another, based on the stand and type
     """
     if isinstance(edit_in, hgvs.edit.NARefAlt):
         if strand == 1:
             edit_out = copy.deepcopy(edit_in)
         else:
             try:
                 # if smells like an int, do nothing
                 # TODO: should use ref_n, right?
                 int(edit_in.ref)
                 ref = edit_in.ref
             except (ValueError, TypeError):
                 ref = reverse_complement(edit_in.ref)
             edit_out = hgvs.edit.NARefAlt(
                 ref=ref,
                 alt=reverse_complement(edit_in.alt),
             )
     elif isinstance(edit_in, hgvs.edit.Dup):
         if strand == 1:
             edit_out = copy.deepcopy(edit_in)
         else:
             edit_out = hgvs.edit.Dup(ref=reverse_complement(edit_in.ref))
     elif isinstance(edit_in, hgvs.edit.Inv):
         if strand == 1:
             edit_out = copy.deepcopy(edit_in)
         else:
             try:
                 int(edit_in.ref)
                 ref = edit_in.ref
             except (ValueError, TypeError):
                 ref = reverse_complement(edit_in.ref)
             edit_out = hgvs.edit.Inv(ref=ref)
     else:
         raise NotImplementedError(
             "Only NARefAlt/Dup/Inv types are currently implemented")
     return edit_out
예제 #5
0
 def _convert_edit_check_strand(strand, edit_in):
     """
     Convert an edit from one type to another, based on the stand and type
     """
     if isinstance(edit_in, hgvs.edit.NARefAlt):
         if strand == 1:
             edit_out = copy.deepcopy(edit_in)
         else:
             try:
                 # if smells like an int, do nothing
                 # TODO: should use ref_n, right?
                 int(edit_in.ref)
                 ref = edit_in.ref
             except (ValueError, TypeError):
                 ref = reverse_complement(edit_in.ref)
             edit_out = hgvs.edit.NARefAlt(ref=ref, alt=reverse_complement(edit_in.alt))
     elif isinstance(edit_in, hgvs.edit.Dup):
         if strand == 1:
             edit_out = copy.deepcopy(edit_in)
         else:
             edit_out = hgvs.edit.Dup(seq=reverse_complement(edit_in.seq))
     else:
         raise NotImplemented("Only NARefAlt/Dup types are currently implemented")
     return edit_out
예제 #6
0
파일: hgvs.py 프로젝트: p7k/tempus
 def simple_variant_from_hgvs(self,
                              variant: SequenceVariant) -> SimpleVariant:
     """
     :param variant: hgvs variant.
     :return: simple variant.
     """
     edit = variant.posedit.edit
     if isinstance(edit, Dup):
         alt = edit.ref_s * 2
     elif isinstance(edit, Inv):
         alt = reverse_complement(edit.ref_s)
     else:
         alt = edit.alt
     return SimpleVariant(contig=self.accession__contig[variant.ac],
                          pos=variant.posedit.pos.start.base,
                          ref=variant.posedit.edit.ref,
                          alt=alt)
예제 #7
0
    def _incorporate_inv(self):
        """Incorporate inv into sequence"""
        seq, cds_start, cds_stop, start, end = self._setup_incorporate()

        seq[start:end] = list(reverse_complement(''.join(seq[start:end])))

        is_frameshift = False
        variant_start_aa = max(
            int(math.ceil((self._var_c.posedit.pos.start.base) / 3.0)), 1)

        alt_data = AltTranscriptData(seq,
                                     cds_start,
                                     cds_stop,
                                     is_frameshift,
                                     variant_start_aa,
                                     self._transcript_data.protein_accession,
                                     is_ambiguous=self._ref_has_multiple_stops)
        return alt_data
예제 #8
0
    def normalize(self, var):
        """Perform sequence variants normalization for single variant
        """
        assert isinstance(
            var, hgvs.sequencevariant.SequenceVariant
        ), "variant must be a parsed HGVS sequence variant object"

        if self.validator:
            self.validator.validate(var)

        if var.posedit is None or var.posedit.uncertain or var.posedit.pos is None:
            return var

        type = var.type

        if type == "p":
            raise HGVSUnsupportedOperationError(
                "Unsupported normalization of protein level variants: {0}".
                format(var))
        if var.posedit.edit.type == "con":
            raise HGVSUnsupportedOperationError(
                "Unsupported normalization of conversion variants: {0}",
                format(var))

        var.fill_ref(self.hdp)

        if var.posedit.edit.type == "identity":
            var_norm = copy.deepcopy(var)
            return var_norm

        # For c. variants normalization, first convert to n. variant
        # and perform normalization at the n. level, then convert the
        # normalized n. variant back to c. variant.
        if type == "c":
            var = self.hm.c_to_n(var)

        if var.type in "nr":
            if var.posedit.pos.start.offset != 0 or var.posedit.pos.end.offset != 0:
                raise HGVSUnsupportedOperationError(
                    "Normalization of intronic variants is not supported")

        # g, m, n, r sequences all use sequence start as the datum
        # That"s an essential assumption herein
        # (this is why we may have converted from c to n above)
        assert var.type in "gmnr", "Internal Error: variant must be of type g, m, n, r"

        bound_s, bound_e = self._get_boundary(var)
        boundary = (bound_s, bound_e)
        start, end, (ref, alt) = self._normalize_alleles(var, boundary)

        ref_len = len(ref)
        alt_len = len(alt)

        # Generate normalized variant
        if alt_len == ref_len:
            ref_start = start
            ref_end = end - 1
            # inversion
            if ref_len > 1 and ref == reverse_complement(alt):
                edit = hgvs.edit.Inv(ref=ref)
            # ident
            elif ref_len == 0 and alt_len == 0:
                ref_start = ref_end
                edit = hgvs.edit.NARefAlt(ref=ref, alt=alt)
            # substitution or delins
            else:
                edit = hgvs.edit.NARefAlt(ref=ref, alt=alt)
        if alt_len < ref_len:
            # del or delins
            ref_start = start
            ref_end = end - 1
            edit = hgvs.edit.NARefAlt(ref=ref,
                                      alt=None if alt_len == 0 else alt)
        elif alt_len > ref_len:
            # ins or dup
            if ref_len == 0:
                if self.shuffle_direction == 3:
                    adj_seq = self._fetch_bounded_seq(var, start - alt_len - 1,
                                                      end - 1, 0, boundary)
                else:
                    adj_seq = self._fetch_bounded_seq(var, start - 1,
                                                      start + alt_len - 1, 0,
                                                      boundary)
                # ins
                if alt != adj_seq:
                    ref_start = start - 1
                    ref_end = end
                    edit = hgvs.edit.NARefAlt(ref=None, alt=alt)
                # dup
                else:
                    if self.shuffle_direction == 3:
                        ref_start = start - alt_len
                        ref_end = end - 1
                        edit = hgvs.edit.Dup(ref=alt)
                    else:
                        ref_start = start
                        ref_end = start + alt_len - 1
                        edit = hgvs.edit.Dup(ref=alt)
            # delins
            else:
                ref_start = start
                ref_end = end - 1
                edit = hgvs.edit.NARefAlt(ref=ref, alt=alt)

        # ensure the start is not 0
        if ref_start == 0:
            ref = self._fetch_bounded_seq(var, 0, 1, 0, boundary)
            alt = alt + ref
            edit = hgvs.edit.NARefAlt(ref=ref, alt=alt)
            ref_start = 1
            ref_end = 1

        # ensure the end is not outside of reference sequence
        tgt_len = self._get_tgt_length(var)
        if ref_end == tgt_len + 1:
            ref = self._fetch_bounded_seq(var, tgt_len - 1, tgt_len, 0,
                                          boundary)
            alt = ref + alt
            edit = hgvs.edit.NARefAlt(ref=ref, alt=alt)
            ref_start = tgt_len
            ref_end = tgt_len

        var_norm = copy.deepcopy(var)
        var_norm.posedit.edit = edit
        var_norm.posedit.pos.start.base = ref_start
        var_norm.posedit.pos.end.base = ref_end

        if type == "c":
            var_norm = self.hm.n_to_c(var_norm)

        return var_norm
예제 #9
0
파일: loading.py 프로젝트: pythseq/uta
def align_exons(session, opts, cf):
    # N.B. setup.py declares dependencies for using uta as a client.  The
    # imports below are loading depenencies only and are not in setup.py.

    update_period = 1000

    def _get_cursor(con):
        cur = con.cursor(cursor_factory=psycopg2.extras.NamedTupleCursor)
        cur.execute("set role {admin_role};".format(
            admin_role=cf.get("uta", "admin_role")))
        cur.execute("set search_path = " + usam.schema_name)
        return cur

    def align(s1, s2):
        score, cigar = utaaa.needleman_wunsch_gotoh_align(str(s1),
                                                          str(s2),
                                                          extended_cigar=True)
        tx_aseq, alt_aseq = utaaa.cigar_alignment(
            tx_seq, alt_seq, cigar, hide_match=False)
        return tx_aseq, alt_aseq, cigar.to_string()

    aln_sel_sql = """
    SELECT * FROM tx_alt_exon_pairs_v TAEP
    WHERE exon_aln_id is NULL
    ORDER BY tx_ac, alt_ac
    """

    aln_ins_sql = """
    INSERT INTO exon_aln (tx_exon_id,alt_exon_id,cigar,added)
    VALUES (%s,%s,%s,%s)
    """

    con = session.bind.pool.connect()
    cur = _get_cursor(con)
    cur.execute(aln_sel_sql)
    n_rows = cur.rowcount

    if n_rows == 0:
        return

    logger.info("{} exon pairs to align".format(n_rows))

    sf = _get_seqfetcher(cf)

    def _fetch_seq(ac, s, e):
        logger.debug("fetching sequence {ac}[{s}:{e}]".format(ac=ac,s=s,e=e))
        seq = sf.fetch(ac,s,e)
        assert seq is not None, "sequence {ac}[{s}:{e}] should never be None (coordinates bogus?)".format(ac=ac,s=s,e=e)
        if isinstance(seq, six.binary_type):
            seq = seq.decode("ascii")  # force into unicode
        assert isinstance(seq, six.text_type)
        return seq

    rows = cur.fetchall()
    ac_warning = set()
    tx_acs = set()
    aln_rate_s = None
    decay_rate = 0.25
    n0, t0 = 0, time.time()

    for i_r, r in enumerate(rows):
        if i_r > 0 and (i_r % update_period == 0 or (i_r + 1) == n_rows):
            con.commit()

        if r.tx_ac in ac_warning or r.alt_ac in ac_warning:
            continue

        try:
            tx_seq = _fetch_seq(r.tx_ac, r.tx_start_i, r.tx_end_i)
        except KeyError:
            logger.warning(
                "{r.tx_ac}: Not in sequence sources; can't align".format(r=r))
            ac_warning.add(r.tx_ac)
            continue

        try:
            alt_seq = _fetch_seq(r.alt_ac, r.alt_start_i, r.alt_end_i)
        except KeyError:
            logger.warning(
                "{r.alt_ac}: Not in sequence sources; can't align".format(r=r))
            ac_warning.add(r.tx_ac)
            continue

        if r.alt_strand == MINUS_STRAND:
            alt_seq = reverse_complement(alt_seq)
        tx_seq = tx_seq.upper()
        alt_seq = alt_seq.upper()

        tx_aseq, alt_aseq, cigar_str = align(tx_seq, alt_seq)

        added = datetime.datetime.now()
        cur.execute(aln_ins_sql, [r.tx_exon_id, r.alt_exon_id, cigar_str, added])
        tx_acs.add(r.tx_ac)

        if i_r > 0 and (i_r % update_period == 0 or (i_r + 1) == n_rows):
            con.commit()
            n1, t1 = i_r, time.time()
            nd, td = n1 - n0, t1 - t0
            aln_rate = nd / td      # aln rate on this update period
            if aln_rate_s is None:  # aln_rate_s is EWMA smoothed average
                aln_rate_s = aln_rate
            else:
                aln_rate_s = decay_rate * aln_rate + (1.0 - decay_rate) * aln_rate_s
            etr = (n_rows - i_r - 1) / aln_rate_s        # etr in secs
            etr_s = str(datetime.timedelta(seconds=round(etr)))  # etr as H:M:S
            logger.info("{i_r}/{n_rows} {p_r:.1f}%; committed; speed={speed:.1f}/{speed_s:.1f} aln/sec (inst/emwa); etr={etr:.0f}s ({etr_s}); {n_tx} tx".format(
                i_r=i_r, n_rows=n_rows, p_r=i_r / n_rows * 100, speed=aln_rate, speed_s=aln_rate_s, etr=etr,
                etr_s=etr_s, n_tx=len(tx_acs)))
            tx_acs = set()
            n0, t0 = n1, t1

    cur.close()
    con.close()
    logger.info("{} distinct sequence accessions not found".format(len(ac_warning)))
예제 #10
0
    def normalize(self, var):
        """Perform sequence variants normalization for single variant
        """
        assert isinstance(
            var, hgvs.sequencevariant.SequenceVariant
        ), "variant must be a parsed HGVS sequence variant object"

        # keep a shallow reference to the original variant, to be returned
        # as-is under certain circumstances
        orig_var = var

        if self.validator:
            self.validator.validate(var)

        init_met = False
        if var.posedit is not None and isinstance(var.posedit,
                                                  hgvs.edit.AARefAlt):
            init_met = var.posedit.init_met

        if var.posedit is None or var.posedit.uncertain or init_met or var.posedit.pos is None:
            return var

        type = var.type

        if type == "p":
            raise HGVSUnsupportedOperationError(
                "Unsupported normalization of protein level variants: {0}".
                format(var))
        if var.posedit.edit.type == "con":
            raise HGVSUnsupportedOperationError(
                "Unsupported normalization of conversion variants: {0}",
                format(var))

        var.fill_ref(self.hdp)

        if var.posedit.edit.type == "identity":
            var_norm = copy.deepcopy(var)
            return var_norm

        # For c. variants normalization, first convert to n. variant
        # and perform normalization at the n. level, then convert the
        # normalized n. variant back to c. variant.
        if type == "c":
            var = self.vm.c_to_n(var)

        if var.type in "nr":
            if var.posedit.pos.start.offset != 0 or var.posedit.pos.end.offset != 0:
                raise HGVSUnsupportedOperationError(
                    "Normalization of intronic variants is not supported")

        def is_valid_pos(ac, pos):
            # tests whether the sequence position actually exists
            # This is *way* janky.
            # TODO: push functionality to hdp which can implement differently
            # based on capabilities of sequence backend
            try:
                s = self.hdp.get_seq(ac, pos - 1, pos)  # 0-based!
                return s != ""
            except HGVSDataNotAvailableError as e:
                # Bad Request indicates that we got to NCBI, but the request
                # was invalid.
                return "Bad Request" not in str(e)

        if var.posedit.pos.start.base < 0 or not is_valid_pos(
                var.ac, var.posedit.pos.end.base):
            if hgvs.global_config.mapping.strict_bounds:
                raise HGVSInvalidVariantError(
                    f"{var}: coordinates are out-of-bounds")
            _logger.warning(
                f"{var}: coordinates are out-of-bounds; returning as-is")
            return orig_var

        # restrict var types to those that use sequence start (i.e., not c.)
        assert var.type in "gmnr", "Internal Error: variant must be of type g, m, n, r"

        bound_s, bound_e = self._get_boundary(var)
        boundary = (bound_s, bound_e)
        start, end, (ref, alt) = self._normalize_alleles(var, boundary)

        ref_len = len(ref)
        alt_len = len(alt)

        # Generate normalized variant
        if alt_len == ref_len:
            ref_start = start
            ref_end = end - 1
            # inversion
            if ref_len > 1 and ref == reverse_complement(alt):
                edit = hgvs.edit.Inv(ref=ref)
            # ident
            elif ref_len == 0 and alt_len == 0:
                ref_start = ref_end
                edit = hgvs.edit.NARefAlt(ref=ref, alt=alt)
            # substitution or delins
            else:
                edit = hgvs.edit.NARefAlt(ref=ref, alt=alt)
        if alt_len < ref_len:
            # del or delins
            ref_start = start
            ref_end = end - 1
            edit = hgvs.edit.NARefAlt(ref=ref,
                                      alt=None if alt_len == 0 else alt)
        elif alt_len > ref_len:
            # ins or dup
            if ref_len == 0:
                if self.shuffle_direction == 3:
                    adj_seq = self._fetch_bounded_seq(var, start - alt_len - 1,
                                                      end - 1, 0, boundary)
                else:
                    adj_seq = self._fetch_bounded_seq(var, start - 1,
                                                      start + alt_len - 1, 0,
                                                      boundary)
                # ins
                if alt != adj_seq:
                    ref_start = start - 1
                    ref_end = end
                    edit = hgvs.edit.NARefAlt(ref=None, alt=alt)
                # dup
                else:
                    if self.shuffle_direction == 3:
                        ref_start = start - alt_len
                        ref_end = end - 1
                        edit = hgvs.edit.Dup(ref=alt)
                    else:
                        ref_start = start
                        ref_end = start + alt_len - 1
                        edit = hgvs.edit.Dup(ref=alt)
            # delins
            else:
                ref_start = start
                ref_end = end - 1
                edit = hgvs.edit.NARefAlt(ref=ref, alt=alt)

        # ensure the start is not 0
        if ref_start == 0:
            ref = self._fetch_bounded_seq(var, 0, 1, 0, boundary)
            alt = alt + ref
            edit = hgvs.edit.NARefAlt(ref=ref, alt=alt)
            ref_start = 1
            ref_end = 1

        # ensure the end is not outside of reference sequence
        tgt_len = self._get_tgt_length(var)
        if ref_end == tgt_len + 1:
            ref = self._fetch_bounded_seq(var, tgt_len - 1, tgt_len, 0,
                                          boundary)
            alt = ref + alt
            edit = hgvs.edit.NARefAlt(ref=ref, alt=alt)
            ref_start = tgt_len
            ref_end = tgt_len

        var_norm = copy.deepcopy(var)
        var_norm.posedit.edit = edit
        var_norm.posedit.pos.start.base = ref_start
        var_norm.posedit.pos.end.base = ref_end

        if type == "c":
            var_norm = self.vm.n_to_c(var_norm)

        return var_norm