def _get_altered_sequence(self, strand, interval, var): seq = list(self.hdp.get_seq(var.ac, interval.start.base - 1, interval.end.base)) # positions are 0-based and half-open pos_start = var.posedit.pos.start.base - interval.start.base pos_end = var.posedit.pos.end.base - interval.start.base + 1 edit = var.posedit.edit if edit.type == 'sub': seq[pos_start] = edit.alt elif edit.type == 'del': del seq[pos_start:pos_end] elif edit.type == 'ins': seq.insert(pos_start + 1, edit.alt) elif edit.type == 'delins': del seq[pos_start:pos_end] seq.insert(pos_start, edit.alt) elif edit.type == 'dup': seq.insert(pos_end, ''.join(seq[pos_start:pos_end])) elif edit.type == 'inv': seq[pos_start:pos_end] = list(reverse_complement(''.join(seq[pos_start:pos_end]))) elif edit.type == 'identity': pass else: raise HGVSUnsupportedOperationError( "Getting altered sequence for {type} is unsupported".format(type=edit.type)) seq = ''.join(seq) if strand == -1: seq = reverse_complement(seq) return seq
def _get_ref_alt(self, var, boundary): """Get reference allele and alternative allele of the variant """ # Get reference allele if var.posedit.edit.type == "ins" or var.posedit.edit.type == "dup": ref = "" else: # For NARefAlt and Inv if var.posedit.edit.ref_s is None or var.posedit.edit.ref == "": ref = self._fetch_bounded_seq(var, var.posedit.pos.start.base - 1, var.posedit.pos.end.base, 0, boundary) else: ref = var.posedit.edit.ref # Get alternative allele if var.posedit.edit.type == "sub" or var.posedit.edit.type == "delins" or var.posedit.edit.type == "ins": alt = var.posedit.edit.alt elif var.posedit.edit.type == "del": alt = "" elif var.posedit.edit.type == "dup": alt = var.posedit.edit.ref or self._fetch_bounded_seq( var, var.posedit.pos.start.base - 1, var.posedit.pos.end.base, 0, boundary) elif var.posedit.edit.type == "inv": alt = reverse_complement(ref) elif var.posedit.edit.type == "identity": alt = ref return ref, alt
def from_hgvs_obj(hgvs_var, seq_fetcher=seq_utils.SeqRepoWrapper.get_instance()): chr = int(hgvs_var.ac.split("_")[1].split('.')[0]) alt = hgvs_var.posedit.edit.alt if hasattr(hgvs_var.posedit.edit, 'alt') else '' if not alt: alt = '' edit_type = str(hgvs_var.posedit.edit) pos = hgvs_var.posedit.pos.start.base ref = hgvs_var.posedit.edit.ref if not ref: if edit_type.startswith('ins'): ref = str( seq_fetcher.get_seq(str(chr), hgvs_var.posedit.pos.start.base, hgvs_var.posedit.pos.start.base + 1)) else: ref = str( seq_fetcher.get_seq(str(chr), hgvs_var.posedit.pos.start.base, hgvs_var.posedit.pos.end.base + 1)) if len(ref) >= 1 and len(alt) >= 1 and not edit_type.startswith('ins'): return VCFVariant(int(chr), int(pos), ref, alt) # require padding, i.e. inserting previous base to avoid empty alt # e.g. instead of 'C'>'' do 'AC'>'A' if edit_type.startswith('del') or edit_type.startswith( 'ins') or edit_type.startswith('dup') or edit_type.startswith( 'inv'): if not edit_type.startswith('ins') and not edit_type.startswith( 'inv'): pos -= 1 # transforming 'del' to a delins padding = str(seq_fetcher.get_seq_at(str(chr), pos, 1)) if edit_type.startswith('ins'): alt = padding + alt elif edit_type.startswith('dup'): alt = padding + ref ref = padding elif edit_type.startswith('del'): ref = padding + ref alt = padding + alt elif edit_type.startswith('inv'): alt = reverse_complement(ref) return VCFVariant(int(chr), int(pos), ref, alt)
def _convert_edit_check_strand(strand, edit_in): """ Convert an edit from one type to another, based on the stand and type """ if isinstance(edit_in, hgvs.edit.NARefAlt): if strand == 1: edit_out = copy.deepcopy(edit_in) else: try: # if smells like an int, do nothing # TODO: should use ref_n, right? int(edit_in.ref) ref = edit_in.ref except (ValueError, TypeError): ref = reverse_complement(edit_in.ref) edit_out = hgvs.edit.NARefAlt( ref=ref, alt=reverse_complement(edit_in.alt), ) elif isinstance(edit_in, hgvs.edit.Dup): if strand == 1: edit_out = copy.deepcopy(edit_in) else: edit_out = hgvs.edit.Dup(ref=reverse_complement(edit_in.ref)) elif isinstance(edit_in, hgvs.edit.Inv): if strand == 1: edit_out = copy.deepcopy(edit_in) else: try: int(edit_in.ref) ref = edit_in.ref except (ValueError, TypeError): ref = reverse_complement(edit_in.ref) edit_out = hgvs.edit.Inv(ref=ref) else: raise NotImplementedError( "Only NARefAlt/Dup/Inv types are currently implemented") return edit_out
def _convert_edit_check_strand(strand, edit_in): """ Convert an edit from one type to another, based on the stand and type """ if isinstance(edit_in, hgvs.edit.NARefAlt): if strand == 1: edit_out = copy.deepcopy(edit_in) else: try: # if smells like an int, do nothing # TODO: should use ref_n, right? int(edit_in.ref) ref = edit_in.ref except (ValueError, TypeError): ref = reverse_complement(edit_in.ref) edit_out = hgvs.edit.NARefAlt(ref=ref, alt=reverse_complement(edit_in.alt)) elif isinstance(edit_in, hgvs.edit.Dup): if strand == 1: edit_out = copy.deepcopy(edit_in) else: edit_out = hgvs.edit.Dup(seq=reverse_complement(edit_in.seq)) else: raise NotImplemented("Only NARefAlt/Dup types are currently implemented") return edit_out
def simple_variant_from_hgvs(self, variant: SequenceVariant) -> SimpleVariant: """ :param variant: hgvs variant. :return: simple variant. """ edit = variant.posedit.edit if isinstance(edit, Dup): alt = edit.ref_s * 2 elif isinstance(edit, Inv): alt = reverse_complement(edit.ref_s) else: alt = edit.alt return SimpleVariant(contig=self.accession__contig[variant.ac], pos=variant.posedit.pos.start.base, ref=variant.posedit.edit.ref, alt=alt)
def _incorporate_inv(self): """Incorporate inv into sequence""" seq, cds_start, cds_stop, start, end = self._setup_incorporate() seq[start:end] = list(reverse_complement(''.join(seq[start:end]))) is_frameshift = False variant_start_aa = max( int(math.ceil((self._var_c.posedit.pos.start.base) / 3.0)), 1) alt_data = AltTranscriptData(seq, cds_start, cds_stop, is_frameshift, variant_start_aa, self._transcript_data.protein_accession, is_ambiguous=self._ref_has_multiple_stops) return alt_data
def normalize(self, var): """Perform sequence variants normalization for single variant """ assert isinstance( var, hgvs.sequencevariant.SequenceVariant ), "variant must be a parsed HGVS sequence variant object" if self.validator: self.validator.validate(var) if var.posedit is None or var.posedit.uncertain or var.posedit.pos is None: return var type = var.type if type == "p": raise HGVSUnsupportedOperationError( "Unsupported normalization of protein level variants: {0}". format(var)) if var.posedit.edit.type == "con": raise HGVSUnsupportedOperationError( "Unsupported normalization of conversion variants: {0}", format(var)) var.fill_ref(self.hdp) if var.posedit.edit.type == "identity": var_norm = copy.deepcopy(var) return var_norm # For c. variants normalization, first convert to n. variant # and perform normalization at the n. level, then convert the # normalized n. variant back to c. variant. if type == "c": var = self.hm.c_to_n(var) if var.type in "nr": if var.posedit.pos.start.offset != 0 or var.posedit.pos.end.offset != 0: raise HGVSUnsupportedOperationError( "Normalization of intronic variants is not supported") # g, m, n, r sequences all use sequence start as the datum # That"s an essential assumption herein # (this is why we may have converted from c to n above) assert var.type in "gmnr", "Internal Error: variant must be of type g, m, n, r" bound_s, bound_e = self._get_boundary(var) boundary = (bound_s, bound_e) start, end, (ref, alt) = self._normalize_alleles(var, boundary) ref_len = len(ref) alt_len = len(alt) # Generate normalized variant if alt_len == ref_len: ref_start = start ref_end = end - 1 # inversion if ref_len > 1 and ref == reverse_complement(alt): edit = hgvs.edit.Inv(ref=ref) # ident elif ref_len == 0 and alt_len == 0: ref_start = ref_end edit = hgvs.edit.NARefAlt(ref=ref, alt=alt) # substitution or delins else: edit = hgvs.edit.NARefAlt(ref=ref, alt=alt) if alt_len < ref_len: # del or delins ref_start = start ref_end = end - 1 edit = hgvs.edit.NARefAlt(ref=ref, alt=None if alt_len == 0 else alt) elif alt_len > ref_len: # ins or dup if ref_len == 0: if self.shuffle_direction == 3: adj_seq = self._fetch_bounded_seq(var, start - alt_len - 1, end - 1, 0, boundary) else: adj_seq = self._fetch_bounded_seq(var, start - 1, start + alt_len - 1, 0, boundary) # ins if alt != adj_seq: ref_start = start - 1 ref_end = end edit = hgvs.edit.NARefAlt(ref=None, alt=alt) # dup else: if self.shuffle_direction == 3: ref_start = start - alt_len ref_end = end - 1 edit = hgvs.edit.Dup(ref=alt) else: ref_start = start ref_end = start + alt_len - 1 edit = hgvs.edit.Dup(ref=alt) # delins else: ref_start = start ref_end = end - 1 edit = hgvs.edit.NARefAlt(ref=ref, alt=alt) # ensure the start is not 0 if ref_start == 0: ref = self._fetch_bounded_seq(var, 0, 1, 0, boundary) alt = alt + ref edit = hgvs.edit.NARefAlt(ref=ref, alt=alt) ref_start = 1 ref_end = 1 # ensure the end is not outside of reference sequence tgt_len = self._get_tgt_length(var) if ref_end == tgt_len + 1: ref = self._fetch_bounded_seq(var, tgt_len - 1, tgt_len, 0, boundary) alt = ref + alt edit = hgvs.edit.NARefAlt(ref=ref, alt=alt) ref_start = tgt_len ref_end = tgt_len var_norm = copy.deepcopy(var) var_norm.posedit.edit = edit var_norm.posedit.pos.start.base = ref_start var_norm.posedit.pos.end.base = ref_end if type == "c": var_norm = self.hm.n_to_c(var_norm) return var_norm
def align_exons(session, opts, cf): # N.B. setup.py declares dependencies for using uta as a client. The # imports below are loading depenencies only and are not in setup.py. update_period = 1000 def _get_cursor(con): cur = con.cursor(cursor_factory=psycopg2.extras.NamedTupleCursor) cur.execute("set role {admin_role};".format( admin_role=cf.get("uta", "admin_role"))) cur.execute("set search_path = " + usam.schema_name) return cur def align(s1, s2): score, cigar = utaaa.needleman_wunsch_gotoh_align(str(s1), str(s2), extended_cigar=True) tx_aseq, alt_aseq = utaaa.cigar_alignment( tx_seq, alt_seq, cigar, hide_match=False) return tx_aseq, alt_aseq, cigar.to_string() aln_sel_sql = """ SELECT * FROM tx_alt_exon_pairs_v TAEP WHERE exon_aln_id is NULL ORDER BY tx_ac, alt_ac """ aln_ins_sql = """ INSERT INTO exon_aln (tx_exon_id,alt_exon_id,cigar,added) VALUES (%s,%s,%s,%s) """ con = session.bind.pool.connect() cur = _get_cursor(con) cur.execute(aln_sel_sql) n_rows = cur.rowcount if n_rows == 0: return logger.info("{} exon pairs to align".format(n_rows)) sf = _get_seqfetcher(cf) def _fetch_seq(ac, s, e): logger.debug("fetching sequence {ac}[{s}:{e}]".format(ac=ac,s=s,e=e)) seq = sf.fetch(ac,s,e) assert seq is not None, "sequence {ac}[{s}:{e}] should never be None (coordinates bogus?)".format(ac=ac,s=s,e=e) if isinstance(seq, six.binary_type): seq = seq.decode("ascii") # force into unicode assert isinstance(seq, six.text_type) return seq rows = cur.fetchall() ac_warning = set() tx_acs = set() aln_rate_s = None decay_rate = 0.25 n0, t0 = 0, time.time() for i_r, r in enumerate(rows): if i_r > 0 and (i_r % update_period == 0 or (i_r + 1) == n_rows): con.commit() if r.tx_ac in ac_warning or r.alt_ac in ac_warning: continue try: tx_seq = _fetch_seq(r.tx_ac, r.tx_start_i, r.tx_end_i) except KeyError: logger.warning( "{r.tx_ac}: Not in sequence sources; can't align".format(r=r)) ac_warning.add(r.tx_ac) continue try: alt_seq = _fetch_seq(r.alt_ac, r.alt_start_i, r.alt_end_i) except KeyError: logger.warning( "{r.alt_ac}: Not in sequence sources; can't align".format(r=r)) ac_warning.add(r.tx_ac) continue if r.alt_strand == MINUS_STRAND: alt_seq = reverse_complement(alt_seq) tx_seq = tx_seq.upper() alt_seq = alt_seq.upper() tx_aseq, alt_aseq, cigar_str = align(tx_seq, alt_seq) added = datetime.datetime.now() cur.execute(aln_ins_sql, [r.tx_exon_id, r.alt_exon_id, cigar_str, added]) tx_acs.add(r.tx_ac) if i_r > 0 and (i_r % update_period == 0 or (i_r + 1) == n_rows): con.commit() n1, t1 = i_r, time.time() nd, td = n1 - n0, t1 - t0 aln_rate = nd / td # aln rate on this update period if aln_rate_s is None: # aln_rate_s is EWMA smoothed average aln_rate_s = aln_rate else: aln_rate_s = decay_rate * aln_rate + (1.0 - decay_rate) * aln_rate_s etr = (n_rows - i_r - 1) / aln_rate_s # etr in secs etr_s = str(datetime.timedelta(seconds=round(etr))) # etr as H:M:S logger.info("{i_r}/{n_rows} {p_r:.1f}%; committed; speed={speed:.1f}/{speed_s:.1f} aln/sec (inst/emwa); etr={etr:.0f}s ({etr_s}); {n_tx} tx".format( i_r=i_r, n_rows=n_rows, p_r=i_r / n_rows * 100, speed=aln_rate, speed_s=aln_rate_s, etr=etr, etr_s=etr_s, n_tx=len(tx_acs))) tx_acs = set() n0, t0 = n1, t1 cur.close() con.close() logger.info("{} distinct sequence accessions not found".format(len(ac_warning)))
def normalize(self, var): """Perform sequence variants normalization for single variant """ assert isinstance( var, hgvs.sequencevariant.SequenceVariant ), "variant must be a parsed HGVS sequence variant object" # keep a shallow reference to the original variant, to be returned # as-is under certain circumstances orig_var = var if self.validator: self.validator.validate(var) init_met = False if var.posedit is not None and isinstance(var.posedit, hgvs.edit.AARefAlt): init_met = var.posedit.init_met if var.posedit is None or var.posedit.uncertain or init_met or var.posedit.pos is None: return var type = var.type if type == "p": raise HGVSUnsupportedOperationError( "Unsupported normalization of protein level variants: {0}". format(var)) if var.posedit.edit.type == "con": raise HGVSUnsupportedOperationError( "Unsupported normalization of conversion variants: {0}", format(var)) var.fill_ref(self.hdp) if var.posedit.edit.type == "identity": var_norm = copy.deepcopy(var) return var_norm # For c. variants normalization, first convert to n. variant # and perform normalization at the n. level, then convert the # normalized n. variant back to c. variant. if type == "c": var = self.vm.c_to_n(var) if var.type in "nr": if var.posedit.pos.start.offset != 0 or var.posedit.pos.end.offset != 0: raise HGVSUnsupportedOperationError( "Normalization of intronic variants is not supported") def is_valid_pos(ac, pos): # tests whether the sequence position actually exists # This is *way* janky. # TODO: push functionality to hdp which can implement differently # based on capabilities of sequence backend try: s = self.hdp.get_seq(ac, pos - 1, pos) # 0-based! return s != "" except HGVSDataNotAvailableError as e: # Bad Request indicates that we got to NCBI, but the request # was invalid. return "Bad Request" not in str(e) if var.posedit.pos.start.base < 0 or not is_valid_pos( var.ac, var.posedit.pos.end.base): if hgvs.global_config.mapping.strict_bounds: raise HGVSInvalidVariantError( f"{var}: coordinates are out-of-bounds") _logger.warning( f"{var}: coordinates are out-of-bounds; returning as-is") return orig_var # restrict var types to those that use sequence start (i.e., not c.) assert var.type in "gmnr", "Internal Error: variant must be of type g, m, n, r" bound_s, bound_e = self._get_boundary(var) boundary = (bound_s, bound_e) start, end, (ref, alt) = self._normalize_alleles(var, boundary) ref_len = len(ref) alt_len = len(alt) # Generate normalized variant if alt_len == ref_len: ref_start = start ref_end = end - 1 # inversion if ref_len > 1 and ref == reverse_complement(alt): edit = hgvs.edit.Inv(ref=ref) # ident elif ref_len == 0 and alt_len == 0: ref_start = ref_end edit = hgvs.edit.NARefAlt(ref=ref, alt=alt) # substitution or delins else: edit = hgvs.edit.NARefAlt(ref=ref, alt=alt) if alt_len < ref_len: # del or delins ref_start = start ref_end = end - 1 edit = hgvs.edit.NARefAlt(ref=ref, alt=None if alt_len == 0 else alt) elif alt_len > ref_len: # ins or dup if ref_len == 0: if self.shuffle_direction == 3: adj_seq = self._fetch_bounded_seq(var, start - alt_len - 1, end - 1, 0, boundary) else: adj_seq = self._fetch_bounded_seq(var, start - 1, start + alt_len - 1, 0, boundary) # ins if alt != adj_seq: ref_start = start - 1 ref_end = end edit = hgvs.edit.NARefAlt(ref=None, alt=alt) # dup else: if self.shuffle_direction == 3: ref_start = start - alt_len ref_end = end - 1 edit = hgvs.edit.Dup(ref=alt) else: ref_start = start ref_end = start + alt_len - 1 edit = hgvs.edit.Dup(ref=alt) # delins else: ref_start = start ref_end = end - 1 edit = hgvs.edit.NARefAlt(ref=ref, alt=alt) # ensure the start is not 0 if ref_start == 0: ref = self._fetch_bounded_seq(var, 0, 1, 0, boundary) alt = alt + ref edit = hgvs.edit.NARefAlt(ref=ref, alt=alt) ref_start = 1 ref_end = 1 # ensure the end is not outside of reference sequence tgt_len = self._get_tgt_length(var) if ref_end == tgt_len + 1: ref = self._fetch_bounded_seq(var, tgt_len - 1, tgt_len, 0, boundary) alt = ref + alt edit = hgvs.edit.NARefAlt(ref=ref, alt=alt) ref_start = tgt_len ref_end = tgt_len var_norm = copy.deepcopy(var) var_norm.posedit.edit = edit var_norm.posedit.pos.start.base = ref_start var_norm.posedit.pos.end.base = ref_end if type == "c": var_norm = self.vm.n_to_c(var_norm) return var_norm