def __lt__(lhs, rhs): assert type(lhs) == type( rhs), "Cannot compare coordinates of different representations" if lhs.uncertain or rhs.uncertain: raise HGVSUnsupportedOperationError( "Cannot compare coordinates of uncertain positions") if lhs.datum == rhs.datum: if lhs.base == rhs.base: return lhs.offset < rhs.offset else: if ((rhs.base - lhs.base == 1 and lhs.offset > 0 and rhs.offset < 0) or (lhs.base - rhs.base == 1 and rhs.offset > 0 and lhs.offset < 0)): raise HGVSUnsupportedOperationError( "Cannot compare coordinates in the same intron with one based on end of exon and the other based on start of next exon" ) else: return lhs.base < rhs.base else: if lhs.datum == Datum.SEQ_START or rhs.datum == Datum.SEQ_START: raise HGVSUnsupportedOperationError( "Cannot compare coordinates of datum SEQ_START with CDS_START or CDS_END" ) else: return lhs.datum < rhs.datum
def __sub__(lhs, rhs): assert type(lhs) == type( rhs), "Cannot substract coordinates of different representations" if lhs.datum != rhs.datum: raise HGVSUnsupportedOperationError( "Interval length measured from different datums is ill-defined" ) if lhs.base == rhs.base: return lhs.offset - rhs.offset if lhs.offset != 0 or rhs.offset != 0: raise HGVSUnsupportedOperationError( "Interval length with intronic offsets is ill-defined") straddles_zero = 1 if (lhs.base > 0 and rhs.base < 0) else 0 return lhs.base - rhs.base - straddles_zero
def __lt__(lhs, rhs): assert type(lhs) == type( rhs), "Cannot compare coordinates of different representations" if lhs.uncertain or rhs.uncertain: raise HGVSUnsupportedOperationError( "Cannot compare coordinates of uncertain positions") return lhs.base < rhs.base
def _get_altered_sequence(self, strand, interval, var): seq = list( self.hdp.get_seq(var.ac, interval.start.base - 1, interval.end.base)) # positions are 0-based and half-open pos_start = var.posedit.pos.start.base - interval.start.base pos_end = var.posedit.pos.end.base - interval.start.base + 1 edit = var.posedit.edit if edit.type == 'sub': seq[pos_start] = edit.alt elif edit.type == 'del': del seq[pos_start:pos_end] elif edit.type == 'ins': seq.insert(pos_start + 1, edit.alt) elif edit.type == 'delins': del seq[pos_start:pos_end] seq.insert(pos_start, edit.alt) elif edit.type == 'dup': seq.insert(pos_end, ''.join(seq[pos_start:pos_end])) elif edit.type == 'inv': seq[pos_start:pos_end] = list( reverse_complement(''.join(seq[pos_start:pos_end]))) elif edit.type == 'identity': pass else: raise HGVSUnsupportedOperationError( "Getting altered sequence for {type} is unsupported".format( type=edit.type)) seq = ''.join(seq) if strand == -1: seq = reverse_complement(seq) return seq
def n_to_c(self, var_n): """Given a parsed n. variant, return a c. variant on the specified transcript using the specified alignment method (default is "transcript" indicating a self alignment). :param hgvs.sequencevariant.SequenceVariant var_n: a variant object :returns: variant object (:class:`vvhgvs.sequencevariant.SequenceVariant`) :raises HGVSInvalidVariantError: if var_n is not of type "n" """ if not (var_n.type == "n"): raise HGVSInvalidVariantError("Expected n. variant; got " + str(var_n)) if self._validator: self._validator.validate(var_n) var_n.fill_ref(self.hdp) tm = self._fetch_AlignmentMapper(tx_ac=var_n.ac, alt_ac=var_n.ac, alt_aln_method="transcript") pos_c = tm.n_to_c(var_n.posedit.pos) if (isinstance(var_n.posedit.edit, vvhgvs.edit.NARefAlt) or isinstance(var_n.posedit.edit, vvhgvs.edit.Dup) or isinstance(var_n.posedit.edit, vvhgvs.edit.Inv)): edit_c = copy.deepcopy(var_n.posedit.edit) else: raise HGVSUnsupportedOperationError( "Only NARefAlt/Dup/Inv types are currently implemented") var_c = vvhgvs.sequencevariant.SequenceVariant( ac=var_n.ac, type="c", posedit=vvhgvs.posedit.PosEdit(pos_c, edit_c)) if self.replace_reference: self._replace_reference(var_c) return var_c
def _replace_reference(self, var): """fetch reference sequence for variant and update (in-place) if necessary""" if var.type not in "cgmnr": raise HGVSUnsupportedOperationError( "Can only update references for type c, g, m, n, r") if var.posedit.edit.type == "ins": # insertions have no reference sequence (zero-width), so return as-is return var if var.posedit.edit.type == "con": # conversions have no reference sequence (zero-width), so return as-is return var pos = var.posedit.pos if ((isinstance(pos.start, vvhgvs.location.BaseOffsetPosition) and pos.start.offset != 0) or (isinstance(pos.end, vvhgvs.location.BaseOffsetPosition) and pos.end.offset != 0)): _logger.info( "Can't update reference sequence for intronic variant {}". format(var)) return var # For c. variants, we need coords on underlying sequences if var.type == "c": tm = self._fetch_AlignmentMapper(tx_ac=var.ac, alt_ac=var.ac, alt_aln_method="transcript") pos = tm.c_to_n(var.posedit.pos) else: pos = var.posedit.pos seq = self.hdp.get_seq(var.ac, pos.start.base - 1, pos.end.base) edit = var.posedit.edit if edit.ref != seq: _logger.debug( "Replaced reference sequence in {var} with {seq}".format( var=var, seq=seq)) edit.ref = seq return var
def _del_ins_lengths(self, ilen): raise HGVSUnsupportedOperationError( "internal function _del_ins_lengths not implemented for this variant type" )
def normalize(self, var): """Perform sequence variants normalization for single variant """ assert isinstance( var, vvhgvs.sequencevariant.SequenceVariant ), "variant must be a parsed HGVS sequence variant object" if self.validator: self.validator.validate(var) if var.posedit is None or var.posedit.uncertain or var.posedit.pos is None: return var type = var.type if type == "p": raise HGVSUnsupportedOperationError( "Unsupported normalization of protein level variants: {0}". format(var)) if var.posedit.edit.type == "con": raise HGVSUnsupportedOperationError( "Unsupported normalization of conversion variants: {0}", format(var)) var.fill_ref(self.hdp) if var.posedit.edit.type == "identity": var_norm = copy.deepcopy(var) return var_norm # For c. variants normalization, first convert to n. variant # and perform normalization at the n. level, then convert the # normalized n. variant back to c. variant. if type == "c": var = self.hm.c_to_n(var) if var.type in "nr": if var.posedit.pos.start.offset != 0 or var.posedit.pos.end.offset != 0: raise HGVSUnsupportedOperationError( "Normalization of intronic variants is not supported") # g, m, n, r sequences all use sequence start as the datum # That"s an essential assumption herein # (this is why we may have converted from c to n above) assert var.type in "gmnr", "Internal Error: variant must be of type g, m, n, r" bound_s, bound_e = self._get_boundary(var) boundary = (bound_s, bound_e) start, end, (ref, alt) = self._normalize_alleles(var, boundary) ref_len = len(ref) alt_len = len(alt) # Generate normalized variant if alt_len == ref_len: ref_start = start ref_end = end - 1 # inversion if ref_len > 1 and ref == reverse_complement(alt): edit = vvhgvs.edit.Inv(ref=ref) # ident elif ref_len == 0 and alt_len == 0: ref_start = ref_end edit = vvhgvs.edit.NARefAlt(ref=ref, alt=alt) # substitution or delins else: edit = vvhgvs.edit.NARefAlt(ref=ref, alt=alt) if alt_len < ref_len: # del or delins ref_start = start ref_end = end - 1 edit = vvhgvs.edit.NARefAlt(ref=ref, alt=None if alt_len == 0 else alt) elif alt_len > ref_len: # ins or dup if ref_len == 0: if self.shuffle_direction == 3: adj_seq = self._fetch_bounded_seq(var, start - alt_len - 1, end - 1, 0, boundary) else: adj_seq = self._fetch_bounded_seq(var, start - 1, start + alt_len - 1, 0, boundary) # ins if alt != adj_seq: ref_start = start - 1 ref_end = end edit = vvhgvs.edit.NARefAlt(ref=None, alt=alt) # dup else: if self.shuffle_direction == 3: ref_start = start - alt_len ref_end = end - 1 edit = vvhgvs.edit.Dup(ref=alt) else: ref_start = start ref_end = start + alt_len - 1 edit = vvhgvs.edit.Dup(ref=alt) # delins else: ref_start = start ref_end = end - 1 edit = vvhgvs.edit.NARefAlt(ref=ref, alt=alt) # ensure the start is not 0 if ref_start == 0: ref = self._fetch_bounded_seq(var, 0, 1, 0, boundary) alt = alt + ref edit = vvhgvs.edit.NARefAlt(ref=ref, alt=alt) ref_start = 1 ref_end = 1 # ensure the end is not outside of reference sequence tgt_len = self._get_tgt_length(var) if ref_end == tgt_len + 1: ref = self._fetch_bounded_seq(var, tgt_len - 1, tgt_len, 0, boundary) alt = ref + alt edit = vvhgvs.edit.NARefAlt(ref=ref, alt=alt) ref_start = tgt_len ref_end = tgt_len var_norm = copy.deepcopy(var) var_norm.posedit.edit = edit var_norm.posedit.pos.start.base = ref_start var_norm.posedit.pos.end.base = ref_end if type == "c": var_norm = self.hm.n_to_c(var_norm) return var_norm
def _get_boundary(self, var): """Get the position of exon-intron boundary for current variant """ if var.type == "r" or var.type == "n": if self.cross_boundaries: return 0, float("inf") else: # Get genomic sequence access number for this transcript map_info = self.hdp.get_tx_mapping_options(var.ac) if not map_info: raise HGVSDataNotAvailableError( "No mapping info available for {ac}".format(ac=var.ac)) map_info = [ item for item in map_info if item["alt_aln_method"] == self.alt_aln_method ] alt_ac = map_info[0]["alt_ac"] # Get tx info tx_info = self.hdp.get_tx_info(var.ac, alt_ac, self.alt_aln_method) cds_start = tx_info["cds_start_i"] cds_end = tx_info["cds_end_i"] # Get exon info exon_info = self.hdp.get_tx_exons(var.ac, alt_ac, self.alt_aln_method) exon_starts = [exon["tx_start_i"] for exon in exon_info] exon_ends = [exon["tx_end_i"] for exon in exon_info] exon_starts.sort() exon_ends.sort() exon_starts.append(exon_ends[-1]) exon_ends.append(float("inf")) # Find the end pos of the exon where the var locates left = 0 right = float("inf") # TODO: #242: implement methods to find tx regions for i in range(0, len(exon_starts)): if (var.posedit.pos.start.base - 1 >= exon_starts[i] and var.posedit.pos.start.base - 1 < exon_ends[i]): break for j in range(0, len(exon_starts)): if (var.posedit.pos.end.base - 1 >= exon_starts[j] and var.posedit.pos.end.base - 1 < exon_ends[j]): break if i != j: raise HGVSUnsupportedOperationError( "Unsupported normalization of variants spanning the exon-intron boundary ({var})" .format(var=var)) left = exon_starts[i] right = exon_ends[i] if cds_start is None: pass elif var.posedit.pos.end.base - 1 < cds_start: right = min(right, cds_start) elif var.posedit.pos.start.base - 1 >= cds_start: left = max(left, cds_start) else: raise HGVSUnsupportedOperationError( "Unsupported normalization of variants spanning the UTR-exon boundary ({var})" .format(var=var)) if cds_end is None: pass elif var.posedit.pos.start.base - 1 >= cds_end: left = max(left, cds_end) elif var.posedit.pos.end.base - 1 < cds_end: right = min(right, cds_end) else: raise HGVSUnsupportedOperationError( "Unsupported normalization of variants spanning the exon-UTR boundary ({var})" .format(var=var)) return left, right else: # For variant type of g and m etc. return 0, float("inf")