Esempio n. 1
0
    def add_hgvs_allele(self, hgvs_allele):
        """parse and add the hgvs_allele to the bundle"""
        hp = _get_hgvs_parser()

        sv = hp.parse_hgvs_variant(hgvs_allele)

        sequence_id = get_vmc_sequence_identifier(sv.ac)
        self.identifiers[sequence_id].add(sv.ac)

        if isinstance(sv.posedit.pos, hgvs.location.BaseOffsetInterval):
            if sv.posedit.pos.start.is_intronic or sv.posedit.pos.end.is_intronic:
                raise ValueError("Intronic HGVS variants are not supported".format(sv.posedit.edit.type))

        if sv.posedit.edit.type == 'ins':
            interval = models.Interval(start=sv.posedit.pos.start.base, end=sv.posedit.pos.start.base)
            state = sv.posedit.edit.alt
        elif sv.posedit.edit.type in ('sub', 'del', 'delins', 'identity'):
            interval = models.Interval(start=sv.posedit.pos.start.base - 1, end=sv.posedit.pos.end.base)
            if sv.posedit.edit.type == 'identity':
                state = get_reference_sequence(sv.ac, sv.posedit.pos.start.base - 1, sv.posedit.pos.end.base)
            else:
                state = sv.posedit.edit.alt or ''
        else:
            raise ValueError("HGVS variant type {} is unsupported".format(sv.posedit.edit.type))

        location = models.Location(sequence_id=sequence_id, interval=interval)
        location.id = self._id_function(location)
        self.locations[location.id] = location

        allele = models.Allele(location_id=location.id, state=state)
        allele.id = self._id_function(allele)
        self.alleles[allele.id] = allele

        return allele
Esempio n. 2
0
    def add_hgvs_haplotype(self, hgvs_alleles, completeness="UNKNOWN"):
        alleles = [self.add_hgvs_allele(hgvs_allele) for hgvs_allele in hgvs_alleles]

        # create location from bounding box around alleles
        sequence_ids = set(self.locations[a.location_id].sequence_id for a in alleles)
        if len(sequence_ids) > 1:
            raise Exception("Haplotypes must be defined on a single sequence")
        sequence_id = next(iter(sequence_ids))
        intervals = [self.locations[a.location_id].interval for a in alleles]
        interval_min = min(int(i.start) for i in intervals)
        interval_max = max(int(i.end) for i in intervals)
        interval = models.Interval(start=interval_min, end=interval_max)
        location = models.Location(sequence_id=sequence_id, interval=interval)
        location.id = self._id_function(location)
        self.locations[location.id] = location

        haplotype = models.Haplotype(
            completeness=completeness, location_id=location.id, allele_ids=[a.id for a in alleles])
        haplotype.id = self._id_function(haplotype)
        self.haplotypes[haplotype.id] = haplotype
        return haplotype
Esempio n. 3
0
        def _make_vmc_allele(a):
            """given dict (from CAR json) for single genomicAllele or
            transcriptAllele, create a (Location, Allele) pair, add to
            the bundle, and return the allele.

            """

            car_rsid = a["referenceSequence"].split("/")[-1]
            ir = self._refseqmapper[car_rsid]
            sequence_id = get_vmc_sequence_identifier(ir)

            # N.B. Double check CA coordinate semantics
            # If HGVS like re: insertions, then end -= 1 below
            if len(a["coordinates"]) > 1:
                _logger.warn(f"More than one coordinate set for resp[@id]; using only first")
            coords = a["coordinates"][0]
            interval = models.Interval(start=coords["start"] - 1, end=coords["end"])
            location = models.Location(sequence_id=sequence_id, interval=interval)
            location.id = computed_id(location)
            allele = models.Allele(location_id=location.id, state=coords["allele"])
            allele.id = computed_id(allele)
            return (ir, sequence_id, location, allele)
Esempio n. 4
0
def from_hgvs(hgvs_string):
    hp = _get_hgvs_parser()
    sv = hp.parse_hgvs_variant(hgvs_string)

    ir = models.Identifier(namespace="NCBI", accession=sv.ac)
    sequence_id = "VMC:GS_Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO"  #get_vmc_sequence_id(ir)

    if isinstance(sv.posedit.pos, hgvs.location.BaseOffsetInterval):
        if sv.posedit.pos.start.is_intronic or sv.posedit.pos.end.is_intronic:
            raise ValueError("Intronic HGVS variants are not supported".format(
                sv.posedit.edit.type))

    if sv.posedit.edit.type == 'ins':
        interval = models.Interval(start=sv.posedit.pos.start.base,
                                   end=sv.posedit.pos.start.base)
    elif sv.posedit.edit.type in ('sub', 'del', 'delins'):
        interval = models.Interval(start=sv.posedit.pos.start.base - 1,
                                   end=sv.posedit.pos.end.base)
    else:
        raise ValueError("HGVS variant type {} is unsupported".format(
            sv.posedit.edit.type))

    location = models.Location(sequence_id=sequence_id, interval=interval)
    location.id = computed_id(location)

    state = sv.posedit.edit.alt or ''
    allele = models.Allele(location_id=location.id, state=state)
    allele.id = computed_id(allele)

    bundle = models.Vmcbundle(
        alleles={allele.id: allele.as_dict()},
        genotypes={},
        haplotypes={},
        identifiers={sequence_id: [ir.as_dict()]},
        locations={location.id: location.as_dict()},
        meta={"version": "0.1"},
    )

    return ppj(bundle)
Esempio n. 5
0
import datetime
import json

from vmc import models, computed_id, serialize

# Interval
i = models.Interval(start=42, end=42)
assert "<Interval|42|42>" == serialize(i)
assert {"end": 42, "start": 42} == i.as_dict()

# Location
l = models.Location(sequence_id="VMC:GS_01234", interval=i)
assert "<Location|VMC:GS_01234|<Interval|42|42>>" == serialize(l)
l.id = computed_id(l)
assert "VMC:GL_OUqODzxryILUEDmv7uF8R8NwREJAx7gN" == l.id
assert {
    "id": "VMC:GL_OUqODzxryILUEDmv7uF8R8NwREJAx7gN",
    "interval": {
        "end": 42,
        "start": 42
    },
    "sequence_id": "VMC:GS_01234"
} == l.as_dict()

locations = {l.id: l.as_dict()}

# Allele
a = models.Allele(location_id=l.id, state="A")
assert "<Allele|VMC:GL_OUqODzxryILUEDmv7uF8R8NwREJAx7gN|A>" == serialize(a)
a.id = computed_id(a)
assert "VMC:GA_xTR0mmMviMLoAI9SwmDMFYr_AZczkjyU" == a.id
Esempio n. 6
0
def build_loc(seq_id, start):
    interval = models.Interval(start=start, end=start + 1)
    location = models.Location(sequence_id=seq_id, interval=interval)
    location.id = computed_id(location)
    return location.id