Esempio n. 1
0
    def test_base_ref_pers_ref_same_results(self):
        mapped_regions = [
            SeqRegion(base_ref_start=1, pers_ref_start=1, length=1),
            SeqRegion(
                base_ref_start=2,
                pers_ref_start=2,
                length=3,
                vcf_record_ref="TAT",
                vcf_record_alt="GCC",
            ),
            SeqRegion(base_ref_start=5, pers_ref_start=5, length=3),
        ]

        vcf_record_in_var_region = _MockVcfRecord(pos=2, ref="GC", alts=["GA"])
        vcf_record_in_nonvar_region = _MockVcfRecord(pos=1,
                                                     ref="A",
                                                     alts=["T"])
        searcher = SearchableSeqRegionsMap({"JAC": mapped_regions})

        for target in BisectTarget:
            result = searcher.bisect("JAC", vcf_record_in_var_region.pos,
                                     target)
            self.assertEqual(1, result)

        for target in BisectTarget:
            result = searcher.bisect("JAC", vcf_record_in_nonvar_region.pos,
                                     target)
            self.assertEqual(0, result)
Esempio n. 2
0
    def test_base_ref_further_than_pers_ref(self):
        mapped_regions = [
            SeqRegion(base_ref_start=1, pers_ref_start=1, length=1),
            SeqRegion(
                base_ref_start=2,
                pers_ref_start=2,
                length=5,
                vcf_record_ref="TAT",
                vcf_record_alt="GCCAC",
            ),
            SeqRegion(
                base_ref_start=5,
                pers_ref_start=7,
                length=3,
                vcf_record_ref="G",
                vcf_record_alt="TTT",
            ),
        ]

        vcf_record = _MockVcfRecord(pos=6, ref="T", alts=["A"])
        searcher = SearchableSeqRegionsMap({"JAC": mapped_regions})

        pers_ref_result = searcher.bisect("JAC", vcf_record.pos,
                                          BisectTarget.PERS_REF)
        self.assertEqual(1, pers_ref_result)

        base_ref_result = searcher.bisect("JAC", vcf_record.pos,
                                          BisectTarget.BASE_REF)
        self.assertEqual(2, base_ref_result)
Esempio n. 3
0
 def test_dump_and_load_recapitulates_map(self):
     searcher = SearchableSeqRegionsMap({"JAC": self.mapped_regions})
     tmpdir = Path(mkdtemp())
     tmpfile = tmpdir / "map.json"
     searcher.dump_to(tmpfile)
     loaded_searcher = SearchableSeqRegionsMap.load_from(tmpfile)
     self.assertEqual(searcher, loaded_searcher)
     rmtree(tmpdir)
Esempio n. 4
0
    def test_SNP_OnTopOfIndel(self):
        """
        A test case where we find a SNP on top of an insertion in the inferred reference.

        What we need is for the rebased alt to include the flanking alt bases, which are implied to be present in the discov_record.
        """
        # base sequ: T TAT CGG T     A
        # secondary: T G   CGG TCTGC A
        chrom_sizes = {"JAC": 9}
        base_records = [
            _MockVcfRecord(pos=2, ref="TAT", alts=["G"]),
            _MockVcfRecord(pos=8, ref="T", alts=["TCTGC"]),
        ]
        mapped_regions = SeqRegionMapper(base_records, chrom_sizes).get_map()
        region_searcher = SearchableSeqRegionsMap(mapped_regions)

        discov_record = _MockVcfRecord(pos=9, ref="G", alts=["A"])

        new_vcf_record = discover._rebase_vcf_record(discov_record, "JAC",
                                                     region_searcher)

        result = _MockVcfRecord(new_vcf_record.pos, new_vcf_record.ref,
                                new_vcf_record.alts)
        expected = _MockVcfRecord(8, "T", ["TCTAC"])

        self.assertEqual(expected, result)
Esempio n. 5
0
    def test_multiple_deletions(self):
        """
        A test case where we discover a deletion on top of a deletion in a variant site;
        as well as an extra deletion in a non-variant site.

        There is also a SNP among the original deletion, to make it plausible that quasimap/infer picks this variant.

        To make it harder, the discovered variation is also reported inside a variant site, so we expect the rebased alt to be elongated.

        We expect the rebased ref to include all deleted bases.
        """
        # base reference:     CAA C GCTA CAA
        # inferred reference: C   C GAT  CAA

        chrom_sizes = {"JAC": 11}
        base_records = [
            _MockVcfRecord(pos=1, ref="CAA", alts=["C"]),
            _MockVcfRecord(pos=5, ref="GCTA", alts=["GAT"]),
        ]
        mapped_regions = SeqRegionMapper(base_records, chrom_sizes).get_map()
        region_searcher = SearchableSeqRegionsMap(mapped_regions)

        discov_record = _MockVcfRecord(pos=4, ref="ATC", alts=["A"])
        new_vcf_record = discover._rebase_vcf_record(discov_record, "JAC",
                                                     region_searcher)

        result = _MockVcfRecord(new_vcf_record.pos, new_vcf_record.ref,
                                new_vcf_record.alts)
        expected = _MockVcfRecord(pos=5, ref="GCTAC", alts=["GA"])

        self.assertEqual(expected, result)
Esempio n. 6
0
    def test_retrieve_searched_region(self):
        mapped_regions_1 = [
            SeqRegion(base_ref_start=1, pers_ref_start=1, length=1),
            SeqRegion(
                base_ref_start=2,
                pers_ref_start=2,
                length=5,
                vcf_record_ref="TAT",
                vcf_record_alt="GCCAC",
            ),
        ]

        mapped_regions_2 = [
            SeqRegion(base_ref_start=1, pers_ref_start=1, length=200)
        ]

        searcher = SearchableSeqRegionsMap({
            "chr1": mapped_regions_1,
            "chr2": mapped_regions_2
        })

        vcf_record = _MockVcfRecord(pos=100, ref="T", alts=["A"], chrom="chr2")
        its_index = searcher.bisect(vcf_record.chrom, vcf_record.pos,
                                    BisectTarget.PERS_REF)
        self.assertEqual(searcher.get_region(vcf_record.chrom, its_index),
                         mapped_regions_2[0])

        vcf_record = _MockVcfRecord(pos=4, ref="C", alts=["A"], chrom="chr1")
        its_index = searcher.bisect(vcf_record.chrom, vcf_record.pos,
                                    BisectTarget.PERS_REF)
        self.assertEqual(searcher.get_region(vcf_record.chrom, its_index),
                         mapped_regions_1[1])
Esempio n. 7
0
def _make_rebasing_map(geno_paths: GenotypePaths):
    """
    Produces a mapping object supporting coordinate translation between
    the original reference (that the genotyped vcf uses as REF) and the gramtools-induced personalised reference.

    This can be used to translate points in either reference coordinate space to the other.
    Used in `discover` for rebasing newly found variants against the original reference.
    """
    chrom_sizes: ChromSizes = common.load_fasta(geno_paths.pers_ref, sizes_only=True)

    base_records = VariantFile(geno_paths.geno_vcf).fetch()
    region_map: SeqRegionsMap = SeqRegionMapper(base_records, chrom_sizes).get_map()
    SearchableSeqRegionsMap(region_map).dump_to(
        geno_paths.rebasing_map, dump_sequences=False
    )
Esempio n. 8
0
def _rebase_vcf(disco_paths: DiscoverPaths, check_records=True):
    """Rebase a vcf so that it uses same reference as base_vcf.
    (* for not an input/output, just for illustration)
    Input:
     discovery.vcf                   personalised_ref.vcf
      |                               |
     personalised_ref.fasta          *base_ref.fasta

    Output:
     discovery.vcf
      |
     *base_ref.fasta
    """
    if check_records:
        var_unplaced_records = []
        inferred_refs = load_fasta(disco_paths.pers_ref)

    _add_contig_lines(disco_paths)
    base_records = VariantFile(disco_paths.geno_vcf).fetch()
    derived_records = VariantFile(disco_paths.discov_vcf_cortex).fetch()

    # Not loading genotype-produced rebasing map here, because it lacks the sequences
    chrom_sizes: ChromSizes = load_fasta(disco_paths.pers_ref, sizes_only=True)
    region_map: SeqRegionsMap = SeqRegionMapper(base_records, chrom_sizes).get_map()
    region_searcher = SearchableSeqRegionsMap(region_map)

    new_vcf_records = []
    for vcf_record in derived_records:
        chrom_key = vcf_record.chrom

        if check_records:
            if not check_ref_consistent(
                vcf_record, inferred_refs[chrom_key], var_unplaced_records
            ):
                continue  # Do not process inconsistent records

        new_vcf_records.append(
            _rebase_vcf_record(vcf_record, chrom_key, region_searcher)
        )

    if check_records and len(var_unplaced_records) > 0:
        log.warning(
            f"{len(var_unplaced_records)} new variant records were skipped, "
            f"because record pos and ref do not coincide with personalised reference"
        )
        log.debug("Skipped records: {}".format("\n".join(var_unplaced_records)))

    return new_vcf_records
Esempio n. 9
0
    def test_StartsAtNonSite_EndsAtSite(self):
        # base sequence:      T TAT CGG
        # derived sequence:   T G   CGG
        chrom_sizes = {"JAC": 7}
        base_records = [_MockVcfRecord(pos=2, ref="TAT", alts=["G"])]
        mapped_regions = SeqRegionMapper(base_records, chrom_sizes).get_map()
        region_searcher = SearchableSeqRegionsMap(mapped_regions)

        discov_record = _MockVcfRecord(pos=1, ref="TG", alts=["TAA"])
        new_vcf_record = discover._rebase_vcf_record(discov_record, "JAC",
                                                     region_searcher)

        result = _MockVcfRecord(new_vcf_record.pos, new_vcf_record.ref,
                                new_vcf_record.alts)
        expected = _MockVcfRecord(1, "TTAT", ["TAA"])

        self.assertEqual(expected, result)
Esempio n. 10
0
    def test_SingleSNPInNonSite(self):
        # base sequence:      T TAT CGG
        # derived sequence:   T G   CGG
        chrom_sizes = {"JAC": 5}
        base_records = [_MockVcfRecord(pos=2, ref="TAT", alts=["G"])]
        region_map = SeqRegionMapper(base_records, chrom_sizes).get_map()
        region_searcher = SearchableSeqRegionsMap(region_map)

        discov_record = _MockVcfRecord(pos=3, ref="C", alts=["G"])
        new_vcf_record = discover._rebase_vcf_record(discov_record, "JAC",
                                                     region_searcher)

        result = _MockVcfRecord(new_vcf_record.pos, new_vcf_record.ref,
                                new_vcf_record.alts)
        expected = _MockVcfRecord(pos=5, ref="C", alts=["G"])

        self.assertEqual(expected, result)
Esempio n. 11
0
    def test_variant_in_chromo_with_no_prg_variants(self):
        # chr1 base:    T TAT CGG
        # chr1 derived: T G   CGG
        # chr2 base:    TTTTT
        # chr2 derived: TTTTT

        chrom_sizes = {"chr1": 7, "chr2": 5}
        base_records = [
            _MockVcfRecord(pos=2, ref="TAT", alts=["G"], chrom="chr1")
        ]
        mapped_regions = SeqRegionMapper(base_records, chrom_sizes).get_map()
        region_searcher = SearchableSeqRegionsMap(mapped_regions)

        discov_record = _MockVcfRecord(pos=1,
                                       ref="TT",
                                       alts=["GA"],
                                       chrom="chr2")
        new_vcf_record = discover._rebase_vcf_record(discov_record, "chr2",
                                                     region_searcher)
        self.assertEqual(discov_record, new_vcf_record)
Esempio n. 12
0
    def test_dump_and_load_without_sequences(self):
        """
        Serialisation without the REF and ALT SeqRegion sequences
        """
        searcher = SearchableSeqRegionsMap({"JAC": self.mapped_regions})
        tmpdir = Path(mkdtemp())
        tmpfile = tmpdir / "map.json"
        searcher.dump_to(tmpfile, dump_sequences=False)
        loaded_searcher = SearchableSeqRegionsMap.load_from(tmpfile)

        self.assertEqual(searcher.get_region("JAC", 0),
                         loaded_searcher.get_region("JAC", 0))
        self.assertEqual(SeqRegion(2, 2, 2),
                         loaded_searcher.get_region("JAC", 1))
        rmtree(tmpdir)
Esempio n. 13
0
    def test_SiteInBetweenNonSites(self):
        """
        A test case where the variation on top of the inferred reference overlaps: a non-variant site, a variant site,
        and a non-variant site in the prg.

        What we need is for the rebased ref to include all three sites.
        """
        # base sequ: T TAT CGG
        # secondary: T G   CGG
        chrom_sizes = {"JAC": 7}
        base_records = [_MockVcfRecord(pos=2, ref="TAT", alts=["G"])]
        mapped_regions = SeqRegionMapper(base_records, chrom_sizes).get_map()
        region_searcher = SearchableSeqRegionsMap(mapped_regions)

        discov_record = _MockVcfRecord(pos=1, ref="TGCG", alts=["GGCT"])

        new_vcf_record = discover._rebase_vcf_record(discov_record, "JAC",
                                                     region_searcher)

        result = _MockVcfRecord(new_vcf_record.pos, new_vcf_record.ref,
                                new_vcf_record.alts)
        expected = _MockVcfRecord(pos=1, ref="TTATCG", alts=["GGCT"])

        self.assertEqual(expected, result)
Esempio n. 14
0
def _rebase_vcf_record(
    vcf_record: VariantRecord, chrom: Chrom, region_searcher: SearchableSeqRegionsMap
):
    """Change `vcf_record` to be expressed relative to a different reference."""

    # Get index of personalised ref region containing the start of the `vcf_record`.
    region_index = region_searcher.bisect(chrom, vcf_record.pos, BisectTarget.PERS_REF)

    consumed_reference = 0  # Position in the inferred ref. sequence
    reference_length = len(vcf_record.ref)
    ref_seq_left = True

    # Build the rebased_ref as we traverse the regions,
    # using the vcf_record alt and pre-pend and/or post-pend to it if necessary.
    rebased_ref, rebased_alt = "", str(vcf_record.alts[0])

    # Let's rebase the position straight away
    first_region = region_searcher.get_region(chrom, region_index)

    # Case: hitting variant region. We rebase at the beginning of the variant region.
    if first_region.is_variant_region:
        rebased_pos = first_region.base_ref_start

        # We also straight away pre-pend any preceding variation relative to the base ref
        if vcf_record.pos > first_region.pers_ref_start:
            record_inset = vcf_record.pos - first_region.pers_ref_start
            rebased_alt = first_region.vcf_record_alt[:record_inset] + rebased_alt

    # Case: hitting non-variant region. We rebase at where the vcf_record starts, in base ref coordinates.
    else:
        rebased_pos = first_region.base_ref_start + (
            vcf_record.pos - first_region.pers_ref_start
        )

    while ref_seq_left:
        region = region_searcher.get_region(chrom, region_index)
        # Check how much of the vcf_record ref (inferred reference) can be consumed by the current region.
        # If the current region can consume at least what is left of the vcf_record ref, loop ends.
        # NOTE that region.length is 'overloaded': if a non-var region, it is the fixed interval between var regions.
        # If a var region, it is the inferred_vcf record's alt length (ref and alt lengths can differ).
        consumable = region.length - (
            vcf_record.pos + consumed_reference - region.pers_ref_start
        )

        if consumable >= (reference_length - consumed_reference):
            ref_seq_left = False
            to_consume = reference_length - consumed_reference
        else:
            to_consume = consumable

        if region.is_variant_region:
            rebased_ref += region.vcf_record_ref

        else:
            # We can use the vcf_record's ref, as that is also the base ref sequence- because we are in a non-variant site.
            rebased_ref += vcf_record.ref[
                consumed_reference : consumed_reference + to_consume
            ]

        consumed_reference += to_consume
        region_index += 1

    assert consumed_reference == len(vcf_record.ref)

    # Deal with the last region: post-pend any sequence in alt record if we finish in a variant site.
    if region.is_variant_region:
        cur_pos = vcf_record.pos + consumed_reference
        # The inset will be < 0 if there is a part of the (inferred vcf record's) alt which has not been
        inset = cur_pos - (region.pers_ref_start + region.length)
        if inset < 0:
            rebased_alt += region.vcf_record_alt[inset:]

    vcf_record = _modify_vcf_record(
        vcf_record, pos=rebased_pos, ref=rebased_ref, alts=[rebased_alt]
    )

    return vcf_record
def make_map(base_records: MockVcfRecords,
             chrom_sizes: List[int]) -> SearchableSeqRegionsMap:
    names = [f"chr{i}" for i in range(len(chrom_sizes))]
    named_chroms = dict(zip(names, chrom_sizes))
    region_map = SeqRegionMapper(base_records, named_chroms).get_map()
    return SearchableSeqRegionsMap(region_map)
Esempio n. 16
0
def _rebase_vcf_record(vcf_record: VariantRecord, chrom: Chrom,
                       region_searcher: SearchableSeqRegionsMap):
    """
    Changes `vcf_record` to be expressed relative to a different reference.

    The algorithm is not trivial to understand- refer to the tests to understand
    expected inputs/outputs and to figures/text in the gramtools PhD thesis for details.

    Brief explanation:
       Notation:
          - base reference = reference on which to rebase
          - personalised reference = reference on which `vcf_record` variation lies
       Goal: get base reference sequence and position and new alt sequence for `vcf_record`
       Functioning:
          - We use a map (`region_searcher`) storing coordinates/sequences in both reference spaces
          - We initially bisect into the map at first position <= `vcf_record`'pos in
            personalised reference space. The algorithm then goes through the map until
            it reaches the end of `vcf_record`'s pos, constructing new sequence along
            the way.
          - If we start or end in a variant site in base reference space, we use all of
            the ref/alt sequences in the new ref/alt sequences, because we need to carry
            over variation that exists in the personalised reference
          - If we start or end in an invariant site, we only use sequence from/up to the
            `vcf_record`'s ref sequence to paste in to the new base reference sequence.
    """
    cur_region_index = region_searcher.bisect(chrom, vcf_record.pos,
                                              BisectTarget.PERS_REF)
    cur_region = region_searcher.get_region(chrom, cur_region_index)

    new_ref_seq = ""
    new_alt_seq = str(vcf_record.alts[0])
    cur_pers_ref_pos = vcf_record.pos

    pers_ref_end_pos = cur_pers_ref_pos + len(vcf_record.ref) - 1
    new_pos = cur_region.base_ref_start

    num_bases_past_first_region = cur_pers_ref_pos - cur_region.pers_ref_start
    if num_bases_past_first_region > 0:
        if cur_region.is_variant_region:
            new_alt_seq = (
                cur_region.vcf_record_alt[:num_bases_past_first_region] +
                new_alt_seq)
        else:
            new_pos += num_bases_past_first_region

    while cur_pers_ref_pos <= pers_ref_end_pos:
        cur_region = region_searcher.get_region(chrom, cur_region_index)
        cur_region_end = cur_region.pers_ref_start + cur_region.length - 1
        num_bases_past_last_region = max(cur_region_end - pers_ref_end_pos, 0)
        if cur_region.is_variant_region:
            new_ref_seq += cur_region.vcf_record_ref
        else:
            start_offset = cur_pers_ref_pos - vcf_record.pos
            end_offset = cur_region_end - vcf_record.pos - num_bases_past_last_region
            new_ref_seq += vcf_record.ref[start_offset:end_offset + 1]
        if num_bases_past_last_region > 0 and cur_region.is_variant_region:
            offset = cur_region.length - num_bases_past_last_region
            new_alt_seq = new_alt_seq + cur_region.vcf_record_alt[offset:]
        cur_pers_ref_pos = cur_region_end + 1
        cur_region_index += 1

    vcf_record = _modify_vcf_record(vcf_record,
                                    pos=new_pos,
                                    ref=new_ref_seq,
                                    alts=[new_alt_seq])
    return vcf_record