Beispiel #1
0
    def test_SNP_OnTopOfIndel(self):
        """
        A test case where we find a SNP on top of an insertion in the inferred reference.

        What we need is for the rebased alt to include the flanking alt bases, which are implied to be present in the discov_record.
        """
        # base sequ: T TAT CGG T     A
        # secondary: T G   CGG TCTGC A
        chrom_sizes = {"JAC": 9}
        base_records = [
            _MockVcfRecord(pos=2, ref="TAT", alts=["G"]),
            _MockVcfRecord(pos=8, ref="T", alts=["TCTGC"]),
        ]
        mapped_regions = SeqRegionMapper(base_records, chrom_sizes).get_map()
        region_searcher = SearchableSeqRegionsMap(mapped_regions)

        discov_record = _MockVcfRecord(pos=9, ref="G", alts=["A"])

        new_vcf_record = discover._rebase_vcf_record(discov_record, "JAC",
                                                     region_searcher)

        result = _MockVcfRecord(new_vcf_record.pos, new_vcf_record.ref,
                                new_vcf_record.alts)
        expected = _MockVcfRecord(8, "T", ["TCTAC"])

        self.assertEqual(expected, result)
    def test_TwoRecords_CorrectRegions(self):
        # base sequence:      T TAT    C G   G
        # derived sequence:   T GCCAC  C TTT G
        base_records = [
            _MockVcfRecord(pos=2, ref="TAT", alts=["GCCAC"]),
            _MockVcfRecord(pos=6, ref="G", alts=["TTT"]),
        ]

        chrom_sizes = {"JAC": 7}
        result = SeqRegionMapper(base_records, chrom_sizes).get_map()

        expected = [
            SeqRegion(base_ref_start=1, pers_ref_start=1, length=1),
            SeqRegion(
                base_ref_start=2,
                pers_ref_start=2,
                length=5,
                vcf_record_ref="TAT",
                vcf_record_alt="GCCAC",
            ),
            SeqRegion(base_ref_start=5, pers_ref_start=7, length=1),
            SeqRegion(
                base_ref_start=6,
                pers_ref_start=8,
                length=3,
                vcf_record_ref="G",
                vcf_record_alt="TTT",
            ),
            SeqRegion(base_ref_start=7, pers_ref_start=11, length=1),
        ]

        self.assertEqual(expected, result["JAC"])
    def test_chrom_with_no_records(self):
        """
        Need to map chroms with no initial variation too
        """
        base_records = [
            _MockVcfRecord(pos=2, ref="T", alts=["A"], chrom="Chrom_2")
        ]

        chrom_sizes = {"Chrom_1": 4, "Chrom_2": 5}
        result = SeqRegionMapper(base_records, chrom_sizes).get_map()

        expected_Chrom_1 = [
            SeqRegion(base_ref_start=1, pers_ref_start=1, length=4)
        ]
        expected_Chrom_2 = [
            SeqRegion(base_ref_start=1, pers_ref_start=1, length=1),
            SeqRegion(
                base_ref_start=2,
                pers_ref_start=2,
                length=1,
                vcf_record_ref="T",
                vcf_record_alt="A",
            ),
            SeqRegion(base_ref_start=3, pers_ref_start=3, length=3),
        ]

        expectations = {
            "Chrom_1": expected_Chrom_1,
            "Chrom_2": expected_Chrom_2
        }
        for key in expectations:
            self.assertEqual(expectations[key], result[key])
Beispiel #4
0
    def test_multiple_deletions(self):
        """
        A test case where we discover a deletion on top of a deletion in a variant site;
        as well as an extra deletion in a non-variant site.

        There is also a SNP among the original deletion, to make it plausible that quasimap/infer picks this variant.

        To make it harder, the discovered variation is also reported inside a variant site, so we expect the rebased alt to be elongated.

        We expect the rebased ref to include all deleted bases.
        """
        # base reference:     CAA C GCTA CAA
        # inferred reference: C   C GAT  CAA

        chrom_sizes = {"JAC": 11}
        base_records = [
            _MockVcfRecord(pos=1, ref="CAA", alts=["C"]),
            _MockVcfRecord(pos=5, ref="GCTA", alts=["GAT"]),
        ]
        mapped_regions = SeqRegionMapper(base_records, chrom_sizes).get_map()
        region_searcher = SearchableSeqRegionsMap(mapped_regions)

        discov_record = _MockVcfRecord(pos=4, ref="ATC", alts=["A"])
        new_vcf_record = discover._rebase_vcf_record(discov_record, "JAC",
                                                     region_searcher)

        result = _MockVcfRecord(new_vcf_record.pos, new_vcf_record.ref,
                                new_vcf_record.alts)
        expected = _MockVcfRecord(pos=5, ref="GCTAC", alts=["GA"])

        self.assertEqual(expected, result)
 def test_NoRecords(self):
     """
     Imagining a vcf from `infer` has no records, `build` would not have succeeded in
     the first place, having not built a prg given no variants.
     """
     # base sequence:      TTATCGG
     # derived sequence:   TTATCGG
     chrom_sizes = {}
     base_records = []
     with self.assertRaises(ValueError):
         result = SeqRegionMapper(base_records, chrom_sizes).get_map()
 def test_ref_call_produces_invariant_region_only(self):
     # base sequence:      T TAT CGG
     # derived sequence:   ^^^^^^^^^
     base_records = [
         _MockVcfRecord(pos=2, ref="TAT", alts=["G"], samples=[{
             "GT": [0]
         }])
     ]
     chrom_sizes = {"JAC": 7}
     result = SeqRegionMapper(base_records, chrom_sizes).get_map()
     expected = [SeqRegion(base_ref_start=1, pers_ref_start=1, length=7)]
     self.assertEqual(expected, result["JAC"])
    def test_ThreeAdjacentRecords_CorrectRegions(self):
        # base sequence:      T TAT    C   G  G
        # derived sequence:   T GCCAC  TCT AA G
        base_records = [
            _MockVcfRecord(pos=2, ref="TAT", alts=["GCCAC"]),
            _MockVcfRecord(pos=5, ref="C", alts=["TCT"]),
            _MockVcfRecord(pos=6, ref="G", alts=["AA"]),
        ]
        chrom_sizes = {"JAC": 7}
        result = SeqRegionMapper(base_records, chrom_sizes).get_map()

        expected = [
            SeqRegion(base_ref_start=1, pers_ref_start=1, length=1),
            SeqRegion(
                base_ref_start=2,
                pers_ref_start=2,
                length=5,
                vcf_record_ref="TAT",
                vcf_record_alt="GCCAC",
            ),
            SeqRegion(
                base_ref_start=5,
                pers_ref_start=7,
                length=3,
                vcf_record_ref="C",
                vcf_record_alt="TCT",
            ),
            SeqRegion(
                base_ref_start=6,
                pers_ref_start=10,
                length=2,
                vcf_record_ref="G",
                vcf_record_alt="AA",
            ),
            SeqRegion(base_ref_start=7, pers_ref_start=12, length=1),
        ]

        self.assertEqual(expected, list(result.values())[0])
Beispiel #8
0
def _make_rebasing_map(geno_paths: GenotypePaths):
    """
    Produces a mapping object supporting coordinate translation between
    the original reference (that the genotyped vcf uses as REF) and the gramtools-induced personalised reference.

    This can be used to translate points in either reference coordinate space to the other.
    Used in `discover` for rebasing newly found variants against the original reference.
    """
    chrom_sizes: ChromSizes = common.load_fasta(geno_paths.pers_ref, sizes_only=True)

    base_records = VariantFile(geno_paths.geno_vcf).fetch()
    region_map: SeqRegionsMap = SeqRegionMapper(base_records, chrom_sizes).get_map()
    SearchableSeqRegionsMap(region_map).dump_to(
        geno_paths.rebasing_map, dump_sequences=False
    )
Beispiel #9
0
def _rebase_vcf(disco_paths: DiscoverPaths, check_records=True):
    """Rebase a vcf so that it uses same reference as base_vcf.
    (* for not an input/output, just for illustration)
    Input:
     discovery.vcf                   personalised_ref.vcf
      |                               |
     personalised_ref.fasta          *base_ref.fasta

    Output:
     discovery.vcf
      |
     *base_ref.fasta
    """
    if check_records:
        var_unplaced_records = []
        inferred_refs = load_fasta(disco_paths.pers_ref)

    _add_contig_lines(disco_paths)
    base_records = VariantFile(disco_paths.geno_vcf).fetch()
    derived_records = VariantFile(disco_paths.discov_vcf_cortex).fetch()

    # Not loading genotype-produced rebasing map here, because it lacks the sequences
    chrom_sizes: ChromSizes = load_fasta(disco_paths.pers_ref, sizes_only=True)
    region_map: SeqRegionsMap = SeqRegionMapper(base_records, chrom_sizes).get_map()
    region_searcher = SearchableSeqRegionsMap(region_map)

    new_vcf_records = []
    for vcf_record in derived_records:
        chrom_key = vcf_record.chrom

        if check_records:
            if not check_ref_consistent(
                vcf_record, inferred_refs[chrom_key], var_unplaced_records
            ):
                continue  # Do not process inconsistent records

        new_vcf_records.append(
            _rebase_vcf_record(vcf_record, chrom_key, region_searcher)
        )

    if check_records and len(var_unplaced_records) > 0:
        log.warning(
            f"{len(var_unplaced_records)} new variant records were skipped, "
            f"because record pos and ref do not coincide with personalised reference"
        )
        log.debug("Skipped records: {}".format("\n".join(var_unplaced_records)))

    return new_vcf_records
Beispiel #10
0
    def test_StartsAtNonSite_EndsAtSite(self):
        # base sequence:      T TAT CGG
        # derived sequence:   T G   CGG
        chrom_sizes = {"JAC": 7}
        base_records = [_MockVcfRecord(pos=2, ref="TAT", alts=["G"])]
        mapped_regions = SeqRegionMapper(base_records, chrom_sizes).get_map()
        region_searcher = SearchableSeqRegionsMap(mapped_regions)

        discov_record = _MockVcfRecord(pos=1, ref="TG", alts=["TAA"])
        new_vcf_record = discover._rebase_vcf_record(discov_record, "JAC",
                                                     region_searcher)

        result = _MockVcfRecord(new_vcf_record.pos, new_vcf_record.ref,
                                new_vcf_record.alts)
        expected = _MockVcfRecord(1, "TTAT", ["TAA"])

        self.assertEqual(expected, result)
Beispiel #11
0
    def test_SingleSNPInNonSite(self):
        # base sequence:      T TAT CGG
        # derived sequence:   T G   CGG
        chrom_sizes = {"JAC": 5}
        base_records = [_MockVcfRecord(pos=2, ref="TAT", alts=["G"])]
        region_map = SeqRegionMapper(base_records, chrom_sizes).get_map()
        region_searcher = SearchableSeqRegionsMap(region_map)

        discov_record = _MockVcfRecord(pos=3, ref="C", alts=["G"])
        new_vcf_record = discover._rebase_vcf_record(discov_record, "JAC",
                                                     region_searcher)

        result = _MockVcfRecord(new_vcf_record.pos, new_vcf_record.ref,
                                new_vcf_record.alts)
        expected = _MockVcfRecord(pos=5, ref="C", alts=["G"])

        self.assertEqual(expected, result)
Beispiel #12
0
    def test_variant_in_chromo_with_no_prg_variants(self):
        # chr1 base:    T TAT CGG
        # chr1 derived: T G   CGG
        # chr2 base:    TTTTT
        # chr2 derived: TTTTT

        chrom_sizes = {"chr1": 7, "chr2": 5}
        base_records = [
            _MockVcfRecord(pos=2, ref="TAT", alts=["G"], chrom="chr1")
        ]
        mapped_regions = SeqRegionMapper(base_records, chrom_sizes).get_map()
        region_searcher = SearchableSeqRegionsMap(mapped_regions)

        discov_record = _MockVcfRecord(pos=1,
                                       ref="TT",
                                       alts=["GA"],
                                       chrom="chr2")
        new_vcf_record = discover._rebase_vcf_record(discov_record, "chr2",
                                                     region_searcher)
        self.assertEqual(discov_record, new_vcf_record)
    def test_SingleBaseAlt_CorrectRegion(self):
        # base sequence:      T TAT CGG
        # derived sequence:   T G   CGG
        base_records = [_MockVcfRecord(pos=2, ref="TAT", alts=["G"])]

        chrom_sizes = {"JAC": 7}

        result = SeqRegionMapper(base_records, chrom_sizes).get_map()

        expected = [
            SeqRegion(base_ref_start=1, pers_ref_start=1, length=1),
            SeqRegion(
                base_ref_start=2,
                pers_ref_start=2,
                length=1,
                vcf_record_ref="TAT",
                vcf_record_alt="G",
            ),
            SeqRegion(base_ref_start=5, pers_ref_start=3, length=3),
        ]
        self.assertEqual(expected, result["JAC"])
    def test_TwoRecords_TwoDifferentChroms(self):
        base_records = [
            _MockVcfRecord(pos=4, ref="ATTC", alts=["A"], chrom="Chrom_1"),
            _MockVcfRecord(pos=6, ref="A", alts=["AAC"], chrom="Chrom_2"),
        ]

        chrom_sizes = {"Chrom_1": 10, "Chrom_2": 8}
        result = SeqRegionMapper(base_records, chrom_sizes).get_map()

        expected_Chrom_1 = [
            SeqRegion(base_ref_start=1, pers_ref_start=1, length=3),
            SeqRegion(
                base_ref_start=4,
                pers_ref_start=4,
                length=1,
                vcf_record_ref="ATTC",
                vcf_record_alt="A",
            ),
            SeqRegion(base_ref_start=8, pers_ref_start=5, length=3),
        ]

        expected_Chrom_2 = [
            SeqRegion(base_ref_start=1, pers_ref_start=1, length=5),
            SeqRegion(
                base_ref_start=6,
                pers_ref_start=6,
                length=3,
                vcf_record_ref="A",
                vcf_record_alt="AAC",
            ),
            SeqRegion(base_ref_start=7, pers_ref_start=9, length=2),
        ]
        expectations = {
            "Chrom_1": expected_Chrom_1,
            "Chrom_2": expected_Chrom_2
        }
        for key in expectations:
            self.assertEqual(expectations[key], result[key])
Beispiel #15
0
    def test_SiteInBetweenNonSites(self):
        """
        A test case where the variation on top of the inferred reference overlaps: a non-variant site, a variant site,
        and a non-variant site in the prg.

        What we need is for the rebased ref to include all three sites.
        """
        # base sequ: T TAT CGG
        # secondary: T G   CGG
        chrom_sizes = {"JAC": 7}
        base_records = [_MockVcfRecord(pos=2, ref="TAT", alts=["G"])]
        mapped_regions = SeqRegionMapper(base_records, chrom_sizes).get_map()
        region_searcher = SearchableSeqRegionsMap(mapped_regions)

        discov_record = _MockVcfRecord(pos=1, ref="TGCG", alts=["GGCT"])

        new_vcf_record = discover._rebase_vcf_record(discov_record, "JAC",
                                                     region_searcher)

        result = _MockVcfRecord(new_vcf_record.pos, new_vcf_record.ref,
                                new_vcf_record.alts)
        expected = _MockVcfRecord(pos=1, ref="TTATCG", alts=["GGCT"])

        self.assertEqual(expected, result)
def make_map(base_records: MockVcfRecords,
             chrom_sizes: List[int]) -> SearchableSeqRegionsMap:
    names = [f"chr{i}" for i in range(len(chrom_sizes))]
    named_chroms = dict(zip(names, chrom_sizes))
    region_map = SeqRegionMapper(base_records, named_chroms).get_map()
    return SearchableSeqRegionsMap(region_map)