def _build_region_bins(self): if not self.chromosomes or self.region_length == 0: return [] for chrom in self.chromosome_lengths: target_chrom = chrom if target_chrom not in self.chromosomes: target_chrom = "other" region_bins_count = math.ceil(self.chromosome_lengths[chrom] / self.region_length) for region_index in range(region_bins_count): region_bin = f"{target_chrom}_{region_index}" region_begin = region_index * self.region_length + 1 region_end = min((region_index + 1) * self.region_length, self.chromosome_lengths[chrom]) region = Region(region_bin, region_begin, region_end) if region_bin not in self.region_bins: self.region_bins[region_bin] = region else: prev_region = self.region_bins[region_bin] assert prev_region.chrom == region_bin assert prev_region.begin == region.begin region = Region(region_bin, 1, max(region.end, prev_region.end)) self.region_bins[region_bin] = region
def _do_annotate_cnv(self, variant): assert VariantType.is_cnv(variant.variant_type) if variant.variant_type & VariantType.cnv_p: effect_type = "CNV+" elif variant.variant_type & VariantType.cnv_m: effect_type = "CNV-" else: raise ValueError( f"unexpected variant type: {variant.variant_type}") assert effect_type is not None effects = [] cnv_region = Region(variant.chromosome, variant.position, variant.position + variant.length) for (start, stop), tms in \ self.gene_models.utr_models[variant.chromosome].items(): if cnv_region.intersection(Region(variant.chromosome, start, stop)): for tm in tms: effects.append( EffectFactory.create_effect_with_tm(effect_type, tm)) if len(effects) == 0: effects.append(EffectFactory.create_effect(effect_type)) return effects
def test_cnv_impala_region_query(cnv_impala): vs = cnv_impala.query_variants( regions=[ Region("1", 1600000, 1620000) ], effect_types=["CNV+", "CNV-"], variant_type="cnv+ or cnv-", inheritance="denovo" ) assert len(list(vs)) == 1 vs = cnv_impala.query_variants( regions=[ Region("1", 1600000, 1630000) ], effect_types=["CNV+", "CNV-"], variant_type="cnv+ or cnv-", inheritance="denovo" ) assert len(list(vs)) == 2 vs = cnv_impala.query_variants( regions=[ Region("1", 1000000, 2000000) ], effect_types=["CNV+", "CNV-"], variant_type="cnv+ or cnv-", inheritance="denovo" ) assert len(list(vs)) == 2
def test_trios_multi_multi3_full(variants_vcf): fvars = variants_vcf("backends/trios_multi") vs = list( fvars.query_variants( regions=[Region("1", 11505, 11505)], return_reference=True, return_unknown=True, ) ) assert len(vs) == 1 for v in vs: print(mat2str(v.best_state)) assert v.best_state.shape == (4, 3) fvars = variants_vcf("backends/trios_multi") vs = list( fvars.query_variants( regions=[Region("1", 11506, 11506)], return_reference=True, return_unknown=True, ) ) assert len(vs) == 1 for v in vs: print(mat2str(v.best_state)) assert v.best_state.shape == (4, 3)
def load_config(genome_config, section_id): genome = Genome(section_id) genome.genomic_sequence_filename = genome_config.chr_all_file for section_id, gene_models_config in \ genome_config.gene_models.items(): gene_models = Genome.GeneModelsConfig( section_id, gene_models_config.file, gene_models_config.fileformat, None, ) genome._gene_models[gene_models.id] = gene_models assert genome_config.default_gene_models in genome._gene_models genome.default_gene_models_id = genome_config.default_gene_models genome.default_gene_models_filename = \ genome._gene_models[genome.default_gene_models_id].file if genome_config.pars: assert genome_config.pars.X is not None regions_x = [ Region.from_str(region) for region in genome_config.pars.X ] chrom_x = regions_x[0].chrom regions_y = [ Region.from_str(region) for region in genome_config.pars.Y ] chrom_y = regions_y[0].chrom genome.pars = {chrom_x: regions_x, chrom_y: regions_y} return genome
def test_collapse_no_chrom_simple(regions, expected): regions = [Region.from_str(r) for r in regions.split(",")] result = collapse_no_chrom(regions) assert len(result) == len(expected) for res, exp in zip(result, expected): assert res == exp
def v_vcf(variants_impl): vvars = variants_impl("variants_vcf")("backends/a") vs = list(vvars.query_variants(regions=[Region("1", 11548, 11548)])) assert len(vs) == 1 v = vs[0] return v
def test_f1_requested_effects( variants_impl, variants, position, inheritance, effect_types, return_reference, matched_alleles_effects, ): vvars = variants_impl(variants)("backends/f1_test") assert vvars is not None vs = vvars.query_variants( regions=[Region("1", position, position)], inheritance=inheritance, effect_types=effect_types, return_reference=return_reference, return_unknown=True, ) vs = list(vs) assert len(vs) == 1 v = vs[0] print(v, v.effects, v.matched_alleles) assert len(v.matched_alleles) == len(matched_alleles_effects)
def test_df_query_multiallelic3_families(variants_impl, variants, fixture_name): dfvars = variants_impl(variants)(fixture_name) assert dfvars is not None regions = [Region("1", 11606, 11606)] family_ids = ["f1"] vs = dfvars.query_variants( regions=regions, family_ids=family_ids, return_reference=True, return_unknown=True, ) vs = list(vs) assert len(vs) == 1 v = vs[0] print(v, mat2str(v.best_state)) fa1 = v.alt_alleles[0] fa2 = v.alt_alleles[1] assert len(v.alt_alleles) == 2 assert "mom1" in fa1.variant_in_members assert "dad1" in fa2.variant_in_members assert "ch1" not in fa1.variant_in_members assert "ch1" not in fa2.variant_in_members
def _build_gene_regions_heuristic(self, genes, regions): assert genes is not None if len(genes) > 0 and len(genes) <= self.GENE_REGIONS_HEURISTIC_CUTOFF: gene_regions = [] for gs in genes: gene_model = self.gene_models.gene_models_by_gene_name(gs) if gene_model is None: logger.warning(f"gene model for {gs} not found") continue for gm in gene_model: gene_regions.append( Region( gm.chrom, gm.tx[0] - self.GENE_REGIONS_HEURISTIC_EXTEND, gm.tx[1] + self.GENE_REGIONS_HEURISTIC_EXTEND, )) gene_regions = dae.utils.regions.collapse(gene_regions) logger.info(f"gene regions for {genes}: {gene_regions}") logger.info(f"input regions: {regions}") if not regions: regions = gene_regions else: result = [] for gr in gene_regions: for r in regions: intersection = gr.intersection(r) if intersection: result.append(intersection) result = dae.utils.regions.collapse(result) logger.info(f"original regions: {regions}; result: {result}") regions = result return regions
def reset_regions(self, regions): super(CNVLoader, self).reset_regions(regions) result = [] for r in self.regions: if r is None: result.append(r) else: result.append(Region.from_str(r)) self.regions = result print("CNV reset regions:", self.regions)
def reset_regions(self, regions): super(DenovoLoader, self).reset_regions(regions) result = [] for r in self.regions: if r is None: result.append(r) else: result.append(Region.from_str(r)) self.regions = result logger.debug(f"denovo reset regions: {self.regions}")
def test_trios_multi_all_reference(variants_vcf): fvars = variants_vcf("backends/trios_multi") vs = list( fvars.query_variants( regions=[Region("1", 11502, 11502)], return_reference=True, return_unknown=True, )) assert len(vs) == 1 for v in vs: assert v.best_state.shape == (3, 3) assert len(mat2str(v.best_state)) == 11
def test_trios_multi_single_allele2_full(variants_vcf): fvars = variants_vcf("backends/trios_multi") vs = list( fvars.query_variants( regions=[Region("1", 11501, 11501)], return_reference=True, return_unknown=True, ) ) assert len(vs) == 1 for v in vs: assert v.best_state.shape == (3, 3)
def test_11540_family_alleles(variants_impl): fvars = variants_impl("variants_vcf")("backends/a") vs = fvars.query_variants(regions=[Region("1", 11539, 11542)]) v = next(vs) assert v.position == 11540 assert len(v.alt_alleles) == 1 aa = v.alt_alleles[0] assert aa.allele_index == 2 assert aa.cshl_variant == "sub(T->A)" assert [0, 2] == v.allele_indexes assert [0, 1] == v.family_allele_indexes
""" Created on Jul 2, 2018 @author: lubo """ import pytest from dae.utils.regions import Region @pytest.mark.parametrize("variants", ["variants_impala", "variants_vcf"]) @pytest.mark.parametrize( "fixture_name,regions,family_ids,count", [ ("backends/trios2", [Region("1", 11539, 11539)], ["f1"], 1), ("backends/trios2", [Region("1", 11539, 11539)], ["f2"], 1), ("backends/trios2", [Region("1", 11539, 11539)], ["f1", "f2"], 2), ("backends/trios2", [Region("1", 11539, 11539)], [], 0), ("backends/trios2", [Region("1", 11539, 11539)], None, 2), ( "backends/trios2", [Region("1", 11539, 11539), Region("1", 11551, 11551)], ["f1"], 2, ), ( "backends/trios2", [Region("1", 11539, 11539), Region("1", 11551, 11551)], ["f2"], 2, ),
import pytest from dae.utils.regions import Region from dae.variants.attributes import Role from dae.backends.attributes_query import role_query @pytest.mark.parametrize("variants", ["variants_impala", "variants_vcf"]) @pytest.mark.parametrize( "fixture_name,regions,roles,count", [ ("backends/effects_trio_dad", None, "dad", 1), ("backends/effects_trio", None, "dad", 1), ("backends/trios2", [Region("1", 11539, 11552)], "prb", 2), ], ) def test_fixture_query_by_roles(variants_impl, variants, fixture_name, regions, roles, count): vvars = variants_impl(variants)(fixture_name) assert vvars is not None vs = vvars.query_variants(regions=regions, roles=roles) vs = list(vs) print(vs) assert len(vs) == count def test_roles_matcher(): roles = "dad"
""" Created on Mar 30, 2018 @author: lubo """ import pytest from dae.utils.regions import Region @pytest.mark.parametrize( "region,count,ref_freq,alt_freq", [ (Region("1", 11501, 11501), 1, 75.0, 25.0), (Region("1", 11503, 11503), 1, 75.0, 25.0), (Region("1", 11511, 11511), 1, 50.0, 50.0), (Region("1", 11515, 11515), 1, 75.0, 25.0), ], ) def test_variant_attributes(variants_vcf, region, count, ref_freq, alt_freq): fvars = variants_vcf("backends/inheritance_trio") vs = list(fvars.query_variants(regions=[region])) assert len(vs) == count for v in vs: assert len(v.get_attribute("af_allele_count")) == 1 assert len(v.get_attribute("af_allele_freq")) == 1 rfreq = v["af_ref_allele_freq"] afreq = v["af_allele_freq"] assert ref_freq == pytest.approx(rfreq[0], 1e-2)
""" Created on Mar 29, 2018 @author: lubo """ import pytest from dae.utils.regions import Region from dae.variants.attributes import Inheritance @pytest.mark.parametrize( "region,count,inheritance", [ (Region("1", 11500, 11500), 1, Inheritance.unknown), (Region("1", 11501, 11501), 1, Inheritance.unknown), (Region("1", 11502, 11502), 1, Inheritance.unknown), (Region("1", 11503, 11503), 1, Inheritance.unknown), (Region("1", 11504, 11504), 1, Inheritance.unknown), (Region("1", 11505, 11505), 1, Inheritance.unknown), ], ) def test_inheritance_nontrio(variants_vcf, region, count, inheritance): fvars = variants_vcf("backends/inheritance_nontrio") vs = list( fvars.query_variants( regions=[region], family_ids=["f1"], return_reference=True, return_unknown=True, )
import pytest from dae.utils.regions import Region @pytest.mark.parametrize("variants", [ "iossifov2014_raw_denovo", "iossifov2014_impala", ]) @pytest.mark.parametrize( "region,cshl_location,effect_type", [ (Region("15", 80137553, 80137553), "15:80137554", "noEnd"), (Region("12", 116418553, 116418553), "12:116418554", "splice-site"), (Region("3", 56627767, 56627767), "3:56627768", "splice-site"), (Region("3", 195475903, 195475903), "3:195475904", "splice-site"), (Region("21", 38877891, 38877891), "21:38877892", "splice-site"), (Region("15", 43694048, 43694048), "15:43694049", "splice-site"), (Region("12", 93792632, 93792632), "12:93792633", "splice-site"), (Region("4", 83276456, 83276456), "4:83276456", "splice-site"), (Region("3", 195966607, 195966607), "3:195966608", "splice-site"), (Region("3", 97611837, 97611837), "3:97611838", "splice-site"), (Region("15", 31776803, 31776803), "15:31776803", "no-frame-shift"), (Region("3", 151176416, 151176416), "3:151176416", "no-frame-shift"), ], ) def test_iossifov2014_variant_coordinates( variants, iossifov2014_impala, iossifov2014_raw_denovo, region, cshl_location,
""" Created on Mar 5, 2018 @author: lubo """ import pytest from dae.utils.regions import Region from dae.utils.variant_utils import mat2str @pytest.mark.parametrize("variants", ["variants_impala", "variants_vcf"]) @pytest.mark.parametrize( "region,count,freq0,freq1", [ (Region("1", 11539, 11539), 2, 75.0, 25.0), (Region("1", 11540, 11540), 2, 75.0, 25.0), (Region("1", 11541, 11541), 1, 87.5, 12.5), (Region("1", 11542, 11542), 1, 87.5, 12.5), (Region("1", 11550, 11550), 0, 100.0, 0.0), (Region("1", 11553, 11553), 2, 100.0, 0.0), (Region("1", 11551, 11551), 2, 0.0, 100.0), (Region("1", 11552, 11552), 2, 0.0, 100.0), ], ) def test_variant_frequency_single(variants_impl, variants, region, count, freq0, freq1): fvars = variants_impl(variants)("backends/trios2") vs = list( fvars.query_variants(regions=[region], return_reference=False,
""" Created on Apr 16, 2018 @author: lubo """ import pytest from dae.utils.regions import Region from dae.utils.variant_utils import mat2str @pytest.mark.parametrize( "region,worst_effect,count", [ # (Region('1', 878109, 878109), ("missense", "missense")), (Region("1", 901921, 901921), ("synonymous", "missense"), 1), (Region("1", 905956, 905956), ("frame-shift", "missense"), 1), ], ) def test_multi_alt_allele_effects(variants_vcf, region, worst_effect, count): fvars = variants_vcf("backends/effects_trio_multi") vs = list(fvars.query_variants(regions=[region], effects=["missense"])) for v in vs: print("------------------") print(mat2str(v.best_state)) print(mat2str(v.gt)) assert len(v.effects) == 2 assert v.effects[0].worst == worst_effect[0] assert v.effects[1].worst == worst_effect[1] assert len(vs) == count
import pytest from dae.utils.regions import Region, collapse, collapse_no_chrom @pytest.mark.parametrize( "region,expected", [ ("1:1-2", Region("1", 1, 2)), ("chr1:1-2", Region("chr1", 1, 2)), ("X:1-2", Region("X", 1, 2)), ("chrX:1-2", Region("chrX", 1, 2)), ("GL000192.1:1-2", Region("GL000192.1", 1, 2)), ("chrUn_GL000218v1:1-2", Region("chrUn_GL000218v1", 1, 2)), ("chr4_KI270790v1_alt:1-2", Region("chr4_KI270790v1_alt", 1, 2)), ("chr1:1", Region("chr1", 1, 1)), ("chr1:1,000,000-2,000,000", Region("chr1", 1_000_000, 2_000_000)), ("chr1_KI270706v1_random", Region("chr1_KI270706v1_random")), ], ) def test_parse_regions(region, expected): result = Region.from_str(region) # assert result is not None assert result == expected @pytest.mark.parametrize( "regions,expected", [ ("1:1-2,1:1-3", [Region("1", 1, 3)]), ("1:1-2,1:2-3", [Region("1", 1, 3)]), ("1:1-2,2:2-3", [Region("1", 1, 2),
""" Created on Mar 20, 2018 @author: lubo """ import pytest from dae.utils.regions import Region from dae.utils.variant_utils import mat2str @pytest.mark.parametrize( "region,count,inheritance", [ (Region("1", 11501, 11510), 4, "mendelian"), (Region("1", 11511, 11520), 5, "omission"), (Region("1", 11521, 11530), 4, "denovo"), (Region("1", 11531, 11540), 1, "unknown"), ], ) def test_inheritance_trio_full(variants_vcf, region, count, inheritance): fvars = variants_vcf("backends/inheritance_trio") vs = list( fvars.query_variants( inheritance=inheritance, regions=[region], return_reference=True ) ) for v in vs: # assert Inheritance.from_name(inheritance) in v.inheritance_in_members assert len(mat2str(v.best_state)) == 7
""" Created on Jul 3, 2018 @author: lubo """ import pytest from dae.utils.regions import Region @pytest.mark.parametrize("variants", ["variants_impala", "variants_vcf"]) @pytest.mark.parametrize( "regions,effect,count", [ ([Region("1", 865582, 865691)], "synonymous", 3), ([Region("1", 865582, 865691)], "missense", 3), ([Region("1", 878109, 905956)], "missense", 1), ([Region("1", 878109, 905956)], "synonymous", 1), ([Region("1", 878109, 905956)], "frame-shift", 1), ], ) def test_single_alt_allele_effects(variants_impl, variants, regions, effect, count): fvars = variants_impl(variants)("backends/effects_trio") vs = list(fvars.query_variants(regions=regions, effect_types=[effect])) for v in vs: print(v.effects) assert len(vs) == count @pytest.mark.parametrize("variants", ["variants_impala", "variants_vcf"])
def test_parse_regions(region, expected): result = Region.from_str(region) # assert result is not None assert result == expected
assert vvars is not None vs = vvars.query_variants() vs = list(vs) print(vs) assert len(vs) == 15 @pytest.mark.parametrize("variants", [ "variants_impala", ]) # "variants_vcf"]) @pytest.mark.parametrize( "regions,inheritance,sv_count,fv_count", [ ([Region("1", 865581, 865581)], None, 1, 5), ([Region("1", 865582, 865582) ], None, 2, 5), # one denovo, one transmitted ([Region("1", 865581, 865582)], None, 3, 10), ([Region("1", 865582, 865582)], "denovo", 1, 1), ([Region("1", 865581, 865582)], "denovo", 1, 1), ([Region("1", 865581, 865583)], "denovo", 1, 1), ([Region("1", 865583, 865583)], None, 3, 8), # FIXME: 1, 5 ([Region("1", 865581, 865583)], None, 6, 18), # FIXME: 3, 15 ]) def test_summary_stats_summary(variants_impl, variants, regions, inheritance, sv_count, fv_count): vvars = variants_impl(variants)("backends/summary_stats") assert vvars is not None
def test_11548_gt(variants_impl): fvars = variants_impl("variants_vcf")("backends/a") vs = fvars.query_variants(regions=[Region("1", 11548, 11548)]) v = next(vs) assert v.position == 11548 print(v.gt) assert np.all( np.array([[ 2, 2, 2, 2, 2, 2, 2, ], [ 2, 2, 3, 2, 2, 2, 2, ]]) == v.gt) print(v.best_state) assert np.all( np.array([ [ 0, 0, 0, 0, 0, 0, 0, ], [ 0, 0, 0, 0, 0, 0, 0, ], [ 2, 2, 1, 2, 2, 2, 2, ], [ 0, 0, 1, 0, 0, 0, 0, ], [ 0, 0, 0, 0, 0, 0, 0, ], ]) == v.best_state) expected_genotype = [[2, 2], [2, 2], [2, 3], [2, 2], [2, 2], [2, 2], [2, 2]] assert all([eg == g for (eg, g) in zip(expected_genotype, v.genotype)]) expected_family_genotype = [[1, 1], [1, 1], [1, 2], [1, 1], [1, 1], [1, 1], [1, 1]] assert all([ eg == g for (eg, g) in zip(expected_family_genotype, v.family_genotype) ])
""" Created on Mar 30, 2018 @author: lubo """ import pytest from dae.utils.regions import Region @pytest.mark.skip @pytest.mark.parametrize( "region,worst_effect", [ (Region("1", 878109, 878109), ("missense", "missense")), (Region("1", 901921, 901921), ("synonymous", "missense")), (Region("1", 905956, 905956), ("frame-shift", "missense")), ], ) def test_serialize_deserialize_worst_effect( variants_vcf, region, worst_effect, effect_annotator ): fvars = variants_vcf("fixtures/effects_trio_multi") vs = fvars.query_variants(regions=[region]) for v in vs: print(v, v.alternative) effects1 = effect_annotator.do_annotate_variant( v.chromosome, v.position, v.reference, v.alt_alleles[0].alternative ) effects2 = effect_annotator.do_annotate_variant( v.chromosome, v.position, v.reference, v.alt_alleles[1].alternative
""" Created on Mar 29, 2018 @author: lubo """ import pytest from dae.utils.regions import Region from dae.utils.variant_utils import mat2str @pytest.mark.parametrize("variants", ["variants_impala", "variants_vcf"]) @pytest.mark.parametrize( "region,count,members", [ (Region("1", 11500, 11500), 1, ["mom1", None, None]), (Region("1", 11501, 11501), 1, ["mom1", None, "ch1"]), (Region("1", 11502, 11502), 1, [None, None, "ch1"]), (Region("1", 11503, 11503), 1, ["mom1", "dad1", "ch1"]), ], ) def test_variant_in_members(variants_impl, variants, region, count, members): fvars = variants_impl(variants)("backends/unknown_trio") vs = list(fvars.query_variants(regions=[region])) assert len(vs) == count for v in vs: print(v, mat2str(v.best_state)) for aa in v.alt_alleles: print(aa, aa.variant_in_members) assert list(aa.variant_in_members) == members