def test_wild_vcf_loader_simple(fixture_dirname, gpf_instance_2013): vcf_file1 = fixture_dirname("multi_vcf/multivcf_missing1_chr[vc].vcf.gz") vcf_file2 = fixture_dirname("multi_vcf/multivcf_missing2_chr[vc].vcf.gz") ped_file = fixture_dirname("multi_vcf/multivcf.ped") families_loader = FamiliesLoader(ped_file) families = families_loader.load() variants_loader = VcfLoader( families, [vcf_file1, vcf_file2], gpf_instance_2013.genomes_db.get_genome(), params={ "vcf_chromosomes": "1;2", }, ) assert variants_loader is not None assert len(variants_loader.vcf_loaders) == 2 indexes = [] for sv, fvs in variants_loader.full_variants_iterator(): indexes.append(sv.summary_index) assert indexes == list(range(len(indexes)))
def test_families_loader_no_role(pedigree, fixture_dirname): filename = fixture_dirname(f"pedigrees/{pedigree}") assert os.path.exists(filename) params = { "ped_no_role": True, } loader = FamiliesLoader(filename, **params) families = loader.load() assert families is not None assert isinstance(families, FamiliesData) fam = families["f1"] assert fam is not None persons = fam.get_members_with_roles(["prb"]) assert len(persons) == 1 person = persons[0] assert person.person_id == "f1.prb" persons = fam.get_members_with_roles(["sib"]) assert len(persons) == 1 person = persons[0] assert person.person_id == "f1.sib"
def fam2(): families_loader = FamiliesLoader(StringIO(PED2), ped_sep=",") families = families_loader.load() family = families["f2"] assert len(family.trios) == 1 return family
def test_combine_families_sex_mismatch(): families_A = FamiliesLoader.load_pedigree_file( relative_to_this_test_folder("fixtures/pedigree_A.ped")) families_D = FamiliesLoader.load_pedigree_file( relative_to_this_test_folder("fixtures/pedigree_D.ped")) with pytest.raises(AssertionError): FamiliesData.combine(families_A, families_D, forced=False)
def test_famlies_loader_simple(pedigree, fixture_dirname): filename = fixture_dirname(f"pedigrees/{pedigree}") assert os.path.exists(filename) loader = FamiliesLoader(filename) families = loader.load() assert families is not None
def test_extra_attributes_loading_with_person_id( fixtures_gpf_instance, fixture_dirname): families_loader = FamiliesLoader( fixture_dirname("backends/denovo-db-person-id.ped")) families_data = families_loader.load() params = { "denovo_chrom": "Chr", "denovo_pos": "Position", "denovo_ref": "Ref", "denovo_alt": "Alt", "denovo_person_id": "SampleID" } loader = DenovoLoader( families_data, fixture_dirname("backends/denovo-db-person-id.tsv"), fixtures_gpf_instance.get_genome(), params=params ) it = loader.full_variants_iterator() variants = list(it) assert len(variants) == 17 family_variants = [v[1][0] for v in variants] assert family_variants[0].get_attribute("StudyName")[0] == "Turner_2017" assert family_variants[1].get_attribute("StudyName")[0] == "Turner_2017" assert family_variants[2].get_attribute("StudyName")[0] == "Turner_2017" assert family_variants[3].get_attribute("StudyName")[0] == "Lelieveld2016" for variant in family_variants: print(variant)
def test_families_ped_df(pedigree, temp_filename, fixture_dirname): filename = fixture_dirname(f"pedigrees/{pedigree}") assert os.path.exists(filename) loader = FamiliesLoader(filename) families = loader.load() assert families._ped_df is None new_df = families.ped_df assert new_df is not None
def test_vcf_loader_multi(fixture_dirname, multivcf_files, genomes_db_2013): ped_file = fixture_dirname("backends/multivcf.ped") multivcf_files = list( map( lambda x: os.path.join(fixture_dirname("backends"), x), multivcf_files, )) families = FamiliesLoader(ped_file).load() families_multi = FamiliesLoader(ped_file).load() multi_vcf_loader = VcfLoader( families_multi, multivcf_files, genomes_db_2013.get_genome(), fill_missing_ref=False, ) assert multi_vcf_loader is not None # for sv, fvs in multi_vcf_loader.full_variants_iterator(): # print(sv, fvs) single_vcf = fixture_dirname("backends/multivcf_original.vcf") single_loader = VcfLoader(families, [single_vcf], genomes_db_2013.get_genome()) assert single_loader is not None single_it = single_loader.full_variants_iterator() multi_it = multi_vcf_loader.full_variants_iterator() for s, m in zip(single_it, multi_it): assert s[0] == m[0] assert len(s[1]) == 5 assert len(m[1]) == 5 s_gt_f1 = s[1][0].gt m_gt_f1 = m[1][0].gt assert all((s_gt_f1 == m_gt_f1).flatten()) s_gt_f2 = s[1][0].gt m_gt_f2 = m[1][0].gt assert all((s_gt_f2 == m_gt_f2).flatten()) s_gt_f3 = s[1][0].gt m_gt_f3 = m[1][0].gt assert all((s_gt_f3 == m_gt_f3).flatten()) s_gt_f4 = s[1][0].gt m_gt_f4 = m[1][0].gt assert all((s_gt_f4 == m_gt_f4).flatten()) s_gt_f5 = s[1][0].gt m_gt_f5 = m[1][0].gt assert all((s_gt_f5 == m_gt_f5).flatten())
def test_load_family_simple(fixture_name, temp_filename, fixture_dirname): family_filename = fixture_dirname(fixture_name) assert os.path.exists(family_filename) families = FamiliesLoader.load_simple_families_file(family_filename) assert families is not None FamiliesLoader.save_pedigree(families, temp_filename) families1 = FamiliesLoader.load_pedigree_file(temp_filename) assert set(families.keys()) == set(families1.keys())
def test_wild_vcf_loader_pedigree_union(fixture_dirname, gpf_instance_2013): # f1: f1.mom f1.dad f1.p1 f1.s1 # f2: f2.mom f2.dad f2.p1 f2.s1 # f3: f3.mom f3.dad f3.p1 f3.s1 # f4: f4.mom f4.dad f4.p1 f4.s1 # f5: f5.mom f5.dad f5.p1 f5.s1 vcf_file1 = fixture_dirname("multi_vcf/multivcf_pedigree1_chr[vc].vcf.gz") vcf_file2 = fixture_dirname("multi_vcf/multivcf_pedigree2_chr[vc].vcf.gz") ped_file = fixture_dirname("multi_vcf/multivcf.ped") families_loader = FamiliesLoader(ped_file) families = families_loader.load() variants_loader = VcfLoader( families, [vcf_file1, vcf_file2], gpf_instance_2013.genomes_db.get_genome(), params={ "vcf_chromosomes": "1;2", "vcf_pedigree_mode": "union", "vcf_include_unknown_person_genotypes": True, "vcf_include_unknown_family_genotypes": True, }, ) assert variants_loader is not None assert len(variants_loader.vcf_loaders) == 2 for vcf_loader in variants_loader.vcf_loaders: print(vcf_loader.families.persons) families = variants_loader.families families1 = variants_loader.vcf_loaders[0].families families2 = variants_loader.vcf_loaders[1].families for p1, p2 in zip(families1.persons.values(), families2.persons.values()): assert p1 == p2 for fid in families1.keys(): f1 = families1[fid] f2 = families2[fid] assert f1 == f2 assert len(families.persons) == 20 assert len(families1.persons) == 20 assert len(families2.persons) == 20 for person in families.persons.values(): assert not person.missing, person
def test_pedigree_keep_family_order_local(): loader = FamiliesLoader(StringIO(PED_FILE1), ped_sep=",") families = loader.load() f = families["SF0043014"] print(f.members_in_order) assert f.members_in_order[-1].role == Role.prb f = families["SF0033119"] print(f.members_in_order) assert f.members_in_order[-1].role == Role.prb f = families["SF0014912"] print(f.members_in_order) assert f.members_in_order[-1].role == Role.prb
def dae_denovo(dae_denovo_config, genome_2013, annotation_pipeline_internal): families_loader = FamiliesLoader(dae_denovo_config.family_filename, **{"ped_file_format": "simple"}) families = families_loader.load() variants_loader = DenovoLoader(families, dae_denovo_config.denovo_filename, genome_2013) variants_loader = AnnotationPipelineDecorator( variants_loader, annotation_pipeline_internal) fvars = RawMemoryVariants([variants_loader]) return fvars
def test_families_loader_roles_testing(fixture_dirname): filename = fixture_dirname("pedigrees/pedigree_no_role_C.ped") assert os.path.exists(filename) params = { "ped_no_role": True, } loader = FamiliesLoader(filename, **params) families = loader.load() assert families.persons["f1.mg_dad"].role == Role.maternal_grandfather assert families.persons["f1.mg_mom"].role == Role.maternal_grandmother assert families.persons["f1.pg_dad"].role == Role.paternal_grandfather assert families.persons["f1.pg_mom"].role == Role.paternal_grandmother
def iossifov2014_loader(dae_iossifov2014_config, genome_2013, annotation_pipeline_internal): config = dae_iossifov2014_config families_loader = FamiliesLoader(config.family_filename) families = families_loader.load() variants_loader = DenovoLoader(families, config.denovo_filename, genome_2013) variants_loader = AnnotationPipelineDecorator( variants_loader, annotation_pipeline_internal) return variants_loader, families_loader
def main(argv): args = parse_cli_arguments(argv[1:]) if args.id is not None: study_id = args.id else: study_id, _ = os.path.splitext(os.path.basename(args.family_filename)) if args.output is None: output = "{study_id}.ped".format(study_id=study_id) else: output = args.output fam_df = FamiliesLoader.load_simple_family_file(args.family_filename) FamiliesLoader.save_pedigree(fam_df, output)
def test_wild_vcf_loader_pedigree(fixture_dirname, gpf_instance_2013): vcf_file1 = fixture_dirname("multi_vcf/multivcf_pedigree1_chr[vc].vcf.gz") vcf_file2 = fixture_dirname("multi_vcf/multivcf_pedigree2_chr[vc].vcf.gz") ped_file = fixture_dirname("multi_vcf/multivcf.ped") families_loader = FamiliesLoader(ped_file) families = families_loader.load() variants_loader = VcfLoader( families, [vcf_file1, vcf_file2], gpf_instance_2013.genomes_db.get_genome(), params={ "vcf_chromosomes": "1;2", "vcf_pedigree_mode": "fixed", "vcf_include_unknown_person_genotypes": True, "vcf_include_unknown_family_genotypes": True, }, ) assert variants_loader is not None assert len(variants_loader.vcf_loaders) == 2 for vcf_loader in variants_loader.vcf_loaders: assert vcf_loader.fixed_pedigree indexes = [] for sv, fvs in variants_loader.full_variants_iterator(): indexes.append(sv.summary_index) for fv in fvs: print(fv) assert indexes == list(range(len(indexes))) for vcf_loader in variants_loader.vcf_loaders: print(vcf_loader.families.persons) families1 = variants_loader.vcf_loaders[0].families families2 = variants_loader.vcf_loaders[1].families for p1, p2 in zip(families1.persons.values(), families2.persons.values()): assert p1 == p2 for fid in families1.keys(): f1 = families1[fid] f2 = families2[fid] assert f1 == f2
def test_vcf_info_annotator(fixture_dirname, genomes_db_2013): score_filename = fixture_dirname( "vcf_scores/gnomad.genomes.r2.1.1.sites.21.1_622.vcf.gz") columns = { "AC": "genome_gnomad_ac", "AF": "genome_gnomad_af", "AF_percent": "genome_gnomad_af_percent", } options = { "vcf": True, "c": "chrom", "p": "position", "r": "reference", "a": "alternative", "scores_file": score_filename, } config = AnnotationConfigParser.parse_section({ "options": options, "columns": columns, "annotator": "vcf_info_annotator.VcfInfoAnnotator", "virtual_columns": [], }) annotator = VcfInfoAnnotator(config, genomes_db_2013) assert annotator is not None vcf_filename = fixture_dirname( "vcf_scores/gnomad.genomes.r2.1.1.sites.21.trio.vcf.gz") pedigree_filename = fixture_dirname("vcf_scores/trio.ped") assert os.path.exists(vcf_filename) assert os.path.exists(pedigree_filename) families_loader = FamiliesLoader(pedigree_filename) families = families_loader.load() loader = VcfLoader(families, [vcf_filename], genomes_db_2013.get_genome()) assert loader is not None for summary_variant, _ in loader.full_variants_iterator(): liftover_variants = {} annotator.annotate_summary_variant(summary_variant, liftover_variants) for aa in summary_variant.alt_alleles: af = aa.get_attribute("genome_gnomad_af_percent") logger.debug(f"summary variant: {aa}; gnomad AF {af}%") assert af is not None
def test_families_genotypes_decorator_broken_x(fixture_dirname, genome_2013): families_loader = FamiliesLoader( fixture_dirname("backends/denovo_families.txt"), **{"ped_file_format": "simple"}, ) families = families_loader.load() variants_loader = DenovoLoader( families, fixture_dirname("backends/denovo_X_broken.txt"), genome_2013) for sv, fvs in variants_loader.full_variants_iterator(): for fv in fvs: print(fv, fv.genetic_model) assert fv.genetic_model == GeneticModel.X_broken
def test_combine_families(): families_A = FamiliesLoader.load_pedigree_file( relative_to_this_test_folder("fixtures/pedigree_A.ped")) families_B = FamiliesLoader.load_pedigree_file( relative_to_this_test_folder("fixtures/pedigree_B.ped")) new_families = FamiliesData.combine(families_A, families_B, forced=False) merged_f1 = new_families["f1"] assert set(merged_f1.persons.keys()) == { "f1.mom", "f1.dad", "f1.p1", "f1.s1", "f1.s2", }
def test_flexible_pedigree_read_from_filesystem(filepath, fixture_dirname): expected_df = expected_pedigree_df.copy() expected_df["sample_id"] = expected_df["person_id"] absolute_filepath = fixture_dirname("pedigrees/{}".format(filepath)) pedigree_df = FamiliesLoader.flexible_pedigree_read(absolute_filepath) assert pedigree_df.equals(expected_df)
def test_extra_attributes_serialization_deserialization( fixtures_gpf_instance, fixture_dirname): families_data = FamiliesLoader.load_simple_families_file( fixture_dirname("backends/iossifov_extra_attrs.ped")) loader = DenovoLoader( families_data, fixture_dirname("backends/iossifov_extra_attrs.tsv"), fixtures_gpf_instance.get_genome() ) main_schema = loader.get_attribute("annotation_schema") extra_attributes = loader.get_attribute("extra_attributes") serializer = AlleleParquetSerializer(main_schema, extra_attributes) it = loader.full_variants_iterator() variant = next(it)[1][0] print(variant.gt) summary_blobs = serializer.serialize_summary_data(variant.alleles) scores_blob = serializer.serialize_scores_data(variant.alleles) variant_blob = serializer.serialize_family_variant( variant.alleles, summary_blobs, scores_blob ) extra_blob = serializer.serialize_extra_attributes(variant) family = variant.family fv = serializer.deserialize_family_variant( variant_blob, family, extra_blob) assert fv.get_attribute("someAttr")[0] == "asdf"
def test_vcf_omission_mode( omission_mode, total, unexpected_inheritance, fixture_dirname, genomes_db_2013, ): prefix = fixture_dirname("backends/inheritance_trio_denovo_omission") families = FamiliesLoader(f"{prefix}.ped").load() params = { "vcf_include_reference_genotypes": True, "vcf_include_unknown_family_genotypes": True, "vcf_include_unknown_person_genotypes": True, "vcf_omission_mode": omission_mode, } vcf_loader = VcfLoader( families, [f"{prefix}.vcf"], genomes_db_2013.get_genome(), params=params, ) assert vcf_loader is not None vs = list(vcf_loader.family_variants_iterator()) assert len(vs) == total for fv in vs: for fa in fv.alleles: print(20 * "-") print(fa, fa.inheritance_in_members) assert set( fa.inheritance_in_members) & unexpected_inheritance == set([])
def test_cnv_loader_alt_best_state(fixture_dirname, genomes_db_2013): families_file = fixture_dirname("backends/cnv_ped.txt") families = FamiliesLoader.load_simple_families_file(families_file) assert families is not None variants_file = fixture_dirname( "backends/cnv_variants_alt_1_best_state.txt") loader = CNVLoader(families, variants_file, genomes_db_2013.get_genome(), params={ "cnv_chrom": "Chr", "cnv_start": "Start", "cnv_end": "Stop", "cnv_variant_type": "Del/Dup", "cnv_plus_values": ["Dup", "Dup_Germline"], "cnv_minus_values": ["Del", "Del_Germline"], "cnv_person_id": "personId" }) assert loader is not None svs = [] fvs = [] for sv, _fvs in loader.full_variants_iterator(): print(sv, fvs) svs.append(sv) for fv in _fvs: fvs.append(fv) assert len(svs) == 1 assert len(fvs) == 4 print(fvs[0].best_state)
def test_cnv_loader_alt_2(fixture_dirname, genomes_db_2013): families_file = fixture_dirname("backends/cnv_ped.txt") families = FamiliesLoader.load_simple_families_file(families_file) assert families is not None variants_file = fixture_dirname("backends/cnv_variants_alt_2.txt") loader = CNVLoader(families, variants_file, genomes_db_2013.get_genome(), params={ "cnv_location": "location", "cnv_variant_type": "variant", "cnv_plus_values": ["duplication"], "cnv_minus_values": ["deletion"], "cnv_person_id": "personId" }) assert loader is not None svs = [] fvs = [] for sv, _fvs in loader.full_variants_iterator(): print(sv, fvs) svs.append(sv) for fv in _fvs: fvs.append(fv) assert len(svs) == 29 assert len(fvs) == 30
def test_families_loader_phenotype(fixture_dirname): filename = fixture_dirname("pedigrees/pedigree_D.ped") assert os.path.exists(filename) loader = FamiliesLoader(filename) families = loader.load() assert families is not None assert isinstance(families, FamiliesData) for fam_id, family in families.items(): print(fam_id, family, family.persons) for person_id, person in family.persons.items(): print(person) print(person.has_attr("phenotype")) assert person.has_attr("phenotype")
def cnv_loader( fixture_dirname, genome_2013, annotation_pipeline_internal): families_filename = fixture_dirname("backends/cnv_ped.txt") variants_filename = fixture_dirname("backends/cnv_variants.txt") families_loader = FamiliesLoader( families_filename, **{"ped_file_format": "simple"}) families = families_loader.load() variants_loader = CNVLoader( families, variants_filename, genome_2013) variants_loader = AnnotationPipelineDecorator( variants_loader, annotation_pipeline_internal ) return families_loader, variants_loader
def test_ped_prepare_simple(test_config, fake_ped_file): test_config.person.role.mapping = "INTERNAL" prep = PreparePersons(test_config) ped_df = FamiliesLoader.flexible_pedigree_read(fake_ped_file) assert ped_df is not None ped_df = prep.prepare_pedigree(ped_df) prep.save_pedigree(ped_df)
def test_multivcf_loader_fill_missing(fixture_dirname, fill_mode, fill_value, genomes_db_2013): ped_file = fixture_dirname("backends/multivcf.ped") multivcf_files = [ fixture_dirname("backends/multivcf_missing1.vcf"), fixture_dirname("backends/multivcf_missing2.vcf"), ] families = FamiliesLoader(ped_file).load() params = { "vcf_include_reference_genotypes": True, "vcf_include_unknown_family_genotypes": True, "vcf_include_unknown_person_genotypes": True, "vcf_multi_loader_fill_in_mode": fill_mode, } multi_vcf_loader = VcfLoader(families, multivcf_files, genomes_db_2013.get_genome(), params=params) assert multi_vcf_loader is not None multi_it = multi_vcf_loader.full_variants_iterator() svs_fvs = [sum_fvs for sum_fvs in multi_it] print(svs_fvs) first_present = svs_fvs[0] second_missing = svs_fvs[1] assert next(multi_it, None) is None gt1_f1 = first_present[1][0].genotype gt1_f1_expected = np.array([[1, 1], [0, 0], [0, 1], [0, 1]], dtype=np.int8) gt1_f5 = first_present[1][4].genotype gt1_f5_expected = np.array([[1, 1], [0, 0], [1, 0], [0, 1]], dtype=np.int8) assert all((gt1_f1 == gt1_f1_expected).flatten()) assert all((gt1_f5 == gt1_f5_expected).flatten()) print(second_missing[1][0], " ", second_missing[1][0].genotype) print(second_missing[1][1], " ", second_missing[1][1].genotype) gt2_f1 = second_missing[1][0].genotype gt2_f2 = second_missing[1][1].genotype gt2_f3 = second_missing[1][2].genotype gt2_f5 = second_missing[1][4].genotype gt2_f1_f2_f3_expected = np.array([[fill_value] * 2] * 4, dtype=np.int8) gt2_f5_expected = np.array([[0, 0], [1, 1], [1, 0], [0, 1]], dtype=np.int8) assert all((gt2_f1 == gt2_f1_f2_f3_expected).flatten()) assert all((gt2_f2 == gt2_f1_f2_f3_expected).flatten()) assert all((gt2_f3 == gt2_f1_f2_f3_expected).flatten()) assert all((gt2_f5 == gt2_f5_expected).flatten()) assert svs_fvs[0][0].ref_allele.position == 865582 assert svs_fvs[1][0].ref_allele.position == 865583 assert svs_fvs[2][0].ref_allele.position == 865624 assert svs_fvs[3][0].ref_allele.position == 865627 assert svs_fvs[4][0].ref_allele.position == 865664 assert svs_fvs[5][0].ref_allele.position == 865691
def builder( path, params={ "vcf_include_reference_genotypes": True, "vcf_include_unknown_family_genotypes": True, "vcf_include_unknown_person_genotypes": True, "vcf_denovo_mode": "denovo", "vcf_omission_mode": "omission", }, ): config = vcf_loader_data(path) families_loader = FamiliesLoader(config.pedigree) families = families_loader.load() loaders = [] if config.denovo: denovo_loader = DenovoLoader(families, config.denovo, genomes_db_2013.get_genome(), params={ "denovo_genotype": "genotype", "denovo_family_id": "family", "denovo_chrom": "chrom", "denovo_pos": "pos", "denovo_ref": "ref", "denovo_alt": "alt", }) loaders.append( AnnotationPipelineDecorator(denovo_loader, default_annotation_pipeline)) vcf_loader = VcfLoader(families, [config.vcf], genomes_db_2013.get_genome(), params=params) loaders.append( AnnotationPipelineDecorator(vcf_loader, default_annotation_pipeline)) return loaders
def test_mom_dad_child_sibling_roles(fixture_dirname, ped_file): families = FamiliesLoader.load_pedigree_file(fixture_dirname(ped_file)) family = families.get("f1") role_builder = FamilyRoleBuilder(family) role_builder.build_roles() members = family.full_members assert members[0].role == Role.dad assert members[1].role == Role.mom assert members[2].role == Role.prb assert members[3].role == Role.sib