def test_proband_column(fixture_dirname): ped_file = fixture_dirname("pedigrees/pedigree_no_role_F.ped") loader = FamiliesLoader(ped_file, **{"ped_no_role": True}) families = loader.load() for person in families.persons.values(): assert not person.has_attr("proband") ped_file = fixture_dirname("pedigrees/pedigree_no_role_H.ped") loader = FamiliesLoader(ped_file, **{"ped_no_role": True}) families = loader.load() for person in families.persons.values(): assert person.has_attr("proband") family = families.get("f1") assert family is not None members = family.full_members assert members[0].role == Role.maternal_grandfather assert members[1].role == Role.maternal_grandmother assert members[2].role == Role.paternal_grandfather assert members[3].role == Role.paternal_grandmother assert members[4].role == Role.dad assert members[5].role == Role.mom assert members[6].role == Role.maternal_aunt assert members[7].role == Role.unknown assert members[8].role == Role.unknown assert members[9].role == Role.paternal_uncle assert members[10].role == Role.prb assert members[11].role == Role.sib assert members[12].role == Role.maternal_cousin assert members[13].role == Role.paternal_cousin
def fam2(): families_loader = FamiliesLoader(StringIO(PED2), ped_sep=",") families = families_loader.load() family = families["f2"] assert len(family.trios) == 1 return family
def test_famlies_loader_simple(pedigree, fixture_dirname): filename = fixture_dirname(f"pedigrees/{pedigree}") assert os.path.exists(filename) loader = FamiliesLoader(filename) families = loader.load() assert families is not None
def test_families_loader_no_role(pedigree, fixture_dirname): filename = fixture_dirname(f"pedigrees/{pedigree}") assert os.path.exists(filename) params = { "ped_no_role": True, } loader = FamiliesLoader(filename, **params) families = loader.load() assert families is not None assert isinstance(families, FamiliesData) fam = families["f1"] assert fam is not None persons = fam.get_members_with_roles(["prb"]) assert len(persons) == 1 person = persons[0] assert person.person_id == "f1.prb" persons = fam.get_members_with_roles(["sib"]) assert len(persons) == 1 person = persons[0] assert person.person_id == "f1.sib"
def test_wild_vcf_loader_simple(fixture_dirname, gpf_instance_2013): vcf_file1 = fixture_dirname("multi_vcf/multivcf_missing1_chr[vc].vcf.gz") vcf_file2 = fixture_dirname("multi_vcf/multivcf_missing2_chr[vc].vcf.gz") ped_file = fixture_dirname("multi_vcf/multivcf.ped") families_loader = FamiliesLoader(ped_file) families = families_loader.load() variants_loader = VcfLoader( families, [vcf_file1, vcf_file2], gpf_instance_2013.genomes_db.get_genome(), params={ "vcf_chromosomes": "1;2", }, ) assert variants_loader is not None assert len(variants_loader.vcf_loaders) == 2 indexes = [] for sv, fvs in variants_loader.full_variants_iterator(): indexes.append(sv.summary_index) assert indexes == list(range(len(indexes)))
def test_extra_attributes_loading_with_person_id( fixtures_gpf_instance, fixture_dirname): families_loader = FamiliesLoader( fixture_dirname("backends/denovo-db-person-id.ped")) families_data = families_loader.load() params = { "denovo_chrom": "Chr", "denovo_pos": "Position", "denovo_ref": "Ref", "denovo_alt": "Alt", "denovo_person_id": "SampleID" } loader = DenovoLoader( families_data, fixture_dirname("backends/denovo-db-person-id.tsv"), fixtures_gpf_instance.get_genome(), params=params ) it = loader.full_variants_iterator() variants = list(it) assert len(variants) == 17 family_variants = [v[1][0] for v in variants] assert family_variants[0].get_attribute("StudyName")[0] == "Turner_2017" assert family_variants[1].get_attribute("StudyName")[0] == "Turner_2017" assert family_variants[2].get_attribute("StudyName")[0] == "Turner_2017" assert family_variants[3].get_attribute("StudyName")[0] == "Lelieveld2016" for variant in family_variants: print(variant)
def main(argv=sys.argv[1:]): parser = argparse.ArgumentParser( description="Produce a pedigree drawing in PDF format " "from a pedigree file with layout coordinates.", conflict_handler="resolve", formatter_class=argparse.RawDescriptionHelpFormatter, ) parser.add_argument('--verbose', '-V', action='count', default=0) FamiliesLoader.cli_arguments(parser) parser.add_argument( "--output", "-o", metavar="o", help="the output filename file", default="output.pdf", ) parser.add_argument( "--mode", type=str, default="report", dest="mode", help="mode of drawing; supported modes are `families` and `report`; " "defaults: `report`", ) argv = parser.parse_args(argv) if argv.verbose == 1: logging.basicConfig(level=logging.WARNING) elif argv.verbose == 2: logging.basicConfig(level=logging.INFO) elif argv.verbose >= 3: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.WARNING) logging.getLogger("matplotlib").setLevel(logging.WARNING) filename, params = FamiliesLoader.parse_cli_arguments(argv) families_loader = FamiliesLoader(filename, **params) families = families_loader.load() mode = argv.mode assert mode in ("families", "report") print("mode:", mode) if mode == "report": generator = draw_families_report(families) else: generator = draw_families(families) with PDFLayoutDrawer(argv.output) as pdf_drawer: for fig in generator: pdf_drawer.savefig(fig) plt.close(fig)
def test_families_ped_df(pedigree, temp_filename, fixture_dirname): filename = fixture_dirname(f"pedigrees/{pedigree}") assert os.path.exists(filename) loader = FamiliesLoader(filename) families = loader.load() assert families._ped_df is None new_df = families.ped_df assert new_df is not None
def test_wild_vcf_loader_pedigree_union(fixture_dirname, gpf_instance_2013): # f1: f1.mom f1.dad f1.p1 f1.s1 # f2: f2.mom f2.dad f2.p1 f2.s1 # f3: f3.mom f3.dad f3.p1 f3.s1 # f4: f4.mom f4.dad f4.p1 f4.s1 # f5: f5.mom f5.dad f5.p1 f5.s1 vcf_file1 = fixture_dirname("multi_vcf/multivcf_pedigree1_chr[vc].vcf.gz") vcf_file2 = fixture_dirname("multi_vcf/multivcf_pedigree2_chr[vc].vcf.gz") ped_file = fixture_dirname("multi_vcf/multivcf.ped") families_loader = FamiliesLoader(ped_file) families = families_loader.load() variants_loader = VcfLoader( families, [vcf_file1, vcf_file2], gpf_instance_2013.genomes_db.get_genome(), params={ "vcf_chromosomes": "1;2", "vcf_pedigree_mode": "union", "vcf_include_unknown_person_genotypes": True, "vcf_include_unknown_family_genotypes": True, }, ) assert variants_loader is not None assert len(variants_loader.vcf_loaders) == 2 for vcf_loader in variants_loader.vcf_loaders: print(vcf_loader.families.persons) families = variants_loader.families families1 = variants_loader.vcf_loaders[0].families families2 = variants_loader.vcf_loaders[1].families for p1, p2 in zip(families1.persons.values(), families2.persons.values()): assert p1 == p2 for fid in families1.keys(): f1 = families1[fid] f2 = families2[fid] assert f1 == f2 assert len(families.persons) == 20 assert len(families1.persons) == 20 assert len(families2.persons) == 20 for person in families.persons.values(): assert not person.missing, person
def test_pedigree_keep_family_order_local(): loader = FamiliesLoader(StringIO(PED_FILE1), ped_sep=",") families = loader.load() f = families["SF0043014"] print(f.members_in_order) assert f.members_in_order[-1].role == Role.prb f = families["SF0033119"] print(f.members_in_order) assert f.members_in_order[-1].role == Role.prb f = families["SF0014912"] print(f.members_in_order) assert f.members_in_order[-1].role == Role.prb
def dae_denovo(dae_denovo_config, genome_2013, annotation_pipeline_internal): families_loader = FamiliesLoader(dae_denovo_config.family_filename, **{"ped_file_format": "simple"}) families = families_loader.load() variants_loader = DenovoLoader(families, dae_denovo_config.denovo_filename, genome_2013) variants_loader = AnnotationPipelineDecorator( variants_loader, annotation_pipeline_internal) fvars = RawMemoryVariants([variants_loader]) return fvars
def test_families_loader_roles_testing(fixture_dirname): filename = fixture_dirname("pedigrees/pedigree_no_role_C.ped") assert os.path.exists(filename) params = { "ped_no_role": True, } loader = FamiliesLoader(filename, **params) families = loader.load() assert families.persons["f1.mg_dad"].role == Role.maternal_grandfather assert families.persons["f1.mg_mom"].role == Role.maternal_grandmother assert families.persons["f1.pg_dad"].role == Role.paternal_grandfather assert families.persons["f1.pg_mom"].role == Role.paternal_grandmother
def iossifov2014_loader(dae_iossifov2014_config, genome_2013, annotation_pipeline_internal): config = dae_iossifov2014_config families_loader = FamiliesLoader(config.family_filename) families = families_loader.load() variants_loader = DenovoLoader(families, config.denovo_filename, genome_2013) variants_loader = AnnotationPipelineDecorator( variants_loader, annotation_pipeline_internal) return variants_loader, families_loader
def test_vcf_info_annotator(fixture_dirname, genomes_db_2013): score_filename = fixture_dirname( "vcf_scores/gnomad.genomes.r2.1.1.sites.21.1_622.vcf.gz") columns = { "AC": "genome_gnomad_ac", "AF": "genome_gnomad_af", "AF_percent": "genome_gnomad_af_percent", } options = { "vcf": True, "c": "chrom", "p": "position", "r": "reference", "a": "alternative", "scores_file": score_filename, } config = AnnotationConfigParser.parse_section({ "options": options, "columns": columns, "annotator": "vcf_info_annotator.VcfInfoAnnotator", "virtual_columns": [], }) annotator = VcfInfoAnnotator(config, genomes_db_2013) assert annotator is not None vcf_filename = fixture_dirname( "vcf_scores/gnomad.genomes.r2.1.1.sites.21.trio.vcf.gz") pedigree_filename = fixture_dirname("vcf_scores/trio.ped") assert os.path.exists(vcf_filename) assert os.path.exists(pedigree_filename) families_loader = FamiliesLoader(pedigree_filename) families = families_loader.load() loader = VcfLoader(families, [vcf_filename], genomes_db_2013.get_genome()) assert loader is not None for summary_variant, _ in loader.full_variants_iterator(): liftover_variants = {} annotator.annotate_summary_variant(summary_variant, liftover_variants) for aa in summary_variant.alt_alleles: af = aa.get_attribute("genome_gnomad_af_percent") logger.debug(f"summary variant: {aa}; gnomad AF {af}%") assert af is not None
def test_wild_vcf_loader_pedigree(fixture_dirname, gpf_instance_2013): vcf_file1 = fixture_dirname("multi_vcf/multivcf_pedigree1_chr[vc].vcf.gz") vcf_file2 = fixture_dirname("multi_vcf/multivcf_pedigree2_chr[vc].vcf.gz") ped_file = fixture_dirname("multi_vcf/multivcf.ped") families_loader = FamiliesLoader(ped_file) families = families_loader.load() variants_loader = VcfLoader( families, [vcf_file1, vcf_file2], gpf_instance_2013.genomes_db.get_genome(), params={ "vcf_chromosomes": "1;2", "vcf_pedigree_mode": "fixed", "vcf_include_unknown_person_genotypes": True, "vcf_include_unknown_family_genotypes": True, }, ) assert variants_loader is not None assert len(variants_loader.vcf_loaders) == 2 for vcf_loader in variants_loader.vcf_loaders: assert vcf_loader.fixed_pedigree indexes = [] for sv, fvs in variants_loader.full_variants_iterator(): indexes.append(sv.summary_index) for fv in fvs: print(fv) assert indexes == list(range(len(indexes))) for vcf_loader in variants_loader.vcf_loaders: print(vcf_loader.families.persons) families1 = variants_loader.vcf_loaders[0].families families2 = variants_loader.vcf_loaders[1].families for p1, p2 in zip(families1.persons.values(), families2.persons.values()): assert p1 == p2 for fid in families1.keys(): f1 = families1[fid] f2 = families2[fid] assert f1 == f2
def test_families_genotypes_decorator_broken_x(fixture_dirname, genome_2013): families_loader = FamiliesLoader( fixture_dirname("backends/denovo_families.txt"), **{"ped_file_format": "simple"}, ) families = families_loader.load() variants_loader = DenovoLoader( families, fixture_dirname("backends/denovo_X_broken.txt"), genome_2013) for sv, fvs in variants_loader.full_variants_iterator(): for fv in fvs: print(fv, fv.genetic_model) assert fv.genetic_model == GeneticModel.X_broken
def test_families_loader_phenotype(fixture_dirname): filename = fixture_dirname("pedigrees/pedigree_D.ped") assert os.path.exists(filename) loader = FamiliesLoader(filename) families = loader.load() assert families is not None assert isinstance(families, FamiliesData) for fam_id, family in families.items(): print(fam_id, family, family.persons) for person_id, person in family.persons.items(): print(person) print(person.has_attr("phenotype")) assert person.has_attr("phenotype")
def cnv_loader( fixture_dirname, genome_2013, annotation_pipeline_internal): families_filename = fixture_dirname("backends/cnv_ped.txt") variants_filename = fixture_dirname("backends/cnv_variants.txt") families_loader = FamiliesLoader( families_filename, **{"ped_file_format": "simple"}) families = families_loader.load() variants_loader = CNVLoader( families, variants_filename, genome_2013) variants_loader = AnnotationPipelineDecorator( variants_loader, annotation_pipeline_internal ) return families_loader, variants_loader
def builder( path, params={ "vcf_include_reference_genotypes": True, "vcf_include_unknown_family_genotypes": True, "vcf_include_unknown_person_genotypes": True, "vcf_denovo_mode": "denovo", "vcf_omission_mode": "omission", }, ): config = vcf_loader_data(path) families_loader = FamiliesLoader(config.pedigree) families = families_loader.load() loaders = [] if config.denovo: denovo_loader = DenovoLoader(families, config.denovo, genomes_db_2013.get_genome(), params={ "denovo_genotype": "genotype", "denovo_family_id": "family", "denovo_chrom": "chrom", "denovo_pos": "pos", "denovo_ref": "ref", "denovo_alt": "alt", }) loaders.append( AnnotationPipelineDecorator(denovo_loader, default_annotation_pipeline)) vcf_loader = VcfLoader(families, [config.vcf], genomes_db_2013.get_genome(), params=params) loaders.append( AnnotationPipelineDecorator(vcf_loader, default_annotation_pipeline)) return loaders
def test_families_loader_phenos(fixture_dirname): filename = fixture_dirname("pedigrees/pedigree_phenos.ped") assert os.path.exists(filename) loader = FamiliesLoader(filename) families = loader.load() assert families is not None assert isinstance(families, FamiliesData) for fam_id, family in families.items(): for person_id, person in family.persons.items(): assert person.has_attr("phenotype") assert person.has_attr("pheno2") assert person.has_attr("pheno3") ped_df = families.ped_df assert is_string_dtype(ped_df["pheno3"]) assert is_string_dtype(ped_df["pheno2"]) assert is_string_dtype(ped_df["phenotype"])
def test_vcf_loader(vcf_loader_data, variants_vcf, fixture_data, genomes_db_2013): conf = vcf_loader_data(fixture_data) print(conf) families_loader = FamiliesLoader(conf.pedigree) families = families_loader.load() loader = VcfLoader( families, [conf.vcf], genomes_db_2013.get_genome(), params={ "vcf_include_reference_genotypes": True, "vcf_include_unknown_family_genotypes": True, "vcf_include_unknown_person_genotypes": True, }, ) assert loader is not None vars_new = list(loader.family_variants_iterator()) for nfv in vars_new: print(nfv)
def pedigree_test(fixture_dirname): loader = FamiliesLoader(fixture_dirname("pedigrees/test.ped")) families = loader.load() return families
def main(argv): parser = argparse.ArgumentParser() parser.add_argument('--verbose', '-V', action='count', default=0) FamiliesLoader.cli_arguments(parser) parser.add_argument( "-o", "--output", dest="output_filename", help="output families parquet filename " "(default is [basename(families_filename).parquet])", ) parser.add_argument( "--partition-description", "--pd", help="input partition description filename", ) parser.add_argument( "--study-id", type=str, default=None, dest="study_id", metavar="<study id>", help="Study ID. " "If none specified, the basename of families filename is used to " "construct study id [default: basename(families filename)]", ) argv = parser.parse_args(argv) if argv.verbose == 1: logging.basicConfig(level=logging.WARNING) elif argv.verbose == 2: logging.basicConfig(level=logging.INFO) elif argv.verbose >= 3: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.ERROR) filename, params = FamiliesLoader.parse_cli_arguments(argv) if argv.study_id is not None: study_id = argv.study_id else: study_id, _ = os.path.splitext(os.path.basename(filename)) loader = FamiliesLoader(filename, **params) families = loader.load() if argv.partition_description: partition_description = ParquetPartitionDescriptor.from_config( argv.partition_description ) if partition_description.family_bin_size > 0: families = partition_description \ .add_family_bins_to_families(families) if not argv.output_filename: output_filename, _ = os.path.splitext(os.path.basename(filename)) output_filename = f"{output_filename}.parquet" else: output_filename = argv.output_filename ParquetManager.families_to_parquet(families, output_filename)
def build(dirname): if not impala_helpers.check_database(impala_test_dbname()): impala_helpers.create_database(impala_test_dbname()) vcfdirname = relative_to_this_test_folder( os.path.join("fixtures", dirname)) vcf_configs = collect_vcf(vcfdirname) for config in vcf_configs: logger.debug(f"importing: {config}") filename = os.path.basename(config.pedigree) study_id = os.path.splitext(filename)[0] (variant_table, pedigree_table) = \ impala_genotype_storage.study_tables( FrozenBox({"id": study_id})) if (not reimport and impala_helpers.check_table( impala_test_dbname(), variant_table) and impala_helpers.check_table(impala_test_dbname(), pedigree_table)): continue study_id = study_id_from_path(config.pedigree) study_temp_dirname = os.path.join(temp_dirname, study_id) families_loader = FamiliesLoader(config.pedigree) families = families_loader.load() genome = gpf_instance_2013.genomes_db.get_genome() loaders = [] if config.denovo: denovo_loader = DenovoLoader(families, config.denovo, genome, params={ "denovo_genotype": "genotype", "denovo_family_id": "family", "denovo_chrom": "chrom", "denovo_pos": "pos", "denovo_ref": "ref", "denovo_alt": "alt", }) loaders.append( AnnotationPipelineDecorator(denovo_loader, annotation_pipeline)) vcf_loader = VcfLoader( families, [config.vcf], genome, regions=None, params={ "vcf_include_reference_genotypes": True, "vcf_include_unknown_family_genotypes": True, "vcf_include_unknown_person_genotypes": True, "vcf_multi_loader_fill_in_mode": "reference", "vcf_denovo_mode": "denovo", "vcf_omission_mode": "omission", }, ) loaders.append( AnnotationPipelineDecorator(vcf_loader, annotation_pipeline)) impala_genotype_storage.simple_study_import( study_id, families_loader=families_loader, variant_loaders=loaders, output=study_temp_dirname, include_reference=True)
def main(argv, gpf_instance=None): if gpf_instance is None: gpf_instance = GPFInstance() parser = argparse.ArgumentParser() parser.add_argument('--verbose', '-V', action='count', default=0) FamiliesLoader.cli_arguments(parser) VcfLoader.cli_arguments(parser, options_only=True) parser.add_argument( "-o", "--output", dest="output_filename", help="output families parquet filename " "(default is [basename(families_filename).ped])", ) parser.add_argument( "--partition-description", "--pd", help="input partition description filename", ) parser.add_argument( "--vcf-files", type=str, nargs="+", metavar="<VCF filename>", help="VCF file to import", ) argv = parser.parse_args(argv) if argv.verbose == 1: logging.basicConfig(level=logging.WARNING) elif argv.verbose == 2: logging.basicConfig(level=logging.INFO) elif argv.verbose >= 3: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.WARNING) filename, params = FamiliesLoader.parse_cli_arguments(argv) logger.info(F"PED PARAMS: {params}") loader = FamiliesLoader(filename, **params) families = loader.load() if argv.partition_description: partition_description = ParquetPartitionDescriptor.from_config( argv.partition_description) families = partition_description.add_family_bins_to_families(families) variants_filenames, variants_params = \ VcfLoader.parse_cli_arguments(argv) if variants_filenames: assert variants_filenames is not None variants_loader = VcfLoader( families, variants_filenames, params=variants_params, genome=gpf_instance.genomes_db.get_genome(), ) families = variants_loader.families if families.broken_families: for family_id, family in families.broken_families.items(): if not family.has_members(): del families[family_id] logger.warning( f"family {family_id} does not contain sequenced members " f"and is removed from the pedigree: {family}") if not argv.output_filename: output_filename, _ = os.path.splitext(os.path.basename(filename)) output_filename = f"{output_filename}.ped" else: output_filename = argv.output_filename FamiliesLoader.save_pedigree(families, output_filename)
def build_backend(self, study_config, genomes_db): if not study_config.genotype_storage.files: data_dir = self.get_data_dir(study_config.id, "data") vcf_filename = os.path.join(data_dir, "{}.vcf".format(study_config.id)) ped_filename = os.path.join(data_dir, "{}.ped".format(study_config.id)) families_loader = FamiliesLoader(ped_filename) families = families_loader.load() variants_loader = VcfLoader(families, [vcf_filename], genomes_db.get_genome()) variants_loader = StoredAnnotationDecorator.decorate( variants_loader, vcf_filename) return RawMemoryVariants([variants_loader], families) else: start = time.time() ped_params = \ study_config.genotype_storage.files.pedigree.params.to_dict() ped_filename = study_config.genotype_storage.files.pedigree.path logger.debug(f"pedigree params: {ped_filename}; {ped_params}") families_loader = FamiliesLoader(ped_filename, **ped_params) families = families_loader.load() elapsed = time.time() - start logger.info(f"families loaded in in {elapsed:.2f} sec") logger.debug(f"{families.ped_df.head()}") loaders = [] for file_conf in study_config.genotype_storage.files.variants: start = time.time() variants_filename = file_conf.path variants_params = file_conf.params.to_dict() logger.debug( f"variant params: {variants_filename}; {variants_params}") annotation_filename = variants_filename if file_conf.format == "vcf": variants_filenames = [ fn.strip() for fn in variants_filename.split(" ") ] variants_loader = VcfLoader( families, variants_filenames, genomes_db.get_genome(), params=variants_params, ) annotation_filename = variants_filenames[0] if file_conf.format == "denovo": variants_loader = DenovoLoader( families, variants_filename, genomes_db.get_genome(), params=variants_params, ) if file_conf.format == "dae": variants_loader = DaeTransmittedLoader( families, variants_filename, genomes_db.get_genome(), params=variants_params, ) if file_conf.format == "cnv": variants_loader = CNVLoader( families, variants_filename, genomes_db.get_genome(), params=variants_params, ) variants_loader = StoredAnnotationDecorator.decorate( variants_loader, annotation_filename) loaders.append(variants_loader) return RawMemoryVariants(loaders, families)
def sample_family(): families_loader = FamiliesLoader(StringIO(PED1), ped_sep=",") families = families_loader.load() family = families["f1"] assert len(family.trios) == 1 return family
def builder(relpath): filename = os.path.join(os.path.dirname(os.path.realpath(__file__)), "fixtures", relpath) loader = FamiliesLoader(filename, ped_sep=",") families = loader.load() return families