def test_config_parser_env_interpolation_missing( conf_schema_basic, fixtures_dir ): GPFConfigParser.load_config( os.path.join(fixtures_dir, "env_interpolation_conf.toml"), conf_schema_basic, )
def default_dae_config(request, cleanup): studies_dirname = tempfile.mkdtemp(prefix="studies_", suffix="_test") def fin(): shutil.rmtree(studies_dirname) if cleanup: request.addfinalizer(fin) dae_conf_path = os.path.join(os.environ.get("DAE_DB_DIR", None), "DAE.conf") dae_config = GPFConfigParser.parse_config(dae_conf_path) dae_config["studies_db"]["dir"] = studies_dirname remote_config = { "id": "TEST_REMOTE", "host": "gpfremote", "base_url": "api/v3", "port": 21010, "user": "******", "password": "******", } if "remotes" not in dae_config: dae_config["remotes"] = list() dae_config["remotes"].append(remote_config) else: dae_config["remotes"][0] = remote_config dae_config = GPFConfigParser.process_config(dae_config, dae_conf_schema, config_filename=dae_conf_path) return dae_config
def test_pheno_regressions_from_conf_path(regressions_conf): regs = GPFConfigParser.load_config(regressions_conf, regression_conf_schema) expected_regs = { "reg1": { "instrument_name": "i1", "measure_name": "regressor1", "jitter": 0.1, }, "reg2": { "instrument_name": "i1", "measure_name": "regressor2", "jitter": 0.2, }, "reg3": { "instrument_name": "", "measure_name": "common_regressor", "jitter": 0.3, }, "reg4": { "instrument_name": "i2", "measure_name": "regressor1", "jitter": 0.4, }, } assert len(regs.regression) == len(expected_regs) for reg_name, expected_reg in expected_regs.items(): assert regs.regression[reg_name] == expected_reg
def __init__( self, dae_config=None, config_file="DAE.conf", work_dir=None, defaults=None, load_eagerly=False): if dae_config is None: # FIXME Merge defaults with newly-loaded config assert not defaults, defaults if work_dir is None: work_dir = os.environ["DAE_DB_DIR"] config_file = os.path.join(work_dir, config_file) dae_config = GPFConfigParser.load_config( config_file, dae_conf_schema ) self.dae_config = dae_config self.dae_db_dir = work_dir self.__autism_gene_profile_config = None self.load_eagerly = load_eagerly if load_eagerly: self.genomes_db self.gene_sets_db self._gene_info_config self._pheno_db self._variants_db self._gene_info_config self.denovo_gene_sets_db self._score_config self._scores_factory self.genotype_storage_db self._common_report_facade self._background_facade
def read_and_parse_file_configuration(cls, options, config_file): config = GPFConfigParser.load_config( config_file, annotation_conf_schema ).to_dict() config["options"] = options config["columns"] = {} config["native_columns"] = [] config["virtual_columns"] = [] config["output_columns"] = [] config = cls._setup_defaults(DefaultBox(config)) parsed_sections = list() for config_section in config.sections: if config_section.annotator is None: continue config_section_dict = recursive_dict_update( {"options": options}, config_section.to_dict() ) parsed_sections.append(cls.parse_section(config_section_dict)) config["sections"] = parsed_sections return FrozenBox(config)
def main(argv=sys.argv[1:], gpf_instance=None): if gpf_instance is None: gpf_instance = GPFInstance() argv = parse_cli_arguments(argv, gpf_instance) genotype_storage_db = gpf_instance.genotype_storage_db genotype_storage = genotype_storage_db.get_genotype_storage( argv.genotype_storage ) if not genotype_storage or ( genotype_storage and not genotype_storage.is_impala()): print("missing or non-impala genotype storage") return assert os.path.exists(argv.variants) study_config = genotype_storage.impala_load_dataset( argv.study_id, argv.variants, argv.pedigree) if argv.study_config: input_config = GPFConfigParser.load_config_raw(argv.study_config) study_config = recursive_dict_update(study_config, input_config) study_config = StudyConfigBuilder(study_config).build_config() assert study_config is not None save_study_config( gpf_instance.dae_config, argv.study_id, study_config, force=argv.force)
def __init__(self, config, genomes_db): super(VariantAnnotatorBase, self).__init__(config, genomes_db) if self.config.options.vcf: self.variant_builder = VCFBuilder(self.config, self.genomic_sequence) else: self.variant_builder = DAEBuilder(self.config, self.genomic_sequence) if not self.config.virtual_columns: self.config = GPFConfigParser.modify_tuple( self.config, { "virtual_columns": [ "CSHL_location", "CSHL_chr", "CSHL_position", "CSHL_variant", "VCF_chr", "VCF_position", "VCF_ref", "VCF_alt", ] }, )
def test_handle_regressions_default_jitter(mocker, fake_phenotype_data, output_dir, fake_phenotype_data_desc_conf): def fake_build_regression(*args): return {"pvalue_regression_male": 0, "pvalue_regression_female": 0} mocked = mocker.patch( "dae.pheno_browser.prepare_data." "PreparePhenoBrowserBase.build_regression", side_effect=fake_build_regression, ) reg = GPFConfigParser.load_config(fake_phenotype_data_desc_conf, pheno_conf_schema) prep = PreparePhenoBrowserBase("fake", fake_phenotype_data, output_dir, reg) regressand = fake_phenotype_data.get_measure("i1.m1") for i in prep.handle_regressions(regressand): pass mocked.assert_called() measure, reg_measure, jitter = mocked.call_args_list[0][0] assert jitter == 0.12 measure, reg_measure, jitter = mocked.call_args_list[1][0] assert jitter == 0.13
def _gene_info_config(self): logger.debug( f"loading gene info config file: " f"{self.dae_config.gene_info_db.conf_file}") return GPFConfigParser.load_config( self.dae_config.gene_info_db.conf_file, gene_info_conf )
def _autism_gene_profile_config(self): agp_config = self.dae_config.autism_gene_tool_config if agp_config is None or not os.path.exists(agp_config.conf_file): return None return GPFConfigParser.load_config( self.dae_config.autism_gene_tool_config.conf_file, autism_gene_tool_config )
def test_config_parser_load_single(conf_schema_basic, fixtures_dir): config = GPFConfigParser.load_config( os.path.join(fixtures_dir, "basic_conf.toml"), conf_schema_basic ) print(config) assert config.id == "152135" assert config.name == "Basic test config" assert config.section1.someval1 == "beep" assert config.section1.someval2 == 1.23 assert config.section1.someval3 == 52345
def test_handle_regressions_regressand_is_regressor( fake_phenotype_data, output_dir, fake_phenotype_data_desc_conf): reg = GPFConfigParser.load_config(fake_phenotype_data_desc_conf, pheno_conf_schema) prep = PreparePhenoBrowserBase("fake", fake_phenotype_data, output_dir, reg) regressand = fake_phenotype_data.get_measure("i1.age") with pytest.raises(StopIteration): next(prep.handle_regressions(regressand))
def test_config_parser_string_interpolation(conf_schema_strings, fixtures_dir): config = GPFConfigParser.load_config( os.path.join(fixtures_dir, "vars_conf.toml"), conf_schema_strings ) print(config) assert config.id == "152135" assert config.name == "Vars test config" assert config.vars is None assert config.section1.someval1 == "asdf" assert config.section1.someval2 == "ghjkl" assert config.section1.someval3 == "qwertyasdfghjk"
def test_config_parser_set_config(conf_schema_set, fixtures_dir): config = GPFConfigParser.load_config( os.path.join(fixtures_dir, "set_conf.toml"), conf_schema_set ) print(config) assert config.id == "152135" assert config.name == "Set test config" assert config.section1.someval1 == "ala" assert isinstance(config.section1.someval2, set) assert (config.section1.someval2 ^ {"a", "b", "c", "d"}) == set() assert config.section1.someval3 == 123
def test_config_parser_load_paths(conf_schema_path, fixtures_dir, mocker): patch = mocker.patch("os.path.exists") patch.return_value = True config = GPFConfigParser.load_config( os.path.join(fixtures_dir, "path_conf.toml"), conf_schema_path ) print(config) assert config.id == "152135" assert config.name == "Path test config" assert config.some_abs_path == "/tmp/maybesomeconf.toml" assert config.some_rel_path == os.path.join( fixtures_dir, "environ_conf.toml" )
def simple_study_import( self, study_id, families_loader=None, variant_loaders=None, study_config=None, **kwargs, ): families_config = self._import_families_file(study_id, families_loader) variants_config = self._import_variants_files(study_id, variant_loaders) config_dict = { "id": study_id, "conf_dir": ".", "has_denovo": False, "has_cnv": False, "genotype_storage": { "id": self.id, "files": { "variants": variants_config, "pedigree": families_config, }, }, "genotype_browser": { "enabled": True }, } if not variant_loaders: config_dict["genotype_browser"]["enabled"] = False else: variant_loaders[0].get_attribute("source_type") if any([ loader.get_attribute("source_type") == "denovo" for loader in variant_loaders ]): config_dict["has_denovo"] = True if any([ loader.get_attribute("source_type") == "cnv" for loader in variant_loaders ]): config_dict["has_denovo"] = True config_dict["has_cnv"] = True if study_config is not None: study_config_dict = GPFConfigParser.load_config_raw(study_config) config_dict = recursive_dict_update(config_dict, study_config_dict) config_builder = StudyConfigBuilder(config_dict) return config_builder.build_config()
def test_handle_regressions_non_continuous_or_ordinal_measure( fake_phenotype_data, output_dir, fake_phenotype_data_desc_conf): reg = GPFConfigParser.load_config(fake_phenotype_data_desc_conf, pheno_conf_schema) prep = PreparePhenoBrowserBase("fake", fake_phenotype_data, output_dir, reg) regressand_categorical = fake_phenotype_data.get_measure("i1.m5") regressand_raw = fake_phenotype_data.get_measure("i1.m6") with pytest.raises(StopIteration): next(prep.handle_regressions(regressand_categorical)) with pytest.raises(StopIteration): next(prep.handle_regressions(regressand_raw))
def __init__(self, dae_config): super(PhenoDb, self).__init__() assert dae_config configs = GPFConfigParser.load_directory_configs( dae_config.phenotype_data.dir, pheno_conf_schema ) self.config = { config.phenotype_data.name: config.phenotype_data for config in configs if config.phenotype_data and config.phenotype_data.enabled } self.pheno_cache = {}
def test_config_parser_env_interpolation( conf_schema_basic, fixtures_dir, mocker ): mocker.patch.dict(os.environ, {"test_env_var": "bop"}) config = GPFConfigParser.load_config( os.path.join(fixtures_dir, "env_interpolation_conf.toml"), conf_schema_basic, ) print(config) assert config.id == "152135" assert config.name == "Environment interpolation test config" assert config.section1.someval1 == "bop" assert config.section1.someval2 == 1.23 assert config.section1.someval3 == 52345
def test_config_parser_load_directory(conf_schema_basic, fixtures_dir): configs = GPFConfigParser.load_directory_configs( os.path.join(fixtures_dir, "sample_conf_directory"), conf_schema_basic ) print(configs) assert len(configs) == 4 configs = sorted(configs, key=lambda x: x.id) assert configs[0].id == "1" assert configs[0].name == "conf1" assert configs[1].id == "2" assert configs[1].name == "conf2" assert configs[2].id == "3" assert configs[2].name == "conf3" assert configs[3].id == "4" assert configs[3].name == "conf4"
def __init__(self, dae_dir, conf_file=None): self.dae_dir = dae_dir if not conf_file: conf_file = f"{dae_dir}/genomesDB.conf" self.config = GPFConfigParser.load_config(conf_file, genomes_db_conf) self._genomes = {} for section_id, genome_config in self.config.genome.items(): genome = Genome.load_config(genome_config, section_id) assert genome is not None self._genomes[genome.genome_id] = genome assert self.config.genomes.default_genome in self._genomes self.default_genome = self._genomes[self.config.genomes.default_genome]
def test_has_regression_measure(fake_phenotype_data, output_dir, regressions_conf): reg = GPFConfigParser.load_config(regressions_conf, regression_conf_schema) prep = PreparePhenoBrowserBase("fake", fake_phenotype_data, output_dir, reg) expected_reg_measures = [ ("regressor1", "i1"), ("regressor2", "i1"), ("common_regressor", ""), ("common_regressor", "i1"), ("common_regressor", "i2"), ("regressor1", "i2"), ] for e in expected_reg_measures: assert prep._has_regression_measure(*e)
def _load_group_configs(self): default_config_filename = None if self.dae_config.default_study_config and \ self.dae_config.default_study_config.conf_file: default_config_filename = \ self.dae_config.default_study_config.conf_file group_configs = GPFConfigParser.load_directory_configs( self.dae_config.datasets_db.dir, study_config_schema, default_config_filename=default_config_filename, ) genotype_group_configs = {} for group_config in group_configs: assert group_config.id is not None, group_config if group_config.enabled is False: continue genotype_group_configs[group_config.id] = \ group_config return genotype_group_configs
def _load_study_configs(self): default_config_filename = None if self.dae_config.default_study_config and \ self.dae_config.default_study_config.conf_file: default_config_filename = \ self.dae_config.default_study_config.conf_file study_configs = GPFConfigParser.load_directory_configs( self.dae_config.studies_db.dir, study_config_schema, default_config_filename=default_config_filename, ) genotype_study_configs = {} for study_config in study_configs: assert study_config.id is not None, study_config if study_config.enabled is False: continue genotype_study_configs[study_config.id] = \ study_config return genotype_study_configs
def __init__(self, score_filename, config_filename=None): self.score_filename = score_filename assert os.path.exists(self.score_filename), self.score_filename if config_filename is None: config_filename = "{}.conf".format(self.score_filename) self.config = GPFConfigParser.load_config(config_filename, score_file_conf_schema) assert self.config.general.header is not None assert self.config.columns.score is not None self.header = self.config.general.header logger.debug(f"score file {os.path.basename(self.score_filename)} " f"header {self.header}") self.score_names = self.config.columns.score self.schema = Schema.from_dict(self.config.score_schema).order_as( self.header) logger.debug(f"score file {os.path.basename(self.score_filename)} " f"schema {self.schema.col_names}") assert all([sn in self.schema for sn in self.score_names]), [ self.score_filename, self.score_names, self.schema.col_names, ] self.chr_index = self.schema.col_names.index(self.chr_name) self.pos_begin_index = self.schema.col_names.index(self.pos_begin_name) self.pos_end_index = self.schema.col_names.index(self.pos_end_name) self.chr_prefix = getattr(self.config.general, "chr_prefix", False) self.no_score_value = self.config.general.no_score_value or "na" if self.no_score_value.lower() in ("na", "none"): self.no_score_value = None self._init_access()
def _build_annotator_for(self, score_name): assert os.path.exists( self.config.options.scores_directory ), self.config.options.scores_directory score_filename = self._get_score_file(score_name) options = GPFConfigParser.modify_tuple( self.config.options, {"scores_file": score_filename} ) columns = {score_name: getattr(self.config.columns, score_name)} variant_config = AnnotationConfigParser.parse_section({ "options": options, "columns": columns, "annotator": "score_annotator.VariantScoreAnnotator", "virtual_columns": [], } ) annotator = PositionScoreAnnotator(variant_config, self.genomes_db) return annotator
def test_handle_regressions(mocker, fake_phenotype_data, output_dir, fake_phenotype_data_desc_conf): def fake_build_regression(dependent_measure, independent_measure, jitter): return { "regressand": dependent_measure, "regressor": independent_measure, "jitter": jitter, "pvalue_regression_male": 0, "pvalue_regression_female": 0, } mocked = mocker.patch( "dae.pheno_browser.prepare_data." "PreparePhenoBrowserBase.build_regression", side_effect=fake_build_regression, ) reg = GPFConfigParser.load_config(fake_phenotype_data_desc_conf, pheno_conf_schema) prep = PreparePhenoBrowserBase("fake", fake_phenotype_data, output_dir, reg) regressand = fake_phenotype_data.get_measure("i1.m1") res = [r for r in prep.handle_regressions(regressand) if r is not None] assert len(res) == 2 assert sorted([r["regression_id"] for r in res]) == sorted(["age", "nviq"]) mocked.assert_called() measure, reg_measure, jitter = mocked.call_args_list[0][0] assert measure.measure_id == "i1.m1" assert reg_measure.measure_id == "i1.age" assert jitter == 0.12 measure, reg_measure, jitter = mocked.call_args_list[1][0] assert measure.measure_id == "i1.m1" assert reg_measure.measure_id == "i1.iq" assert jitter == 0.13
def simple_study_import(self, study_id, families_loader=None, variant_loaders=None, study_config=None, output=".", include_reference=False): variants_dir = None has_denovo = False has_cnv = False bucket_index = 0 if variant_loaders: for index, variant_loader in enumerate(variant_loaders): assert isinstance(variant_loader, VariantsLoader), \ type(variant_loader) if variant_loader.get_attribute("source_type") == "denovo": has_denovo = True if variant_loader.get_attribute("source_type") == "cnv": has_denovo = True has_cnv = True if variant_loader.transmission_type == \ TransmissionType.denovo: assert index < 100 bucket_index = index # denovo buckets < 100 elif variant_loader.transmission_type == \ TransmissionType.transmitted: bucket_index = index + 100 # transmitted buckets >=100 variants_dir = os.path.join(output, "variants") partition_description = NoPartitionDescriptor(variants_dir) ParquetManager.variants_to_parquet( variant_loader, partition_description, # parquet_filenames.variants, bucket_index=bucket_index, include_reference=include_reference) pedigree_filename = os.path.join(output, "pedigree", "pedigree.parquet") families = families_loader.load() ParquetManager.families_to_parquet(families, pedigree_filename) config_dict = self.impala_load_dataset(study_id, variants_dir=variants_dir, pedigree_file=pedigree_filename) config_dict["has_denovo"] = has_denovo config_dict["has_cnv"] = has_cnv if study_config is not None: study_config_dict = GPFConfigParser.load_config_raw(study_config) config_dict = recursive_dict_update(config_dict, study_config_dict) config_builder = StudyConfigBuilder(config_dict) return config_builder.build_config()
def get_person_set_collections_config(config_path): return GPFConfigParser.load_config( config_path, {"person_set_collections": person_set_collections_schema}, ).person_set_collections
def main(argv): try: # Setup argument parser gpf_instance = GPFInstance() dae_conf = gpf_instance.dae_config parser = pheno_cli_parser() args = parser.parse_args(argv) if args.instruments is None: print("missing instruments directory parameter", sys.stderr) raise ValueError() if args.pedigree is None: print("missing pedigree filename", sys.stderr) raise ValueError() if args.pheno_name is None: print("missing pheno db name", sys.stderr) raise ValueError() args.pheno_name = verify_phenotype_data_name(args.pheno_name) pheno_db_dir = os.path.join(dae_conf.phenotype_data.dir, args.pheno_name) if not os.path.exists(pheno_db_dir): os.makedirs(pheno_db_dir) args.pheno_db_filename = os.path.join(pheno_db_dir, "{}.db".format(args.pheno_name)) if os.path.exists(args.pheno_db_filename): if not args.force: print("pheno db filename already exists:", args.pheno_db_filename) raise ValueError() else: os.remove(args.pheno_db_filename) args.browser_dir = os.path.join(pheno_db_dir, "browser") if not os.path.exists(args.browser_dir): os.makedirs(args.browser_dir) config = parse_phenotype_data_config(args) if args.regression: regressions = GPFConfigParser.load_config(args.regression, regression_conf_schema) else: regressions = None prep = PrepareVariables(config) prep.build_pedigree(args.pedigree) prep.build_variables(args.instruments, args.data_dictionary) build_pheno_browser( args.pheno_db_filename, args.pheno_name, args.browser_dir, regressions, ) pheno_conf_path = os.path.join(pheno_db_dir, "{}.conf".format(args.pheno_name)) with open(pheno_conf_path, "w") as pheno_conf_file: pheno_conf_file.write( toml.dumps(generate_phenotype_data_config(args, regressions))) return 0 except KeyboardInterrupt: return 0 except Exception as e: traceback.print_exc() program_name = "simple_pheno_import.py" indent = len(program_name) * " " sys.stderr.write(program_name + ": " + repr(e) + "\n") sys.stderr.write(indent + " for help use --help") return 2