def test_parse_reports(self): reports = {"r1": {"SequenceLengthDistribution": {}}} symbol_table = SymbolTable() symbol_table, specs = ReportParser.parse_reports(reports, symbol_table) self.assertTrue(symbol_table.contains("r1")) self.assertTrue( isinstance(symbol_table.get("r1"), SequenceLengthDistribution))
def _prepare_optional_params(self, analysis: dict, symbol_table: SymbolTable, yaml_location: str) -> dict: params = {} dataset = symbol_table.get(analysis["dataset"]) if "encoding" in analysis: params["encoder"] = symbol_table.get( analysis["encoding"]).build_object( dataset, **symbol_table.get_config( analysis["encoding"])["encoder_params"]) if "labels" in analysis: params["label_config"] = LabelHelper.create_label_config( analysis["labels"], dataset, ExploratoryAnalysisParser.__name__, yaml_location) else: params["label_config"] = LabelConfiguration() if "preprocessing_sequence" in analysis: params["preprocessing_sequence"] = symbol_table.get( analysis["preprocessing_sequence"]) return params
def parse(self, key: str, instruction: dict, symbol_table: SymbolTable, path: Path = None) -> SimulationInstruction: ParameterValidator.assert_keys( instruction.keys(), ["dataset", "simulation", "type", "export_formats"], "SimulationParser", key) signals = [ signal.item for signal in symbol_table.get_by_type(SymbolType.SIGNAL) ] simulation = symbol_table.get(instruction["simulation"]) dataset = symbol_table.get(instruction["dataset"]) exporters = self.parse_exporters(instruction) process = SimulationInstruction(signals=signals, simulation=simulation, dataset=dataset, name=key, exporters=exporters) return process
def parse(self, key: str, instruction: dict, symbol_table: SymbolTable, path: Path) -> MLApplicationInstruction: location = MLApplicationParser.__name__ ParameterValidator.assert_keys(instruction.keys(), [ 'type', 'dataset', 'number_of_processes', 'config_path', 'store_encoded_data' ], location, key) ParameterValidator.assert_in_valid_list( instruction['dataset'], symbol_table.get_keys_by_type(SymbolType.DATASET), location, f"{key}: dataset") ParameterValidator.assert_type_and_value( instruction['number_of_processes'], int, location, f"{key}: number_of_processes", min_inclusive=1) ParameterValidator.assert_type_and_value(instruction['config_path'], str, location, f'{key}: config_path') ParameterValidator.assert_type_and_value( instruction['store_encoded_data'], bool, location, f'{key}: store_encoded_data') hp_setting, label = self._parse_hp_setting(instruction, path, key) instruction = MLApplicationInstruction( dataset=symbol_table.get(instruction['dataset']), name=key, number_of_processes=instruction['number_of_processes'], label_configuration=LabelConfiguration([label]), hp_setting=hp_setting, store_encoded_data=instruction['store_encoded_data']) return instruction
def _parse_dataset(key: str, dataset_specs: dict, symbol_table: SymbolTable, result_path: Path) -> SymbolTable: location = "ImportParser" ParameterValidator.assert_keys(list(dataset_specs.keys()), ImportParser.valid_keys, location, f"datasets:{key}", False) valid_formats = ReflectionHandler.all_nonabstract_subclass_basic_names( DataImport, "Import", "IO/dataset_import/") ParameterValidator.assert_in_valid_list(dataset_specs["format"], valid_formats, location, "format") import_cls = ReflectionHandler.get_class_by_name("{}Import".format( dataset_specs["format"])) params = ImportParser._prepare_params(dataset_specs, result_path, key) if "is_repertoire" in params: ParameterValidator.assert_type_and_value(params["is_repertoire"], bool, location, "is_repertoire") if params["is_repertoire"] == True: if import_cls != IReceptorImport: assert "metadata_file" in params, f"{location}: Missing parameter: metadata_file under {key}/params/" ParameterValidator.assert_type_and_value( params["metadata_file"], Path, location, "metadata_file") if params["is_repertoire"] == False: assert "paired" in params, f"{location}: Missing parameter: paired under {key}/params/" ParameterValidator.assert_type_and_value( params["paired"], bool, location, "paired") if params["paired"] == True: assert "receptor_chains" in params, f"{location}: Missing parameter: receptor_chains under {key}/params/" ParameterValidator.assert_in_valid_list( params["receptor_chains"], ["_".join(cp.value) for cp in ChainPair], location, "receptor_chains") try: dataset = import_cls.import_dataset(params, key) dataset.name = key symbol_table.add(key, SymbolType.DATASET, dataset) except KeyError as key_error: raise KeyError( f"{key_error}\n\nAn error occurred during parsing of dataset {key}. " f"The keyword {key_error.args[0]} was missing. This either means this argument was " f"not defined under definitions/datasets/{key}/params, or this column was missing from " f"an input data file. ") except Exception as ex: raise Exception( f"{ex}\n\nAn error occurred while parsing the dataset {key}. See the log above for more details." ) return symbol_table
def _parse_report(key: str, params: dict, symbol_table: SymbolTable): valid_values = ReflectionHandler.all_nonabstract_subclass_basic_names(Report, "", "reports/") report_object, params = ObjectParser.parse_object(params, valid_values, "", "reports/", "ReportParser", key, builder=True, return_params_dict=True) symbol_table.add(key, SymbolType.REPORT, report_object) return symbol_table, params
def parse(encodings: dict, symbol_table: SymbolTable): for key in encodings.keys(): encoder, params = EncodingParser.parse_encoder(key, encodings[key]) symbol_table.add(key, SymbolType.ENCODING, encoder, {"encoder_params": params}) return symbol_table, encodings
def parse(specs: dict, symbol_table: SymbolTable) -> dict: if "output" in specs: ParameterValidator.assert_keys(specs["output"], ["format"], "OutputParser", "output") ParameterValidator.assert_in_valid_list(specs["output"]["format"], ["HTML"], "OutputParser", "format") else: specs["output"] = {"format": "HTML"} symbol_table.add("output", SymbolType.OUTPUT, specs["output"]) return specs["output"]
def parse(specification: dict, symbol_table: SymbolTable): for ml_method_id in specification.keys(): ml_method, config = MLParser._parse_ml_method( ml_method_id, specification[ml_method_id]) specification[ml_method_id] = config symbol_table.add(ml_method_id, SymbolType.ML_METHOD, ml_method, config) return symbol_table, specification
def _parse_settings(self, instruction: dict, symbol_table: SymbolTable) -> list: try: settings = [] for index, setting in enumerate(instruction["settings"]): if "preprocessing" in setting and setting[ "preprocessing"] is not None: ParameterValidator.assert_type_and_value( setting["preprocessing"], str, TrainMLModelParser.__name__, f'settings: {index+1}. ' f'element: preprocessing') if symbol_table.contains(setting["preprocessing"]): preprocessing_sequence = symbol_table.get( setting["preprocessing"]) preproc_name = setting["preprocessing"] if not all(preproc.keeps_example_count() for preproc in preprocessing_sequence): raise ValueError( f"{TrainMLModelParser.__name__}: preprocessing sequence {preproc_name} includes preprocessing that " f"change the number of examples at runtime and as such cannot be used with this instruction. See the " f"documentation for the preprocessing or alternatively use them with other instructions." ) else: raise KeyError( f"{TrainMLModelParser.__name__}: preprocessing was set in the TrainMLModel instruction to value " f"{setting['preprocessing']}, but no such preprocessing was defined in the specification under " f"definitions: {PreprocessingParser.keyword}.") else: setting["preprocessing"] = None preprocessing_sequence = [] preproc_name = None ParameterValidator.assert_keys( setting.keys(), ["preprocessing", "ml_method", "encoding"], TrainMLModelParser.__name__, f"settings, {index + 1}. entry") encoder = symbol_table.get(setting["encoding"]).build_object(symbol_table.get(instruction["dataset"]), **symbol_table.get_config(setting["encoding"])["encoder_params"])\ .set_context({"dataset": symbol_table.get(instruction['dataset'])}) ml_method = symbol_table.get(setting["ml_method"]) ml_method.check_encoder_compatibility(encoder) s = HPSetting(encoder=encoder, encoder_name=setting["encoding"], encoder_params=symbol_table.get_config( setting["encoding"])["encoder_params"], ml_method=ml_method, ml_method_name=setting["ml_method"], ml_params=symbol_table.get_config( setting["ml_method"]), preproc_sequence=preprocessing_sequence, preproc_sequence_name=preproc_name) settings.append(s) return settings except KeyError as key_error: raise KeyError( f"{TrainMLModelParser.__name__}: parameter {key_error.args[0]} was not defined under settings in TrainMLModel instruction." )
def parse_motifs(motifs: dict, symbol_table: SymbolTable): valid_motif_keys = ["seed", "instantiation", "seed_chain1", "seed_chain2", "name_chain1", "name_chain2"] for key in motifs.keys(): ParameterValidator.assert_keys(motifs[key].keys(), valid_motif_keys, "MotifParser", key, exclusive=False) motif = MotifParser._parse_motif(key, motifs[key]) symbol_table.add(key, SymbolType.MOTIF, motif) return symbol_table, motifs
def parse_instruction(key: str, instruction: dict, symbol_table: SymbolTable, path) -> tuple: ParameterValidator.assert_keys_present(list(instruction.keys()), ["type"], InstructionParser.__name__, key) valid_instructions = [cls[:-6] for cls in ReflectionHandler.discover_classes_by_partial_name("Parser", "dsl/instruction_parsers/")] ParameterValidator.assert_in_valid_list(instruction["type"], valid_instructions, "InstructionParser", "type") default_params = DefaultParamsLoader.load("instructions/", instruction["type"]) instruction = {**default_params, **instruction} parser = ReflectionHandler.get_class_by_name("{}Parser".format(instruction["type"]), "instruction_parsers/")() instruction_object = parser.parse(key, instruction, symbol_table, path) symbol_table.add(key, SymbolType.INSTRUCTION, instruction_object) return instruction, symbol_table
def test_parse_receptor_dataset(self): file_content = """complex.id Gene CDR3 V J Species MHC A MHC B MHC class Epitope Epitope gene Epitope species Reference Method Meta CDR3fix Score 3050 TRB CASSPPRVYSNGAGLAGVGWRNEQFF TRBV5-4*01 TRBJ2-1*01 HomoSapiens HLA-A*11:01 B2M MHCI AVFDRKSDAK EBNA4 EBV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""} {"cdr3": "CASSPPRVYSNGAGLAGVGWRNEQFF", "cdr3_old": "CASSPPRVYSNGAGLAGVGWRNEQFF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-1*01", "jStart": 21, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-4*01"} 0 15760 TRB CASSWTWDAATLWGQGALGGANVLTF TRBV5-5*01 TRBJ2-6*01 HomoSapiens HLA-A*03:01 B2M MHCI KLGGALQAK IE1 CMV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""} {"cdr3": "CASSWTWDAATLWGQGALGGANVLTF", "cdr3_old": "CASSWTWDAATLWGQGALGGANVLTF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-6*01", "jStart": 19, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-5*01"} 0 3050 TRA CAAIYESRGSTLGRLYF TRAV13-1*01 TRAJ18*01 HomoSapiens HLA-A*11:01 B2M MHCI AVFDRKSDAK EBNA4 EBV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""} {"cdr3": "CAAIYESRGSTLGRLYF", "cdr3_old": "CAAIYESRGSTLGRLYF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ18*01", "jStart": 7, "oldVEnd": -1, "oldVFixType": "FailedBadSegment", "oldVId": null, "vCanonical": true, "vEnd": 3, "vFixType": "ChangeSegment", "vId": "TRAV13-1*01"} 0 15760 TRA CALRLNNQGGKLIF TRAV9-2*01 TRAJ23*01 HomoSapiens HLA-A*03:01 B2M MHCI KLGGALQAK IE1 CMV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""} {"cdr3": "CALRLNNQGGKLIF", "cdr3_old": "CALRLNNQGGKLIF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ23*01", "jStart": 6, "vCanonical": true, "vEnd": 3, "vFixType": "NoFixNeeded", "vId": "TRAV9-2*01"} 0 """ path = EnvironmentSettings.root_path / "test/tmp/dslimportparservdj/" data_path = EnvironmentSettings.root_path / "test/tmp/dslimportparservdj/receptor_data/" PathBuilder.build(data_path) with open(data_path / "receptors.tsv", "w") as file: file.writelines(file_content) st, desc = ImportParser.parse( { "datasets": { "d1": { "format": "VDJdb", "params": { "is_repertoire": False, "paired": True, "receptor_chains": "TRA_TRB", "path": data_path } } } }, SymbolTable(), path) dataset = st.get("d1") self.assertTrue(isinstance(dataset, ReceptorDataset)) self.assertEqual(2, dataset.get_example_count()) shutil.rmtree(path)
def test_parse(self): workflow_specs = { "seq1": [{ "filter_chain_B": { "ChainRepertoireFilter": { "keep_chain": "A" } } }], "seq2": [{ "filter_chain_A": { "ChainRepertoireFilter": { "keep_chain": "B" } } }] } symbol_table = SymbolTable() table, specs = PreprocessingParser.parse(workflow_specs, symbol_table) self.assertTrue(table.contains("seq1")) self.assertTrue(table.contains("seq2")) self.assertTrue( isinstance(table.get("seq1"), list) and len(table.get("seq1")) == 1) self.assertEqual(list(workflow_specs.keys()), list(specs.keys()))
def test_parse(self): specs = { "type": "DatasetExport", "export_formats": ["Pickle", "AIRR"], "datasets": ["d1"] } symbol_table = SymbolTable() symbol_table.add("d1", SymbolType.DATASET, RepertoireDataset()) instruction = DatasetExportParser().parse("instr1", specs, symbol_table) self.assertTrue(isinstance(instruction, DatasetExportInstruction)) self.assertEqual(2, len(instruction.exporters)) self.assertEqual(1, len(instruction.datasets))
def test_parse(self): path = PathBuilder.build( f'{EnvironmentSettings.tmp_test_path}subsampling_parser/') dataset = RandomDatasetGenerator.generate_receptor_dataset( 30, {3: 1}, {2: 1}, {}, path) symbol_table = SymbolTable() symbol_table.add("d1", SymbolType.DATASET, dataset) SubsamplingParser().parse( 'inst1', { 'dataset': 'd1', 'type': 'Subsampling', 'subsampled_dataset_sizes': [10, 20], 'dataset_export_formats': ['Pickle'] }, symbol_table) with self.assertRaises(AssertionError): SubsamplingParser().parse( 'inst1', { 'dataset': 'd1', 'type': 'Subsampling', 'subsampled_dataset_sizes': [10, 50], 'dataset_export_formats': ['Pickle'] }, symbol_table) with self.assertRaises(AssertionError): SubsamplingParser().parse( 'inst1', { 'dataset': 'd2', 'type': 'Subsampling', 'subsampled_dataset_sizes': [10, 20], 'dataset_export_formats': ['Pickle'] }, symbol_table) with self.assertRaises(AssertionError): SubsamplingParser().parse( 'inst1', { 'dataset': 'd2', 'type': 'Subsampling', 'subsampled_dataset_sizes': [10, 20], 'dataset_export_formats': ['Random'] }, symbol_table) shutil.rmtree(path)
def _prepare_reports(self, reports: list, symbol_table: SymbolTable) -> dict: if reports is not None: ParameterValidator.assert_type_and_value(reports, list, TrainMLModelParser.__name__, "reports") report_objects = {report_id: symbol_table.get(report_id) for report_id in reports} ParameterValidator.assert_all_type_and_value(report_objects.values(), TrainMLModelReport, TrainMLModelParser.__name__, 'reports') return report_objects else: return {}
def parse(self, key: str, instruction: dict, symbol_table: SymbolTable, path: Path = None) -> SubsamplingInstruction: valid_keys = [ "type", "dataset", "subsampled_dataset_sizes", "dataset_export_formats" ] ParameterValidator.assert_keys(instruction.keys(), valid_keys, SubsamplingParser.__name__, key) dataset_keys = symbol_table.get_keys_by_type(SymbolType.DATASET) ParameterValidator.assert_in_valid_list(instruction['dataset'], dataset_keys, SubsamplingParser.__name__, f'{key}/dataset') dataset = symbol_table.get(instruction['dataset']) ParameterValidator.assert_type_and_value( instruction['subsampled_dataset_sizes'], list, SubsamplingParser.__name__, f'{key}/subsampled_dataset_sizes') ParameterValidator.assert_all_type_and_value( instruction['subsampled_dataset_sizes'], int, SubsamplingParser.__name__, f'{key}/subsampled_dataset_sizes', 1, dataset.get_example_count()) valid_export_formats = ReflectionHandler.all_nonabstract_subclass_basic_names( DataExporter, 'Exporter', "dataset_export/") ParameterValidator.assert_type_and_value( instruction['dataset_export_formats'], list, SubsamplingParser.__name__, f"{key}/dataset_export_formats") ParameterValidator.assert_all_in_valid_list( instruction['dataset_export_formats'], valid_export_formats, SubsamplingParser.__name__, f"{key}/dataset_export_formats") return SubsamplingInstruction( dataset=dataset, subsampled_dataset_sizes=instruction['subsampled_dataset_sizes'], dataset_export_formats=[ ReflectionHandler.get_class_by_name(export_format + "Exporter", "dataset_export/") for export_format in instruction['dataset_export_formats'] ], name=key)
def test_parse_ml_methods(self): params = { "LR1": { "LogisticRegression": { "max_iter": 1000, "penalty": "l1", } }, "LR2": "LogisticRegression", "SVM1": { "SVM": { "max_iter": [1000, 2000], "penalty": ["l1", "l2"] }, "model_selection_cv": True, "model_selection_n_folds": 5 }, "SVM2": { "SVM": {}, "model_selection_cv": False, "model_selection_n_folds": -1 } } symbol_table = SymbolTable() symbol_table, desc = MLParser.parse(params, symbol_table) self.assertTrue(symbol_table.get("SVM1")._parameter_grid is not None and len(symbol_table.get("SVM1")._parameter_grid["max_iter"]) == 2) self.assertTrue(symbol_table.get("LR1")._parameters is not None and symbol_table.get("LR1")._parameters["penalty"] == "l1") self.assertTrue(isinstance(symbol_table.get("LR2"), LogisticRegression)) self.assertTrue("SVM" in desc["SVM1"].keys())
def _parse_sequence(key: str, preproc_sequence: list, symbol_table: SymbolTable) -> SymbolTable: sequence = [] valid_preprocessing_classes = ReflectionHandler.all_nonabstract_subclass_basic_names( Preprocessor, "", "preprocessing/") for item in preproc_sequence: for step_key, step in item.items(): obj, params = ObjectParser.parse_object( step, valid_preprocessing_classes, "", "preprocessing/", "PreprocessingParser", step_key, True, True) step = params sequence.append(obj) symbol_table.add(key, SymbolType.PREPROCESSING, sequence) return symbol_table
def _parse_settings(self, instruction: dict, symbol_table: SymbolTable) -> list: try: settings = [] for index, setting in enumerate(instruction["settings"]): if "preprocessing" in setting: ParameterValidator.assert_type_and_value( setting["preprocessing"], str, TrainMLModelParser.__name__, f'settings: {index+1}. ' f'element: preprocessing') if symbol_table.contains(setting["preprocessing"]): preprocessing_sequence = symbol_table.get( setting["preprocessing"]) preproc_name = setting["preprocessing"] else: raise KeyError( f"{TrainMLModelParser.__name__}: preprocessing was set in the TrainMLModel instruction to value " f"{setting['preprocessing']}, but no such preprocessing was defined in the specification under " f"definitions: {PreprocessingParser.keyword}.") else: setting["preprocessing"] = None preprocessing_sequence = [] preproc_name = None ParameterValidator.assert_keys( setting.keys(), ["preprocessing", "ml_method", "encoding"], TrainMLModelParser.__name__, f"settings, {index + 1}. entry") encoder = symbol_table.get(setting["encoding"]).build_object(symbol_table.get(instruction["dataset"]), **symbol_table.get_config(setting["encoding"])["encoder_params"])\ .set_context({"dataset": symbol_table.get(instruction['dataset'])}) s = HPSetting(encoder=encoder, encoder_name=setting["encoding"], encoder_params=symbol_table.get_config( setting["encoding"])["encoder_params"], ml_method=symbol_table.get(setting["ml_method"]), ml_method_name=setting["ml_method"], ml_params=symbol_table.get_config( setting["ml_method"]), preproc_sequence=preprocessing_sequence, preproc_sequence_name=preproc_name) settings.append(s) return settings except KeyError as key_error: raise KeyError( f"{TrainMLModelParser.__name__}: parameter {key_error.args[0]} was not defined under settings in TrainMLModel instruction." )
def _prepare_params(self, analysis: dict, symbol_table: SymbolTable) -> dict: valid_keys = ["dataset", "report", "preprocessing_sequence", "labels", "encoding", "number_of_processes"] ParameterValidator.assert_keys(list(analysis.keys()), valid_keys, "ExploratoryAnalysisParser", "analysis", False) params = {"dataset": symbol_table.get(analysis["dataset"]), "report": copy.deepcopy(symbol_table.get(analysis["report"]))} optional_params = self._prepare_optional_params(analysis, symbol_table) params = {**params, **optional_params} return params
def _prepare_optional_params(self, analysis: dict, symbol_table: SymbolTable) -> dict: params = {} dataset = symbol_table.get(analysis["dataset"]) if "encoding" in analysis: params["encoder"] = symbol_table.get(analysis["encoding"]).build_object(dataset, **symbol_table.get_config(analysis["encoding"])["encoder_params"]) params["label_config"] = LabelConfiguration() if "labels" in analysis: for label in analysis["labels"]: label_values = self._get_label_values(label, dataset) params["label_config"].add_label(label, label_values) if "preprocessing_sequence" in analysis: params["preprocessing_sequence"] = symbol_table.get(analysis["preprocessing_sequence"]) if "number_of_processes" in analysis: params["number_of_processes"] = analysis["number_of_processes"] return params
def _parse_simulation(key: str, simulation: dict, symbol_table: SymbolTable) -> SymbolTable: location = "SimulationParser" valid_implanting_keys = ["dataset_implanting_rate", "repertoire_implanting_rate", "signals", "is_noise"] implantings = [] for impl_key, implanting in simulation.items(): ParameterValidator.assert_keys(implanting.keys(), valid_implanting_keys, location, impl_key, exclusive=False) ParameterValidator.assert_keys(implanting["signals"], symbol_table.get_keys_by_type(SymbolType.SIGNAL), location, impl_key, False) implanting_params = copy.deepcopy(implanting) implanting_params["signals"] = [symbol_table.get(signal) for signal in implanting["signals"]] implanting_params["name"] = impl_key implantings.append(Implanting(**implanting_params)) assert sum([settings["dataset_implanting_rate"] for settings in simulation.values()]) <= 1, \ "The total dataset implanting rate can not exceed 1." symbol_table.add(key, SymbolType.SIMULATION, Simulation(implantings)) return symbol_table
def parse_signals(signals: dict, symbol_table: SymbolTable): for key, signal_spec in signals.items(): ParameterValidator.assert_keys_present(signal_spec.keys(), SignalParser.VALID_KEYS, "SignalParser", key) implanting_strategy = SignalParser._get_implanting_strategy( key, signal_spec) ParameterValidator.assert_keys( signal_spec["motifs"], symbol_table.get_keys_by_type(SymbolType.MOTIF), "SignalParser", f"motifs in signal {key}", False) signal_motifs = [ symbol_table.get(motif_id) for motif_id in signal_spec["motifs"] ] signal = Signal(key, signal_motifs, implanting_strategy) symbol_table.add(key, SymbolType.SIGNAL, signal) return symbol_table, signals
def parse(self, key: str, instruction: dict, symbol_table: SymbolTable, path: Path = None) -> DatasetExportInstruction: location = "DatasetExportParser" ParameterValidator.assert_keys( list(instruction.keys()), DatasetExportParser.REQUIRED_KEYS + DatasetExportParser.OPTIONAL_KEYS, location, key, False) ParameterValidator.assert_keys_present( list(instruction.keys()), DatasetExportParser.REQUIRED_KEYS, location, key) valid_formats = ReflectionHandler.all_nonabstract_subclass_basic_names( DataExporter, "Exporter", 'dataset_export/') ParameterValidator.assert_all_in_valid_list( instruction["export_formats"], valid_formats, location, "export_formats") ParameterValidator.assert_all_in_valid_list( instruction["datasets"], symbol_table.get_keys_by_type(SymbolType.DATASET), location, "datasets") return DatasetExportInstruction( datasets=[ symbol_table.get(dataset_key) for dataset_key in instruction["datasets"] ], exporters=[ ReflectionHandler.get_class_by_name(f"{key}Exporter", "dataset_export/") for key in instruction["export_formats"] ], preprocessing_sequence=symbol_table.get( instruction["preprocessing_sequence"]) if "preprocessing_sequence" in instruction else None, name=key)
def test_parse_simulation(self): simulation = { "sim1": { "var1": { "signals": ["signal1"], "dataset_implanting_rate": 0.5, "repertoire_implanting_rate": 0.1 } } } symbol_table = SymbolTable() symbol_table.add("motif1", SymbolType.MOTIF, Motif("motif1", GappedKmerInstantiation(position_weights={0: 1}), seed="CAS")) symbol_table.add("signal1", SymbolType.SIGNAL, Signal("signal1", [symbol_table.get("motif1")], HealthySequenceImplanting(GappedMotifImplanting(), implanting_computation=ImplantingComputation.ROUND))) symbol_table, specs = SimulationParser.parse_simulations(simulation, symbol_table) self.assertTrue(symbol_table.contains("sim1")) sim1 = symbol_table.get("sim1") self.assertEqual(1, len(sim1.implantings))
def parse(workflow_specification: dict, file_path, result_path): symbol_table = SymbolTable() def_parser_output, specs_defs = DefinitionParser.parse( workflow_specification, symbol_table, result_path) symbol_table, specs_instructions = InstructionParser.parse( def_parser_output, result_path) app_output = OutputParser.parse(workflow_specification, symbol_table) path = ImmuneMLParser._output_specs(file_path=file_path, result_path=result_path, definitions=specs_defs, instructions=specs_instructions, output=app_output) return symbol_table, path
def _extract_reports(self): with self.specification_path.open("r") as file: workflow_specification = yaml.safe_load(file) report_keys = list(workflow_specification['instructions'].values() )[0]['benchmark_reports'] ParameterValidator.assert_all_in_valid_list( report_keys, list(workflow_specification['definitions']['reports'].keys()), MultiDatasetBenchmarkTool.__name__, "benchmark_reports") reports = { key: value for key, value in workflow_specification['definitions'] ['reports'].items() if key in report_keys } symbol_table, _ = ReportParser.parse_reports(reports, SymbolTable()) self.reports = [ entry.item for entry in symbol_table.get_by_type(SymbolType.REPORT) ]
def parse(self, key: str, instruction: dict, symbol_table: SymbolTable, path: Path = None) -> TrainMLModelInstruction: valid_keys = ["assessment", "selection", "dataset", "strategy", "labels", "metrics", "settings", "number_of_processes", "type", "reports", "optimization_metric", 'refit_optimal_model', 'store_encoded_data'] ParameterValidator.assert_type_and_value(instruction['settings'], list, TrainMLModelParser.__name__, 'settings') ParameterValidator.assert_keys(list(instruction.keys()), valid_keys, TrainMLModelParser.__name__, "TrainMLModel") ParameterValidator.assert_type_and_value(instruction['refit_optimal_model'], bool, TrainMLModelParser.__name__, 'refit_optimal_model') ParameterValidator.assert_type_and_value(instruction['metrics'], list, TrainMLModelParser.__name__, 'metrics') ParameterValidator.assert_type_and_value(instruction['optimization_metric'], str, TrainMLModelParser.__name__, 'optimization_metric') ParameterValidator.assert_type_and_value(instruction['number_of_processes'], int, TrainMLModelParser.__name__, 'number_of_processes') ParameterValidator.assert_type_and_value(instruction['strategy'], str, TrainMLModelParser.__name__, 'strategy') ParameterValidator.assert_type_and_value(instruction['store_encoded_data'], bool, TrainMLModelParser.__name__, 'store_encoded_data') if instruction["reports"] is not None: ParameterValidator.assert_type_and_value(instruction['reports'], list, TrainMLModelParser.__name__, 'reports') settings = self._parse_settings(instruction, symbol_table) dataset = symbol_table.get(instruction["dataset"]) assessment = self._parse_split_config(key, instruction, "assessment", symbol_table, len(settings)) selection = self._parse_split_config(key, instruction, "selection", symbol_table, len(settings)) assessment, selection = self._update_split_configs(assessment, selection, dataset) label_config = self._create_label_config(instruction, dataset, key) strategy = ReflectionHandler.get_class_by_name(instruction["strategy"], "hyperparameter_optimization/") metrics = {Metric[metric.upper()] for metric in instruction["metrics"]} optimization_metric = Metric[instruction["optimization_metric"].upper()] metric_search_criterion = Metric.get_search_criterion(optimization_metric) path = self._prepare_path(instruction) context = self._prepare_context(instruction, symbol_table) reports = self._prepare_reports(instruction["reports"], symbol_table) hp_instruction = TrainMLModelInstruction(dataset=dataset, hp_strategy=strategy(settings, metric_search_criterion), hp_settings=settings, assessment=assessment, selection=selection, metrics=metrics, optimization_metric=optimization_metric, refit_optimal_model=instruction['refit_optimal_model'], label_configuration=label_config, path=path, context=context, store_encoded_data=instruction['store_encoded_data'], number_of_processes=instruction["number_of_processes"], reports=reports, name=key) return hp_instruction