def _prepare_specs(self): with open(self.yaml_path, "r") as file: specs = yaml.safe_load(file) ParameterValidator.assert_keys_present(specs.keys(), ["definitions", "instructions"], GalaxyTrainMLModel.__name__, "YAML specification") ParameterValidator.assert_all_in_valid_list( specs.keys(), ["definitions", "instructions", "output"], GalaxyTrainMLModel.__name__, "YAML specification") ParameterValidator.assert_type_and_value(specs["instructions"], dict, GalaxyTrainMLModel.__name__, "instructions") assert len(list(specs["instructions"].keys())) == 1, f"{GalaxyTrainMLModel.__name__}: one instruction has to be specified under " \ f"`instructions`, got the following instead: {list(specs['instructions'].keys())}." self.instruction_name = list(specs["instructions"].keys())[0] ParameterValidator.assert_type_and_value( specs['instructions'][self.instruction_name], dict, GalaxyTrainMLModel.__name__, self.instruction_name) ParameterValidator.assert_keys_present( specs['instructions'][self.instruction_name].keys(), ['type'], GalaxyTrainMLModel.__name__, self.instruction_name) assert specs['instructions'][self.instruction_name]['type'] == TrainMLModelInstruction.__name__[:-11], \ f"{GalaxyTrainMLModel.__name__}: instruction `type` under {self.instruction_name} has to be {TrainMLModelInstruction.__name__[:-11]} " \ f"for this tool." Util.check_paths(specs, GalaxyTrainMLModel.__name__) Util.update_result_paths(specs, self.result_path, self.yaml_path)
def parse(self, key: str, instruction: dict, symbol_table: SymbolTable, path: str = None) -> DatasetExportInstruction: location = "DatasetExportParser" ParameterValidator.assert_keys(list(instruction.keys()), DatasetExportParser.VALID_KEYS, location, key) valid_formats = ReflectionHandler.all_nonabstract_subclass_basic_names( DataExporter, "Exporter", 'dataset_export/') ParameterValidator.assert_all_in_valid_list( instruction["export_formats"], valid_formats, location, "export_formats") ParameterValidator.assert_all_in_valid_list( instruction["datasets"], symbol_table.get_keys_by_type(SymbolType.DATASET), location, "datasets") return DatasetExportInstruction( datasets=[ symbol_table.get(dataset_key) for dataset_key in instruction["datasets"] ], exporters=[ ReflectionHandler.get_class_by_name(f"{key}Exporter", "dataset_export/") for key in instruction["export_formats"] ], name=key)
def _parse_ml_method(ml_method_id: str, ml_specification) -> tuple: valid_class_values = ReflectionHandler.all_nonabstract_subclass_basic_names(MLMethod, "", "ml_methods/") if type(ml_specification) is str: ml_specification = {ml_specification: {}} ml_specification = {**DefaultParamsLoader.load("ml_methods/", "MLMethod"), **ml_specification} ml_specification_keys = list(ml_specification.keys()) ParameterValidator.assert_all_in_valid_list(list(ml_specification_keys), ["model_selection_cv", "model_selection_n_folds"] + valid_class_values, "MLParser", ml_method_id) non_default_keys = [key for key in ml_specification.keys() if key not in ["model_selection_cv", "model_selection_n_folds"]] assert len(ml_specification_keys) == 3, f"MLParser: ML method {ml_method_id} was not correctly specified. Expected at least 1 key " \ f"(ML method name), got {len(ml_specification_keys) - 2} instead: " \ f"{str([key for key in non_default_keys])[1:-1]}." ml_method_class_name = non_default_keys[0] ml_method_class = ReflectionHandler.get_class_by_name(ml_method_class_name, "ml_methods/") ml_specification[ml_method_class_name] = {**DefaultParamsLoader.load("ml_methods/", ml_method_class_name, log_if_missing=False), **ml_specification[ml_method_class_name]} method, params = MLParser.create_method_instance(ml_specification, ml_method_class, ml_method_id) ml_specification[ml_method_class_name] = params method.name = ml_method_id return method, ml_specification
def _prepare_report_config(self, instruction_key, instruction, split_key, symbol_table): if "reports" in instruction[split_key]: location = f"{instruction_key}/{split_key}/reports" report_types = list(signature(ReportConfig).parameters.keys()) ParameterValidator.assert_all_in_valid_list( instruction[split_key]["reports"].keys(), report_types, location, "reports") for report_type in instruction[split_key]["reports"]: ParameterValidator.assert_type_and_value( instruction[split_key]["reports"][report_type], list, f"{location}/{report_type}", report_type) report_config_input = { report_type: { report_id: symbol_table.get(report_id) for report_id in instruction[split_key]["reports"] [report_type] } for report_type in instruction[split_key]["reports"] } else: report_config_input = {} return report_config_input
def build_object(cls, **kwargs): location = "Coefficients" coefs_to_plot = [coef.upper() for coef in kwargs["coefs_to_plot"]] name = kwargs["name"] if "name" in kwargs else None ParameterValidator.assert_all_in_valid_list(coefs_to_plot, [item.name.upper() for item in CoefficientPlottingSetting], location, "coefs_to_plot") if CoefficientPlottingSetting.CUTOFF.name in coefs_to_plot: cutoff = kwargs["cutoff"] ParameterValidator.assert_type_and_value(cutoff, list, location, "cutoff") ParameterValidator.assert_all_type_and_value(cutoff, Number, location, "cutoff", min_inclusive=1e-15) else: cutoff = [] if CoefficientPlottingSetting.N_LARGEST.name in coefs_to_plot: n_largest = kwargs["n_largest"] ParameterValidator.assert_type_and_value(n_largest, list, location, "n_largest") ParameterValidator.assert_all_type_and_value(n_largest, int, location, "n_largest", min_inclusive=1) else: n_largest = [] coefs = CoefficientPlottingSettingList() for keyword in coefs_to_plot: coefs.append(CoefficientPlottingSetting[keyword.upper()]) return Coefficients(coefs, cutoff, n_largest, name)
def import_dataset(params, name: str) -> SequenceDataset: """ Returns randomly generated receptor dataset according to the parameters; YAML specification: result_path: path/where/to/store/results/ sequence_count: 100 # number of random sequences to generate chain_1_length_probabilities: 14: 0.8 # 80% of all generated sequences for all sequences will have length 14 15: 0.2 # 20% of all generated sequences across all sequences will have length 15 labels: epitope1: # label name True: 0.5 # 50% of the sequences will have class True False: 0.5 # 50% of the sequences will have class False epitope2: # next label with classes that will be assigned to sequences independently of the previous label or other parameters 1: 0.3 # 30% of the generated sequences will have class 1 0: 0.7 # 70% of the generated sequences will have class 0 """ valid_keys = [ "sequence_count", "length_probabilities", "labels", "result_path" ] ParameterValidator.assert_all_in_valid_list( list(params.keys()), valid_keys, "RandomSequenceDatasetImport", "params") return RandomDatasetGenerator.generate_sequence_dataset( sequence_count=params["sequence_count"], length_probabilities=params["length_probabilities"], labels=params["labels"], path=params["result_path"])
def import_dataset(params: dict, dataset_name: str) -> RepertoireDataset: valid_keys = ["result_path", "repertoire_count", "sequence_count_probabilities", "sequence_length_probabilities", "labels"] ParameterValidator.assert_all_in_valid_list(list(params.keys()), valid_keys, "RandomRepertoireDatasetImport", "params") return RandomDatasetGenerator.generate_repertoire_dataset(repertoire_count=params["repertoire_count"], sequence_count_probabilities=params["sequence_count_probabilities"], sequence_length_probabilities=params["sequence_length_probabilities"], labels=params["labels"], path=params["result_path"])
def update_specs(self): with open(self.yaml_path, 'r') as file: specs = yaml.safe_load(file) ParameterValidator.assert_keys_present(specs.keys(), ["definitions", "instructions"], DatasetGenerationTool.__name__, "YAML specification") ParameterValidator.assert_all_in_valid_list(specs.keys(), ["definitions", "instructions", "output"], DatasetGenerationTool.__name__, "YAML specification") self._check_dataset(specs) self._check_instruction(specs) Util.check_paths(specs, DatasetGenerationTool.__name__) Util.update_result_paths(specs, self.result_path, self.yaml_path)
def parse_encoder(key: str, specs: dict): class_path = "encodings" valid_encoders = ReflectionHandler.all_nonabstract_subclass_basic_names( DatasetEncoder, "Encoder", class_path) encoder = ObjectParser.get_class(specs, valid_encoders, "Encoder", class_path, "EncodingParser", key) params = ObjectParser.get_all_params(specs, class_path, encoder.__name__[:-7], key) required_params = [ p for p in list( inspect.signature(encoder.__init__).parameters.keys()) if p != "self" ] ParameterValidator.assert_all_in_valid_list( params.keys(), required_params, "EncoderParser", f"{key}/{encoder.__name__.replace('Encoder', '')}") return encoder, params
def parse(self, key: str, instruction: dict, symbol_table: SymbolTable, path: str = None) -> SubsamplingInstruction: valid_keys = ["type", "dataset", "subsampled_dataset_sizes", "dataset_export_formats"] ParameterValidator.assert_keys(instruction.keys(), valid_keys, SubsamplingParser.__name__, key) dataset_keys = symbol_table.get_keys_by_type(SymbolType.DATASET) ParameterValidator.assert_in_valid_list(instruction['dataset'], dataset_keys, SubsamplingParser.__name__, f'{key}/dataset') dataset = symbol_table.get(instruction['dataset']) ParameterValidator.assert_type_and_value(instruction['subsampled_dataset_sizes'], list, SubsamplingParser.__name__, f'{key}/subsampled_dataset_sizes') ParameterValidator.assert_all_type_and_value(instruction['subsampled_dataset_sizes'], int, SubsamplingParser.__name__, f'{key}/subsampled_dataset_sizes', 1, dataset.get_example_count()) valid_export_formats = ReflectionHandler.all_nonabstract_subclass_basic_names(DataExporter, 'Exporter', "dataset_export/") ParameterValidator.assert_type_and_value(instruction['dataset_export_formats'], list, SubsamplingParser.__name__, f"{key}/dataset_export_formats") ParameterValidator.assert_all_in_valid_list(instruction['dataset_export_formats'], valid_export_formats, SubsamplingParser.__name__, f"{key}/dataset_export_formats") return SubsamplingInstruction(dataset=dataset, subsampled_dataset_sizes=instruction['subsampled_dataset_sizes'], dataset_export_formats=[ReflectionHandler.get_class_by_name(export_format + "Exporter", "dataset_export/") for export_format in instruction['dataset_export_formats']], name=key)
def _extract_reports(self): with open(self.specification_path, "r") as file: workflow_specification = yaml.safe_load(file) report_keys = list(workflow_specification['instructions'].values() )[0]['benchmark_reports'] ParameterValidator.assert_all_in_valid_list( report_keys, list(workflow_specification['definitions']['reports'].keys()), MultiDatasetBenchmarkTool.__name__, "benchmark_reports") reports = { key: value for key, value in workflow_specification['definitions'] ['reports'].items() if key in report_keys } symbol_table, _ = ReportParser.parse_reports(reports, SymbolTable()) self.reports = [ entry.item for entry in symbol_table.get_by_type(SymbolType.REPORT) ]