def _prepare_specs(self): with open(self.yaml_path, "r") as file: specs = yaml.safe_load(file) ParameterValidator.assert_keys_present(specs.keys(), ["definitions", "instructions"], GalaxyTrainMLModel.__name__, "YAML specification") ParameterValidator.assert_all_in_valid_list( specs.keys(), ["definitions", "instructions", "output"], GalaxyTrainMLModel.__name__, "YAML specification") ParameterValidator.assert_type_and_value(specs["instructions"], dict, GalaxyTrainMLModel.__name__, "instructions") assert len(list(specs["instructions"].keys())) == 1, f"{GalaxyTrainMLModel.__name__}: one instruction has to be specified under " \ f"`instructions`, got the following instead: {list(specs['instructions'].keys())}." self.instruction_name = list(specs["instructions"].keys())[0] ParameterValidator.assert_type_and_value( specs['instructions'][self.instruction_name], dict, GalaxyTrainMLModel.__name__, self.instruction_name) ParameterValidator.assert_keys_present( specs['instructions'][self.instruction_name].keys(), ['type'], GalaxyTrainMLModel.__name__, self.instruction_name) assert specs['instructions'][self.instruction_name]['type'] == TrainMLModelInstruction.__name__[:-11], \ f"{GalaxyTrainMLModel.__name__}: instruction `type` under {self.instruction_name} has to be {TrainMLModelInstruction.__name__[:-11]} " \ f"for this tool." Util.check_paths(specs, GalaxyTrainMLModel.__name__) Util.update_result_paths(specs, self.result_path, self.yaml_path)
def import_repertoire_dataset(import_class, params: DatasetImportParams, dataset_name: str) -> RepertoireDataset: """ Function to create a dataset from the metadata and a list of repertoire files and exports dataset pickle file Arguments: import_class: class to use for import params: instance of DatasetImportParams class which includes information on path, columns, result path etc. dataset_name: user-defined name of the dataset Returns: RepertoireDataset object that was created """ metadata = pd.read_csv(params.metadata_file, ",") ParameterValidator.assert_keys_present(metadata.columns.tolist(), ["filename"], ImportHelper.__name__, f'{dataset_name}: params: metadata_file') PathBuilder.build(params.result_path + "repertoires/") arguments = [(import_class, row, params) for index, row in metadata.iterrows()] with Pool(params.number_of_processes) as pool: repertoires = pool.starmap(ImportHelper.load_repertoire_as_object, arguments) new_metadata_file = ImportHelper.make_new_metadata_file(repertoires, metadata, params.result_path, dataset_name) potential_labels = list(set(metadata.columns.tolist()) - {"filename"}) dataset = RepertoireDataset(params={key: list(set(metadata[key].values.tolist())) for key in potential_labels}, repertoires=repertoires, metadata_file=new_metadata_file, name=dataset_name) PickleExporter.export(dataset, params.result_path) return dataset
def _check_dataset(self, specs): ParameterValidator.assert_keys_present(specs["definitions"].keys(), ['datasets'], DatasetGenerationTool.__name__, 'definitions') assert len(specs['definitions']['datasets'].keys()) == 1, \ f"{DatasetGenerationTool.__name__}: only one dataset can be defined with this Galaxy tool, got these " \ f"instead: {list(specs['definitions']['datasets'].keys())}." assert len(specs['instructions'].keys()) == 1, \ f"{DatasetGenerationTool.__name__}: only one instruction of type DatasetExport can be defined with this Galaxy tool, got these " \ f"instructions instead: {list(specs['instructions'].keys())}."
def _check_instruction(self, specs): instruction_name = Util.check_instruction_type(specs, DatasetGenerationTool.__name__, DatasetExportInstruction.__name__[:-11]) for key in ['datasets', 'export_formats']: ParameterValidator.assert_keys_present(list(specs['instructions'][instruction_name].keys()), [key], DatasetGenerationTool.__name__, instruction_name) ParameterValidator.assert_type_and_value(specs["instructions"][instruction_name][key], list, DatasetGenerationTool.__name__, f"{instruction_name}/{key}") assert len(specs['instructions'][instruction_name][key]) == 1, \ f"{DatasetGenerationTool.__name__}: this tool accepts only one item under {key}, got {specs['instructions'][instruction_name][key]} " \ f"instead."
def update_specs(self): with open(self.yaml_path, 'r') as file: specs = yaml.safe_load(file) ParameterValidator.assert_keys_present(specs.keys(), ["definitions", "instructions"], DatasetGenerationTool.__name__, "YAML specification") ParameterValidator.assert_all_in_valid_list(specs.keys(), ["definitions", "instructions", "output"], DatasetGenerationTool.__name__, "YAML specification") self._check_dataset(specs) self._check_instruction(specs) Util.check_paths(specs, DatasetGenerationTool.__name__) Util.update_result_paths(specs, self.result_path, self.yaml_path)
def check_instruction_type(specs: dict, tool_name, expected_instruction) -> str: ParameterValidator.assert_keys_present(list(specs.keys()), ['definitions', 'instructions'], tool_name, "YAML specification") assert len(list(specs['instructions'].keys())) == 1, f"{tool_name}: multiple instructions were given " \ f"({str(list(specs['instructions'].keys()))[1:-1]}), but only one instruction of type " \ f"{expected_instruction} should be specified." instruction_name = list(specs['instructions'].keys())[0] instruction_type = specs['instructions'][instruction_name]['type'] assert instruction_type == expected_instruction, \ f"{tool_name}: instruction type has to be '{expected_instruction}', got {instruction_type} instead." return instruction_name
def parse_signals(signals: dict, symbol_table: SymbolTable): for key, signal_spec in signals.items(): ParameterValidator.assert_keys_present(signal_spec.keys(), SignalParser.VALID_KEYS, "SignalParser", key) implanting_strategy = SignalParser._get_implanting_strategy(key, signal_spec) ParameterValidator.assert_keys(signal_spec["motifs"], symbol_table.get_keys_by_type(SymbolType.MOTIF), "SignalParser", f"motifs in signal {key}", False) signal_motifs = [symbol_table.get(motif_id) for motif_id in signal_spec["motifs"]] signal = Signal(key, signal_motifs, implanting_strategy) symbol_table.add(key, SymbolType.SIGNAL, signal) return symbol_table, signals
def check_export_format(specs: dict, tool_name: str, instruction_name: str): ParameterValidator.assert_keys_present( list(specs['instructions'][instruction_name].keys()), ["export_formats"], tool_name, f"{instruction_name}/export_formats") ParameterValidator.assert_type_and_value( specs['instructions'][instruction_name]["export_formats"], list, tool_name, f"{instruction_name}/export_formats") assert len(specs['instructions'][instruction_name]["export_formats"]) == 1, \ f"{tool_name}: only one format can be specified under export_formats parameter under " \ f"{instruction_name}/export_formats, got {specs['instructions'][instruction_name]['export_formats']} instead." return specs['instructions'][instruction_name]["export_formats"][0]
def _check_dataset_specs(self, workflow_specification, location): ParameterValidator.assert_type_and_value( workflow_specification['definitions'], dict, location, 'definitions') ParameterValidator.assert_keys_present( workflow_specification['definitions'].keys(), ['datasets'], location, 'definitions') ParameterValidator.assert_type_and_value( workflow_specification['definitions']['datasets'], dict, location, 'datasets') dataset_names = list( workflow_specification['definitions']['datasets'].keys()) assert len(dataset_names) > 1, \ f"MultiDatasetBenchmarkTool: there is only one dataset specified ({dataset_names[0]}), while this tool operates on multiple datasets. " \ f"If only one dataset is needed, consider using the training instruction directly."
def parse_instruction(key: str, instruction: dict, symbol_table: SymbolTable, path) -> tuple: ParameterValidator.assert_keys_present(list(instruction.keys()), ["type"], InstructionParser.__name__, key) valid_instructions = [ cls[:-6] for cls in ReflectionHandler.discover_classes_by_partial_name( "Parser", "dsl/instruction_parsers/") ] ParameterValidator.assert_in_valid_list(instruction["type"], valid_instructions, "InstructionParser", "type") parser = ReflectionHandler.get_class_by_name( "{}Parser".format(instruction["type"]), "instruction_parsers/")() instruction_object = parser.parse(key, instruction, symbol_table, path) symbol_table.add(key, SymbolType.INSTRUCTION, instruction_object) return instruction, symbol_table
def build_object(cls, **kwargs): ParameterValidator.assert_keys( kwargs.keys(), ['reference_path', 'comparison_attributes', 'name', 'label'], ReferenceSequenceOverlap.__name__, f"reports: {kwargs['name'] if 'name' in kwargs else ''}") assert os.path.isfile(kwargs['reference_path']), f"{ReferenceSequenceOverlap.__name__}: 'reference_path' for report {kwargs['name']} is not " \ f"a valid file path." reference_sequences_df = pd.read_csv(kwargs['reference_path']) attributes = reference_sequences_df.columns.tolist() ParameterValidator.assert_keys_present( expected_values=kwargs['comparison_attributes'], values=attributes, location=ReferenceSequenceOverlap.__name__, parameter_name='columns in file under reference_path') return ReferenceSequenceOverlap(**kwargs)
def _get_implanting_strategy(key: str, signal: dict) -> SignalImplantingStrategy: valid_strategies = [cls[:-10] for cls in ReflectionHandler.discover_classes_by_partial_name("Implanting", "simulation/signal_implanting_strategy/")] ParameterValidator.assert_in_valid_list(signal["implanting"], valid_strategies, "SignalParser", key) defaults = DefaultParamsLoader.load("signal_implanting_strategy/", f"{signal['implanting']}Implanting") signal = {**defaults, **signal} ParameterValidator.assert_keys_present(list(signal.keys()), ["motifs", "implanting", "sequence_position_weights"], SignalParser.__name__, key) implanting_comp = None if 'implanting_computation' in signal: implanting_comp = signal['implanting_computation'].lower() ParameterValidator.assert_in_valid_list(implanting_comp, [el.name.lower() for el in ImplantingComputation], SignalParser.__name__, 'implanting_computation') implanting_comp = ImplantingComputation[implanting_comp.upper()] implanting_strategy = ReflectionHandler.get_class_by_name(f"{signal['implanting']}Implanting")(GappedMotifImplanting(), signal["sequence_position_weights"], implanting_comp) return implanting_strategy
def _check_instruction_specs(self, workflow_specification, location): ParameterValidator.assert_type_and_value( workflow_specification['instructions'], dict, location, 'instructions') instruction_names = list(workflow_specification['instructions'].keys()) assert len(instruction_names) == 1, f"MultiDatasetBenchmarkTool: there can be only one instruction specified for this tool. " \ f"Currently the following instructions are specified: {instruction_names}." ParameterValidator.assert_keys_present( workflow_specification['instructions'][ instruction_names[0]].keys(), ['type', 'datasets'], location, instruction_names[0]) instruction_type = workflow_specification['instructions'][ instruction_names[0]]['type'] assert instruction_type == 'TrainMLModel', \ f"MultiDatasetBenchmarkTool: this tool works only with instruction of type 'TrainMLModel', got {instruction_type} instead." datasets_in_instruction = workflow_specification['instructions'][ instruction_names[0]]['datasets'] assert len(datasets_in_instruction) > 1, \ f'{location}: this tool takes a multiple dataset names as input, but only {len(datasets_in_instruction)} were provided: ' \ f'{datasets_in_instruction}.'