def _prepare_report_config(self, instruction_key, instruction, split_key,
                               symbol_table):
        if "reports" in instruction[split_key]:
            location = f"{instruction_key}/{split_key}/reports"
            report_types = list(signature(ReportConfig).parameters.keys())
            ParameterValidator.assert_all_in_valid_list(
                instruction[split_key]["reports"].keys(), report_types,
                location, "reports")

            for report_type in instruction[split_key]["reports"]:
                ParameterValidator.assert_type_and_value(
                    instruction[split_key]["reports"][report_type], list,
                    f"{location}/{report_type}", report_type)

            report_config_input = {
                report_type: {
                    report_id: symbol_table.get(report_id)
                    for report_id in instruction[split_key]["reports"]
                    [report_type]
                }
                for report_type in instruction[split_key]["reports"]
            }
        else:
            report_config_input = {}

        return report_config_input
Exemple #2
0
    def build_object(cls, **kwargs):
        location = "Coefficients"
        coefs_to_plot = [coef.upper() for coef in kwargs["coefs_to_plot"]]

        name = kwargs["name"] if "name" in kwargs else None

        ParameterValidator.assert_all_in_valid_list(coefs_to_plot, [item.name.upper() for item in CoefficientPlottingSetting], location,
                                                    "coefs_to_plot")

        if CoefficientPlottingSetting.CUTOFF.name in coefs_to_plot:
            cutoff = kwargs["cutoff"]
            ParameterValidator.assert_type_and_value(cutoff, list, location, "cutoff")
            ParameterValidator.assert_all_type_and_value(cutoff, Number, location, "cutoff", min_inclusive=1e-15)
        else:
            cutoff = []

        if CoefficientPlottingSetting.N_LARGEST.name in coefs_to_plot:
            n_largest = kwargs["n_largest"]
            ParameterValidator.assert_type_and_value(n_largest, list, location, "n_largest")
            ParameterValidator.assert_all_type_and_value(n_largest, int, location, "n_largest", min_inclusive=1)
        else:
            n_largest = []

        coefs = CoefficientPlottingSettingList()
        for keyword in coefs_to_plot:
            coefs.append(CoefficientPlottingSetting[keyword.upper()])

        return Coefficients(coefs, cutoff, n_largest, name)
    def _prepare_specs(self):
        with open(self.yaml_path, "r") as file:
            specs = yaml.safe_load(file)

        ParameterValidator.assert_keys_present(specs.keys(),
                                               ["definitions", "instructions"],
                                               GalaxyTrainMLModel.__name__,
                                               "YAML specification")
        ParameterValidator.assert_all_in_valid_list(
            specs.keys(), ["definitions", "instructions", "output"],
            GalaxyTrainMLModel.__name__, "YAML specification")

        ParameterValidator.assert_type_and_value(specs["instructions"], dict,
                                                 GalaxyTrainMLModel.__name__,
                                                 "instructions")

        assert len(list(specs["instructions"].keys())) == 1, f"{GalaxyTrainMLModel.__name__}: one instruction has to be specified under " \
                                                             f"`instructions`, got the following instead: {list(specs['instructions'].keys())}."

        self.instruction_name = list(specs["instructions"].keys())[0]

        ParameterValidator.assert_type_and_value(
            specs['instructions'][self.instruction_name], dict,
            GalaxyTrainMLModel.__name__, self.instruction_name)
        ParameterValidator.assert_keys_present(
            specs['instructions'][self.instruction_name].keys(), ['type'],
            GalaxyTrainMLModel.__name__, self.instruction_name)

        assert specs['instructions'][self.instruction_name]['type'] == TrainMLModelInstruction.__name__[:-11], \
            f"{GalaxyTrainMLModel.__name__}: instruction `type` under {self.instruction_name} has to be {TrainMLModelInstruction.__name__[:-11]} " \
            f"for this tool."

        Util.check_paths(specs, GalaxyTrainMLModel.__name__)
        Util.update_result_paths(specs, self.result_path, self.yaml_path)
Exemple #4
0
    def _prepare_parameters(reference: dict,
                            max_edit_distances: dict,
                            name: str = None):
        location = "MatchedReceptorsEncoder"

        legal_chains = [
            chain
            for receptor in (TCABReceptor(), TCGDReceptor(), BCReceptor())
            for chain in receptor.get_chains()
        ]

        if type(max_edit_distances) is int:
            max_edit_distances = {
                chain: max_edit_distances
                for chain in legal_chains
            }
        elif type(max_edit_distances) is dict:
            ParameterValidator.assert_keys(max_edit_distances.keys(),
                                           legal_chains,
                                           location,
                                           "max_edit_distances",
                                           exclusive=False)
        else:
            ParameterValidator.assert_type_and_value(max_edit_distances, dict,
                                                     location,
                                                     'max_edit_distances')

        reference_receptors = MatchedReferenceUtil.prepare_reference(
            reference, location=location, paired=True)

        return {
            "reference_receptors": reference_receptors,
            "max_edit_distances": max_edit_distances,
            "name": name
        }
    def _check_label_format(self, labels: list, instruction_key: str):
        ParameterValidator.assert_type_and_value(labels, list,
                                                 TrainMLModelParser.__name__,
                                                 f'{instruction_key}/labels')
        assert all(isinstance(label, str) or isinstance(label, dict) for label in labels), \
            f"{TrainMLModelParser.__name__}: labels under {instruction_key} were not defined properly. The list of labels has to either be a list of " \
            f"label names, or there can be a parameter 'positive_class' defined under the label name."

        assert all(len(list(label.keys())) == 1 and isinstance(list(label.values())[0], dict) and 'positive_class' in list(label.values())[0]
                   and len(list(list(label.values())[0].keys())) == 1 for label in [l for l in labels if isinstance(l, dict)]), \
            f"{TrainMLModelParser.__name__}: labels that are specified by more than label name, can include only one parameter called 'positive_class'."
    def _prepare_parameters(max_edit_distance: int, reference: dict, name: str = None):
        location = "MatchedSequencesEncoder"

        ParameterValidator.assert_type_and_value(max_edit_distance, int, location, "max_edit_distance", min_inclusive=0)

        reference_sequences = MatchedReferenceUtil.prepare_reference(reference_params=reference, location=location, paired=False)

        return {
            "max_edit_distance": max_edit_distance,
            "reference_sequences": reference_sequences,
            "name": name
        }
Exemple #7
0
    def _check_instruction(self, specs):
        instruction_name = Util.check_instruction_type(specs, DatasetGenerationTool.__name__, DatasetExportInstruction.__name__[:-11])

        for key in ['datasets', 'export_formats']:
            ParameterValidator.assert_keys_present(list(specs['instructions'][instruction_name].keys()), [key], DatasetGenerationTool.__name__,
                                                   instruction_name)
            ParameterValidator.assert_type_and_value(specs["instructions"][instruction_name][key], list, DatasetGenerationTool.__name__,
                                                     f"{instruction_name}/{key}")

            assert len(specs['instructions'][instruction_name][key]) == 1, \
                f"{DatasetGenerationTool.__name__}: this tool accepts only one item under {key}, got {specs['instructions'][instruction_name][key]} " \
                f"instead."
    def __init__(self, percentage: float, show_warnings: bool = True):
        super().__init__()

        ParameterValidator.assert_type_and_value(percentage,
                                                 float,
                                                 "TCRdistClassifier",
                                                 "percentage",
                                                 min_inclusive=0.,
                                                 max_inclusive=1.)

        self.percentage = percentage
        self.k = None
        self.label = None
        self.show_warnings = show_warnings
    def _parse_settings(self, instruction: dict,
                        symbol_table: SymbolTable) -> list:
        try:
            settings = []
            for index, setting in enumerate(instruction["settings"]):
                if "preprocessing" in setting:
                    ParameterValidator.assert_type_and_value(
                        setting["preprocessing"], str,
                        TrainMLModelParser.__name__, f'settings: {index+1}. '
                        f'element: preprocessing')
                    if symbol_table.contains(setting["preprocessing"]):
                        preprocessing_sequence = symbol_table.get(
                            setting["preprocessing"])
                        preproc_name = setting["preprocessing"]
                    else:
                        raise KeyError(
                            f"{TrainMLModelParser.__name__}: preprocessing was set in the TrainMLModel instruction to value "
                            f"{setting['preprocessing']}, but no such preprocessing was defined in the specification under "
                            f"definitions: {PreprocessingParser.keyword}.")
                else:
                    setting["preprocessing"] = None
                    preprocessing_sequence = []
                    preproc_name = None

                ParameterValidator.assert_keys(
                    setting.keys(), ["preprocessing", "ml_method", "encoding"],
                    TrainMLModelParser.__name__,
                    f"settings, {index + 1}. entry")

                encoder = symbol_table.get(setting["encoding"]).build_object(symbol_table.get(instruction["dataset"]),
                                                                             **symbol_table.get_config(setting["encoding"])["encoder_params"])\
                    .set_context({"dataset": symbol_table.get(instruction['dataset'])})

                s = HPSetting(encoder=encoder,
                              encoder_name=setting["encoding"],
                              encoder_params=symbol_table.get_config(
                                  setting["encoding"])["encoder_params"],
                              ml_method=symbol_table.get(setting["ml_method"]),
                              ml_method_name=setting["ml_method"],
                              ml_params=symbol_table.get_config(
                                  setting["ml_method"]),
                              preproc_sequence=preprocessing_sequence,
                              preproc_sequence_name=preproc_name)
                settings.append(s)
            return settings
        except KeyError as key_error:
            raise KeyError(
                f"{TrainMLModelParser.__name__}: parameter {key_error.args[0]} was not defined under settings in TrainMLModel instruction."
            )
Exemple #10
0
    def check_export_format(specs: dict, tool_name: str,
                            instruction_name: str):
        ParameterValidator.assert_keys_present(
            list(specs['instructions'][instruction_name].keys()),
            ["export_formats"], tool_name,
            f"{instruction_name}/export_formats")
        ParameterValidator.assert_type_and_value(
            specs['instructions'][instruction_name]["export_formats"], list,
            tool_name, f"{instruction_name}/export_formats")

        assert len(specs['instructions'][instruction_name]["export_formats"]) == 1, \
            f"{tool_name}: only one format can be specified under export_formats parameter under " \
            f"{instruction_name}/export_formats, got {specs['instructions'][instruction_name]['export_formats']} instead."

        return specs['instructions'][instruction_name]["export_formats"][0]
Exemple #11
0
    def build_object(cls, **kwargs):

        if kwargs["additional_node_attributes"] is None:
            kwargs["additional_node_attributes"] = []
        if kwargs["additional_edge_attributes"] is None:
            kwargs["additional_edge_attributes"] = []

        ParameterValidator.assert_type_and_value(
            kwargs["additional_node_attributes"], list,
            "CytoscapeNetworkExporter", "additional_node_attributes")
        ParameterValidator.assert_type_and_value(
            kwargs["additional_edge_attributes"], list,
            "CytoscapeNetworkExporter", "additional_edge_attributes")

        return CytoscapeNetworkExporter(**kwargs)
    def _check_dataset_specs(self, workflow_specification, location):
        ParameterValidator.assert_type_and_value(
            workflow_specification['definitions'], dict, location,
            'definitions')
        ParameterValidator.assert_keys_present(
            workflow_specification['definitions'].keys(), ['datasets'],
            location, 'definitions')
        ParameterValidator.assert_type_and_value(
            workflow_specification['definitions']['datasets'], dict, location,
            'datasets')

        dataset_names = list(
            workflow_specification['definitions']['datasets'].keys())

        assert len(dataset_names) > 1, \
            f"MultiDatasetBenchmarkTool: there is only one dataset specified ({dataset_names[0]}), while this tool operates on multiple datasets. " \
            f"If only one dataset is needed, consider using the training instruction directly."
    def build_object(cls, **kwargs):
        location = "DeepRCMotifDiscovery"
        name = kwargs["name"] if "name" in kwargs else None
        ParameterValidator.assert_type_and_value(kwargs["n_steps"],
                                                 int,
                                                 location,
                                                 "n_steps",
                                                 min_inclusive=1)
        ParameterValidator.assert_type_and_value(kwargs["threshold"],
                                                 float,
                                                 location,
                                                 "threshold",
                                                 min_inclusive=0,
                                                 max_inclusive=1)

        return DeepRCMotifDiscovery(n_steps=kwargs["n_steps"],
                                    threshold=kwargs["threshold"],
                                    name=name)
Exemple #14
0
 def build_object(cls, **kwargs):
     location = cls.__name__
     ParameterValidator.assert_keys(
         kwargs.keys(), ["filter_sequence_type", "batch_size", "count_agg"],
         location, "DuplicateSequenceFilter")
     ParameterValidator.assert_in_valid_list(
         kwargs["filter_sequence_type"].upper(),
         [item.name for item in SequenceType], location,
         "filter_sequence_type")
     ParameterValidator.assert_in_valid_list(
         kwargs["count_agg"].upper(),
         [item.name
          for item in CountAggregationFunction], location, "count_agg")
     ParameterValidator.assert_type_and_value(kwargs["batch_size"], int,
                                              location, "batch_size", 1)
     return DuplicateSequenceFilter(
         filter_sequence_type=SequenceType[
             kwargs["filter_sequence_type"].upper()],
         batch_size=kwargs["batch_size"],
         count_agg=CountAggregationFunction[kwargs["count_agg"].upper()])
Exemple #15
0
    def parse(self, key: str, instruction: dict, symbol_table: SymbolTable, path: str = None) -> SubsamplingInstruction:

        valid_keys = ["type", "dataset", "subsampled_dataset_sizes", "dataset_export_formats"]
        ParameterValidator.assert_keys(instruction.keys(), valid_keys, SubsamplingParser.__name__, key)

        dataset_keys = symbol_table.get_keys_by_type(SymbolType.DATASET)
        ParameterValidator.assert_in_valid_list(instruction['dataset'], dataset_keys, SubsamplingParser.__name__, f'{key}/dataset')

        dataset = symbol_table.get(instruction['dataset'])
        ParameterValidator.assert_type_and_value(instruction['subsampled_dataset_sizes'], list, SubsamplingParser.__name__, f'{key}/subsampled_dataset_sizes')
        ParameterValidator.assert_all_type_and_value(instruction['subsampled_dataset_sizes'], int, SubsamplingParser.__name__,
                                                     f'{key}/subsampled_dataset_sizes', 1, dataset.get_example_count())

        valid_export_formats = ReflectionHandler.all_nonabstract_subclass_basic_names(DataExporter, 'Exporter', "dataset_export/")
        ParameterValidator.assert_type_and_value(instruction['dataset_export_formats'], list, SubsamplingParser.__name__, f"{key}/dataset_export_formats")
        ParameterValidator.assert_all_in_valid_list(instruction['dataset_export_formats'], valid_export_formats, SubsamplingParser.__name__, f"{key}/dataset_export_formats")

        return SubsamplingInstruction(dataset=dataset, subsampled_dataset_sizes=instruction['subsampled_dataset_sizes'],
                                      dataset_export_formats=[ReflectionHandler.get_class_by_name(export_format + "Exporter", "dataset_export/")
                                                              for export_format in instruction['dataset_export_formats']], name=key)
 def __init__(self, k: int, skip_first_n_aa: int, skip_last_n_aa: int, abundance: str, normalize_all_features: bool, name: str = None):
     location = "AtchleyKmerEncoder"
     ParameterValidator.assert_type_and_value(k, int, location, "k", 1)
     ParameterValidator.assert_type_and_value(skip_first_n_aa, int, location, "skip_first_n_aa", 0)
     ParameterValidator.assert_type_and_value(skip_last_n_aa, int, location, "skip_last_n_aa", 0)
     ParameterValidator.assert_in_valid_list(abundance.upper(), [ab.name for ab in RelativeAbundanceType], location, "abundance")
     ParameterValidator.assert_type_and_value(normalize_all_features, bool, location, "normalize_all_features")
     self.k = k
     self.skip_first_n_aa = skip_first_n_aa
     self.skip_last_n_aa = skip_last_n_aa
     self.abundance = RelativeAbundanceType[abundance.upper()]
     self.normalize_all_features = normalize_all_features
     self.name = name
     self.scaler_path = None
     self.vectorizer_path = None
    def parse(self, key: str, instruction: dict, symbol_table: SymbolTable, path: str) -> MLApplicationInstruction:
        location = MLApplicationParser.__name__
        ParameterValidator.assert_keys(instruction.keys(), ['type', 'dataset', 'label', 'pool_size', 'config_path', 'store_encoded_data'], location, key)
        ParameterValidator.assert_in_valid_list(instruction['dataset'], symbol_table.get_keys_by_type(SymbolType.DATASET), location, f"{key}: dataset")
        ParameterValidator.assert_type_and_value(instruction['pool_size'], int, location, f"{key}: pool_size", min_inclusive=1)
        ParameterValidator.assert_type_and_value(instruction['label'], str, location, f'{key}: label')
        ParameterValidator.assert_type_and_value(instruction['config_path'], str, location, f'{key}: config_path')
        ParameterValidator.assert_type_and_value(instruction['store_encoded_data'], bool, location, f'{key}: store_encoded_data')

        hp_setting, label = self._parse_hp_setting(instruction, path, key)

        instruction = MLApplicationInstruction(dataset=symbol_table.get(instruction['dataset']), name=key, pool_size=instruction['pool_size'],
                                               label_configuration=LabelConfiguration([label]), hp_setting=hp_setting,
                                               store_encoded_data=instruction['store_encoded_data'])

        return instruction
Exemple #18
0
    def create_method_instance(ml_specification: dict, ml_method_class, key: str) -> tuple:

        ml_params = {}

        if ml_specification[ml_method_class.__name__] is None or len(ml_specification[ml_method_class.__name__].keys()) == 0:
            ml_method = ml_method_class()
        else:
            ml_params = ml_specification[ml_method_class.__name__]
            init_method_keys = inspect.signature(ml_method_class.__init__).parameters.keys()
            if any([isinstance(ml_params[key], list) for key in ml_params.keys()]) and "parameter_grid" in init_method_keys:

                ParameterValidator.assert_type_and_value(ml_specification['model_selection_cv'], bool, MLParser.__name__, f'{key}: model_selection_cv', exact_value=True)
                ParameterValidator.assert_type_and_value(ml_specification['model_selection_n_folds'], int, MLParser.__name__, f'{key}: model_selection_n_folds', 2)

                ml_method = ml_method_class(parameter_grid={key: [ml_params[key]] if not isinstance(ml_params[key], list) else ml_params[key]
                                                            for key in ml_params.keys()})

            elif len(init_method_keys) == 3 and all(arg in init_method_keys for arg in ["parameters", "parameter_grid"]):
                ml_method = ml_method_class(parameters=ml_params)
            else:
                ml_method = ml_method_class(**ml_params)

        return ml_method, ml_params
Exemple #19
0
 def _prepare_parameters(vector_size: int,
                         k: int,
                         model_type: str,
                         name: str = None):
     location = "Word2VecEncoder"
     ParameterValidator.assert_type_and_value(vector_size,
                                              int,
                                              location,
                                              "vector_size",
                                              min_inclusive=1)
     ParameterValidator.assert_type_and_value(k,
                                              int,
                                              location,
                                              "k",
                                              min_inclusive=1)
     ParameterValidator.assert_in_valid_list(
         model_type.upper(), [item.name for item in ModelType], location,
         "model_type")
     return {
         "vector_size": vector_size,
         "k": k,
         "model_type": ModelType[model_type.upper()],
         "name": name
     }
    def _check_instruction_specs(self, workflow_specification, location):
        ParameterValidator.assert_type_and_value(
            workflow_specification['instructions'], dict, location,
            'instructions')

        instruction_names = list(workflow_specification['instructions'].keys())
        assert len(instruction_names) == 1, f"MultiDatasetBenchmarkTool: there can be only one instruction specified for this tool. " \
                                            f"Currently the following instructions are specified: {instruction_names}."

        ParameterValidator.assert_keys_present(
            workflow_specification['instructions'][
                instruction_names[0]].keys(), ['type', 'datasets'], location,
            instruction_names[0])

        instruction_type = workflow_specification['instructions'][
            instruction_names[0]]['type']
        assert instruction_type == 'TrainMLModel', \
            f"MultiDatasetBenchmarkTool: this tool works only with instruction of type 'TrainMLModel', got {instruction_type} instead."

        datasets_in_instruction = workflow_specification['instructions'][
            instruction_names[0]]['datasets']
        assert len(datasets_in_instruction) > 1, \
            f'{location}: this tool takes a multiple dataset names as input, but only {len(datasets_in_instruction)} were provided: ' \
            f'{datasets_in_instruction}.'
Exemple #21
0
    def _parse_dataset(key: str, dataset_specs: dict, symbol_table: SymbolTable, result_path: str) -> SymbolTable:
        location = "ImportParser"

        ParameterValidator.assert_keys(list(dataset_specs.keys()), ImportParser.valid_keys, location, f"datasets:{key}", False)

        valid_formats = ReflectionHandler.all_nonabstract_subclass_basic_names(DataImport, "Import", "IO/dataset_import/")
        ParameterValidator.assert_in_valid_list(dataset_specs["format"], valid_formats, location, "format")

        import_cls = ReflectionHandler.get_class_by_name("{}Import".format(dataset_specs["format"]))
        params = ImportParser._prepare_params(dataset_specs, result_path, key)


        if "is_repertoire" in params:
            ParameterValidator.assert_type_and_value(params["is_repertoire"], bool, location, "is_repertoire")

            if params["is_repertoire"] == True:
                if import_cls != IReceptorImport:
                    assert "metadata_file" in params, f"{location}: Missing parameter: metadata_file under {key}/params/"
                    ParameterValidator.assert_type_and_value(params["metadata_file"], str, location, "metadata_file")

            if params["is_repertoire"] == False:
                assert "paired" in params, f"{location}: Missing parameter: paired under {key}/params/"
                ParameterValidator.assert_type_and_value(params["paired"], bool, location, "paired")

                if params["paired"] == True:
                    assert "receptor_chains" in params, f"{location}: Missing parameter: receptor_chains under {key}/params/"
                    ParameterValidator.assert_in_valid_list(params["receptor_chains"], ["_".join(cp.value) for cp in ChainPair], location, "receptor_chains")

        try:
            dataset = import_cls.import_dataset(params, key)
            dataset.name = key
            symbol_table.add(key, SymbolType.DATASET, dataset)
        except KeyError as key_error:
            raise KeyError(f"{key_error}\n\nAn error occurred during parsing of dataset {key}. "
                           f"The keyword {key_error.args[0]} was missing. This either means this argument was "
                           f"not defined under definitions/datasets/{key}/params, or this column was missing from "
                           f"an input data file. ")
        except Exception as ex:
            raise Exception(f"{ex}\n\nAn error occurred while parsing the dataset {key}. See the log above for more details.")

        return symbol_table
Exemple #22
0
    def build_object(cls, **kwargs):
        location = "MLSettingsPerformance"

        single_axis_labels = kwargs["single_axis_labels"]
        ParameterValidator.assert_type_and_value(single_axis_labels, bool,
                                                 location,
                                                 "single_axis_labels")

        if single_axis_labels:
            x_label_position = kwargs["x_label_position"]
            ParameterValidator.assert_type_and_value(x_label_position, float,
                                                     location,
                                                     "x_label_position")
            y_label_position = kwargs["y_label_position"]
            ParameterValidator.assert_type_and_value(y_label_position, float,
                                                     location,
                                                     "y_label_position")
        else:
            x_label_position = None
            y_label_position = None

        name = kwargs["name"] if "name" in kwargs else None
        return MLSettingsPerformance(single_axis_labels, x_label_position,
                                     y_label_position, name)
Exemple #23
0
 def check(self, v):
     ParameterValidator.assert_type_and_value(v, Cell, "CellList",
                                              "new item")
 def check(self, v):
     ParameterValidator.assert_type_and_value(
         v, CoefficientPlottingSetting, "CoefficientPlottingSettingList",
         "new item")
Exemple #25
0
 def check(self, v):
     ParameterValidator.assert_type_and_value(v, ReceptorSequence, "ReceptorSequenceList", "new item")
    def parse(self,
              key: str,
              instruction: dict,
              symbol_table: SymbolTable,
              path: str = None) -> TrainMLModelInstruction:

        valid_keys = [
            "assessment", "selection", "dataset", "strategy", "labels",
            "metrics", "settings", "number_of_processes", "type", "reports",
            "optimization_metric", 'refit_optimal_model', 'store_encoded_data'
        ]
        ParameterValidator.assert_type_and_value(instruction['settings'], list,
                                                 TrainMLModelParser.__name__,
                                                 'settings')
        ParameterValidator.assert_keys(list(instruction.keys()), valid_keys,
                                       TrainMLModelParser.__name__,
                                       "TrainMLModel")
        ParameterValidator.assert_type_and_value(
            instruction['refit_optimal_model'], bool,
            TrainMLModelParser.__name__, 'refit_optimal_model')
        ParameterValidator.assert_type_and_value(instruction['metrics'], list,
                                                 TrainMLModelParser.__name__,
                                                 'metrics')
        ParameterValidator.assert_type_and_value(
            instruction['optimization_metric'], str,
            TrainMLModelParser.__name__, 'optimization_metric')
        ParameterValidator.assert_type_and_value(
            instruction['number_of_processes'], int,
            TrainMLModelParser.__name__, 'number_of_processes')
        ParameterValidator.assert_type_and_value(instruction['strategy'], str,
                                                 TrainMLModelParser.__name__,
                                                 'strategy')
        ParameterValidator.assert_type_and_value(
            instruction['store_encoded_data'], bool,
            TrainMLModelParser.__name__, 'store_encoded_data')

        settings = self._parse_settings(instruction, symbol_table)
        dataset = symbol_table.get(instruction["dataset"])
        assessment = self._parse_split_config(key, instruction, "assessment",
                                              symbol_table, len(settings))
        selection = self._parse_split_config(key, instruction, "selection",
                                             symbol_table, len(settings))
        assessment, selection = self._update_split_configs(
            assessment, selection, dataset)
        label_config = self._create_label_config(instruction, dataset, key)
        strategy = ReflectionHandler.get_class_by_name(
            instruction["strategy"], "hyperparameter_optimization/")
        metrics = {Metric[metric.upper()] for metric in instruction["metrics"]}
        optimization_metric = Metric[
            instruction["optimization_metric"].upper()]
        metric_search_criterion = Metric.get_search_criterion(
            optimization_metric)
        path = self._prepare_path(instruction)
        context = self._prepare_context(instruction, symbol_table)
        reports = self._prepare_reports(instruction["reports"], symbol_table)

        hp_instruction = TrainMLModelInstruction(
            dataset=dataset,
            hp_strategy=strategy(settings, metric_search_criterion),
            hp_settings=settings,
            assessment=assessment,
            selection=selection,
            metrics=metrics,
            optimization_metric=optimization_metric,
            refit_optimal_model=instruction['refit_optimal_model'],
            label_configuration=label_config,
            path=path,
            context=context,
            store_encoded_data=instruction['store_encoded_data'],
            number_of_processes=instruction["number_of_processes"],
            reports=reports,
            name=key)

        return hp_instruction
    def _prepare_parameters(normalization_type: str,
                            reads: str,
                            sequence_encoding: str,
                            k: int = 0,
                            k_left: int = 0,
                            k_right: int = 0,
                            min_gap: int = 0,
                            max_gap: int = 0,
                            metadata_fields_to_include: list = None,
                            name: str = None,
                            scale_to_unit_variance: bool = False,
                            scale_to_zero_mean: bool = False):

        location = KmerFrequencyEncoder.__name__

        ParameterValidator.assert_in_valid_list(
            normalization_type.upper(),
            [item.name
             for item in NormalizationType], location, "normalization_type")
        ParameterValidator.assert_in_valid_list(
            reads.upper(), [item.name for item in ReadsType], location,
            "reads")
        ParameterValidator.assert_in_valid_list(
            sequence_encoding.upper(),
            [item.name
             for item in SequenceEncodingType], location, "sequence_encoding")
        ParameterValidator.assert_type_and_value(scale_to_zero_mean, bool,
                                                 location,
                                                 "scale_to_zero_mean")
        ParameterValidator.assert_type_and_value(scale_to_unit_variance, bool,
                                                 location,
                                                 "scale_to_unit_variance")

        vars_to_check = {
            "k": k,
            "k_left": k_left,
            "k_right": k_right,
            "min_gap": min_gap,
            "max_gap": max_gap
        }
        for param in vars_to_check.keys():
            ParameterValidator.assert_type_and_value(vars_to_check[param],
                                                     int,
                                                     location,
                                                     param,
                                                     min_inclusive=0)

        if "gap" in sequence_encoding.lower():
            assert k_left != 0 and k_right != 0, f"KmerFrequencyEncoder: sequence encoding {sequence_encoding} was chosen, but k_left " \
                                                 f"({k_left}) or k_right ({k_right}) have to be set and larger than 0."

        return {
            "normalization_type":
            NormalizationType[normalization_type.upper()],
            "reads": ReadsType[reads.upper()],
            "sequence_encoding":
            SequenceEncodingType[sequence_encoding.upper()],
            "name": name,
            "scale_to_zero_mean": scale_to_zero_mean,
            "scale_to_unit_variance": scale_to_unit_variance,
            **vars_to_check
        }