Exemple #1
0
    def test_parse_ml_methods(self):

        params = {
            "LR1": {
                "LogisticRegression": {
                    "max_iter": 1000,
                    "penalty": "l1",
                }
            },
            "LR2": "LogisticRegression",
            "SVM1": {
                "SVM": {
                    "max_iter": [1000, 2000],
                    "penalty": ["l1", "l2"]
                },
                "model_selection_cv": True,
                "model_selection_n_folds": 5
            },
            "SVM2": {
                "SVM": {},
                "model_selection_cv": False,
                "model_selection_n_folds": -1
            }
        }

        symbol_table = SymbolTable()
        symbol_table, desc = MLParser.parse(params, symbol_table)
        self.assertTrue(symbol_table.get("SVM1")._parameter_grid is not None and len(symbol_table.get("SVM1")._parameter_grid["max_iter"]) == 2)
        self.assertTrue(symbol_table.get("LR1")._parameters is not None and symbol_table.get("LR1")._parameters["penalty"] == "l1")
        self.assertTrue(isinstance(symbol_table.get("LR2"), LogisticRegression))

        self.assertTrue("SVM" in desc["SVM1"].keys())
Exemple #2
0
    def _parse_settings(self, instruction: dict,
                        symbol_table: SymbolTable) -> list:
        try:
            settings = []
            for index, setting in enumerate(instruction["settings"]):
                if "preprocessing" in setting and setting[
                        "preprocessing"] is not None:
                    ParameterValidator.assert_type_and_value(
                        setting["preprocessing"], str,
                        TrainMLModelParser.__name__, f'settings: {index+1}. '
                        f'element: preprocessing')
                    if symbol_table.contains(setting["preprocessing"]):
                        preprocessing_sequence = symbol_table.get(
                            setting["preprocessing"])
                        preproc_name = setting["preprocessing"]
                        if not all(preproc.keeps_example_count()
                                   for preproc in preprocessing_sequence):
                            raise ValueError(
                                f"{TrainMLModelParser.__name__}: preprocessing sequence {preproc_name} includes preprocessing that "
                                f"change the number of examples at runtime and as such cannot be used with this instruction. See the "
                                f"documentation for the preprocessing or alternatively use them with other instructions."
                            )
                    else:
                        raise KeyError(
                            f"{TrainMLModelParser.__name__}: preprocessing was set in the TrainMLModel instruction to value "
                            f"{setting['preprocessing']}, but no such preprocessing was defined in the specification under "
                            f"definitions: {PreprocessingParser.keyword}.")
                else:
                    setting["preprocessing"] = None
                    preprocessing_sequence = []
                    preproc_name = None

                ParameterValidator.assert_keys(
                    setting.keys(), ["preprocessing", "ml_method", "encoding"],
                    TrainMLModelParser.__name__,
                    f"settings, {index + 1}. entry")

                encoder = symbol_table.get(setting["encoding"]).build_object(symbol_table.get(instruction["dataset"]),
                                                                             **symbol_table.get_config(setting["encoding"])["encoder_params"])\
                    .set_context({"dataset": symbol_table.get(instruction['dataset'])})

                ml_method = symbol_table.get(setting["ml_method"])
                ml_method.check_encoder_compatibility(encoder)

                s = HPSetting(encoder=encoder,
                              encoder_name=setting["encoding"],
                              encoder_params=symbol_table.get_config(
                                  setting["encoding"])["encoder_params"],
                              ml_method=ml_method,
                              ml_method_name=setting["ml_method"],
                              ml_params=symbol_table.get_config(
                                  setting["ml_method"]),
                              preproc_sequence=preprocessing_sequence,
                              preproc_sequence_name=preproc_name)
                settings.append(s)
            return settings
        except KeyError as key_error:
            raise KeyError(
                f"{TrainMLModelParser.__name__}: parameter {key_error.args[0]} was not defined under settings in TrainMLModel instruction."
            )
    def _prepare_optional_params(self, analysis: dict,
                                 symbol_table: SymbolTable,
                                 yaml_location: str) -> dict:

        params = {}
        dataset = symbol_table.get(analysis["dataset"])

        if "encoding" in analysis:
            params["encoder"] = symbol_table.get(
                analysis["encoding"]).build_object(
                    dataset,
                    **symbol_table.get_config(
                        analysis["encoding"])["encoder_params"])

            if "labels" in analysis:
                params["label_config"] = LabelHelper.create_label_config(
                    analysis["labels"], dataset,
                    ExploratoryAnalysisParser.__name__, yaml_location)
            else:
                params["label_config"] = LabelConfiguration()

        if "preprocessing_sequence" in analysis:
            params["preprocessing_sequence"] = symbol_table.get(
                analysis["preprocessing_sequence"])

        return params
Exemple #4
0
    def parse(self,
              key: str,
              instruction: dict,
              symbol_table: SymbolTable,
              path: Path = None) -> SimulationInstruction:
        ParameterValidator.assert_keys(
            instruction.keys(),
            ["dataset", "simulation", "type", "export_formats"],
            "SimulationParser", key)

        signals = [
            signal.item
            for signal in symbol_table.get_by_type(SymbolType.SIGNAL)
        ]
        simulation = symbol_table.get(instruction["simulation"])
        dataset = symbol_table.get(instruction["dataset"])

        exporters = self.parse_exporters(instruction)

        process = SimulationInstruction(signals=signals,
                                        simulation=simulation,
                                        dataset=dataset,
                                        name=key,
                                        exporters=exporters)
        return process
Exemple #5
0
    def _parse_settings(self, instruction: dict,
                        symbol_table: SymbolTable) -> list:
        try:
            settings = []
            for index, setting in enumerate(instruction["settings"]):
                if "preprocessing" in setting:
                    ParameterValidator.assert_type_and_value(
                        setting["preprocessing"], str,
                        TrainMLModelParser.__name__, f'settings: {index+1}. '
                        f'element: preprocessing')
                    if symbol_table.contains(setting["preprocessing"]):
                        preprocessing_sequence = symbol_table.get(
                            setting["preprocessing"])
                        preproc_name = setting["preprocessing"]
                    else:
                        raise KeyError(
                            f"{TrainMLModelParser.__name__}: preprocessing was set in the TrainMLModel instruction to value "
                            f"{setting['preprocessing']}, but no such preprocessing was defined in the specification under "
                            f"definitions: {PreprocessingParser.keyword}.")
                else:
                    setting["preprocessing"] = None
                    preprocessing_sequence = []
                    preproc_name = None

                ParameterValidator.assert_keys(
                    setting.keys(), ["preprocessing", "ml_method", "encoding"],
                    TrainMLModelParser.__name__,
                    f"settings, {index + 1}. entry")

                encoder = symbol_table.get(setting["encoding"]).build_object(symbol_table.get(instruction["dataset"]),
                                                                             **symbol_table.get_config(setting["encoding"])["encoder_params"])\
                    .set_context({"dataset": symbol_table.get(instruction['dataset'])})

                s = HPSetting(encoder=encoder,
                              encoder_name=setting["encoding"],
                              encoder_params=symbol_table.get_config(
                                  setting["encoding"])["encoder_params"],
                              ml_method=symbol_table.get(setting["ml_method"]),
                              ml_method_name=setting["ml_method"],
                              ml_params=symbol_table.get_config(
                                  setting["ml_method"]),
                              preproc_sequence=preprocessing_sequence,
                              preproc_sequence_name=preproc_name)
                settings.append(s)
            return settings
        except KeyError as key_error:
            raise KeyError(
                f"{TrainMLModelParser.__name__}: parameter {key_error.args[0]} was not defined under settings in TrainMLModel instruction."
            )
Exemple #6
0
    def parse(self, key: str, instruction: dict, symbol_table: SymbolTable,
              path: Path) -> MLApplicationInstruction:
        location = MLApplicationParser.__name__
        ParameterValidator.assert_keys(instruction.keys(), [
            'type', 'dataset', 'number_of_processes', 'config_path',
            'store_encoded_data'
        ], location, key)
        ParameterValidator.assert_in_valid_list(
            instruction['dataset'],
            symbol_table.get_keys_by_type(SymbolType.DATASET), location,
            f"{key}: dataset")
        ParameterValidator.assert_type_and_value(
            instruction['number_of_processes'],
            int,
            location,
            f"{key}: number_of_processes",
            min_inclusive=1)
        ParameterValidator.assert_type_and_value(instruction['config_path'],
                                                 str, location,
                                                 f'{key}: config_path')
        ParameterValidator.assert_type_and_value(
            instruction['store_encoded_data'], bool, location,
            f'{key}: store_encoded_data')

        hp_setting, label = self._parse_hp_setting(instruction, path, key)

        instruction = MLApplicationInstruction(
            dataset=symbol_table.get(instruction['dataset']),
            name=key,
            number_of_processes=instruction['number_of_processes'],
            label_configuration=LabelConfiguration([label]),
            hp_setting=hp_setting,
            store_encoded_data=instruction['store_encoded_data'])

        return instruction
Exemple #7
0
 def test_parse_reports(self):
     reports = {"r1": {"SequenceLengthDistribution": {}}}
     symbol_table = SymbolTable()
     symbol_table, specs = ReportParser.parse_reports(reports, symbol_table)
     self.assertTrue(symbol_table.contains("r1"))
     self.assertTrue(
         isinstance(symbol_table.get("r1"), SequenceLengthDistribution))
 def _prepare_reports(self, reports: list, symbol_table: SymbolTable) -> dict:
     if reports is not None:
         ParameterValidator.assert_type_and_value(reports, list, TrainMLModelParser.__name__, "reports")
         report_objects = {report_id: symbol_table.get(report_id) for report_id in reports}
         ParameterValidator.assert_all_type_and_value(report_objects.values(), TrainMLModelReport, TrainMLModelParser.__name__, 'reports')
         return report_objects
     else:
         return {}
    def _prepare_params(self, analysis: dict, symbol_table: SymbolTable) -> dict:

        valid_keys = ["dataset", "report", "preprocessing_sequence", "labels", "encoding", "number_of_processes"]
        ParameterValidator.assert_keys(list(analysis.keys()), valid_keys, "ExploratoryAnalysisParser", "analysis", False)

        params = {"dataset": symbol_table.get(analysis["dataset"]), "report": copy.deepcopy(symbol_table.get(analysis["report"]))}

        optional_params = self._prepare_optional_params(analysis, symbol_table)
        params = {**params, **optional_params}

        return params
    def _prepare_optional_params(self, analysis: dict, symbol_table: SymbolTable) -> dict:

        params = {}
        dataset = symbol_table.get(analysis["dataset"])

        if "encoding" in analysis:
            params["encoder"] = symbol_table.get(analysis["encoding"]).build_object(dataset, **symbol_table.get_config(analysis["encoding"])["encoder_params"])
            params["label_config"] = LabelConfiguration()

            if "labels" in analysis:
                for label in analysis["labels"]:
                    label_values = self._get_label_values(label, dataset)
                    params["label_config"].add_label(label, label_values)

        if "preprocessing_sequence" in analysis:
            params["preprocessing_sequence"] = symbol_table.get(analysis["preprocessing_sequence"])

        if "number_of_processes" in analysis:
            params["number_of_processes"] = analysis["number_of_processes"]

        return params
    def test_parse_simulation(self):

        simulation = {
            "sim1": {
                "var1": {
                    "signals": ["signal1"],
                    "dataset_implanting_rate": 0.5,
                    "repertoire_implanting_rate": 0.1
                }
            }
        }

        symbol_table = SymbolTable()
        symbol_table.add("motif1", SymbolType.MOTIF, Motif("motif1", GappedKmerInstantiation(position_weights={0: 1}), seed="CAS"))
        symbol_table.add("signal1", SymbolType.SIGNAL, Signal("signal1", [symbol_table.get("motif1")],
                                                              HealthySequenceImplanting(GappedMotifImplanting(), implanting_computation=ImplantingComputation.ROUND)))

        symbol_table, specs = SimulationParser.parse_simulations(simulation, symbol_table)

        self.assertTrue(symbol_table.contains("sim1"))
        sim1 = symbol_table.get("sim1")
        self.assertEqual(1, len(sim1.implantings))
    def parse(self,
              key: str,
              instruction: dict,
              symbol_table: SymbolTable,
              path: Path = None) -> DatasetExportInstruction:
        location = "DatasetExportParser"
        ParameterValidator.assert_keys(
            list(instruction.keys()), DatasetExportParser.REQUIRED_KEYS +
            DatasetExportParser.OPTIONAL_KEYS, location, key, False)
        ParameterValidator.assert_keys_present(
            list(instruction.keys()), DatasetExportParser.REQUIRED_KEYS,
            location, key)

        valid_formats = ReflectionHandler.all_nonabstract_subclass_basic_names(
            DataExporter, "Exporter", 'dataset_export/')
        ParameterValidator.assert_all_in_valid_list(
            instruction["export_formats"], valid_formats, location,
            "export_formats")
        ParameterValidator.assert_all_in_valid_list(
            instruction["datasets"],
            symbol_table.get_keys_by_type(SymbolType.DATASET), location,
            "datasets")

        return DatasetExportInstruction(
            datasets=[
                symbol_table.get(dataset_key)
                for dataset_key in instruction["datasets"]
            ],
            exporters=[
                ReflectionHandler.get_class_by_name(f"{key}Exporter",
                                                    "dataset_export/")
                for key in instruction["export_formats"]
            ],
            preprocessing_sequence=symbol_table.get(
                instruction["preprocessing_sequence"])
            if "preprocessing_sequence" in instruction else None,
            name=key)
Exemple #13
0
    def parse(self,
              key: str,
              instruction: dict,
              symbol_table: SymbolTable,
              path: Path = None) -> SubsamplingInstruction:
        valid_keys = [
            "type", "dataset", "subsampled_dataset_sizes",
            "dataset_export_formats"
        ]
        ParameterValidator.assert_keys(instruction.keys(), valid_keys,
                                       SubsamplingParser.__name__, key)

        dataset_keys = symbol_table.get_keys_by_type(SymbolType.DATASET)
        ParameterValidator.assert_in_valid_list(instruction['dataset'],
                                                dataset_keys,
                                                SubsamplingParser.__name__,
                                                f'{key}/dataset')

        dataset = symbol_table.get(instruction['dataset'])
        ParameterValidator.assert_type_and_value(
            instruction['subsampled_dataset_sizes'], list,
            SubsamplingParser.__name__, f'{key}/subsampled_dataset_sizes')
        ParameterValidator.assert_all_type_and_value(
            instruction['subsampled_dataset_sizes'], int,
            SubsamplingParser.__name__, f'{key}/subsampled_dataset_sizes', 1,
            dataset.get_example_count())

        valid_export_formats = ReflectionHandler.all_nonabstract_subclass_basic_names(
            DataExporter, 'Exporter', "dataset_export/")
        ParameterValidator.assert_type_and_value(
            instruction['dataset_export_formats'], list,
            SubsamplingParser.__name__, f"{key}/dataset_export_formats")
        ParameterValidator.assert_all_in_valid_list(
            instruction['dataset_export_formats'], valid_export_formats,
            SubsamplingParser.__name__, f"{key}/dataset_export_formats")

        return SubsamplingInstruction(
            dataset=dataset,
            subsampled_dataset_sizes=instruction['subsampled_dataset_sizes'],
            dataset_export_formats=[
                ReflectionHandler.get_class_by_name(export_format + "Exporter",
                                                    "dataset_export/")
                for export_format in instruction['dataset_export_formats']
            ],
            name=key)
Exemple #14
0
    def _parse_simulation(key: str, simulation: dict, symbol_table: SymbolTable) -> SymbolTable:

        location = "SimulationParser"
        valid_implanting_keys = ["dataset_implanting_rate", "repertoire_implanting_rate", "signals", "is_noise"]
        implantings = []

        for impl_key, implanting in simulation.items():

            ParameterValidator.assert_keys(implanting.keys(), valid_implanting_keys, location, impl_key, exclusive=False)
            ParameterValidator.assert_keys(implanting["signals"], symbol_table.get_keys_by_type(SymbolType.SIGNAL), location, impl_key, False)

            implanting_params = copy.deepcopy(implanting)
            implanting_params["signals"] = [symbol_table.get(signal) for signal in implanting["signals"]]
            implanting_params["name"] = impl_key

            implantings.append(Implanting(**implanting_params))

        assert sum([settings["dataset_implanting_rate"] for settings in simulation.values()]) <= 1, \
            "The total dataset implanting rate can not exceed 1."

        symbol_table.add(key, SymbolType.SIMULATION, Simulation(implantings))

        return symbol_table
Exemple #15
0
    def parse_signals(signals: dict, symbol_table: SymbolTable):
        for key, signal_spec in signals.items():

            ParameterValidator.assert_keys_present(signal_spec.keys(),
                                                   SignalParser.VALID_KEYS,
                                                   "SignalParser", key)

            implanting_strategy = SignalParser._get_implanting_strategy(
                key, signal_spec)

            ParameterValidator.assert_keys(
                signal_spec["motifs"],
                symbol_table.get_keys_by_type(SymbolType.MOTIF),
                "SignalParser", f"motifs in signal {key}", False)

            signal_motifs = [
                symbol_table.get(motif_id)
                for motif_id in signal_spec["motifs"]
            ]
            signal = Signal(key, signal_motifs, implanting_strategy)
            symbol_table.add(key, SymbolType.SIGNAL, signal)

        return symbol_table, signals
    def parse(self, key: str, instruction: dict, symbol_table: SymbolTable, path: Path = None) -> TrainMLModelInstruction:

        valid_keys = ["assessment", "selection", "dataset", "strategy", "labels", "metrics", "settings", "number_of_processes", "type", "reports",
                      "optimization_metric", 'refit_optimal_model', 'store_encoded_data']
        ParameterValidator.assert_type_and_value(instruction['settings'], list, TrainMLModelParser.__name__, 'settings')
        ParameterValidator.assert_keys(list(instruction.keys()), valid_keys, TrainMLModelParser.__name__, "TrainMLModel")
        ParameterValidator.assert_type_and_value(instruction['refit_optimal_model'], bool, TrainMLModelParser.__name__, 'refit_optimal_model')
        ParameterValidator.assert_type_and_value(instruction['metrics'], list, TrainMLModelParser.__name__, 'metrics')
        ParameterValidator.assert_type_and_value(instruction['optimization_metric'], str, TrainMLModelParser.__name__, 'optimization_metric')
        ParameterValidator.assert_type_and_value(instruction['number_of_processes'], int, TrainMLModelParser.__name__, 'number_of_processes')
        ParameterValidator.assert_type_and_value(instruction['strategy'], str, TrainMLModelParser.__name__, 'strategy')
        ParameterValidator.assert_type_and_value(instruction['store_encoded_data'], bool, TrainMLModelParser.__name__, 'store_encoded_data')
        if instruction["reports"] is not None:
            ParameterValidator.assert_type_and_value(instruction['reports'], list, TrainMLModelParser.__name__, 'reports')

        settings = self._parse_settings(instruction, symbol_table)
        dataset = symbol_table.get(instruction["dataset"])
        assessment = self._parse_split_config(key, instruction, "assessment", symbol_table, len(settings))
        selection = self._parse_split_config(key, instruction, "selection", symbol_table, len(settings))
        assessment, selection = self._update_split_configs(assessment, selection, dataset)
        label_config = self._create_label_config(instruction, dataset, key)
        strategy = ReflectionHandler.get_class_by_name(instruction["strategy"], "hyperparameter_optimization/")
        metrics = {Metric[metric.upper()] for metric in instruction["metrics"]}
        optimization_metric = Metric[instruction["optimization_metric"].upper()]
        metric_search_criterion = Metric.get_search_criterion(optimization_metric)
        path = self._prepare_path(instruction)
        context = self._prepare_context(instruction, symbol_table)
        reports = self._prepare_reports(instruction["reports"], symbol_table)

        hp_instruction = TrainMLModelInstruction(dataset=dataset, hp_strategy=strategy(settings, metric_search_criterion),
                                                 hp_settings=settings, assessment=assessment, selection=selection, metrics=metrics,
                                                 optimization_metric=optimization_metric, refit_optimal_model=instruction['refit_optimal_model'],
                                                 label_configuration=label_config, path=path, context=context,
                                                 store_encoded_data=instruction['store_encoded_data'],
                                                 number_of_processes=instruction["number_of_processes"], reports=reports, name=key)

        return hp_instruction
 def _prepare_context(self, instruction: dict, symbol_table: SymbolTable):
     return {"dataset": symbol_table.get(instruction["dataset"])}