Beispiel #1
0
    def test_get_optimal_hps(self):
        hp_settings = [
            HPSetting(encoder=KmerFrequencyEncoder,
                      encoder_params={},
                      encoder_name="e1",
                      ml_method=LogisticRegression(),
                      ml_params={
                          "model_selection_cv": False,
                          "model_selection_n_fold": -1
                      },
                      ml_method_name="ml1",
                      preproc_sequence=[]),
            HPSetting(encoder=Word2VecEncoder,
                      encoder_params={},
                      encoder_name='e2',
                      ml_method=LogisticRegression(),
                      ml_params={
                          "model_selection_cv": False,
                          "model_selection_n_fold": -1
                      },
                      ml_method_name="ml2",
                      preproc_sequence=[])
        ]

        grid_search = GridSearch(hp_settings)
        setting1 = grid_search.generate_next_setting()
        setting2 = grid_search.generate_next_setting(setting1, 0.7)
        grid_search.generate_next_setting(setting2, 0.8)

        optimal = grid_search.get_optimal_hps()

        self.assertEqual(hp_settings[1], optimal)
Beispiel #2
0
    def test_generate_next_setting(self):

        hp_settings = [
            HPSetting(encoder=KmerFrequencyEncoder,
                      encoder_params={},
                      encoder_name="enc1",
                      ml_method=LogisticRegression(),
                      ml_params={
                          "model_selection_cv": False,
                          "model_selection_n_fold": -1
                      },
                      ml_method_name="ml1",
                      preproc_sequence=[]),
            HPSetting(encoder=Word2VecEncoder,
                      encoder_params={},
                      encoder_name="enc2",
                      ml_method=LogisticRegression(),
                      ml_params={
                          "model_selection_cv": False,
                          "model_selection_n_fold": -1
                      },
                      ml_method_name="ml2",
                      preproc_sequence=[])
        ]

        grid_search = GridSearch(hp_settings)
        setting1 = grid_search.generate_next_setting()
        setting2 = grid_search.generate_next_setting(setting1, 0.7)
        setting3 = grid_search.generate_next_setting(setting2, 0.8)

        self.assertIsNone(setting3)
        self.assertEqual(KmerFrequencyEncoder, setting1.encoder)
        self.assertEqual(Word2VecEncoder, setting2.encoder)
    def test_run(self):

        path = EnvironmentSettings.tmp_test_path / "mlapplicationtest/"
        PathBuilder.build(path)

        dataset = RandomDatasetGenerator.generate_repertoire_dataset(50, {5: 1}, {5: 1}, {"l1": {1: 0.5, 2: 0.5}}, path / 'dataset/')
        ml_method = LogisticRegression()
        encoder = KmerFreqRepertoireEncoder(NormalizationType.RELATIVE_FREQUENCY, ReadsType.UNIQUE, SequenceEncodingType.CONTINUOUS_KMER, 3,
                                            scale_to_zero_mean=True, scale_to_unit_variance=True)
        label_config = LabelConfiguration([Label("l1", [1, 2])])

        enc_dataset = encoder.encode(dataset, EncoderParams(result_path=path, label_config=label_config, filename="tmp_enc_dataset.pickle", pool_size=4))
        ml_method.fit(enc_dataset.encoded_data, 'l1')

        hp_setting = HPSetting(encoder, {"normalization_type": "relative_frequency", "reads": "unique", "sequence_encoding": "continuous_kmer",
                                         "k": 3, "scale_to_zero_mean": True, "scale_to_unit_variance": True}, ml_method, {}, [], 'enc1', 'ml1')

        PathBuilder.build(path / 'result/instr1/')
        shutil.copy(path / 'dict_vectorizer.pickle', path / 'result/instr1/dict_vectorizer.pickle')
        shutil.copy(path / 'scaler.pickle', path / 'result/instr1/scaler.pickle')

        ml_app = MLApplicationInstruction(dataset, label_config, hp_setting, 4, "instr1", False)
        ml_app.run(path / 'result/')

        predictions_path = path / "result/instr1/predictions.csv"
        self.assertTrue(os.path.isfile(predictions_path))

        df = pd.read_csv(predictions_path)
        self.assertEqual(50, df.shape[0])

        shutil.rmtree(path)
Beispiel #4
0
    def import_hp_setting(config_dir: Path) -> Tuple[HPSetting, Label]:

        config = MLMethodConfiguration()
        config.load(config_dir / 'ml_config.yaml')

        ml_method = ReflectionHandler.get_class_by_name(
            config.ml_method, 'ml_methods/')()
        ml_method.load(config_dir)

        encoder = MLImport.import_encoder(config, config_dir)
        preprocessing_sequence = MLImport.import_preprocessing_sequence(
            config, config_dir)

        labels = list(config.labels_with_values.keys())
        assert len(
            labels
        ) == 1, "MLImport: Multiple labels set in a single ml_config file."

        label = Label(labels[0], config.labels_with_values[labels[0]])

        return HPSetting(
            encoder=encoder,
            encoder_params=config.encoding_parameters,
            encoder_name=config.encoding_name,
            ml_method=ml_method,
            ml_method_name=config.ml_method_name,
            ml_params={},
            preproc_sequence=preprocessing_sequence,
            preproc_sequence_name=config.preprocessing_sequence_name), label
Beispiel #5
0
    def test(self):

        path = EnvironmentSettings.tmp_test_path / "integration_sequence_classification/"
        dataset = RandomDatasetGenerator.generate_sequence_dataset(50, {4: 1}, {'l1': {1: 0.5, 2: 0.5}}, path / 'data')

        os.environ["cache_type"] = "test"
        encoder_params = {
            "normalization_type": NormalizationType.RELATIVE_FREQUENCY.name,
            "reads": ReadsType.UNIQUE.name,
            "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name,
            "sequence_type": SequenceType.AMINO_ACID.name,
            "k": 3
        }

        hp_setting = HPSetting(encoder=KmerFrequencyEncoder.build_object(dataset, **encoder_params), encoder_params=encoder_params,
                               ml_method=LogisticRegression(), ml_params={"model_selection_cv": False, "model_selection_n_folds": -1},
                               preproc_sequence=[])

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])

        instruction = TrainMLModelInstruction(dataset, GridSearch([hp_setting]), [hp_setting],
                                              SplitConfig(SplitType.RANDOM, 1, 0.5, reports=ReportConfig()),
                                              SplitConfig(SplitType.RANDOM, 1, 0.5, reports=ReportConfig()),
                                              {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, lc, path)

        result = instruction.run(result_path=path)

        shutil.rmtree(path)
Beispiel #6
0
    def _parse_settings(self, instruction: dict,
                        symbol_table: SymbolTable) -> list:
        try:
            settings = []
            for index, setting in enumerate(instruction["settings"]):
                if "preprocessing" in setting and setting[
                        "preprocessing"] is not None:
                    ParameterValidator.assert_type_and_value(
                        setting["preprocessing"], str,
                        TrainMLModelParser.__name__, f'settings: {index+1}. '
                        f'element: preprocessing')
                    if symbol_table.contains(setting["preprocessing"]):
                        preprocessing_sequence = symbol_table.get(
                            setting["preprocessing"])
                        preproc_name = setting["preprocessing"]
                        if not all(preproc.keeps_example_count()
                                   for preproc in preprocessing_sequence):
                            raise ValueError(
                                f"{TrainMLModelParser.__name__}: preprocessing sequence {preproc_name} includes preprocessing that "
                                f"change the number of examples at runtime and as such cannot be used with this instruction. See the "
                                f"documentation for the preprocessing or alternatively use them with other instructions."
                            )
                    else:
                        raise KeyError(
                            f"{TrainMLModelParser.__name__}: preprocessing was set in the TrainMLModel instruction to value "
                            f"{setting['preprocessing']}, but no such preprocessing was defined in the specification under "
                            f"definitions: {PreprocessingParser.keyword}.")
                else:
                    setting["preprocessing"] = None
                    preprocessing_sequence = []
                    preproc_name = None

                ParameterValidator.assert_keys(
                    setting.keys(), ["preprocessing", "ml_method", "encoding"],
                    TrainMLModelParser.__name__,
                    f"settings, {index + 1}. entry")

                encoder = symbol_table.get(setting["encoding"]).build_object(symbol_table.get(instruction["dataset"]),
                                                                             **symbol_table.get_config(setting["encoding"])["encoder_params"])\
                    .set_context({"dataset": symbol_table.get(instruction['dataset'])})

                ml_method = symbol_table.get(setting["ml_method"])
                ml_method.check_encoder_compatibility(encoder)

                s = HPSetting(encoder=encoder,
                              encoder_name=setting["encoding"],
                              encoder_params=symbol_table.get_config(
                                  setting["encoding"])["encoder_params"],
                              ml_method=ml_method,
                              ml_method_name=setting["ml_method"],
                              ml_params=symbol_table.get_config(
                                  setting["ml_method"]),
                              preproc_sequence=preprocessing_sequence,
                              preproc_sequence_name=preproc_name)
                settings.append(s)
            return settings
        except KeyError as key_error:
            raise KeyError(
                f"{TrainMLModelParser.__name__}: parameter {key_error.args[0]} was not defined under settings in TrainMLModel instruction."
            )
Beispiel #7
0
    def test_run(self):

        path = EnvironmentSettings.root_path / "test/tmp/smmodel/"
        PathBuilder.build(path)
        repertoires, metadata = RepertoireBuilder.build(
            [["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"],
             ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"],
             ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"],
             ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"],
             ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"],
             ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"],
             ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"],
             ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"]], path, {
                 "default": [
                     1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1,
                     2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2
                 ]
             })
        dataset = RepertoireDataset(repertoires=repertoires,
                                    labels={"default": [1, 2]},
                                    metadata_file=metadata)

        label_config = LabelConfiguration()
        label_config.add_label("default", [1, 2])

        hp_settings = [
            HPSetting(
                Word2VecEncoder.build_object(
                    dataset, **{
                        "vector_size": 8,
                        "model_type": ModelType.SEQUENCE.name,
                        "k": 3
                    }), {
                        "vector_size": 8,
                        "model_type": ModelType.SEQUENCE.name,
                        "k": 3
                    }, LogisticRegression(), {
                        "model_selection_cv": False,
                        "model_selection_n_folds": -1
                    }, [])
        ]

        split_config_assessment = SplitConfig(SplitType.RANDOM, 1, 0.5,
                                              ReportConfig())
        split_config_selection = SplitConfig(SplitType.RANDOM, 1, 0.5,
                                             ReportConfig())

        instruction = TrainMLModelInstruction(
            dataset, GridSearch(hp_settings), hp_settings,
            split_config_assessment, split_config_selection,
            {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, label_config,
            path)
        semantic_model = SemanticModel([instruction], path)

        semantic_model.run()

        shutil.rmtree(path)
Beispiel #8
0
    def generate_next_setting(self, hp_setting: HPSetting = None, metric: float = None) -> HPSetting:

        if hp_setting is not None:
            self.search_space_metric[hp_setting.get_key()] = metric

        keys = [key for key in self.search_space_metric if self.search_space_metric[key] is None]

        if len(keys) > 0:
            next_setting = self.hp_settings[keys[0]]
        else:
            next_setting = None

        return copy.deepcopy(next_setting)
Beispiel #9
0
    def _parse_settings(self, instruction: dict,
                        symbol_table: SymbolTable) -> list:
        try:
            settings = []
            for index, setting in enumerate(instruction["settings"]):
                if "preprocessing" in setting:
                    ParameterValidator.assert_type_and_value(
                        setting["preprocessing"], str,
                        TrainMLModelParser.__name__, f'settings: {index+1}. '
                        f'element: preprocessing')
                    if symbol_table.contains(setting["preprocessing"]):
                        preprocessing_sequence = symbol_table.get(
                            setting["preprocessing"])
                        preproc_name = setting["preprocessing"]
                    else:
                        raise KeyError(
                            f"{TrainMLModelParser.__name__}: preprocessing was set in the TrainMLModel instruction to value "
                            f"{setting['preprocessing']}, but no such preprocessing was defined in the specification under "
                            f"definitions: {PreprocessingParser.keyword}.")
                else:
                    setting["preprocessing"] = None
                    preprocessing_sequence = []
                    preproc_name = None

                ParameterValidator.assert_keys(
                    setting.keys(), ["preprocessing", "ml_method", "encoding"],
                    TrainMLModelParser.__name__,
                    f"settings, {index + 1}. entry")

                encoder = symbol_table.get(setting["encoding"]).build_object(symbol_table.get(instruction["dataset"]),
                                                                             **symbol_table.get_config(setting["encoding"])["encoder_params"])\
                    .set_context({"dataset": symbol_table.get(instruction['dataset'])})

                s = HPSetting(encoder=encoder,
                              encoder_name=setting["encoding"],
                              encoder_params=symbol_table.get_config(
                                  setting["encoding"])["encoder_params"],
                              ml_method=symbol_table.get(setting["ml_method"]),
                              ml_method_name=setting["ml_method"],
                              ml_params=symbol_table.get_config(
                                  setting["ml_method"]),
                              preproc_sequence=preprocessing_sequence,
                              preproc_sequence_name=preproc_name)
                settings.append(s)
            return settings
        except KeyError as key_error:
            raise KeyError(
                f"{TrainMLModelParser.__name__}: parameter {key_error.args[0]} was not defined under settings in TrainMLModel instruction."
            )
    def _create_state_object(self, path):
        repertoires, metadata = RepertoireBuilder.build(sequences=[["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"]],
                                                        path=path,
                                                        labels={
                                                            "l1": [1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
                                                                   1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2],
                                                            "l2": [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1,
                                                                   0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]})

        dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata,
                                    labels={"l1": [1, 2], "l2": [0, 1]})
        enc_params = {"k": 3, "model_type": ModelType.SEQUENCE.name, "vector_size": 4}
        hp_settings = [HPSetting(Word2VecEncoder.build_object(dataset, **enc_params), enc_params,
                                 LogisticRegression(),
                                 {"model_selection_cv": False, "model_selection_n_folds": -1},
                                 [])]

        label_config = LabelConfiguration([Label("l1", [1, 2]), Label("l2", [0, 1])])

        process = TrainMLModelInstruction(dataset, GridSearch(hp_settings), hp_settings,
                                          SplitConfig(SplitType.RANDOM, 1, 0.7),
                                          SplitConfig(SplitType.RANDOM, 1, 0.7),
                                          {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, label_config, path)

        state = process.run(result_path=path)

        return state
    def test(self):

        path = EnvironmentSettings.tmp_test_path / "integration_receptor_classification/"
        dataset = self.create_dataset(path)

        os.environ["cache_type"] = "test"

        encoder_params = {
            "normalization_type": NormalizationType.RELATIVE_FREQUENCY.name,
            "reads": ReadsType.UNIQUE.name,
            "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name,
            "sequence_type": SequenceType.AMINO_ACID.name,
            "k": 3
        }

        hp_setting = HPSetting(encoder=KmerFrequencyEncoder.build_object(
            dataset, **encoder_params),
                               encoder_params=encoder_params,
                               ml_method=LogisticRegression(),
                               ml_params={
                                   "model_selection_cv": False,
                                   "model_selection_n_folds": -1
                               },
                               preproc_sequence=[])

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])

        instruction = TrainMLModelInstruction(
            dataset, GridSearch([hp_setting]), [hp_setting],
            SplitConfig(SplitType.RANDOM, 1, 0.5, reports=ReportConfig()),
            SplitConfig(SplitType.RANDOM, 1, 0.5, reports=ReportConfig()),
            {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, lc, path)

        state = instruction.run(result_path=path)
        print(vars(state))

        self.assertEqual(
            1.0, state.assessment_states[0].label_states["l1"].
            optimal_assessment_item.performance[
                state.optimization_metric.name.lower()])

        shutil.rmtree(path)
Beispiel #12
0
 def get_performance(self, hp_setting: HPSetting):
     key = hp_setting.get_key()
     if key in self.search_space_metric:
         return self.search_space_metric[key]
     else:
         return None
    def test_generate(self):
        path = EnvironmentSettings.tmp_test_path / "cv_feature_performance/"

        state = TrainMLModelState(
            assessment=SplitConfig(split_count=5,
                                   split_strategy=SplitType.K_FOLD),
            selection=SplitConfig(split_count=10,
                                  split_strategy=SplitType.K_FOLD),
            optimization_metric=Metric.ACCURACY,
            label_configuration=LabelConfiguration(
                labels=[Label(name="CMV", values=[True, False])]),
            hp_settings=[
                HPSetting(encoder_params={"p_value_threshold": 0.001},
                          encoder_name="e1",
                          encoder=SequenceAbundanceEncoder([], 0, 0, 0),
                          preproc_sequence=[],
                          ml_method_name="ml1",
                          ml_method=ProbabilisticBinaryClassifier(10, 0.1),
                          ml_params={}),
                HPSetting(encoder_params={"p_value_threshold": 0.01},
                          encoder_name="e2",
                          encoder=SequenceAbundanceEncoder([], 0, 0, 0),
                          preproc_sequence=[],
                          ml_method_name="ml1",
                          ml_method=ProbabilisticBinaryClassifier(10, 0.1),
                          ml_params={}),
                HPSetting(encoder_params={"p_value_threshold": 0.01},
                          encoder=SequenceAbundanceEncoder([], 0, 0, 0),
                          preproc_sequence=[],
                          ml_method=ProbabilisticBinaryClassifier(10, 0.01),
                          ml_params={})
            ],
            dataset=None,
            hp_strategy=None,
            metrics=None)

        report = CVFeaturePerformance("p_value_threshold",
                                      state,
                                      path,
                                      is_feature_axis_categorical=True,
                                      name="report1")
        with self.assertWarns(RuntimeWarning):
            report.generate_report()

        state.hp_settings = state.hp_settings[:2]

        state.assessment_states = [
            HPAssessmentState(i, None, None, None, state.label_configuration)
            for i in range(state.assessment.split_count)
        ]
        for assessment_state in state.assessment_states:
            assessment_state.label_states["CMV"] = HPLabelState("CMV", [])
            assessment_state.label_states["CMV"].assessment_items = {
                setting.get_key():
                HPItem(performance={'accuracy': random.uniform(0.5, 1)},
                       hp_setting=setting)
                for setting in state.hp_settings
            }
            assessment_state.label_states[
                "CMV"].selection_state = HPSelectionState(
                    [], [], "", GridSearch(state.hp_settings))
            assessment_state.label_states["CMV"].selection_state.hp_items = {
                str(setting): [
                    HPItem(performance={'accuracy': random.uniform(0.5, 1)},
                           hp_setting=setting)
                    for _ in range(state.selection.split_count)
                ]
                for setting in state.hp_settings
            }

        report.state = state

        report_result = report.generate_report()

        self.assertTrue(isinstance(report_result, ReportResult))
        self.assertEqual(2, len(report_result.output_tables))
        self.assertEqual(1, len(report_result.output_figures))
        self.assertTrue(os.path.isfile(report_result.output_figures[0].path))
        self.assertTrue(os.path.isfile(report_result.output_tables[0].path))
        self.assertTrue(os.path.isfile(report_result.output_tables[1].path))

        shutil.rmtree(path)
Beispiel #14
0
    def test_run(self):

        path = EnvironmentSettings.tmp_test_path / "hpoptimproc/"
        PathBuilder.build(path)

        repertoires, metadata = RepertoireBuilder.build(
            sequences=[["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"]],
            path=path,
            labels={
                "l1": [
                    1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
                    1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2
                ],
                "l2": [
                    0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1,
                    0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
                ]
            })

        dataset = RepertoireDataset(repertoires=repertoires,
                                    metadata_file=metadata,
                                    labels={
                                        "l1": [1, 2],
                                        "l2": [0, 1]
                                    })
        enc1 = {
            "k": 3,
            "model_type": ModelType.SEQUENCE.name,
            "vector_size": 4
        }
        enc2 = {
            "k": 3,
            "model_type": ModelType.SEQUENCE.name,
            "vector_size": 6
        }
        hp_settings = [
            HPSetting(Word2VecEncoder.build_object(dataset, **enc1), enc1,
                      LogisticRegression(), {
                          "model_selection_cv": False,
                          "model_selection_n_folds": -1
                      }, []),
            HPSetting(
                Word2VecEncoder.build_object(dataset, **enc2), enc2, SVM(), {
                    "model_selection_cv": False,
                    "model_selection_n_folds": -1
                },
                [ClonesPerRepertoireFilter(lower_limit=-1, upper_limit=1000)])
        ]

        report = SequenceLengthDistribution()
        label_config = LabelConfiguration(
            [Label("l1", [1, 2]), Label("l2", [0, 1])])

        process = TrainMLModelInstruction(
            dataset, GridSearch(hp_settings), hp_settings,
            SplitConfig(SplitType.RANDOM,
                        1,
                        0.5,
                        reports=ReportConfig(data_splits={"seqlen": report})),
            SplitConfig(SplitType.RANDOM,
                        1,
                        0.5,
                        reports=ReportConfig(data_splits={"seqlen": report})),
            {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, label_config,
            path)

        state = process.run(result_path=path)

        self.assertTrue(isinstance(state, TrainMLModelState))
        self.assertEqual(1, len(state.assessment_states))
        self.assertTrue("l1" in state.assessment_states[0].label_states)
        self.assertTrue("l2" in state.assessment_states[0].label_states)

        shutil.rmtree(path)