Python LogisticRegressionの例、immuneML.ml_methods.LogisticRegression.LogisticRegression Pythonの例

コード例 #1

0

ファイルを表示

ファイル: test_counfounderAnalysis.py プロジェクト: sailfish009/immuneML

    def _create_dummy_lr_model(self, path, encoded_data, label):
        # dummy logistic regression with 100 observations with 3 features belonging to 2 classes
        dummy_lr = LogisticRegression()
        dummy_lr.fit_by_cross_validation(encoded_data,
                                         number_of_splits=2, label_name=label)

        return dummy_lr

コード例 #2

0

ファイルを表示

ファイル: test_gridSearch.py プロジェクト: uio-bmi/immuneML

    def test_generate_next_setting(self):

        hp_settings = [
            HPSetting(encoder=KmerFrequencyEncoder,
                      encoder_params={},
                      encoder_name="enc1",
                      ml_method=LogisticRegression(),
                      ml_params={
                          "model_selection_cv": False,
                          "model_selection_n_fold": -1
                      },
                      ml_method_name="ml1",
                      preproc_sequence=[]),
            HPSetting(encoder=Word2VecEncoder,
                      encoder_params={},
                      encoder_name="enc2",
                      ml_method=LogisticRegression(),
                      ml_params={
                          "model_selection_cv": False,
                          "model_selection_n_fold": -1
                      },
                      ml_method_name="ml2",
                      preproc_sequence=[])
        ]

        grid_search = GridSearch(hp_settings)
        setting1 = grid_search.generate_next_setting()
        setting2 = grid_search.generate_next_setting(setting1, 0.7)
        setting3 = grid_search.generate_next_setting(setting2, 0.8)

        self.assertIsNone(setting3)
        self.assertEqual(KmerFrequencyEncoder, setting1.encoder)
        self.assertEqual(Word2VecEncoder, setting2.encoder)

コード例 #3

0

ファイルを表示

ファイル: test_gridSearch.py プロジェクト: uio-bmi/immuneML

    def test_get_optimal_hps(self):
        hp_settings = [
            HPSetting(encoder=KmerFrequencyEncoder,
                      encoder_params={},
                      encoder_name="e1",
                      ml_method=LogisticRegression(),
                      ml_params={
                          "model_selection_cv": False,
                          "model_selection_n_fold": -1
                      },
                      ml_method_name="ml1",
                      preproc_sequence=[]),
            HPSetting(encoder=Word2VecEncoder,
                      encoder_params={},
                      encoder_name='e2',
                      ml_method=LogisticRegression(),
                      ml_params={
                          "model_selection_cv": False,
                          "model_selection_n_fold": -1
                      },
                      ml_method_name="ml2",
                      preproc_sequence=[])
        ]

        grid_search = GridSearch(hp_settings)
        setting1 = grid_search.generate_next_setting()
        setting2 = grid_search.generate_next_setting(setting1, 0.7)
        grid_search.generate_next_setting(setting2, 0.8)

        optimal = grid_search.get_optimal_hps()

        self.assertEqual(hp_settings[1], optimal)

コード例 #4

0

ファイルを表示

ファイル: test_MLApplicationInstruction.py プロジェクト: sailfish009/immuneML

    def test_run(self):

        path = EnvironmentSettings.tmp_test_path / "mlapplicationtest/"
        PathBuilder.build(path)

        dataset = RandomDatasetGenerator.generate_repertoire_dataset(50, {5: 1}, {5: 1}, {"l1": {1: 0.5, 2: 0.5}}, path / 'dataset/')
        ml_method = LogisticRegression()
        encoder = KmerFreqRepertoireEncoder(NormalizationType.RELATIVE_FREQUENCY, ReadsType.UNIQUE, SequenceEncodingType.CONTINUOUS_KMER, 3,
                                            scale_to_zero_mean=True, scale_to_unit_variance=True)
        label_config = LabelConfiguration([Label("l1", [1, 2])])

        enc_dataset = encoder.encode(dataset, EncoderParams(result_path=path, label_config=label_config, filename="tmp_enc_dataset.pickle", pool_size=4))
        ml_method.fit(enc_dataset.encoded_data, 'l1')

        hp_setting = HPSetting(encoder, {"normalization_type": "relative_frequency", "reads": "unique", "sequence_encoding": "continuous_kmer",
                                         "k": 3, "scale_to_zero_mean": True, "scale_to_unit_variance": True}, ml_method, {}, [], 'enc1', 'ml1')

        PathBuilder.build(path / 'result/instr1/')
        shutil.copy(path / 'dict_vectorizer.pickle', path / 'result/instr1/dict_vectorizer.pickle')
        shutil.copy(path / 'scaler.pickle', path / 'result/instr1/scaler.pickle')

        ml_app = MLApplicationInstruction(dataset, label_config, hp_setting, 4, "instr1", False)
        ml_app.run(path / 'result/')

        predictions_path = path / "result/instr1/predictions.csv"
        self.assertTrue(os.path.isfile(predictions_path))

        df = pd.read_csv(predictions_path)
        self.assertEqual(50, df.shape[0])

        shutil.rmtree(path)

コード例 #5

0

ファイルを表示

    def _create_dummy_lr_model(self):
        dummy_lr = LogisticRegression()
        encoded_tr = EncodedData(np.random.rand(100, 20),
                                 {"l1": [i % 2 for i in range(0, 100)]})

        dummy_lr.fit_by_cross_validation(encoded_tr, number_of_splits=2,
                                         label=Label("l1", values=[0, 1]))
        return dummy_lr, encoded_tr

コード例 #6

0

ファイルを表示

ファイル: test_MLMethodAssessment.py プロジェクト: uio-bmi/immuneML

    def test_run(self):
        path = EnvironmentSettings.root_path / "test/tmp/mlmethodassessment/"
        PathBuilder.build(path)
        dataset = RepertoireDataset(repertoires=RepertoireBuilder.build(
            [["AA"], ["CC"], ["AA"], ["CC"], ["AA"], ["CC"], ["AA"], ["CC"],
             ["AA"], ["CC"], ["AA"], ["CC"]], path)[0])
        dataset.encoded_data = EncodedData(
            examples=np.array([[1, 1], [1, 1], [3, 3], [1, 1], [1, 1], [3, 3],
                               [1, 1], [1, 1], [3, 3], [1, 1], [1, 1], [3,
                                                                        3]]),
            labels={
                "l1": [1, 1, 3, 1, 1, 3, 1, 1, 3, 1, 1, 3],
                "l2": [1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3]
            })

        label_config = LabelConfiguration()
        label_config.add_label("l1", [1, 3])

        label = Label(name='l1', values=[1, 2])

        method1 = LogisticRegression()
        method1.fit(dataset.encoded_data, label=label)

        res = MLMethodAssessment.run(
            MLMethodAssessmentParams(
                dataset=dataset,
                method=method1,
                metrics={
                    Metric.ACCURACY, Metric.BALANCED_ACCURACY, Metric.F1_MACRO
                },
                optimization_metric=Metric.LOG_LOSS,
                predictions_path=EnvironmentSettings.root_path /
                "test/tmp/mlmethodassessment/predictions.csv",
                label=label,
                ml_score_path=EnvironmentSettings.root_path /
                "test/tmp/mlmethodassessment/ml_score.csv",
                split_index=1,
                path=EnvironmentSettings.root_path /
                "test/tmp/mlmethodassessment/"))

        self.assertTrue(isinstance(res, dict))
        self.assertTrue(res[Metric.LOG_LOSS.name.lower()] <= 0.1)

        self.assertTrue(
            os.path.isfile(EnvironmentSettings.root_path /
                           "test/tmp/mlmethodassessment/ml_score.csv"))

        df = pd.read_csv(EnvironmentSettings.root_path /
                         "test/tmp/mlmethodassessment/ml_score.csv")
        self.assertTrue(df.shape[0] == 1)

        df = pd.read_csv(EnvironmentSettings.root_path /
                         "test/tmp/mlmethodassessment/predictions.csv")
        self.assertEqual(12, df.shape[0])

        shutil.rmtree(EnvironmentSettings.root_path /
                      "test/tmp/mlmethodassessment/")

コード例 #7

0

ファイルを表示

    def test_fit_by_cross_validation(self):
        x = EncodedData(
            np.array([[1, 0, 0], [0, 1, 1], [1, 1, 1], [0, 1, 1], [1, 0, 0],
                      [0, 1, 1], [1, 1, 1], [0, 1, 1]]), {
                          "test1": [1, 0, 2, 0, 1, 0, 2, 0],
                          "test2": [1, 0, 2, 0, 1, 0, 2, 0]
                      })

        lr = LogisticRegression()
        lr.fit_by_cross_validation(x, number_of_splits=2, label=Label("test2"))

コード例 #8

0

ファイルを表示

    def test_predict(self):
        x = np.array([[1, 0, 0], [0, 1, 1], [1, 1, 1], [0, 1, 1]])
        y = {"test1": [1, 0, 2, 0], "test2": [1, 0, 2, 0]}

        lr = LogisticRegression()
        lr.fit(EncodedData(x, y), Label("test2"))

        test_x = np.array([[0, 1, 0], [1, 0, 0]])
        y = lr.predict(EncodedData(test_x), Label("test2"))

        self.assertTrue(len(y["test2"]) == 2)
        self.assertTrue(y["test2"][1] in [0, 1, 2])

コード例 #9

0

ファイルを表示

ファイル: test_motifSeedRecovery.py プロジェクト: uio-bmi/immuneML

    def _create_dummy_lr_model(self, path):
        # dummy logistic regression with 100 observations with 20 features belonging to 2 classes
        dummy_lr = LogisticRegression()
        dummy_lr.fit_by_cross_validation(EncodedData(np.random.rand(100, 5), {"l1": [i % 2 for i in range(0, 100)]}), number_of_splits=2,
                                         label=Label("l1"))

        # Change coefficients to values 1-20
        dummy_lr.model.coef_ = np.array(list(range(0, 5))).reshape(1, -1)

        with open(path / "ml_details.yaml", "w") as file:
            yaml.dump({"l1": {"feature_names": ["AAA", "AAC", "CKJ", "KSA", "AKJ"]}},
                      file)

        return dummy_lr

コード例 #10

0

ファイルを表示

    def test(self):

        path = EnvironmentSettings.tmp_test_path / "integration_sequence_classification/"
        dataset = RandomDatasetGenerator.generate_sequence_dataset(50, {4: 1}, {'l1': {1: 0.5, 2: 0.5}}, path / 'data')

        os.environ["cache_type"] = "test"
        encoder_params = {
            "normalization_type": NormalizationType.RELATIVE_FREQUENCY.name,
            "reads": ReadsType.UNIQUE.name,
            "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name,
            "sequence_type": SequenceType.AMINO_ACID.name,
            "k": 3
        }

        hp_setting = HPSetting(encoder=KmerFrequencyEncoder.build_object(dataset, **encoder_params), encoder_params=encoder_params,
                               ml_method=LogisticRegression(), ml_params={"model_selection_cv": False, "model_selection_n_folds": -1},
                               preproc_sequence=[])

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])

        instruction = TrainMLModelInstruction(dataset, GridSearch([hp_setting]), [hp_setting],
                                              SplitConfig(SplitType.RANDOM, 1, 0.5, reports=ReportConfig()),
                                              SplitConfig(SplitType.RANDOM, 1, 0.5, reports=ReportConfig()),
                                              {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, lc, path)

        result = instruction.run(result_path=path)

        shutil.rmtree(path)

コード例 #11

0

ファイルを表示

ファイル: test_semanticModel.py プロジェクト: uio-bmi/immuneML

    def test_run(self):

        path = EnvironmentSettings.root_path / "test/tmp/smmodel/"
        PathBuilder.build(path)
        repertoires, metadata = RepertoireBuilder.build(
            [["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"],
             ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"],
             ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"],
             ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"],
             ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"],
             ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"],
             ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"],
             ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"]], path, {
                 "default": [
                     1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1,
                     2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2
                 ]
             })
        dataset = RepertoireDataset(repertoires=repertoires,
                                    labels={"default": [1, 2]},
                                    metadata_file=metadata)

        label_config = LabelConfiguration()
        label_config.add_label("default", [1, 2])

        hp_settings = [
            HPSetting(
                Word2VecEncoder.build_object(
                    dataset, **{
                        "vector_size": 8,
                        "model_type": ModelType.SEQUENCE.name,
                        "k": 3
                    }), {
                        "vector_size": 8,
                        "model_type": ModelType.SEQUENCE.name,
                        "k": 3
                    }, LogisticRegression(), {
                        "model_selection_cv": False,
                        "model_selection_n_folds": -1
                    }, [])
        ]

        split_config_assessment = SplitConfig(SplitType.RANDOM, 1, 0.5,
                                              ReportConfig())
        split_config_selection = SplitConfig(SplitType.RANDOM, 1, 0.5,
                                             ReportConfig())

        instruction = TrainMLModelInstruction(
            dataset, GridSearch(hp_settings), hp_settings,
            split_config_assessment, split_config_selection,
            {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, label_config,
            path)
        semantic_model = SemanticModel([instruction], path)

        semantic_model.run()

        shutil.rmtree(path)

コード例 #12

0

ファイルを表示

    def test_load(self):
        x = np.array([[1, 0, 0], [0, 1, 1], [1, 1, 1], [0, 1, 1]])
        y = {"default": np.array([1, 0, 2, 0])}

        lr = LogisticRegression()
        lr.fit(EncodedData(x, y), 'default')

        path = EnvironmentSettings.root_path / "test/tmp/lr2/"
        PathBuilder.build(path)

        with open(path / "logistic_regression.pickle", "wb") as file:
            pickle.dump(lr.get_model(), file)

        lr2 = LogisticRegression()
        lr2.load(path)

        self.assertTrue(isinstance(lr2.get_model(), SklearnLogisticRegression))

        shutil.rmtree(path)

コード例 #13

0

ファイルを表示

    def _create_dummy_lr_model(self, path):
        # dummy logistic regression with 100 observations with 20 features belonging to 2 classes
        dummy_lr = LogisticRegression()
        dummy_lr.fit_by_cross_validation(EncodedData(np.random.rand(
            100, 20), {"l1": [i % 2 for i in range(0, 100)]}),
                                         number_of_splits=2,
                                         label_name="l1")

        # Change coefficients to values 1-20
        dummy_lr.models["l1"].coef_ = np.array(list(range(0,
                                                          20))).reshape(1, -1)

        file_path = path / "ml_details.yaml"
        with file_path.open("w") as file:
            yaml.dump(
                {"l1": {
                    "feature_names": [f"feature{i}" for i in range(20)]
                }}, file)

        return dummy_lr

コード例 #14

0

ファイルを表示

    def test_run(self):
        method = LogisticRegression()
        dataset = RepertoireDataset()
        dataset.encoded_data = EncodedData(examples=np.array([[1, 2, 3],
                                                              [2, 3, 4],
                                                              [1, 2, 3],
                                                              [2, 3, 4],
                                                              [1, 2, 3],
                                                              [2, 3, 4]]),
                                           labels={
                                               "l1": [1, 0, 1, 0, 1, 0],
                                               "l2": [0, 1, 0, 1, 0, 1]
                                           },
                                           feature_names=["f1", "f2", "f3"])

        path = EnvironmentSettings.root_path / "test/tmp/mlmethodtrainer/"

        method = MLMethodTrainer.run(
            MLMethodTrainerParams(result_path=path,
                                  dataset=dataset,
                                  label=Label(name="l1", values=[0, 1]),
                                  method=method,
                                  model_selection_n_folds=2,
                                  model_selection_cv=True,
                                  cores_for_training=1,
                                  train_predictions_path=path /
                                  "predictions.csv",
                                  ml_details_path=path / "details.yaml",
                                  optimization_metric="balanced_accuracy"))

        method.predict(EncodedData(np.array([1, 2, 3]).reshape(1, -1)),
                       Label("l1"))
        self.assertTrue(os.path.isfile(path / "predictions.csv"))
        self.assertTrue(os.path.isfile(path / "details.yaml"))

        shutil.rmtree(path)

コード例 #15

0

ファイルを表示

ファイル: test_receptorClassification.py プロジェクト: sailfish009/immuneML

    def test(self):

        path = EnvironmentSettings.tmp_test_path / "integration_receptor_classification/"
        dataset = self.create_dataset(path)

        os.environ["cache_type"] = "test"

        encoder_params = {
            "normalization_type": NormalizationType.RELATIVE_FREQUENCY.name,
            "reads": ReadsType.UNIQUE.name,
            "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name,
            "sequence_type": SequenceType.AMINO_ACID.name,
            "k": 3
        }

        hp_setting = HPSetting(encoder=KmerFrequencyEncoder.build_object(
            dataset, **encoder_params),
                               encoder_params=encoder_params,
                               ml_method=LogisticRegression(),
                               ml_params={
                                   "model_selection_cv": False,
                                   "model_selection_n_folds": -1
                               },
                               preproc_sequence=[])

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])

        instruction = TrainMLModelInstruction(
            dataset, GridSearch([hp_setting]), [hp_setting],
            SplitConfig(SplitType.RANDOM, 1, 0.5, reports=ReportConfig()),
            SplitConfig(SplitType.RANDOM, 1, 0.5, reports=ReportConfig()),
            {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, lc, path)

        state = instruction.run(result_path=path)
        print(vars(state))

        self.assertEqual(
            1.0, state.assessment_states[0].label_states["l1"].
            optimal_assessment_item.performance[
                state.optimization_metric.name.lower()])

        shutil.rmtree(path)

コード例 #16

0

ファイルを表示

ファイル: test_mLSettingsPerformance.py プロジェクト: sailfish009/immuneML

    def _create_state_object(self, path):
        repertoires, metadata = RepertoireBuilder.build(sequences=[["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"]],
                                                        path=path,
                                                        labels={
                                                            "l1": [1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
                                                                   1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2],
                                                            "l2": [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1,
                                                                   0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]})

        dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata,
                                    labels={"l1": [1, 2], "l2": [0, 1]})
        enc_params = {"k": 3, "model_type": ModelType.SEQUENCE.name, "vector_size": 4}
        hp_settings = [HPSetting(Word2VecEncoder.build_object(dataset, **enc_params), enc_params,
                                 LogisticRegression(),
                                 {"model_selection_cv": False, "model_selection_n_folds": -1},
                                 [])]

        label_config = LabelConfiguration([Label("l1", [1, 2]), Label("l2", [0, 1])])

        process = TrainMLModelInstruction(dataset, GridSearch(hp_settings), hp_settings,
                                          SplitConfig(SplitType.RANDOM, 1, 0.7),
                                          SplitConfig(SplitType.RANDOM, 1, 0.7),
                                          {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, label_config, path)

        state = process.run(result_path=path)

        return state

コード例 #17

0

ファイルを表示

    def test_store(self):
        x = np.array([[1, 0, 0], [0, 1, 1], [1, 1, 1], [0, 1, 1]])
        y = {"default": np.array([1, 0, 2, 0])}

        lr = LogisticRegression()
        lr.fit(EncodedData(x, y), Label("default"))

        path = EnvironmentSettings.root_path / "test/tmp/lr/"

        lr.store(path, ["f1", "f2", "f3"])
        self.assertTrue(os.path.isfile(path / "logistic_regression.pickle"))

        with open(path / "logistic_regression.pickle", "rb") as file:
            lr2 = pickle.load(file)

        self.assertTrue(isinstance(lr2, SklearnLogisticRegression))

        shutil.rmtree(path)

コード例 #18

0

ファイルを表示

    def test_fit(self):
        x = np.array([[1, 0, 0], [0, 1, 1], [1, 1, 1], [0, 1, 1]])
        y = {"test": np.array([1, 0, 2, 0])}

        lr = LogisticRegression()
        lr.fit(EncodedData(x, y), Label("test"))

コード例 #19

0

ファイルを表示

    def test_run(self):

        path = EnvironmentSettings.tmp_test_path / "hpoptimproc/"
        PathBuilder.build(path)

        repertoires, metadata = RepertoireBuilder.build(
            sequences=[["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"]],
            path=path,
            labels={
                "l1": [
                    1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
                    1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2
                ],
                "l2": [
                    0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1,
                    0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
                ]
            })

        dataset = RepertoireDataset(repertoires=repertoires,
                                    metadata_file=metadata,
                                    labels={
                                        "l1": [1, 2],
                                        "l2": [0, 1]
                                    })
        enc1 = {
            "k": 3,
            "model_type": ModelType.SEQUENCE.name,
            "vector_size": 4
        }
        enc2 = {
            "k": 3,
            "model_type": ModelType.SEQUENCE.name,
            "vector_size": 6
        }
        hp_settings = [
            HPSetting(Word2VecEncoder.build_object(dataset, **enc1), enc1,
                      LogisticRegression(), {
                          "model_selection_cv": False,
                          "model_selection_n_folds": -1
                      }, []),
            HPSetting(
                Word2VecEncoder.build_object(dataset, **enc2), enc2, SVM(), {
                    "model_selection_cv": False,
                    "model_selection_n_folds": -1
                },
                [ClonesPerRepertoireFilter(lower_limit=-1, upper_limit=1000)])
        ]

        report = SequenceLengthDistribution()
        label_config = LabelConfiguration(
            [Label("l1", [1, 2]), Label("l2", [0, 1])])

        process = TrainMLModelInstruction(
            dataset, GridSearch(hp_settings), hp_settings,
            SplitConfig(SplitType.RANDOM,
                        1,
                        0.5,
                        reports=ReportConfig(data_splits={"seqlen": report})),
            SplitConfig(SplitType.RANDOM,
                        1,
                        0.5,
                        reports=ReportConfig(data_splits={"seqlen": report})),
            {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, label_config,
            path)

        state = process.run(result_path=path)

        self.assertTrue(isinstance(state, TrainMLModelState))
        self.assertEqual(1, len(state.assessment_states))
        self.assertTrue("l1" in state.assessment_states[0].label_states)
        self.assertTrue("l2" in state.assessment_states[0].label_states)

        shutil.rmtree(path)