Beispiel #1
0
    def test_encode(self):
        path = EnvironmentSettings.tmp_test_path + "abundance_encoder/"
        PathBuilder.build(path)

        repertoires, metadata = RepertoireBuilder.build([["GGG", "III", "LLL", "MMM"],
                                                         ["DDD", "EEE", "FFF", "III", "LLL", "MMM"],
                                                         ["CCC", "FFF", "MMM"],
                                                         ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"]],
                                                        labels={"l1": [True, True, False, False]}, path=path)

        dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata, identifier="1")

        encoder = SequenceAbundanceEncoder.build_object(dataset, **{
            "comparison_attributes": ["sequence_aas"],
            "p_value_threshold": 0.4, "sequence_batch_size": 4, "repertoire_batch_size": 8
        })

        label_config = LabelConfiguration([Label("l1", [True, False], positive_class=True)])

        encoded_dataset = encoder.encode(dataset, EncoderParams(result_path=path, label_config=label_config))

        self.assertTrue(np.array_equal(np.array([[1, 4], [1, 6], [0, 3], [0, 6]]), encoded_dataset.encoded_data.examples))

        encoder.p_value_threshold = 0.05

        encoded_dataset = encoder.encode(dataset, EncoderParams(result_path=path, label_config=label_config))

        self.assertTrue(np.array_equal(np.array([[0, 4], [0, 6], [0, 3], [0, 6]]), encoded_dataset.encoded_data.examples))

        shutil.rmtree(path)
Beispiel #2
0
    def test_process(self):
        path = EnvironmentSettings.root_path + "test/tmp/clones_per_repertoire_filter/"
        PathBuilder.build(path)
        dataset = RepertoireDataset(repertoires=RepertoireBuilder.build(
            [["ACF", "ACF", "ACF"], ["ACF", "ACF"],
             ["ACF", "ACF", "ACF", "ACF"]], path)[0])

        dataset1 = ClonesPerRepertoireFilter.process(dataset, {
            "lower_limit": 3,
            "result_path": path
        })
        self.assertEqual(2, dataset1.get_example_count())

        dataset2 = ClonesPerRepertoireFilter.process(dataset, {
            "upper_limit": 2,
            "result_path": path
        })
        self.assertEqual(1, dataset2.get_example_count())

        self.assertRaises(AssertionError, ClonesPerRepertoireFilter.process,
                          dataset, {
                              "lower_limit": 10,
                              "result_path": path
                          })

        shutil.rmtree(path)
Beispiel #3
0
    def test_build(self):
        path = EnvironmentSettings.root_path + "test/tmp/repbuilder/"
        repertoires, metadata = RepertoireBuilder.build(
            [["AAA", "CCC"], ["TTTT"]], path, {"default": [1, 2]})

        self.assertEqual(2, len(repertoires))
        self.assertEqual((2, 4), pd.read_csv(metadata).shape)

        self.assertEqual(2, len(repertoires[0].sequences))
        self.assertTrue(
            all([
                isinstance(seq, ReceptorSequence)
                for seq in repertoires[0].sequences
            ]))
        self.assertEqual(1, repertoires[0].metadata["default"])

        self.assertEqual(1, len(repertoires[1].sequences))
        self.assertTrue(
            all([
                isinstance(seq, ReceptorSequence)
                for seq in repertoires[1].sequences
            ]))
        self.assertEqual(2, repertoires[1].metadata["default"])
        self.assertEqual("rep_1", repertoires[1].metadata["subject_id"])

        # Testing with custom metadata
        repertoires, metadata = RepertoireBuilder.build([["AAA", "CCC"]],
                                                        path,
                                                        seq_metadata=[[{
                                                            "v_gene":
                                                            "v5",
                                                            "j_gene":
                                                            "j5"
                                                        }, {
                                                            "v_gene":
                                                            "v2",
                                                            "j_gene":
                                                            "j2"
                                                        }]])

        self.assertEqual(repertoires[0].sequences[0].metadata.v_gene, "v5")
        self.assertEqual(repertoires[0].sequences[0].metadata.j_gene, "j5")
        self.assertEqual(repertoires[0].sequences[1].metadata.v_gene, "v2")
        self.assertEqual(repertoires[0].sequences[1].metadata.j_gene, "j2")

        shutil.rmtree(path)
Beispiel #4
0
 def create_dataset(self, path: str) -> RepertoireDataset:
     repertoires, metadata = RepertoireBuilder.build(
         [["A", "B"], ["B", "C"], ["D"], ["E", "F"], ["A", "B"], ["B", "C"],
          ["D"], ["E", "F"]], path, {
              "l1": [1, 0, 1, 0, 1, 0, 1, 0],
              "l2": [2, 3, 2, 3, 2, 3, 3, 3]
          })
     dataset = RepertoireDataset(repertoires=repertoires,
                                 metadata_file=metadata)
     return dataset
    def test_get_metadata_fields(self):

        path = EnvironmentSettings.tmp_test_path + "repertoire_dataset/"
        PathBuilder.build(path)

        repertoires, metadata = RepertoireBuilder.build([["AA"], ["BB"]], path, {"l1": [1, 2], "hla": ["A", "B"]}, subject_ids=["d1", "d2"])
        dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata)

        self.assertTrue("l1" in dataset.get_metadata_fields())
        self.assertTrue("hla" in dataset.get_metadata_fields())
        self.assertTrue("subject_id" in dataset.get_metadata_fields())

        shutil.rmtree(path)
Beispiel #6
0
    def create_datasets(self, path: str):
        repertoires, metadata = RepertoireBuilder.build(
            [["A", "B"], ["B", "C"], ["D"], ["E", "F"]], path, {
                "l1": [1, 0, 1, 0],
                "l2": [2, 3, 2, 3]
            })

        main_dataset = RepertoireDataset(repertoires=repertoires,
                                         metadata_file=metadata)
        sub_dataset = main_dataset.make_subset([0, 1],
                                               path=path,
                                               dataset_type="subset")
        return main_dataset, sub_dataset
Beispiel #7
0
    def create_dummy_data(self, path):
        # Setting up dummy data
        labels = {
            "subject_id": ["subject_1", "subject_2", "subject_3"],
            "label": ["yes", "yes", "no"]
        }

        metadata = {
            "v_gene": "TRBV1",
            "j_gene": "TRBJ1",
            "chain": Chain.BETA.value
        }

        repertoires, metadata = RepertoireBuilder.build(
            sequences=[["AAAA"], ["SSSS"], ["SSSS", "CCCC"]],
            path=path,
            labels=labels,
            seq_metadata=[[{
                **metadata, "count": 10
            }], [{
                **metadata, "count": 10
            }], [{
                **metadata, "count": 5
            }, {
                **metadata, "count": 5
            }]],
            subject_ids=labels["subject_id"])

        dataset = RepertoireDataset(repertoires=repertoires)

        label_config = LabelConfiguration()
        label_config.add_label("subject_id", labels["subject_id"])
        label_config.add_label("label", labels["label"])

        file_content = """complex.id	Gene	CDR3	V	J	Species	MHC A	MHC B	MHC class	Epitope	Epitope gene	Epitope species	Reference	Method	Meta	CDR3fix	Score
100	TRB	AAAA	TRBV1	TRBJ1	HomoSapiens	HLA-A*11:01	B2M	MHCI	AVFDRKSDAK	EBNA4	EBV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""}	{"cdr3": "CASSPPRVYSNGAGLAGVGWRNEQFF", "cdr3_old": "CASSPPRVYSNGAGLAGVGWRNEQFF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-1*01", "jStart": 21, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-4*01"}	0
200	TRB	SSSS	TRBV1	TRBJ1	HomoSapiens	HLA-A*03:01	B2M	MHCI	KLGGALQAK	IE1	CMV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""}	{"cdr3": "CASSWTWDAATLWGQGALGGANVLTF", "cdr3_old": "CASSWTWDAATLWGQGALGGANVLTF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-6*01", "jStart": 19, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-5*01"}	0"""

        with open(path + "refs.tsv", "w") as file:
            file.writelines(file_content)

        reference_sequences = {
            "params": {
                "path": path + "refs.tsv",
                "region_type": "FULL_SEQUENCE"
            },
            "format": "VDJdb"
        }

        return dataset, label_config, reference_sequences, labels
    def create_dataset(self, path):
        repertoires, metadata = RepertoireBuilder.build(
            [["AAA"], ["AAAC"], ["ACA"], ["CAAA"], ["AAAC"], ["AAA"]], path, {
                "l1": [1, 1, 1, 0, 0, 0],
                "l2": [2, 3, 2, 3, 2, 3]
            })

        dataset = RepertoireDataset(repertoires=repertoires,
                                    params={
                                        "l1": [0, 1],
                                        "l2": [2, 3]
                                    },
                                    metadata_file=metadata)
        return dataset
Beispiel #9
0
    def create_dataset(self):
        path = os.path.relpath(EnvironmentSettings.root_path + "test/tmp/immunemlapp/initial_dataset/") + "/"
        PathBuilder.build(path)

        repertoire_count = 30
        repertoires, metadata = RepertoireBuilder.build([["AA", "AAAA", "AAAA", "AAA"] for i in range(repertoire_count)], path,
                                                        {"CD": ['yes' if i % 2 == 0 else 'no' for i in range(repertoire_count)],
                                                         "CMV": [True if i % 2 == 1 else False for i in range(repertoire_count)]},
                                                        [[{"chain": "A" if i % 2 == 0 else "B", "count": random.randint(2, 5)}
                                                          for i in range(4)]
                                                         for j in range(repertoire_count)])

        dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata, params={"CD": [True, False], "CMV": [True, False]}, name="d1")
        PickleExporter.export(dataset, path)

        return path + "d1.iml_dataset"
    def create_dummy_dataset(self, path):
        repertoires, metadata = RepertoireBuilder.build([["AA"], ["CC"]],
                                                        path,
                                                        labels={
                                                            "label1":
                                                            ["val1", "val2"],
                                                            "label2":
                                                            ["val1", "val2"]
                                                        })

        dataset = RepertoireDataset(repertoires=repertoires,
                                    metadata_file=metadata)
        dataset.name = "my_dataset"
        PickleExporter.export(dataset, path)

        return f"{dataset.name}.iml_dataset"
    def generate_repertoire_dataset(repertoire_count: int, sequence_count_probabilities: dict, sequence_length_probabilities: dict,
                                    labels: dict, path: str) -> RepertoireDataset:
        """
        Creates repertoire_count repertoires where the number of sequences per repertoire is sampled from the probability distribution given
        in sequence_count_probabilities. The length of sequences is sampled independently for each sequence from
        sequence_length_probabilities distribution. The labels are also randomly assigned to repertoires from the distribution given in
        labels. In this case, labels are multi-class, so each repertoire will get at one class from each label. This means that negative
        classes for the labels should be included as well in the specification.

        An example of input parameters is given below:
        repertoire_count: 100 # generate 100 repertoires
        sequence_count_probabilities:
            100: 0.5 # half of the generated repertoires will have 100 sequences
            200: 0.5 # the other half of the generated repertoires will have 200 sequences
        sequence_length_distribution:
            14: 0.8 # 80% of all generated sequences for all repertoires will have length 14
            15: 0.2 # 20% of all generated sequences across all repertoires will have length 15
        labels:
            cmv: # label name
                True: 0.5 # 50% of the repertoires will have class True
                False: 0.5 # 50% of the repertoires will have class False
            coeliac: # next label with classes that will be assigned to repertoires independently of the previous label or any other parameter
                1: 0.3 # 30% of the generated repertoires will have class 1
                0: 0.7 # 70% of the generated repertoires will have class 0
        """
        RandomDatasetGenerator._check_rep_dataset_generation_params(repertoire_count, sequence_count_probabilities, sequence_length_probabilities,
                                                                    labels, path)

        alphabet = EnvironmentSettings.get_sequence_alphabet()
        PathBuilder.build(path)

        sequences = [["".join(random.choices(alphabet,
                                             k=random.choices(list(sequence_length_probabilities.keys()), sequence_length_probabilities.values())[0]))
                      for seq_count in range(random.choices(list(sequence_count_probabilities.keys()), sequence_count_probabilities.values())[0])]
                     for rep in range(repertoire_count)]

        if labels is not None:
            processed_labels = {label: random.choices(list(labels[label].keys()), labels[label].values(), k=repertoire_count) for label in labels}
            dataset_params = {label: list(labels[label].keys()) for label in labels}
        else:
            processed_labels = None
            dataset_params = None

        repertoires, metadata = RepertoireBuilder.build(sequences=sequences, path=path, labels=processed_labels)
        dataset = RepertoireDataset(params=dataset_params, repertoires=repertoires, metadata_file=metadata)

        return dataset
Beispiel #12
0
    def test_load(self):
        path = EnvironmentSettings.root_path + "test/tmp/pathbuilder/"
        PathBuilder.build(path)

        repertoires, metadata = RepertoireBuilder.build([["AA"], ["CC"]], path)
        dataset = RepertoireDataset(repertoires=repertoires,
                                    metadata_file=metadata)

        with open(path + "dataset.pkl", "wb") as file:
            pickle.dump(dataset, file)

        dataset2 = PickleImport.import_dataset({"path": path + "dataset.pkl"},
                                               "dataset_name")

        shutil.rmtree(path)

        self.assertEqual(2, len(dataset2.get_data()))
        self.assertEqual("rep_1",
                         dataset2.get_data()[1].metadata["subject_id"])
    def _create_state_object(self, path):
        repertoires, metadata = RepertoireBuilder.build(sequences=[["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"]],
                                                        path=path,
                                                        labels={
                                                            "l1": [1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
                                                                   1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2],
                                                            "l2": [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1,
                                                                   0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]})

        dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata,
                                    params={"l1": [1, 2], "l2": [0, 1]})
        enc_params = {"k": 3, "model_type": ModelType.SEQUENCE.name, "vector_size": 4}
        hp_settings = [HPSetting(Word2VecEncoder.build_object(dataset, **enc_params), enc_params,
                                 LogisticRegression(),
                                 {"model_selection_cv": False, "model_selection_n_folds": -1},
                                 [])]

        label_config = LabelConfiguration([Label("l1", [1, 2]), Label("l2", [0, 1])])

        process = TrainMLModelInstruction(dataset, GridSearch(hp_settings), hp_settings,
                                          SplitConfig(SplitType.RANDOM, 1, 0.5),
                                          SplitConfig(SplitType.RANDOM, 1, 0.5),
                                          {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, label_config, path)

        state = process.run(result_path=path)

        return state
    def test_process(self):
        path = EnvironmentSettings.root_path + "test/tmp/metadata_filter/"
        PathBuilder.build(path)
        dataset = RepertoireDataset(repertoires=RepertoireBuilder.build(
            [["ACF", "ACF", "ACF"], ["ACF", "ACF"],
             ["ACF", "ACF", "ACF", "ACF"]], path)[0])

        df = pd.DataFrame(data={"key1": [0, 1, 2], "key2": [0, 1, 2]})
        df.to_csv(path + "metadata.csv")

        dataset.metadata_file = path + "metadata.csv"

        dataset1 = MetadataRepertoireFilter.process(
            dataset, {
                "criteria": {
                    "type": OperationType.GREATER_THAN,
                    "value": {
                        "type": DataType.COLUMN,
                        "name": "key2"
                    },
                    "threshold": 1
                },
                "result_path": path
            })

        self.assertEqual(1, dataset1.get_example_count())

        self.assertRaises(
            AssertionError, MetadataRepertoireFilter.process, dataset, {
                "criteria": {
                    "type": OperationType.GREATER_THAN,
                    "value": {
                        "type": DataType.COLUMN,
                        "name": "key2"
                    },
                    "threshold": 10
                },
                "result_path": path
            })

        shutil.rmtree(path)
Beispiel #15
0
    def test_export(self):
        path = EnvironmentSettings.tmp_test_path + "pickleexporter/"
        PathBuilder.build(path)

        repertoires, metadata = RepertoireBuilder.build([["AA"], ["CC"]], path)
        dataset = RepertoireDataset(repertoires=repertoires,
                                    metadata_file=metadata)
        PickleExporter.export(
            dataset, EnvironmentSettings.tmp_test_path + "pickleexporter/")

        with open(
                EnvironmentSettings.tmp_test_path +
                f"pickleexporter/{dataset.name}.iml_dataset", "rb") as file:
            dataset2 = pickle.load(file)

        shutil.rmtree(EnvironmentSettings.tmp_test_path + "pickleexporter/")

        self.assertTrue(isinstance(dataset2, RepertoireDataset))
        self.assertEqual(2, len(dataset2.get_data()))
        self.assertEqual("rep_0",
                         dataset2.get_data()[0].metadata["subject_id"])
Beispiel #16
0
    def prepare_dataset(self, path):
        PathBuilder.build(path)
        repertoires, metadata = RepertoireBuilder.build(
            sequences=[["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"]],
            path=path,
            labels={
                "l1": [
                    1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
                    1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2
                ],
                "l2": [
                    0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1,
                    0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
                ]
            })

        dataset = RepertoireDataset(repertoires=repertoires,
                                    metadata_file=metadata,
                                    params={
                                        "l1": [1, 2],
                                        "l2": [0, 1]
                                    },
                                    name="dataset1")
        PickleExporter.export(dataset, path)
Beispiel #17
0
    def test_encode(self):

        path = EnvironmentSettings.tmp_test_path + "count_encoder/"
        PathBuilder.build(path)

        repertoires, metadata = RepertoireBuilder.build(
            [["GGG", "III", "LLL", "MMM"],
             ["DDD", "EEE", "FFF", "III", "LLL", "MMM"], ["CCC", "FFF", "MMM"],
             ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"]],
            labels={"l1": [True, True, False, False]},
            path=path)

        dataset = RepertoireDataset(repertoires=repertoires,
                                    metadata_file=metadata,
                                    identifier="1")

        encoder = SequenceCountEncoder.build_object(
            dataset, **{
                "comparison_attributes": ["sequence_aas"],
                "p_value_threshold": 0.4,
                "sequence_batch_size": 4
            })

        label_config = LabelConfiguration(
            [Label("l1", [True, False], positive_class=True)])

        encoded_dataset = encoder.encode(
            dataset, EncoderParams(result_path=path,
                                   label_config=label_config))

        test = encoded_dataset.encoded_data.examples

        self.assertTrue(test[0] == 1)
        self.assertTrue(test[1] == 1)
        self.assertTrue(test[2] == 0)
        self.assertTrue(test[3] == 0)

        self.assertTrue("III" in encoded_dataset.encoded_data.feature_names)

        shutil.rmtree(path)
Beispiel #18
0
    def test_run(self):

        path = EnvironmentSettings.root_path + "test/tmp/smmodel/"
        PathBuilder.build(path)
        repertoires, metadata = RepertoireBuilder.build([["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"],
                                                       ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"],
                                                       ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"],
                                                       ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"],
                                                       ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"],
                                                       ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"],
                                                       ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"],
                                                       ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"]], path,
                                                      {"default": [1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
                                                                   1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2]})
        dataset = RepertoireDataset(repertoires=repertoires,
                                    params={"default": [1, 2]},
                                    metadata_file=metadata)

        label_config = LabelConfiguration()
        label_config.add_label("default", [1, 2])

        hp_settings = [HPSetting(Word2VecEncoder.build_object(dataset, **{"vector_size": 8, "model_type": ModelType.SEQUENCE.name, "k": 3}),
                                 {"vector_size": 8, "model_type": ModelType.SEQUENCE.name, "k": 3},
                                 LogisticRegression(),
                                 {"model_selection_cv": False, "model_selection_n_folds": -1}, [])]

        split_config_assessment = SplitConfig(SplitType.RANDOM, 1, 0.5, ReportConfig())
        split_config_selection = SplitConfig(SplitType.RANDOM, 1, 0.5, ReportConfig())

        instruction = TrainMLModelInstruction(dataset, GridSearch(hp_settings), hp_settings,
                                              split_config_assessment,
                                              split_config_selection,
                                              {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY,
                                              label_config, path)
        semantic_model = SemanticModel([instruction], path)

        semantic_model.run()

        shutil.rmtree(path)
    def test_run(self):
        path = EnvironmentSettings.root_path + "test/tmp/mlmethodassessment/"
        PathBuilder.build(path)
        dataset = RepertoireDataset(repertoires=RepertoireBuilder.build([["AA"], ["CC"], ["AA"], ["CC"], ["AA"], ["CC"], ["AA"], ["CC"], ["AA"], ["CC"], ["AA"], ["CC"]], path)[0])
        dataset.encoded_data = EncodedData(
            examples=np.array([[1, 1], [1, 1], [3, 3], [1, 1], [1, 1], [3, 3], [1, 1], [1, 1], [3, 3], [1, 1], [1, 1], [3, 3]]),
            labels={"l1": [1, 1, 3, 1, 1, 3, 1, 1, 3, 1, 1, 3], "l2": [1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3]}
        )

        label_config = LabelConfiguration()
        label_config.add_label("l1", [1, 3])

        method1 = LogisticRegression()
        method1.fit(dataset.encoded_data, label_name='l1')

        res = MLMethodAssessment.run(MLMethodAssessmentParams(
            dataset=dataset,
            method=method1,
            metrics={Metric.ACCURACY, Metric.BALANCED_ACCURACY, Metric.F1_MACRO},
            optimization_metric=Metric.LOG_LOSS,
            predictions_path=EnvironmentSettings.root_path + "test/tmp/mlmethodassessment/predictions.csv",
            label="l1",
            ml_score_path=EnvironmentSettings.root_path + "test/tmp/mlmethodassessment/ml_score.csv",
            split_index=1,
            path=EnvironmentSettings.root_path + "test/tmp/mlmethodassessment/"
        ))

        self.assertTrue(isinstance(res, dict))
        self.assertTrue(res[Metric.LOG_LOSS.name.lower()] <= 0.1)

        self.assertTrue(os.path.isfile(EnvironmentSettings.root_path + "test/tmp/mlmethodassessment/ml_score.csv"))

        df = pd.read_csv(EnvironmentSettings.root_path + "test/tmp/mlmethodassessment/ml_score.csv")
        self.assertTrue(df.shape[0] == 1)

        df = pd.read_csv(EnvironmentSettings.root_path + "test/tmp/mlmethodassessment/predictions.csv")
        self.assertEqual(12, df.shape[0])

        shutil.rmtree(EnvironmentSettings.root_path + "test/tmp/mlmethodassessment/")
Beispiel #20
0
    def test_process(self):
        path = EnvironmentSettings.root_path + "test/tmp/count_per_seq_filter/"
        PathBuilder.build(path)
        dataset = RepertoireDataset(repertoires=RepertoireBuilder.build(
            [["ACF", "ACF", "ACF"], ["ACF", "ACF"],
             ["ACF", "ACF", "ACF", "ACF"]],
            path,
            seq_metadata=[[{
                "count": 1
            }, {
                "count": 2
            }, {
                "count": 3
            }], [{
                "count": 4
            }, {
                "count": 1
            }], [{
                "count": 5
            }, {
                "count": 6
            }, {
                "count": None
            }, {
                "count": 1
            }]])[0])

        dataset1 = CountPerSequenceFilter.process(
            dataset, {
                "low_count_limit": 2,
                "remove_without_count": True,
                "remove_empty_repertoires": False,
                "result_path": path,
                "batch_size": 4
            })
        self.assertEqual(2,
                         dataset1.repertoires[0].get_sequence_aas().shape[0])

        dataset2 = CountPerSequenceFilter.process(
            dataset, {
                "low_count_limit": 5,
                "remove_without_count": True,
                "remove_empty_repertoires": False,
                "result_path": path,
                "batch_size": 4
            })
        self.assertEqual(0,
                         dataset2.repertoires[0].get_sequence_aas().shape[0])

        dataset3 = CountPerSequenceFilter.process(
            dataset, {
                "low_count_limit": 0,
                "remove_without_count": True,
                "remove_empty_repertoires": False,
                "result_path": path,
                "batch_size": 4
            })
        self.assertEqual(3,
                         dataset3.repertoires[2].get_sequence_aas().shape[0])

        dataset = RepertoireDataset(repertoires=RepertoireBuilder.build(
            [["ACF", "ACF", "ACF"], ["ACF", "ACF"],
             ["ACF", "ACF", "ACF", "ACF"]],
            path,
            seq_metadata=[[{
                "count": None
            }, {
                "count": None
            }, {
                "count": None
            }], [{
                "count": None
            }, {
                "count": None
            }],
                          [{
                              "count": None
                          }, {
                              "count": None
                          }, {
                              "count": None
                          }, {
                              "count": None
                          }]])[0])

        dataset4 = CountPerSequenceFilter.process(
            dataset, {
                "low_count_limit": 0,
                "remove_without_count": True,
                "remove_empty_repertoires": False,
                "result_path": path,
                "batch_size": 4
            })
        self.assertEqual(0,
                         dataset4.repertoires[0].get_sequence_aas().shape[0])
        self.assertEqual(0,
                         dataset4.repertoires[1].get_sequence_aas().shape[0])
        self.assertEqual(0,
                         dataset4.repertoires[2].get_sequence_aas().shape[0])

        self.assertRaises(
            AssertionError, CountPerSequenceFilter.process, dataset, {
                "low_count_limit": 10,
                "remove_without_count": True,
                "remove_empty_repertoires": True,
                "result_path": path,
                "batch_size": 4
            })

        shutil.rmtree(path)
Beispiel #21
0
    def test_generate(self):

        path = EnvironmentSettings.tmp_test_path + "disease_assoc_seq_cv/"
        PathBuilder.build(path)

        repertoires, metadata = RepertoireBuilder.build(
            [["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF"],
             ["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF"],
             ["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF"],
             ["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF"],
             ["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF"],
             ["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF"],
             ["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF"]],
            labels={
                "l1": [
                    True, False, True, False, True, False, True, False, True,
                    False, True, False, True, False
                ]
            },
            path=path)

        dataset = RepertoireDataset(repertoires=repertoires,
                                    metadata_file=metadata,
                                    params={"l1": [True, False]})
        PickleExporter.export(dataset, path)

        specs = {
            "definitions": {
                "datasets": {
                    "d1": {
                        "format": "Pickle",
                        "params": {
                            "path": path + f"{dataset.name}.iml_dataset",
                        }
                    }
                },
                "encodings": {
                    "e1": {
                        "SequenceAbundance": {
                            'p_value_threshold': 0.5
                        }
                    }
                },
                "ml_methods": {
                    "knn": {
                        "KNN": {
                            "n_neighbors": 1
                        },
                    }
                },
                "reports": {
                    "r1": {
                        "DiseaseAssociatedSequenceCVOverlap": {
                            "compare_in_selection": True,
                            "compare_in_assessment": True
                        }
                    }
                }
            },
            "instructions": {
                "inst1": {
                    "type": "TrainMLModel",
                    "settings": [{
                        "encoding": "e1",
                        "ml_method": "knn"
                    }],
                    "assessment": {
                        "split_strategy": "random",
                        "split_count": 1,
                        "training_percentage": 0.5,
                        "reports": {}
                    },
                    "selection": {
                        "split_strategy": "random",
                        "split_count": 1,
                        "training_percentage": 0.5,
                    },
                    "labels": [{
                        "l1": {
                            "positive_class": True
                        }
                    }],
                    "dataset": "d1",
                    "strategy": "GridSearch",
                    "metrics": ["accuracy"],
                    "number_of_processes": 2,
                    "reports": ["r1"],
                    "optimization_metric": "balanced_accuracy",
                    "refit_optimal_model": True,
                    "store_encoded_data": False
                }
            }
        }

        specs_file = path + "specs.yaml"
        with open(specs_file, "w") as file:
            yaml.dump(specs, file)

        app = ImmuneMLApp(specs_file, path + "result/")
        state = app.run()[0]

        self.assertEqual(1, len(state.report_results))
        self.assertTrue(len(state.report_results[0].output_figures) > 0)
        self.assertTrue(len(state.report_results[0].output_tables) > 0)

        for fig in state.report_results[0].output_figures:
            self.assertTrue(os.path.isfile(fig.path))
        for table in state.report_results[0].output_tables:
            self.assertTrue(os.path.isfile(table.path))

        shutil.rmtree(path)
Beispiel #22
0
    def create_encoded_matchedregex(self, path):
        # Setting up dummy data
        labels = {
            "subject_id": ["subject_1", "subject_2", "subject_3"],
            "label": ["yes", "no", "no"]
        }

        metadata_alpha = {
            "v_gene": "V1",
            "j_gene": "J1",
            "chain": Chain.ALPHA.value
        }
        metadata_beta = {
            "v_gene": "V1",
            "j_gene": "J1",
            "chain": Chain.BETA.value
        }

        repertoires, metadata = RepertoireBuilder.build(
            sequences=[[
                "XXAGQXGSSNTGKLIXX", "XXAGQXGSSNTGKLIYY", "XXSAGQGETQYXX"
            ], ["ASSXRXX"], ["XXIXXNDYKLSXX", "CCCC", "SSSS", "TTTT"]],
            path=path,
            labels=labels,
            seq_metadata=[[{
                **metadata_alpha, "count": 10,
                "v_gene": "TRAV35"
            }, {
                **metadata_alpha, "count": 10
            }, {
                **metadata_beta, "count": 10,
                "v_gene": "TRBV29-1"
            }], [{
                **metadata_beta, "count": 10,
                "v_gene": "TRBV7-3"
            }],
                          [{
                              **metadata_alpha, "count": 5,
                              "v_gene": "TRAV26-2"
                          }, {
                              **metadata_alpha, "count": 2
                          }, {
                              **metadata_beta, "count": 1
                          }, {
                              **metadata_beta, "count": 2
                          }]],
            subject_ids=labels["subject_id"])

        dataset = RepertoireDataset(repertoires=repertoires,
                                    metadata_file=metadata)

        label_config = LabelConfiguration()
        label_config.add_label("subject_id", labels["subject_id"])
        label_config.add_label("label", labels["label"])

        file_content = """id	TRAV	TRBV	TRA_regex	TRB_regex
1	TRAV35	TRBV29-1	AGQ.GSSNTGKLI	S[APGFTVML]GQGETQY
2		TRBV7-3		ASS.R.*
3	TRAV26-1		I..NDYKLS	
4	TRAV26-2		I..NDYKLS	
        """

        filepath = path + "reference_motifs.tsv"
        with open(filepath, "w") as file:
            file.writelines(file_content)

        encoder = MatchedRegexEncoder.build_object(
            dataset, **{
                "motif_filepath": filepath,
                "match_v_genes": False,
                "sum_counts": True
            })

        encoded = encoder.encode(
            dataset,
            EncoderParams(result_path=path,
                          label_config=label_config,
                          filename="dataset.csv"))

        return encoded
Beispiel #23
0
    def test_parse_yaml_file(self):
        path = EnvironmentSettings.root_path + "test/tmp/parser/"
        dataset = RepertoireDataset(repertoires=RepertoireBuilder.build(
            [["AAA", "CCC"], ["TTTT"]], path, {"default": [1, 2]})[0],
                                    params={"default": [1, 2]})
        PickleExporter.export(dataset, path)

        spec = {
            "definitions": {
                "datasets": {
                    "d1": {
                        "format": "Pickle",
                        "params": {
                            "path": path + f"{dataset.name}.iml_dataset",
                        }
                    }
                },
                "encodings": {
                    "a1": {
                        "Word2Vec": {
                            "k": 3,
                            "model_type": "sequence",
                            "vector_size": 8,
                        }
                    },
                    "a2": "Word2Vec"
                },
                "ml_methods": {
                    "simpleLR": {
                        "LogisticRegression": {
                            "penalty": "l1"
                        },
                        "model_selection_cv": False,
                        "model_selection_n_folds": -1,
                    },
                    "simpleLR2": "LogisticRegression"
                },
                "reports": {
                    "rep1": "SequenceLengthDistribution"
                }
            },
            "instructions": {}
        }

        PathBuilder.build(path)

        specs_filename = path + "tmp_yaml_spec.yaml"

        with open(specs_filename, "w") as file:
            yaml.dump(spec, file, default_flow_style=False)

        symbol_table, _ = ImmuneMLParser.parse_yaml_file(specs_filename)

        self.assertTrue(
            all([
                symbol_table.contains(key)
                for key in ["simpleLR", "rep1", "a1", "d1"]
            ]))
        self.assertTrue(isinstance(symbol_table.get("d1"), RepertoireDataset))

        with self.assertRaises(YAMLError):
            with open(specs_filename, "r") as file:
                specs_text = file.readlines()
            specs_text[0] = "        definitions:"
            with open(specs_filename, "w") as file:
                file.writelines(specs_text)

            ImmuneMLParser.parse_yaml_file(specs_filename)

        shutil.rmtree(path)
    def test_encoding(self):

        path = EnvironmentSettings.tmp_test_path + "integration_test_emerson_encoding/"
        PathBuilder.build(path)

        ref_path = path + "reference.csv"
        pd.DataFrame({
            "sequence_aas": ["GGG", "III", "TTT", "EFEF"],
            "v_alleles":
            ["TRBV6-1*01", "TRBV6-1*01", "TRBV6-1*01", "TRBV6-1*01"],
            'j_alleles': ["TRBJ2-7", "TRBJ2-7", "TRBJ2-7", "TRBJ2-7"]
        }).to_csv(ref_path, index=False)

        repertoires, metadata = RepertoireBuilder.build(
            [["GGG", "III", "LLL", "MMM"],
             ["DDD", "EEE", "FFF", "III", "LLL", "MMM"], ["CCC", "FFF", "MMM"],
             ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"],
             ["GGG", "III", "LLL", "MMM"],
             ["DDD", "EEE", "FFF", "III", "LLL", "MMM"], ["CCC", "FFF", "MMM"],
             ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"],
             ["GGG", "III", "LLL", "MMM"],
             ["DDD", "EEE", "FFF", "III", "LLL", "MMM"], ["CCC", "FFF", "MMM"],
             ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"],
             ["GGG", "III", "LLL", "MMM"],
             ["DDD", "EEE", "FFF", "III", "LLL", "MMM"], ["CCC", "FFF", "MMM"],
             ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"]],
            labels={
                "l1": [
                    True, True, False, False, True, True, False, False, True,
                    True, False, False, True, True, False, False
                ]
            },
            path=path)

        dataset = RepertoireDataset(repertoires=repertoires,
                                    metadata_file=metadata,
                                    params={"l1": [True, False]})
        PickleExporter.export(dataset, path)

        specs = {
            "definitions": {
                "datasets": {
                    "d1": {
                        "format": "Pickle",
                        "params": {
                            "path": path + f"{dataset.name}.iml_dataset",
                        }
                    }
                },
                "encodings": {
                    "e1": {
                        "SequenceAbundance": {
                            'comparison_attributes':
                            ["sequence_aas", "v_alleles", "j_alleles"]
                        }
                    }
                },
                "ml_methods": {
                    "knn": {
                        "KNN": {
                            "n_neighbors": 1
                        },
                    }
                },
                "reports": {
                    "r1": {
                        "ReferenceSequenceOverlap": {
                            "reference_path":
                            ref_path,
                            'comparison_attributes':
                            ["sequence_aas", "v_alleles", "j_alleles"]
                        }
                    }
                }
            },
            "instructions": {
                "inst1": {
                    "type": "TrainMLModel",
                    "settings": [{
                        "encoding": "e1",
                        "ml_method": "knn"
                    }],
                    "assessment": {
                        "split_strategy": "random",
                        "split_count": 1,
                        "training_percentage": 0.7,
                        "reports": {}
                    },
                    "selection": {
                        "split_strategy": "random",
                        "split_count": 1,
                        "training_percentage": 0.7,
                    },
                    "labels": [{
                        "l1": {
                            "positive_class": True
                        }
                    }],
                    "dataset": "d1",
                    "strategy": "GridSearch",
                    "metrics": ["accuracy"],
                    "number_of_processes": 2,
                    "reports": ["r1"],
                    "optimization_metric": "balanced_accuracy",
                    "refit_optimal_model": True,
                    "store_encoded_data": False
                }
            }
        }

        specs_file = path + "specs.yaml"
        with open(specs_file, "w") as file:
            yaml.dump(specs, file)

        app = ImmuneMLApp(specs_file, path + "result/")
        app.run()

        shutil.rmtree(path)
    def create_dummy_data(self, path):

        # Setting up dummy data
        labels = {
            "subject_id": ["subject_1", "subject_2", "subject_3"],
            "label": ["yes", "no", "no"]
        }

        metadata_alpha = {
            "v_gene": "V1",
            "j_gene": "J1",
            "chain": Chain.LIGHT.value
        }
        metadata_beta = {
            "v_gene": "V1",
            "j_gene": "J1",
            "chain": Chain.HEAVY.value
        }

        repertoires, metadata = RepertoireBuilder.build(
            sequences=[[
                "XXAGQXGSSNTGKLIXX", "XXAGQXGSSNTGKLIYY", "XXSAGQGETQYXX"
            ], ["ASSXRXX"], ["XXIXXNDYKLSXX", "CCCC", "SSSS", "TTTT"]],
            path=path,
            labels=labels,
            seq_metadata=[[{
                **metadata_alpha, "count": 10,
                "v_gene": "IGLV35"
            }, {
                **metadata_alpha, "count": 10
            }, {
                **metadata_beta, "count": 10,
                "v_gene": "IGHV29-1"
            }], [{
                **metadata_beta, "count": 10,
                "v_gene": "IGHV7-3"
            }],
                          [{
                              **metadata_alpha, "count": 5,
                              "v_gene": "IGLV26-2"
                          }, {
                              **metadata_alpha, "count": 2
                          }, {
                              **metadata_beta, "count": 1
                          }, {
                              **metadata_beta, "count": 2
                          }]],
            subject_ids=labels["subject_id"])

        dataset = RepertoireDataset(repertoires=repertoires,
                                    metadata_file=metadata)

        label_config = LabelConfiguration()
        label_config.add_label("subject_id", labels["subject_id"])
        label_config.add_label("label", labels["label"])

        file_content = """id	IGLV	IGHV	IGL_regex	IGH_regex
1	IGLV35	IGHV29-1	AGQ.GSSNTGKLI	S[APGFTVML]GQGETQY
2		IGHV7-3		ASS.R.*
3	IGLV26-1		I..NDYKLS	
4	IGLV26-2		I..NDYKLS	
"""

        filepath = path + "reference_motifs.tsv"
        with open(filepath, "w") as file:
            file.writelines(file_content)

        return dataset, label_config, filepath, labels
Beispiel #26
0
 def create_dataset(self, path: str) -> RepertoireDataset:
     repertoires, metadata = RepertoireBuilder.build(
         [["A", "B"], ["D"], ["E", "F"], ["B", "C"], ["A", "D"]], path)
     dataset = RepertoireDataset(repertoires=repertoires,
                                 metadata_file=metadata)
     return dataset
    def test_run(self):

        path = EnvironmentSettings.tmp_test_path + "hpoptimproc/"
        PathBuilder.build(path)

        repertoires, metadata = RepertoireBuilder.build(
            sequences=[["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"]],
            path=path,
            labels={
                "l1": [
                    1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
                    1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2
                ],
                "l2": [
                    0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1,
                    0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
                ]
            })

        dataset = RepertoireDataset(repertoires=repertoires,
                                    metadata_file=metadata,
                                    params={
                                        "l1": [1, 2],
                                        "l2": [0, 1]
                                    })
        enc1 = {
            "k": 3,
            "model_type": ModelType.SEQUENCE.name,
            "vector_size": 4
        }
        enc2 = {
            "k": 3,
            "model_type": ModelType.SEQUENCE.name,
            "vector_size": 6
        }
        hp_settings = [
            HPSetting(Word2VecEncoder.build_object(dataset, **enc1), enc1,
                      LogisticRegression(), {
                          "model_selection_cv": False,
                          "model_selection_n_folds": -1
                      }, []),
            HPSetting(
                Word2VecEncoder.build_object(dataset, **enc2), enc2, SVM(), {
                    "model_selection_cv": False,
                    "model_selection_n_folds": -1
                },
                [ClonesPerRepertoireFilter(lower_limit=-1, upper_limit=1000)])
        ]

        report = SequenceLengthDistribution()
        label_config = LabelConfiguration(
            [Label("l1", [1, 2]), Label("l2", [0, 1])])

        process = TrainMLModelInstruction(
            dataset, GridSearch(hp_settings), hp_settings,
            SplitConfig(SplitType.RANDOM,
                        1,
                        0.5,
                        reports=ReportConfig(data_splits={"seqlen": report})),
            SplitConfig(SplitType.RANDOM,
                        1,
                        0.5,
                        reports=ReportConfig(data_splits={"seqlen": report})),
            {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, label_config,
            path)

        state = process.run(result_path=path)

        self.assertTrue(isinstance(state, TrainMLModelState))
        self.assertEqual(1, len(state.assessment_states))
        self.assertTrue("l1" in state.assessment_states[0].label_states)
        self.assertTrue("l2" in state.assessment_states[0].label_states)

        shutil.rmtree(path)