Ejemplo n.º 1
0
    def import_dataset(params, name: str) -> SequenceDataset:
        """
        Returns randomly generated receptor dataset according to the parameters;

        YAML specification:

            result_path: path/where/to/store/results/
            sequence_count: 100 # number of random sequences to generate
            chain_1_length_probabilities:
                14: 0.8 # 80% of all generated sequences for all sequences will have length 14
                15: 0.2 # 20% of all generated sequences across all sequences will have length 15
            labels:
                epitope1: # label name
                    True: 0.5 # 50% of the sequences will have class True
                    False: 0.5 # 50% of the sequences will have class False
                epitope2: # next label with classes that will be assigned to sequences independently of the previous label or other parameters
                    1: 0.3 # 30% of the generated sequences will have class 1
                    0: 0.7 # 70% of the generated sequences will have class 0

        """
        valid_keys = [
            "sequence_count", "length_probabilities", "labels", "result_path"
        ]
        ParameterValidator.assert_all_in_valid_list(
            list(params.keys()), valid_keys, "RandomSequenceDatasetImport",
            "params")

        return RandomDatasetGenerator.generate_sequence_dataset(
            sequence_count=params["sequence_count"],
            length_probabilities=params["length_probabilities"],
            labels=params["labels"],
            path=params["result_path"])
    def test_generate(self):
        path = EnvironmentSettings.tmp_test_path + "relevant_sequence_exporter/"
        PathBuilder.build(path)

        df = pd.DataFrame({
            "v_genes": ["TRBV1-1", "TRBV1-1"],
            'j_genes': ["TRBJ1-1", "TRBJ1-2"],
            "sequence_aas": ['ACCF', "EEFG"]
        })
        df.to_csv(path + 'sequences.csv', index=False)

        dataset = RandomDatasetGenerator.generate_repertoire_dataset(
            2, {2: 1}, {4: 1}, {}, path + "data/")
        dataset.encoded_data = EncodedData(
            examples=None,
            info={'relevant_sequence_path': path + 'sequences.csv'},
            encoding="SequenceAbundanceEncoder")

        report_result = RelevantSequenceExporter(dataset, path + "result/",
                                                 'somename').generate_report()

        self.assertEqual(1, len(report_result.output_tables))
        self.assertTrue(os.path.isfile(report_result.output_tables[0].path))

        self.assertTrue(
            all(col in ["v_call", "j_call", "cdr3_aa"] for col in pd.read_csv(
                report_result.output_tables[0].path).columns))

        shutil.rmtree(path)
    def test_run(self):
        path = PathBuilder.build(
            f"{EnvironmentSettings.tmp_test_path}subsampling/")
        dataset = RandomDatasetGenerator.generate_receptor_dataset(
            200,
            labels={"epitope": {
                "A": 0.5,
                "B": 0.5
            }},
            path=path,
            chain_1_length_probabilities={3: 1},
            chain_2_length_probabilities={4: 1})
        dataset.name = "d1"

        inst = SubsamplingInstruction(dataset=dataset,
                                      subsampled_dataset_sizes=[100, 50],
                                      dataset_export_formats=[PickleExporter],
                                      name="subsampling_inst")

        state = inst.run(path + "result/")

        self.assertEqual(2, len(state.subsampled_datasets))
        self.assertEqual(100, state.subsampled_datasets[0].get_example_count())
        self.assertEqual(50, state.subsampled_datasets[1].get_example_count())

        self.assertTrue(
            all(
                os.path.isfile(state.subsampled_dataset_paths[name]['pickle'])
                for name in
                [dataset.name for dataset in state.subsampled_datasets]))

        shutil.rmtree(path)
Ejemplo n.º 4
0
    def test_generate(self):
        path = PathBuilder.build(f"{EnvironmentSettings.tmp_test_path}kernel_sequence_logo/")
        dataset = RandomDatasetGenerator.generate_receptor_dataset(receptor_count=500, chain_1_length_probabilities={4: 1},
                                                                   chain_2_length_probabilities={4: 1},
                                                                   labels={"CMV": {True: 0.5, False: 0.5}}, path=path + "dataset/")
        enc_dataset = OneHotReceptorEncoder(True, 1, False, "enc1").encode(dataset, EncoderParams(path + "result/",
                                                                                                  LabelConfiguration([Label("CMV", [True, False])])))
        cnn = ReceptorCNN(kernel_count=2, kernel_size=[3], positional_channels=3, sequence_type="amino_acid", device="cpu",
                          number_of_threads=4, random_seed=1, learning_rate=0.01, iteration_count=10, l1_weight_decay=0.1, evaluate_at=5,
                          batch_size=100, training_percentage=0.8, l2_weight_decay=0.0)
        cnn.fit(enc_dataset.encoded_data, "CMV")

        report = KernelSequenceLogo(method=cnn, result_path=path + "logos/")
        report.generate_report()

        self.assertTrue(os.path.isfile(f"{path}logos/alpha_kernel_3_1.png"))
        self.assertTrue(os.path.isfile(f"{path}logos/alpha_kernel_3_2.png"))
        self.assertTrue(os.path.isfile(f"{path}logos/beta_kernel_3_1.png"))
        self.assertTrue(os.path.isfile(f"{path}logos/beta_kernel_3_2.png"))
        self.assertTrue(os.path.isfile(f"{path}logos/alpha_kernel_3_1.csv"))
        self.assertTrue(os.path.isfile(f"{path}logos/alpha_kernel_3_2.csv"))
        self.assertTrue(os.path.isfile(f"{path}logos/beta_kernel_3_1.csv"))
        self.assertTrue(os.path.isfile(f"{path}logos/beta_kernel_3_2.csv"))
        self.assertTrue(os.path.isfile(f"{path}logos/fully_connected_layer_weights.csv"))
        self.assertTrue(os.path.isfile(f"{path}logos/fully_connected_layer_weights.html"))

        shutil.rmtree(path)
    def test_run(self):
        path = PathBuilder.build(
            f"{EnvironmentSettings.tmp_test_path}dataset_export_instruction/")
        dataset = RandomDatasetGenerator.generate_repertoire_dataset(
            10, {10: 1}, {12: 1}, {}, path)
        dataset.name = "d1"
        instruction = DatasetExportInstruction(datasets=[dataset],
                                               exporters=[AIRRExporter])

        result_path = f"{path}generated/"
        state = instruction.run(result_path=result_path)

        self.assertTrue(isinstance(state, DatasetExportState))
        self.assertEqual(1, len(state.datasets))
        self.assertEqual(1, len(state.formats))
        self.assertEqual("AIRR", state.formats[0])

        self.assertTrue(os.path.isdir(result_path))
        self.assertEqual(1, len(list(glob(f"{state.result_path}*/"))))
        self.assertEqual(
            1, len(list(glob(f"{state.result_path}{dataset.name}/*/"))))
        self.assertTrue(
            os.path.isdir(f"{state.result_path}{dataset.name}/AIRR/"))
        self.assertTrue(
            os.path.isfile(
                f"{state.result_path}{dataset.name}/AIRR/metadata.csv"))
        self.assertEqual(
            10,
            len(
                list(
                    glob(
                        f"{state.result_path}{dataset.name}/AIRR/repertoires/*"
                    ))))

        shutil.rmtree(path)
Ejemplo n.º 6
0
    def test__split_repertoire_dataset(self):
        path = PathBuilder.build(EnvironmentSettings.tmp_test_path + "manual_splitter/")
        dataset = RandomDatasetGenerator.generate_repertoire_dataset(10, {4: 1}, {3: 1}, {}, path)

        train_metadata = pd.DataFrame({"subject_id": ["rep_1", "rep_2", "rep_4", "rep_5", "rep_9", "rep_7"]})
        train_metadata.to_csv(path + "train.csv")

        test_metadata = pd.DataFrame({"subject_id": ["rep_0", "rep_3", "rep_6", "rep_8"]})
        test_metadata.to_csv(path + "test.csv")

        train_datasets, test_datasets = ManualSplitter._split_repertoire_dataset(
            DataSplitterParams(dataset, SplitType.MANUAL, split_count=1, paths=[path + 'result/'],
                               split_config=SplitConfig(manual_config=ManualSplitConfig(path + "train.csv",
                                                                                        path + "test.csv"),
                                                        split_count=1, split_strategy=SplitType.MANUAL)))

        self.assertEqual(1, len(train_datasets))
        self.assertEqual(1, len(test_datasets))
        self.assertEqual(6, train_datasets[0].get_example_count())
        self.assertEqual(4, test_datasets[0].get_example_count())
        self.assertTrue(all(subject_id in ["rep_1", "rep_2", "rep_4", "rep_5", "rep_9", "rep_7"]
                            for subject_id in train_datasets[0].get_metadata(["subject_id"])["subject_id"]))
        self.assertTrue(all(subject_id in ["rep_0", "rep_3", "rep_6", "rep_8"]
                            for subject_id in test_datasets[0].get_metadata(["subject_id"])["subject_id"]))
        self.assertTrue(os.path.isfile(train_datasets[0].metadata_file))
        self.assertTrue(os.path.isfile(test_datasets[0].metadata_file))

        shutil.rmtree(path)
Ejemplo n.º 7
0
    def import_dataset(params: dict, dataset_name: str) -> RepertoireDataset:
        valid_keys = ["result_path", "repertoire_count", "sequence_count_probabilities", "sequence_length_probabilities", "labels"]
        ParameterValidator.assert_all_in_valid_list(list(params.keys()), valid_keys, "RandomRepertoireDatasetImport", "params")

        return RandomDatasetGenerator.generate_repertoire_dataset(repertoire_count=params["repertoire_count"],
                                                                  sequence_count_probabilities=params["sequence_count_probabilities"],
                                                                  sequence_length_probabilities=params["sequence_length_probabilities"],
                                                                  labels=params["labels"],
                                                                  path=params["result_path"])
    def test_run(self):

        path = EnvironmentSettings.tmp_test_path + "mlapplicationtest/"
        PathBuilder.build(path)

        dataset = RandomDatasetGenerator.generate_repertoire_dataset(
            50, {5: 1}, {5: 1}, {"l1": {
                1: 0.5,
                2: 0.5
            }}, path + 'dataset/')
        ml_method = LogisticRegression()
        encoder = KmerFreqRepertoireEncoder(
            NormalizationType.RELATIVE_FREQUENCY,
            ReadsType.UNIQUE,
            SequenceEncodingType.CONTINUOUS_KMER,
            3,
            scale_to_zero_mean=True,
            scale_to_unit_variance=True)
        label_config = LabelConfiguration([Label("l1", [1, 2])])

        enc_dataset = encoder.encode(
            dataset,
            EncoderParams(result_path=path,
                          label_config=label_config,
                          filename="tmp_enc_dataset.pickle",
                          pool_size=4))
        ml_method.fit(enc_dataset.encoded_data, 'l1')

        hp_setting = HPSetting(
            encoder, {
                "normalization_type": "relative_frequency",
                "reads": "unique",
                "sequence_encoding": "continuous_kmer",
                "k": 3,
                "scale_to_zero_mean": True,
                "scale_to_unit_variance": True
            }, ml_method, {}, [], 'enc1', 'ml1')

        PathBuilder.build(path + 'result/instr1/')
        shutil.copy(path + 'dict_vectorizer.pickle',
                    path + 'result/instr1/dict_vectorizer.pickle')
        shutil.copy(path + 'scaler.pickle',
                    path + 'result/instr1/scaler.pickle')

        ml_app = MLApplicationInstruction(dataset, label_config, hp_setting, 4,
                                          "instr1", False)
        ml_app.run(path + 'result/')

        predictions_path = path + "result/instr1/predictions.csv"
        self.assertTrue(os.path.isfile(predictions_path))

        df = pd.read_csv(predictions_path)
        self.assertEqual(50, df.shape[0])

        shutil.rmtree(path)
Ejemplo n.º 9
0
    def test_encode(self):
        path = PathBuilder.build(EnvironmentSettings.tmp_test_path + "atchley_kmer_encoding/")
        dataset = RandomDatasetGenerator.generate_repertoire_dataset(3, {1: 1}, {4: 1}, {"l1": {True: 0.4, False: 0.6}}, path + "dataset/")

        encoder = AtchleyKmerEncoder.build_object(dataset, **{"k": 2, "skip_first_n_aa": 1, "skip_last_n_aa": 1, "abundance": "RELATIVE_ABUNDANCE",
                                                              "normalize_all_features": False})
        encoded_dataset = encoder.encode(dataset, EncoderParams(path + "result/", LabelConfiguration(labels=[Label("l1")])))

        self.assertEqual((3, 11, 3), encoded_dataset.encoded_data.examples.shape)
        self.assertEqual(0., encoded_dataset.encoded_data.examples[0, -1, 0])

        shutil.rmtree(path)
Ejemplo n.º 10
0
    def test_export_receptor_dataset(self):
        path = EnvironmentSettings.tmp_test_path + "pickleexporter_receptor/"
        PathBuilder.build(path)

        dataset = RandomDatasetGenerator.generate_receptor_dataset(
            10, {2: 1}, {3: 1}, {}, path)
        dataset.name = "d1"
        PickleExporter.export(dataset, path)

        with open(f"{path}/{dataset.name}.iml_dataset", "rb") as file:
            dataset2 = pickle.load(file)

        self.assertTrue(isinstance(dataset2, ReceptorDataset))
        self.assertEqual(10, dataset2.get_example_count())

        shutil.rmtree(path)
Ejemplo n.º 11
0
    def test_generate_sequence_dataset(self):

        path = f"{EnvironmentSettings.tmp_test_path}random_sequence_dataset_generation/"

        dataset = RandomDatasetGenerator.generate_sequence_dataset(sequence_count=100,
                                                                   length_probabilities={4: 0.5, 5: 0.5},
                                                                   labels={"HLA": {"A": 0.5, "B": 0.5}},
                                                                   path=path)

        self.assertEqual(SequenceDataset, type(dataset))
        self.assertEqual(100, dataset.get_example_count())

        for sequence in dataset.get_data():
            self.assertTrue(len(sequence.amino_acid_sequence) in [4, 5])
            self.assertTrue(sequence.get_attribute("HLA") in ["A", "B"])

        shutil.rmtree(path)
Ejemplo n.º 12
0
    def test_parse(self):

        path = PathBuilder.build(
            f'{EnvironmentSettings.tmp_test_path}subsampling_parser/')
        dataset = RandomDatasetGenerator.generate_receptor_dataset(
            30, {3: 1}, {2: 1}, {}, path)

        symbol_table = SymbolTable()
        symbol_table.add("d1", SymbolType.DATASET, dataset)

        SubsamplingParser().parse(
            'inst1', {
                'dataset': 'd1',
                'type': 'Subsampling',
                'subsampled_dataset_sizes': [10, 20],
                'dataset_export_formats': ['Pickle']
            }, symbol_table)

        with self.assertRaises(AssertionError):
            SubsamplingParser().parse(
                'inst1', {
                    'dataset': 'd1',
                    'type': 'Subsampling',
                    'subsampled_dataset_sizes': [10, 50],
                    'dataset_export_formats': ['Pickle']
                }, symbol_table)

        with self.assertRaises(AssertionError):
            SubsamplingParser().parse(
                'inst1', {
                    'dataset': 'd2',
                    'type': 'Subsampling',
                    'subsampled_dataset_sizes': [10, 20],
                    'dataset_export_formats': ['Pickle']
                }, symbol_table)

        with self.assertRaises(AssertionError):
            SubsamplingParser().parse(
                'inst1', {
                    'dataset': 'd2',
                    'type': 'Subsampling',
                    'subsampled_dataset_sizes': [10, 20],
                    'dataset_export_formats': ['Random']
                }, symbol_table)

        shutil.rmtree(path)
Ejemplo n.º 13
0
    def test_generate_receptor_dataset(self):

        path = f"{EnvironmentSettings.tmp_test_path}random_receptor_dataset_generation/"

        dataset = RandomDatasetGenerator.generate_receptor_dataset(receptor_count=100,
                                                                   chain_1_length_probabilities={4: 0.5, 5: 0.5},
                                                                   chain_2_length_probabilities={4: 0.5, 5: 0.5},
                                                                   labels={"HLA": {"A": 0.5, "B": 0.5}},
                                                                   path=path)

        self.assertEqual(ReceptorDataset, type(dataset))

        self.assertEqual(100, dataset.get_example_count())
        for receptor in dataset.get_data():
            self.assertTrue(len(sequence_aa) in [4, 5] for sequence_aa in [receptor.alpha.amino_acid_sequence, receptor.beta.amino_acid_sequence])
            self.assertTrue(receptor.metadata["HLA"] in ["A", "B"])

        shutil.rmtree(path)
Ejemplo n.º 14
0
    def test_generate_repertoire_dataset(self):

        path = f"{EnvironmentSettings.tmp_test_path}random_repertoire_dataset_generation/"

        dataset = RandomDatasetGenerator.generate_repertoire_dataset(repertoire_count=100,
                                                                     sequence_count_probabilities={5: 0.5, 6: 0.5},
                                                                     sequence_length_probabilities={4: 0.5, 5: 0.5},
                                                                     labels={"HLA": {"A": 0.5, "B": 0.5}},
                                                                     path=path)

        self.assertEqual(RepertoireDataset, type(dataset))

        self.assertEqual(100, dataset.get_example_count())
        for repertoire in dataset.repertoires:
            self.assertTrue(repertoire.get_element_count() == 5 or repertoire.get_element_count() == 6)
            self.assertTrue(all(len(sequence_aa) in [4, 5] for sequence_aa in repertoire.get_sequence_aas().tolist()))
            self.assertTrue(repertoire.metadata["HLA"] in ["A", "B"])

        shutil.rmtree(path)
Ejemplo n.º 15
0
    def test_run_with_receptors(self):

        path = PathBuilder.build(EnvironmentSettings.root_path + "test/tmp/signalImplanter_receptor/")

        dataset = RandomDatasetGenerator.generate_receptor_dataset(100, {10: 1}, {12: 1}, {}, path + "dataset/")
        motif1 = Motif(identifier="motif1", instantiation=GappedKmerInstantiation(), seed_chain1="AAA", name_chain1=Chain.ALPHA, seed_chain2="CCC",
                       name_chain2=Chain.BETA)
        signal1 = Signal(identifier="signal1", motifs=[motif1], implanting_strategy=ReceptorImplanting(GappedMotifImplanting()))

        simulation = Simulation([Implanting(dataset_implanting_rate=0.5, signals=[signal1])])

        sim_state = SimulationState(dataset=dataset, result_path=path, simulation=simulation, signals=[signal1])

        new_dataset = SignalImplanter.run(sim_state)

        self.assertEqual(100, new_dataset.get_example_count())
        self.assertEqual(50, len([receptor for receptor in new_dataset.get_data(40) if receptor.metadata["signal_signal1"] is True]))

        shutil.rmtree(path)
Ejemplo n.º 16
0
    def test_generate(self):
        path = PathBuilder.build(EnvironmentSettings.tmp_test_path +
                                 "gliph2_export/")
        dataset = RandomDatasetGenerator.generate_receptor_dataset(
            10, {3: 1}, {2: 1}, {"epitope": {
                "ep1": 0.4,
                "ep2": 0.6
            }}, path)
        report_result = GLIPH2Exporter(dataset, path + "result/", "somename",
                                       "epitope").generate_report()

        self.assertEqual(1, len(report_result.output_tables))
        self.assertTrue(os.path.isfile(report_result.output_tables[0].path))

        df = pd.read_csv(report_result.output_tables[0].path, sep="\t")
        self.assertTrue(
            all(col in [
                "CDR3b", "TRBV", "TRBJ", "CDR3a", "subject:condition", "count"
            ] for col in df.columns))
        self.assertEqual(10, df.shape[0])

        shutil.rmtree(path)
    def test_fit(self):
        path = PathBuilder.build(EnvironmentSettings.tmp_test_path +
                                 "kmermil/")

        repertoire_count = 10
        dataset = RandomDatasetGenerator.generate_repertoire_dataset(
            repertoire_count=repertoire_count,
            sequence_count_probabilities={2: 1},
            sequence_length_probabilities={4: 1},
            labels={"l1": {
                True: 0.5,
                False: 0.5
            }},
            path=path + "dataset/")
        enc_dataset = AtchleyKmerEncoder(
            2, 1, 1, 'relative_abundance', False).encode(
                dataset,
                EncoderParams(path + "result/",
                              LabelConfiguration([Label("l1",
                                                        [True, False])])))
        cls = AtchleyKmerMILClassifier(iteration_count=10,
                                       threshold=-0.0001,
                                       evaluate_at=2,
                                       use_early_stopping=False,
                                       random_seed=1,
                                       learning_rate=0.01,
                                       zero_abundance_weight_init=True,
                                       number_of_threads=8)
        cls.fit(enc_dataset.encoded_data, "l1")

        predictions = cls.predict(enc_dataset.encoded_data, "l1")
        self.assertEqual(repertoire_count, len(predictions["l1"]))
        self.assertEqual(
            repertoire_count,
            len([pred for pred in predictions["l1"]
                 if isinstance(pred, bool)]))

        predictions_proba = cls.predict_proba(enc_dataset.encoded_data, "l1")
        self.assertEqual(repertoire_count,
                         np.rint(np.sum(predictions_proba["l1"])))
        self.assertEqual(repertoire_count, predictions_proba["l1"].shape[0])

        cls.store(path + "model_storage/",
                  feature_names=enc_dataset.encoded_data.feature_names)

        cls2 = AtchleyKmerMILClassifier(iteration_count=10,
                                        threshold=-0.0001,
                                        evaluate_at=2,
                                        use_early_stopping=False,
                                        random_seed=1,
                                        learning_rate=0.01,
                                        zero_abundance_weight_init=True,
                                        number_of_threads=8)
        cls2.load(path + "model_storage/")

        cls2_vars = vars(cls2)
        del cls2_vars["logistic_regression"]
        cls_vars = vars(cls)
        del cls_vars["logistic_regression"]

        for item, value in cls_vars.items():
            if not isinstance(value, np.ndarray):
                loaded_value = cls2_vars[item]
                self.assertEqual(value, loaded_value)

        model = cls.get_model("l1")
        self.assertEqual(vars(cls), model)

        shutil.rmtree(path)
Ejemplo n.º 18
0
    def test_fit(self):
        path = PathBuilder.build(EnvironmentSettings.tmp_test_path + "cnn/")

        dataset = RandomDatasetGenerator.generate_receptor_dataset(
            receptor_count=500,
            chain_1_length_probabilities={4: 1},
            chain_2_length_probabilities={4: 1},
            labels={"CMV": {
                True: 0.5,
                False: 0.5
            }},
            path=path + "dataset/")
        enc_dataset = OneHotReceptorEncoder(True, 1, False, "enc1").encode(
            dataset,
            EncoderParams(path + "result/",
                          LabelConfiguration([Label("CMV", [True, False])])))
        cnn = ReceptorCNN(kernel_count=2,
                          kernel_size=[3],
                          positional_channels=3,
                          sequence_type="amino_acid",
                          device="cpu",
                          number_of_threads=4,
                          random_seed=1,
                          learning_rate=0.01,
                          iteration_count=10,
                          l1_weight_decay=0.1,
                          evaluate_at=5,
                          batch_size=100,
                          training_percentage=0.8,
                          l2_weight_decay=0.0)
        cnn.fit(encoded_data=enc_dataset.encoded_data, label_name="CMV")

        predictions = cnn.predict(enc_dataset.encoded_data, "CMV")
        self.assertEqual(500, len(predictions["CMV"]))
        self.assertEqual(
            500,
            len([
                pred for pred in predictions["CMV"] if isinstance(pred, bool)
            ]))

        predictions_proba = cnn.predict_proba(enc_dataset.encoded_data, "CMV")
        self.assertEqual(500, np.rint(np.sum(predictions_proba["CMV"])))
        self.assertEqual(500, predictions_proba["CMV"].shape[0])

        cnn.store(path + "model_storage/")

        cnn2 = ReceptorCNN(sequence_type="amino_acid")
        cnn2.load(path + "model_storage/")

        cnn2_vars = vars(cnn2)
        del cnn2_vars["CNN"]
        cnn_vars = vars(cnn)
        del cnn_vars["CNN"]

        for item, value in cnn_vars.items():
            if not isinstance(value, np.ndarray):
                self.assertEqual(value, cnn2_vars[item])

        model = cnn.get_model(["CMV"])
        self.assertEqual(vars(cnn), model)

        shutil.rmtree(path)
Ejemplo n.º 19
0
    def test_run(self):

        path = PathBuilder.build(f"{EnvironmentSettings.tmp_test_path}api_galaxy_yaml_tool/")
        result_path = f"{path}result/"

        dataset = RandomDatasetGenerator.generate_repertoire_dataset(10, {10: 1}, {12: 1}, {}, result_path)
        dataset.name = "d1"
        PickleExporter.export(dataset, result_path)

        specs = {
            "definitions": {
                "datasets": {
                    "new_d1": {
                        "format": "Pickle",
                        "params": {
                            "metadata_file": f"{result_path}d1_metadata.csv"
                        }
                    },
                    "d2": {
                        "format": "RandomRepertoireDataset",
                        "params": {
                            "repertoire_count": 50,
                            "sequence_length_probabilities": {10: 1},
                            'sequence_count_probabilities': {10: 1},
                            'labels': {
                                "CD": {
                                    True: 0.5,
                                    False: 0.5
                                }
                            }
                        }
                    }
                },
                "encodings": {
                    "e1": {
                        "Word2Vec": {
                            "k": 3,
                            "model_type": "sequence",
                            "vector_size": 8,
                        }
                    },
                    "e2": {
                        "Word2Vec": {
                            "k": 3,
                            "model_type": "sequence",
                            "vector_size": 10,
                        }
                    },
                },
                "ml_methods": {
                    "simpleLR": {
                        "LogisticRegression": {
                            "penalty": "l1"
                        },
                        "model_selection_cv": False,
                        "model_selection_n_folds": -1,
                    }
                },
            },
            "instructions": {
                "inst1": {
                    "type": "DatasetExport",
                    "datasets": ["new_d1", 'd2'],
                    "export_formats": ["AIRR"]
                },
                "inst2": {
                    "type": "TrainMLModel",
                    "settings": [
                        {
                            "encoding": "e1",
                            "ml_method": "simpleLR"
                        },
                        {
                            "encoding": "e2",
                            "ml_method": "simpleLR"
                        }
                    ],
                    "assessment": {
                        "split_strategy": "random",
                        "split_count": 1,
                        "training_percentage": 0.7
                    },
                    "selection": {
                        "split_strategy": "random",
                        "split_count": 2,
                        "training_percentage": 0.7
                    },
                    "labels": ["CD"],
                    "dataset": "d2",
                    "strategy": "GridSearch",
                    "metrics": ["accuracy", "auc"],
                    "reports": [],
                    "number_of_processes": 10,
                    "optimization_metric": "accuracy",
                    'refit_optimal_model': False,
                    "store_encoded_data": False
                }
            }
        }

        specs_path = f"{path}specs.yaml"
        with open(specs_path, "w") as file:
            yaml.dump(specs, file)

        run_immuneML(Namespace(**{"specification_path": specs_path, "result_path": result_path + 'result/', 'tool': "GalaxyYamlTool"}))

        self.assertTrue(os.path.exists(f"{result_path}result/inst1/new_d1/AIRR"))
        self.assertTrue(os.path.exists(f"{result_path}result/inst1/d2/AIRR"))
        self.assertTrue(os.path.exists(f"{result_path}result/d2"))

        shutil.rmtree(path)