Ejemplo n.º 1
0
    def create_dataset(self):
        path = Path(
            os.path.relpath(EnvironmentSettings.root_path /
                            "test/tmp/immunemlapp/initial_dataset"))
        PathBuilder.build(path)

        repertoire_count = 30
        repertoires, metadata = RepertoireBuilder.build(
            [["AA", "AAAA", "AAAA", "AAA"] for i in range(repertoire_count)],
            path, {
                "CD": [
                    'yes' if i % 2 == 0 else 'no'
                    for i in range(repertoire_count)
                ],
                "CMV": [
                    True if i % 2 == 1 else False
                    for i in range(repertoire_count)
                ]
            }, [[{
                "chain": "A" if i % 2 == 0 else "B",
                "count": random.randint(2, 5)
            } for i in range(4)] for j in range(repertoire_count)])

        dataset = RepertoireDataset(repertoires=repertoires,
                                    metadata_file=metadata,
                                    labels={
                                        "CD": [True, False],
                                        "CMV": [True, False]
                                    },
                                    name="d1")
        PickleExporter.export(dataset, path)

        return path / "d1.iml_dataset"
Ejemplo n.º 2
0
    def make_random_dataset(self, path):
        alphabet = EnvironmentSettings.get_sequence_alphabet()
        sequences = [["".join([rn.choice(alphabet) for i in range(20)]) for i in range(100)] for i in range(40)]

        repertoires, metadata = RepertoireBuilder.build(sequences, path, subject_ids=[i % 2 for i in range(len(sequences))])
        dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata)
        PickleExporter.export(dataset, path)
Ejemplo n.º 3
0
    def create_dummy_dataset(self, path):
        repertoires, metadata = RepertoireBuilder.build([["AA"], ["CC"]], path, labels={"label1": ["val1", "val2"], "label2": ["val1", "val2"]})

        dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata)
        dataset.name = "my_dataset"
        PickleExporter.export(dataset, path)

        return f"{dataset.name}.iml_dataset"
Ejemplo n.º 4
0
    def test_build(self):
        path = EnvironmentSettings.root_path / "test/tmp/repbuilder/"
        repertoires, metadata = RepertoireBuilder.build(
            [["AAA", "CCC"], ["TTTT"]], path, {"default": [1, 2]})

        self.assertEqual(2, len(repertoires))
        self.assertEqual((2, 4), pd.read_csv(metadata).shape)

        self.assertEqual(2, len(repertoires[0].sequences))
        self.assertTrue(
            all([
                isinstance(seq, ReceptorSequence)
                for seq in repertoires[0].sequences
            ]))
        self.assertEqual(1, repertoires[0].metadata["default"])

        self.assertEqual(1, len(repertoires[1].sequences))
        self.assertTrue(
            all([
                isinstance(seq, ReceptorSequence)
                for seq in repertoires[1].sequences
            ]))
        self.assertEqual(2, repertoires[1].metadata["default"])
        self.assertEqual("rep_1", repertoires[1].metadata["subject_id"])

        # Testing with custom metadata
        repertoires, metadata = RepertoireBuilder.build([["AAA", "CCC"]],
                                                        path,
                                                        seq_metadata=[[{
                                                            "v_gene":
                                                            "v5",
                                                            "j_gene":
                                                            "j5"
                                                        }, {
                                                            "v_gene":
                                                            "v2",
                                                            "j_gene":
                                                            "j2"
                                                        }]])

        self.assertEqual(repertoires[0].sequences[0].metadata.v_gene, "v5")
        self.assertEqual(repertoires[0].sequences[0].metadata.j_gene, "j5")
        self.assertEqual(repertoires[0].sequences[1].metadata.v_gene, "v2")
        self.assertEqual(repertoires[0].sequences[1].metadata.j_gene, "j2")

        shutil.rmtree(path)
Ejemplo n.º 5
0
    def test_run(self):
        path = EnvironmentSettings.root_path / "test/tmp/mlmethodassessment/"
        PathBuilder.build(path)
        dataset = RepertoireDataset(repertoires=RepertoireBuilder.build(
            [["AA"], ["CC"], ["AA"], ["CC"], ["AA"], ["CC"], ["AA"], ["CC"],
             ["AA"], ["CC"], ["AA"], ["CC"]], path)[0])
        dataset.encoded_data = EncodedData(
            examples=np.array([[1, 1], [1, 1], [3, 3], [1, 1], [1, 1], [3, 3],
                               [1, 1], [1, 1], [3, 3], [1, 1], [1, 1], [3,
                                                                        3]]),
            labels={
                "l1": [1, 1, 3, 1, 1, 3, 1, 1, 3, 1, 1, 3],
                "l2": [1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3]
            })

        label_config = LabelConfiguration()
        label_config.add_label("l1", [1, 3])

        label = Label(name='l1', values=[1, 2])

        method1 = LogisticRegression()
        method1.fit(dataset.encoded_data, label=label)

        res = MLMethodAssessment.run(
            MLMethodAssessmentParams(
                dataset=dataset,
                method=method1,
                metrics={
                    Metric.ACCURACY, Metric.BALANCED_ACCURACY, Metric.F1_MACRO
                },
                optimization_metric=Metric.LOG_LOSS,
                predictions_path=EnvironmentSettings.root_path /
                "test/tmp/mlmethodassessment/predictions.csv",
                label=label,
                ml_score_path=EnvironmentSettings.root_path /
                "test/tmp/mlmethodassessment/ml_score.csv",
                split_index=1,
                path=EnvironmentSettings.root_path /
                "test/tmp/mlmethodassessment/"))

        self.assertTrue(isinstance(res, dict))
        self.assertTrue(res[Metric.LOG_LOSS.name.lower()] <= 0.1)

        self.assertTrue(
            os.path.isfile(EnvironmentSettings.root_path /
                           "test/tmp/mlmethodassessment/ml_score.csv"))

        df = pd.read_csv(EnvironmentSettings.root_path /
                         "test/tmp/mlmethodassessment/ml_score.csv")
        self.assertTrue(df.shape[0] == 1)

        df = pd.read_csv(EnvironmentSettings.root_path /
                         "test/tmp/mlmethodassessment/predictions.csv")
        self.assertEqual(12, df.shape[0])

        shutil.rmtree(EnvironmentSettings.root_path /
                      "test/tmp/mlmethodassessment/")
Ejemplo n.º 6
0
    def test_run(self):

        path = EnvironmentSettings.root_path / "test/tmp/smmodel/"
        PathBuilder.build(path)
        repertoires, metadata = RepertoireBuilder.build(
            [["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"],
             ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"],
             ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"],
             ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"],
             ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"],
             ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"],
             ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"],
             ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"]], path, {
                 "default": [
                     1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1,
                     2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2
                 ]
             })
        dataset = RepertoireDataset(repertoires=repertoires,
                                    labels={"default": [1, 2]},
                                    metadata_file=metadata)

        label_config = LabelConfiguration()
        label_config.add_label("default", [1, 2])

        hp_settings = [
            HPSetting(
                Word2VecEncoder.build_object(
                    dataset, **{
                        "vector_size": 8,
                        "model_type": ModelType.SEQUENCE.name,
                        "k": 3
                    }), {
                        "vector_size": 8,
                        "model_type": ModelType.SEQUENCE.name,
                        "k": 3
                    }, LogisticRegression(), {
                        "model_selection_cv": False,
                        "model_selection_n_folds": -1
                    }, [])
        ]

        split_config_assessment = SplitConfig(SplitType.RANDOM, 1, 0.5,
                                              ReportConfig())
        split_config_selection = SplitConfig(SplitType.RANDOM, 1, 0.5,
                                             ReportConfig())

        instruction = TrainMLModelInstruction(
            dataset, GridSearch(hp_settings), hp_settings,
            split_config_assessment, split_config_selection,
            {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, label_config,
            path)
        semantic_model = SemanticModel([instruction], path)

        semantic_model.run()

        shutil.rmtree(path)
Ejemplo n.º 7
0
 def create_dataset(self, path: str) -> RepertoireDataset:
     repertoires, metadata = RepertoireBuilder.build(
         [["A", "B"], ["B", "C"], ["D"], ["E", "F"], ["A", "B"], ["B", "C"],
          ["D"], ["E", "F"]], path, {
              "l1": [1, 0, 1, 0, 1, 0, 1, 0],
              "l2": [2, 3, 2, 3, 2, 3, 3, 3]
          })
     dataset = RepertoireDataset(repertoires=repertoires,
                                 metadata_file=metadata)
     return dataset
Ejemplo n.º 8
0
    def test_get_metadata_fields(self):

        path = EnvironmentSettings.tmp_test_path / "repertoire_dataset/"
        PathBuilder.build(path)

        repertoires, metadata = RepertoireBuilder.build([["AA"], ["BB"]], path, {"l1": [1, 2], "hla": ["A", "B"]}, subject_ids=["d1", "d2"])
        dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata)

        self.assertTrue("l1" in dataset.get_metadata_fields())
        self.assertTrue("hla" in dataset.get_metadata_fields())
        self.assertTrue("subject_id" in dataset.get_metadata_fields())

        shutil.rmtree(path)
    def _build_test_dataset(self, path):
        repertoires, metadata = RepertoireBuilder.build(
            [["GGG", "III", "LLL", "MMM"],
             ["DDD", "EEE", "FFF", "III", "LLL", "MMM"], ["CCC", "FFF", "MMM"],
             ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"]],
            labels={"l1": [True, True, False, False]},
            path=path)

        dataset = RepertoireDataset(repertoires=repertoires,
                                    metadata_file=metadata,
                                    identifier="1")

        return dataset
Ejemplo n.º 10
0
    def create_datasets(self, path: Path):
        repertoires, metadata = RepertoireBuilder.build(
            [["A", "B"], ["B", "C"], ["D"], ["E", "F"]], path, {
                "l1": [1, 0, 1, 0],
                "l2": [2, 3, 2, 3]
            })

        main_dataset = RepertoireDataset(repertoires=repertoires,
                                         metadata_file=metadata)
        sub_dataset = main_dataset.make_subset([0, 1],
                                               path=path,
                                               dataset_type="subset")
        return main_dataset, sub_dataset
    def create_dummy_data(self, path):
        # Setting up dummy data
        labels = {
            "subject_id": ["subject_1", "subject_2", "subject_3"],
            "label": ["yes", "yes", "no"]
        }

        metadata = {
            "v_gene": "TRBV1",
            "j_gene": "TRBJ1",
            "chain": Chain.BETA.value
        }

        repertoires, metadata = RepertoireBuilder.build(
            sequences=[["AAAA"], ["SSSS"], ["SSSS", "CCCC"]],
            path=path,
            labels=labels,
            seq_metadata=[[{
                **metadata, "count": 10
            }], [{
                **metadata, "count": 10
            }], [{
                **metadata, "count": 5
            }, {
                **metadata, "count": 5
            }]],
            subject_ids=labels["subject_id"])

        dataset = RepertoireDataset(repertoires=repertoires)

        label_config = LabelConfiguration()
        label_config.add_label("subject_id", labels["subject_id"])
        label_config.add_label("label", labels["label"])

        file_content = """complex.id	Gene	CDR3	V	J	Species	MHC A	MHC B	MHC class	Epitope	Epitope gene	Epitope species	Reference	Method	Meta	CDR3fix	Score
100	TRB	AAAA	TRBV1	TRBJ1	HomoSapiens	HLA-A*11:01	B2M	MHCI	AVFDRKSDAK	EBNA4	EBV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""}	{"cdr3": "CASSPPRVYSNGAGLAGVGWRNEQFF", "cdr3_old": "CASSPPRVYSNGAGLAGVGWRNEQFF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-1*01", "jStart": 21, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-4*01"}	0
200	TRB	SSSS	TRBV1	TRBJ1	HomoSapiens	HLA-A*03:01	B2M	MHCI	KLGGALQAK	IE1	CMV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""}	{"cdr3": "CASSWTWDAATLWGQGALGGANVLTF", "cdr3_old": "CASSWTWDAATLWGQGALGGANVLTF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-6*01", "jStart": 19, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-5*01"}	0"""

        with open(path / "refs.tsv", "w") as file:
            file.writelines(file_content)

        reference_sequences = {
            "params": {
                "path": path / "refs.tsv",
                "region_type": "FULL_SEQUENCE"
            },
            "format": "VDJdb"
        }

        return dataset, label_config, reference_sequences, labels
Ejemplo n.º 12
0
    def create_dataset(self, path):
        repertoires, metadata = RepertoireBuilder.build(
            [["AAA"], ["AAAC"], ["ACA"], ["CAAA"], ["AAAC"], ["AAA"]], path, {
                "l1": [1, 1, 1, 0, 0, 0],
                "l2": [2, 3, 2, 3, 2, 3]
            })

        dataset = RepertoireDataset(repertoires=repertoires,
                                    labels={
                                        "l1": [0, 1],
                                        "l2": [2, 3]
                                    },
                                    metadata_file=metadata)
        return dataset
Ejemplo n.º 13
0
    def test_export(self):
        path = EnvironmentSettings.tmp_test_path / "imlexporter/"
        PathBuilder.build(path)

        repertoires, metadata = RepertoireBuilder.build([["AA"], ["CC"]], path)
        dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata)
        ImmuneMLExporter.export(dataset, EnvironmentSettings.tmp_test_path / "imlexporter/")

        with open(EnvironmentSettings.tmp_test_path / f"imlexporter/{dataset.name}.iml_dataset", "r") as file:
            dataset2 = yaml.safe_load(file)

        shutil.rmtree(EnvironmentSettings.tmp_test_path / "imlexporter/")

        self.assertTrue(isinstance(dataset2, dict))
        self.assertEqual('RepertoireDataset', dataset2['dataset_class'])
        self.assertEqual(dataset.identifier, dataset2['identifier'])
    def test_process(self):
        path = EnvironmentSettings.root_path / "test/tmp/clones_per_repertoire_filter/"
        PathBuilder.build(path)
        dataset = RepertoireDataset(repertoires=RepertoireBuilder.build([["ACF", "ACF", "ACF"],
                                                                       ["ACF", "ACF"],
                                                                       ["ACF", "ACF", "ACF", "ACF"]], path)[0])

        dataset1 = ClonesPerRepertoireFilter(**{"lower_limit": 3, "result_path": path}).process_dataset(dataset, path)
        self.assertEqual(2, dataset1.get_example_count())

        dataset2 = ClonesPerRepertoireFilter(**{"upper_limit": 2, "result_path": path}).process_dataset(dataset, path)
        self.assertEqual(1, dataset2.get_example_count())

        self.assertRaises(AssertionError, ClonesPerRepertoireFilter(**{"lower_limit": 10, "result_path": path}).process_dataset, dataset, path)

        shutil.rmtree(path)
Ejemplo n.º 15
0
    def test_import(self):
        path = PathBuilder.build(EnvironmentSettings.tmp_test_path / "iml_import/")

        repertoires, metadata = RepertoireBuilder.build([["AA"], ["CC"]], path)
        dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata)

        with open(path / "dataset.iml_dataset", "w") as file:
            dataset_dict = {key: item if not isinstance(item, Path) else str(item) for key, item in vars(dataset).items()
                            if key not in ['repertoires', 'encoded_data']}
            yaml.dump({**dataset_dict, **{"dataset_class": "RepertoireDataset"}}, file)

        dataset2 = ImmuneMLImport.import_dataset({"path": path / "dataset.iml_dataset"}, "dataset_name")

        shutil.rmtree(path)

        self.assertEqual(2, len(dataset2.get_data()))
        self.assertEqual("rep_1", dataset2.get_data()[1].metadata["subject_id"])
    def create_dummy_data(self, path):

        # Setting up dummy data
        labels = {"subject_id": ["subject_1", "subject_1", "subject_2", "subject_2", "subject_3"],
                  "label": ["yes", "yes", "no", "no", "no"]}

        metadata_alpha = {"v_gene": "V1", "j_gene": "J1", "chain": Chain.ALPHA.value}
        metadata_beta = {"v_gene": "V1", "j_gene": "J1", "chain": Chain.BETA.value}

        repertoires, metadata = RepertoireBuilder.build(sequences=[["AAAA"],
                                                                   ["SSSS"],
                                                                   ["AAAA", "CCCC"],
                                                                   ["SSSS", "TTTT"],
                                                                   ["AAAA", "CCCC", "SSSS", "TTTT"]],
                                                        path=path, labels=labels,
                                                        seq_metadata=[[{**metadata_alpha, "count": 10}],
                                                                      [{**metadata_beta, "count": 10}],
                                                                      [{**metadata_alpha, "count": 5},
                                                                       {**metadata_alpha, "count": 5}],
                                                                      [{**metadata_beta, "count": 5},
                                                                       {**metadata_beta, "count": 5}],
                                                                      [{**metadata_alpha, "count": 1},
                                                                       {**metadata_alpha, "count": 2},
                                                                       {**metadata_beta, "count": 1},
                                                                       {**metadata_beta, "count": 2}]],
                                                        subject_ids=labels["subject_id"])

        dataset = RepertoireDataset(repertoires=repertoires)

        label_config = LabelConfiguration()
        label_config.add_label("subject_id", labels["subject_id"])
        label_config.add_label("label", labels["label"])

        # clonotype 100 with TRA=AAAA, TRB = SSSS; clonotype 200 with TRA=CCCC, TRB = TTTT
        file_content = """Cell type	Clonotype ID	Chain: TRA (1)	TRA - V gene (1)	TRA - D gene (1)	TRA - J gene (1)	Chain: TRA (2)	TRA - V gene (2)	TRA - D gene (2)	TRA - J gene (2)	Chain: TRB (1)	TRB - V gene (1)	TRB - D gene (1)	TRB - J gene (1)	Chain: TRB (2)	TRB - V gene (2)	TRB - D gene (2)	TRB - J gene (2)	Cells pr. clonotype	Clonotype (Id)	Clonotype (Name)
TCR_AB	100	AAAA	TRAV1		TRAJ1	null	null	null	null	SSSS	TRBV1		TRBJ1	null	null	null	null	1	1941533	3ca0cd7f-02fd-40bb-b295-7cd5d419e474(101, 102, 103, 104, 105, 108, 109, 127, 128, 130, 131, 132, 133, 134, 174)Size:1
TCR_AB	200	CCCC	TRAV1		TRAJ1	null	null	null	null	TTTT	TRBV1		TRBJ1	null	null	null	null	1	1941532	1df22bbc-8113-46b9-8913-da95fcf9a568(101, 102, 103, 104, 105, 108, 109, 127, 128, 130, 131, 132, 133, 134, 174)Size:1
"""

        with open(path / "refs.tsv", "w") as file:
            file.writelines(file_content)

        reference_receptors = {"params": {"path": path / "refs.tsv"}, "format": "IRIS"}

        return dataset, label_config, reference_receptors, labels
Ejemplo n.º 17
0
    def test_encode(self):
        path = EnvironmentSettings.tmp_test_path / "abundance_encoder/"
        PathBuilder.build(path)

        repertoires, metadata = RepertoireBuilder.build(
            [["GGG", "III", "LLL", "MMM"],
             ["DDD", "EEE", "FFF", "III", "LLL", "MMM"], ["CCC", "FFF", "MMM"],
             ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"]],
            labels={"l1": [True, True, False, False]},
            path=path)

        dataset = RepertoireDataset(repertoires=repertoires,
                                    metadata_file=metadata,
                                    identifier="1")

        encoder = SequenceAbundanceEncoder.build_object(
            dataset, **{
                "comparison_attributes": ["sequence_aas"],
                "p_value_threshold": 0.4,
                "sequence_batch_size": 4,
                "repertoire_batch_size": 8
            })

        label_config = LabelConfiguration(
            [Label("l1", [True, False], positive_class=True)])

        encoded_dataset = encoder.encode(
            dataset, EncoderParams(result_path=path,
                                   label_config=label_config))

        self.assertTrue(
            np.array_equal(np.array([[1, 4], [1, 6], [0, 3], [0, 6]]),
                           encoded_dataset.encoded_data.examples))

        encoder.p_value_threshold = 0.05

        encoded_dataset = encoder.encode(
            dataset, EncoderParams(result_path=path,
                                   label_config=label_config))

        self.assertTrue(
            np.array_equal(np.array([[0, 4], [0, 6], [0, 3], [0, 6]]),
                           encoded_dataset.encoded_data.examples))

        shutil.rmtree(path)
    def _create_state_object(self, path):
        repertoires, metadata = RepertoireBuilder.build(sequences=[["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"]],
                                                        path=path,
                                                        labels={
                                                            "l1": [1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
                                                                   1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2],
                                                            "l2": [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1,
                                                                   0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]})

        dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata,
                                    labels={"l1": [1, 2], "l2": [0, 1]})
        enc_params = {"k": 3, "model_type": ModelType.SEQUENCE.name, "vector_size": 4}
        hp_settings = [HPSetting(Word2VecEncoder.build_object(dataset, **enc_params), enc_params,
                                 LogisticRegression(),
                                 {"model_selection_cv": False, "model_selection_n_folds": -1},
                                 [])]

        label_config = LabelConfiguration([Label("l1", [1, 2]), Label("l2", [0, 1])])

        process = TrainMLModelInstruction(dataset, GridSearch(hp_settings), hp_settings,
                                          SplitConfig(SplitType.RANDOM, 1, 0.7),
                                          SplitConfig(SplitType.RANDOM, 1, 0.7),
                                          {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, label_config, path)

        state = process.run(result_path=path)

        return state
Ejemplo n.º 19
0
    def test_load(self):
        path = EnvironmentSettings.root_path / "test/tmp/pathbuilder/"
        PathBuilder.build(path)

        repertoires, metadata = RepertoireBuilder.build([["AA"], ["CC"]], path)
        dataset = RepertoireDataset(repertoires=repertoires,
                                    metadata_file=metadata)

        with open(path / "dataset.pkl", "wb") as file:
            pickle.dump(dataset, file)

        dataset2 = PickleImport.import_dataset({"path": path / "dataset.pkl"},
                                               "dataset_name")

        shutil.rmtree(path)

        self.assertEqual(2, len(dataset2.get_data()))
        self.assertEqual("rep_1",
                         dataset2.get_data()[1].metadata["subject_id"])
    def test_process(self):
        path = EnvironmentSettings.root_path / "test/tmp/metadata_filter/"
        PathBuilder.build(path)
        dataset = RepertoireDataset(repertoires=RepertoireBuilder.build(
            [["ACF", "ACF", "ACF"], ["ACF", "ACF"],
             ["ACF", "ACF", "ACF", "ACF"]], path)[0])

        df = pd.DataFrame(data={"key1": [0, 1, 2], "key2": [0, 1, 2]})
        df.to_csv(path / "metadata.csv")

        dataset.metadata_file = path / "metadata.csv"

        dataset1 = MetadataRepertoireFilter(
            **{
                "criteria": {
                    "type": OperationType.GREATER_THAN.name,
                    "value": {
                        "type": DataType.COLUMN.name,
                        "name": "key2"
                    },
                    "threshold": 1
                },
                "result_path": path
            }).process_dataset(dataset, path)

        self.assertEqual(1, dataset1.get_example_count())

        self.assertRaises(
            AssertionError,
            MetadataRepertoireFilter(
                **{
                    "criteria": {
                        "type": OperationType.GREATER_THAN.name,
                        "value": {
                            "type": DataType.COLUMN.name,
                            "name": "key2"
                        },
                        "threshold": 10
                    }
                }).process_dataset, dataset, path)

        shutil.rmtree(path)
Ejemplo n.º 21
0
    def create_dummy_data(self, path):

        # Setting up dummy data
        labels = {"subject_id": ["subject_1", "subject_2", "subject_3"],
                  "label": ["yes", "no", "no"]}

        metadata_alpha = {"v_gene": "V1", "j_gene": "J1", "chain": Chain.LIGHT.value}
        metadata_beta = {"v_gene": "V1", "j_gene": "J1", "chain": Chain.HEAVY.value}

        repertoires, metadata = RepertoireBuilder.build(sequences=[["XXAGQXGSSNTGKLIXX", "XXAGQXGSSNTGKLIYY", "XXSAGQGETQYXX"],
                                                                   ["ASSXRXX"],
                                                                   ["XXIXXNDYKLSXX", "CCCC", "SSSS", "TTTT"]],
                                                        path=path, labels=labels,
                                                        seq_metadata=[[{**metadata_alpha, "count": 10, "v_gene": "IGLV35"},
                                                                       {**metadata_alpha, "count": 10},
                                                                       {**metadata_beta, "count": 10, "v_gene": "IGHV29-1"}],
                                                                      [{**metadata_beta, "count": 10, "v_gene": "IGHV7-3"}],
                                                                      [{**metadata_alpha, "count": 5, "v_gene": "IGLV26-2"},
                                                                       {**metadata_alpha, "count": 2},
                                                                       {**metadata_beta, "count": 1},
                                                                       {**metadata_beta, "count": 2}]],
                                                        subject_ids=labels["subject_id"])

        dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata)

        label_config = LabelConfiguration()
        label_config.add_label("subject_id", labels["subject_id"])
        label_config.add_label("label", labels["label"])

        file_content = """id	IGLV	IGHV	IGL_regex	IGH_regex
1	IGLV35	IGHV29-1	AGQ.GSSNTGKLI	S[APGFTVML]GQGETQY
2		IGHV7-3		ASS.R.*
3	IGLV26-1		I..NDYKLS	
4	IGLV26-2		I..NDYKLS	
"""

        filepath = path / "reference_motifs.tsv"
        with open(filepath, "w") as file:
            file.writelines(file_content)

        return dataset, label_config, filepath, labels
Ejemplo n.º 22
0
    def test_export(self):
        path = EnvironmentSettings.tmp_test_path / "pickleexporter/"
        PathBuilder.build(path)

        repertoires, metadata = RepertoireBuilder.build([["AA"], ["CC"]], path)
        dataset = RepertoireDataset(repertoires=repertoires,
                                    metadata_file=metadata)
        PickleExporter.export(
            dataset, EnvironmentSettings.tmp_test_path / "pickleexporter/")

        with open(
                EnvironmentSettings.tmp_test_path /
                f"pickleexporter/{dataset.name}.iml_dataset", "rb") as file:
            dataset2 = pickle.load(file)

        shutil.rmtree(EnvironmentSettings.tmp_test_path / "pickleexporter/")

        self.assertTrue(isinstance(dataset2, RepertoireDataset))
        self.assertEqual(2, len(dataset2.get_data()))
        self.assertEqual("rep_0",
                         dataset2.get_data()[0].metadata["subject_id"])
Ejemplo n.º 23
0
    def test_encode(self):

        path = EnvironmentSettings.tmp_test_path / "count_encoder/"
        PathBuilder.build(path)

        repertoires, metadata = RepertoireBuilder.build(
            [["GGG", "III", "LLL", "MMM"],
             ["DDD", "EEE", "FFF", "III", "LLL", "MMM"], ["CCC", "FFF", "MMM"],
             ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"]],
            labels={"l1": [True, True, False, False]},
            path=path)

        dataset = RepertoireDataset(repertoires=repertoires,
                                    metadata_file=metadata,
                                    identifier="1")

        encoder = SequenceCountEncoder.build_object(
            dataset, **{
                "comparison_attributes": ["sequence_aas"],
                "p_value_threshold": 0.4,
                "sequence_batch_size": 4
            })

        label_config = LabelConfiguration(
            [Label("l1", [True, False], positive_class=True)])

        encoded_dataset = encoder.encode(
            dataset, EncoderParams(result_path=path,
                                   label_config=label_config))

        test = encoded_dataset.encoded_data.examples

        self.assertTrue(test[0] == 1)
        self.assertTrue(test[1] == 1)
        self.assertTrue(test[2] == 0)
        self.assertTrue(test[3] == 0)

        self.assertTrue("III" in encoded_dataset.encoded_data.feature_names)

        shutil.rmtree(path)
Ejemplo n.º 24
0
    def prepare_dataset(self, path):
        PathBuilder.build(path)
        repertoires, metadata = RepertoireBuilder.build(
            sequences=[["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"]],
            path=path,
            labels={
                "l1": [
                    1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
                    1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2
                ],
                "l2": [
                    0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1,
                    0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
                ]
            })

        dataset = RepertoireDataset(repertoires=repertoires,
                                    metadata_file=metadata,
                                    labels={
                                        "l1": [1, 2],
                                        "l2": [0, 1]
                                    },
                                    name="dataset1")
        PickleExporter.export(dataset, path)
Ejemplo n.º 25
0
    def test_generate(self):

        path = EnvironmentSettings.tmp_test_path / "disease_assoc_seq_cv/"
        PathBuilder.build(path)

        repertoires, metadata = RepertoireBuilder.build(
            [["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF"],
             ["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF"],
             ["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF"],
             ["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF"],
             ["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF"],
             ["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF"],
             ["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF"]],
            labels={
                "l1": [
                    True, False, True, False, True, False, True, False, True,
                    False, True, False, True, False
                ]
            },
            path=path)

        dataset = RepertoireDataset(repertoires=repertoires,
                                    metadata_file=metadata,
                                    labels={"l1": [True, False]})
        PickleExporter.export(dataset, path)

        specs = {
            "definitions": {
                "datasets": {
                    "d1": {
                        "format": "Pickle",
                        "params": {
                            "path": str(path / f"{dataset.name}.iml_dataset"),
                        }
                    }
                },
                "encodings": {
                    "e1": {
                        "SequenceAbundance": {
                            'p_value_threshold': 0.5
                        }
                    }
                },
                "ml_methods": {
                    "knn": {
                        "KNN": {
                            "n_neighbors": 1
                        },
                    }
                },
                "reports": {
                    "r1": {
                        "DiseaseAssociatedSequenceCVOverlap": {
                            "compare_in_selection": True,
                            "compare_in_assessment": True
                        }
                    }
                }
            },
            "instructions": {
                "inst1": {
                    "type": "TrainMLModel",
                    "settings": [{
                        "encoding": "e1",
                        "ml_method": "knn"
                    }],
                    "assessment": {
                        "split_strategy": "random",
                        "split_count": 1,
                        "training_percentage": 0.5,
                        "reports": {}
                    },
                    "selection": {
                        "split_strategy": "random",
                        "split_count": 1,
                        "training_percentage": 0.5,
                    },
                    "labels": [{
                        "l1": {
                            "positive_class": True
                        }
                    }],
                    "dataset": "d1",
                    "strategy": "GridSearch",
                    "metrics": ["accuracy"],
                    "number_of_processes": 2,
                    "reports": ["r1"],
                    "optimization_metric": "balanced_accuracy",
                    "refit_optimal_model": True,
                    "store_encoded_data": False
                }
            }
        }

        specs_file = path / "specs.yaml"
        with open(specs_file, "w") as file:
            yaml.dump(specs, file)

        app = ImmuneMLApp(specs_file, path / "result/")
        state = app.run()[0]

        self.assertEqual(1, len(state.report_results))
        self.assertTrue(len(state.report_results[0].output_figures) > 0)
        self.assertTrue(len(state.report_results[0].output_tables) > 0)

        for fig in state.report_results[0].output_figures:
            self.assertTrue(os.path.isfile(fig.path))
        for table in state.report_results[0].output_tables:
            self.assertTrue(os.path.isfile(table.path))

        shutil.rmtree(path)
Ejemplo n.º 26
0
    def test_run(self):

        path = EnvironmentSettings.tmp_test_path / "hpoptimproc/"
        PathBuilder.build(path)

        repertoires, metadata = RepertoireBuilder.build(
            sequences=[["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"]],
            path=path,
            labels={
                "l1": [
                    1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
                    1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2
                ],
                "l2": [
                    0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1,
                    0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
                ]
            })

        dataset = RepertoireDataset(repertoires=repertoires,
                                    metadata_file=metadata,
                                    labels={
                                        "l1": [1, 2],
                                        "l2": [0, 1]
                                    })
        enc1 = {
            "k": 3,
            "model_type": ModelType.SEQUENCE.name,
            "vector_size": 4
        }
        enc2 = {
            "k": 3,
            "model_type": ModelType.SEQUENCE.name,
            "vector_size": 6
        }
        hp_settings = [
            HPSetting(Word2VecEncoder.build_object(dataset, **enc1), enc1,
                      LogisticRegression(), {
                          "model_selection_cv": False,
                          "model_selection_n_folds": -1
                      }, []),
            HPSetting(
                Word2VecEncoder.build_object(dataset, **enc2), enc2, SVM(), {
                    "model_selection_cv": False,
                    "model_selection_n_folds": -1
                },
                [ClonesPerRepertoireFilter(lower_limit=-1, upper_limit=1000)])
        ]

        report = SequenceLengthDistribution()
        label_config = LabelConfiguration(
            [Label("l1", [1, 2]), Label("l2", [0, 1])])

        process = TrainMLModelInstruction(
            dataset, GridSearch(hp_settings), hp_settings,
            SplitConfig(SplitType.RANDOM,
                        1,
                        0.5,
                        reports=ReportConfig(data_splits={"seqlen": report})),
            SplitConfig(SplitType.RANDOM,
                        1,
                        0.5,
                        reports=ReportConfig(data_splits={"seqlen": report})),
            {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, label_config,
            path)

        state = process.run(result_path=path)

        self.assertTrue(isinstance(state, TrainMLModelState))
        self.assertEqual(1, len(state.assessment_states))
        self.assertTrue("l1" in state.assessment_states[0].label_states)
        self.assertTrue("l2" in state.assessment_states[0].label_states)

        shutil.rmtree(path)
Ejemplo n.º 27
0
    def test_process(self):
        path = EnvironmentSettings.root_path / "test/tmp/count_per_seq_filter/"
        PathBuilder.build(path)
        dataset = RepertoireDataset(repertoires=RepertoireBuilder.build(
            [["ACF", "ACF", "ACF"], ["ACF", "ACF"],
             ["ACF", "ACF", "ACF", "ACF"]],
            path,
            seq_metadata=[[{
                "count": 1
            }, {
                "count": 2
            }, {
                "count": 3
            }], [{
                "count": 4
            }, {
                "count": 1
            }], [{
                "count": 5
            }, {
                "count": 6
            }, {
                "count": None
            }, {
                "count": 1
            }]])[0])

        dataset1 = CountPerSequenceFilter(
            **{
                "low_count_limit": 2,
                "remove_without_count": True,
                "remove_empty_repertoires": False,
                "result_path": path,
                "batch_size": 4
            }).process_dataset(dataset, path)
        self.assertEqual(2,
                         dataset1.repertoires[0].get_sequence_aas().shape[0])

        dataset2 = CountPerSequenceFilter(
            **{
                "low_count_limit": 5,
                "remove_without_count": True,
                "remove_empty_repertoires": False,
                "result_path": path,
                "batch_size": 4
            }).process_dataset(dataset, path)
        self.assertEqual(0,
                         dataset2.repertoires[0].get_sequence_aas().shape[0])

        dataset3 = CountPerSequenceFilter(
            **{
                "low_count_limit": 0,
                "remove_without_count": True,
                "remove_empty_repertoires": False,
                "result_path": path,
                "batch_size": 4
            }).process_dataset(dataset, path)
        self.assertEqual(3,
                         dataset3.repertoires[2].get_sequence_aas().shape[0])

        dataset = RepertoireDataset(repertoires=RepertoireBuilder.build(
            [["ACF", "ACF", "ACF"], ["ACF", "ACF"],
             ["ACF", "ACF", "ACF", "ACF"]],
            path,
            seq_metadata=[[{
                "count": None
            }, {
                "count": None
            }, {
                "count": None
            }], [{
                "count": None
            }, {
                "count": None
            }],
                          [{
                              "count": None
                          }, {
                              "count": None
                          }, {
                              "count": None
                          }, {
                              "count": None
                          }]])[0])

        dataset4 = CountPerSequenceFilter(
            **{
                "low_count_limit": 0,
                "remove_without_count": True,
                "remove_empty_repertoires": False,
                "result_path": path,
                "batch_size": 4
            }).process_dataset(dataset, path)
        self.assertEqual(0,
                         dataset4.repertoires[0].get_sequence_aas().shape[0])
        self.assertEqual(0,
                         dataset4.repertoires[1].get_sequence_aas().shape[0])
        self.assertEqual(0,
                         dataset4.repertoires[2].get_sequence_aas().shape[0])

        self.assertRaises(
            AssertionError,
            CountPerSequenceFilter(
                **{
                    "low_count_limit": 10,
                    "remove_without_count": True,
                    "remove_empty_repertoires": True,
                    "result_path": path,
                    "batch_size": 4
                }).process_dataset, dataset, path)

        shutil.rmtree(path)
Ejemplo n.º 28
0
    def generate_repertoire_dataset(repertoire_count: int,
                                    sequence_count_probabilities: dict,
                                    sequence_length_probabilities: dict,
                                    labels: dict,
                                    path: Path) -> RepertoireDataset:
        """
        Creates repertoire_count repertoires where the number of sequences per repertoire is sampled from the probability distribution given
        in sequence_count_probabilities. The length of sequences is sampled independently for each sequence from
        sequence_length_probabilities distribution. The labels are also randomly assigned to repertoires from the distribution given in
        labels. In this case, labels are multi-class, so each repertoire will get at one class from each label. This means that negative
        classes for the labels should be included as well in the specification.

        An example of input parameters is given below:
        repertoire_count: 100 # generate 100 repertoires
        sequence_count_probabilities:
            100: 0.5 # half of the generated repertoires will have 100 sequences
            200: 0.5 # the other half of the generated repertoires will have 200 sequences
        sequence_length_distribution:
            14: 0.8 # 80% of all generated sequences for all repertoires will have length 14
            15: 0.2 # 20% of all generated sequences across all repertoires will have length 15
        labels:
            cmv: # label name
                True: 0.5 # 50% of the repertoires will have class True
                False: 0.5 # 50% of the repertoires will have class False
            coeliac: # next label with classes that will be assigned to repertoires independently of the previous label or any other parameter
                1: 0.3 # 30% of the generated repertoires will have class 1
                0: 0.7 # 70% of the generated repertoires will have class 0
        """
        RandomDatasetGenerator._check_rep_dataset_generation_params(
            repertoire_count, sequence_count_probabilities,
            sequence_length_probabilities, labels, path)

        alphabet = EnvironmentSettings.get_sequence_alphabet()
        PathBuilder.build(path)

        sequences = [[
            "".join(
                random.choices(alphabet,
                               k=random.choices(
                                   list(sequence_length_probabilities.keys()),
                                   sequence_length_probabilities.values())[0]))
            for seq_count in range(
                random.choices(list(sequence_count_probabilities.keys()),
                               sequence_count_probabilities.values())[0])
        ] for rep in range(repertoire_count)]

        if labels is not None:
            processed_labels = {
                label: random.choices(list(labels[label].keys()),
                                      labels[label].values(),
                                      k=repertoire_count)
                for label in labels
            }
            dataset_params = {
                label: list(labels[label].keys())
                for label in labels
            }
        else:
            processed_labels = None
            dataset_params = None

        repertoires, metadata = RepertoireBuilder.build(
            sequences=sequences, path=path, labels=processed_labels)
        dataset = RepertoireDataset(labels=dataset_params,
                                    repertoires=repertoires,
                                    metadata_file=metadata)

        return dataset
Ejemplo n.º 29
0
    def test_parse_yaml_file(self):
        path = EnvironmentSettings.root_path / "test/tmp/parser/"
        dataset = RepertoireDataset(repertoires=RepertoireBuilder.build(
            [["AAA", "CCC"], ["TTTT"]], path, {"default": [1, 2]})[0],
                                    labels={"default": [1, 2]})
        PickleExporter.export(dataset, path)

        spec = {
            "definitions": {
                "datasets": {
                    "d1": {
                        "format": "Pickle",
                        "params": {
                            "path": str(path / f"{dataset.name}.iml_dataset"),
                        }
                    }
                },
                "encodings": {
                    "a1": {
                        "Word2Vec": {
                            "k": 3,
                            "model_type": "sequence",
                            "vector_size": 8,
                        }
                    },
                    "a2": "Word2Vec"
                },
                "ml_methods": {
                    "simpleLR": {
                        "LogisticRegression": {
                            "penalty": "l1"
                        },
                        "model_selection_cv": False,
                        "model_selection_n_folds": -1,
                    },
                    "simpleLR2": "LogisticRegression"
                },
                "reports": {
                    "rep1": "SequenceLengthDistribution"
                }
            },
            "instructions": {}
        }

        PathBuilder.build(path)

        specs_filename = path / "tmp_yaml_spec.yaml"

        with specs_filename.open("w") as file:
            yaml.dump(spec, file, default_flow_style=False)

        symbol_table, _ = ImmuneMLParser.parse_yaml_file(specs_filename,
                                                         result_path=path)

        self.assertTrue(
            all([
                symbol_table.contains(key)
                for key in ["simpleLR", "rep1", "a1", "d1"]
            ]))
        self.assertTrue(isinstance(symbol_table.get("d1"), RepertoireDataset))

        with self.assertRaises(YAMLError):
            with specs_filename.open("r") as file:
                specs_text = file.readlines()
            specs_text[0] = "        definitions:"
            with specs_filename.open("w") as file:
                file.writelines(specs_text)

            ImmuneMLParser.parse_yaml_file(specs_filename, result_path=path)

        shutil.rmtree(path)
Ejemplo n.º 30
0
    def test_encoding(self):

        path = EnvironmentSettings.tmp_test_path / "integration_test_emerson_encoding/"
        PathBuilder.build(path)

        ref_path = path / "reference.csv"
        pd.DataFrame({
            "sequence_aas": ["GGG", "III", "TTT", "EFEF"],
            "v_alleles":
            ["TRBV6-1*01", "TRBV6-1*01", "TRBV6-1*01", "TRBV6-1*01"],
            'j_alleles': ["TRBJ2-7", "TRBJ2-7", "TRBJ2-7", "TRBJ2-7"]
        }).to_csv(ref_path, index=False)

        repertoires, metadata = RepertoireBuilder.build(
            [["GGG", "III", "LLL", "MMM"],
             ["DDD", "EEE", "FFF", "III", "LLL", "MMM"], ["CCC", "FFF", "MMM"],
             ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"],
             ["GGG", "III", "LLL", "MMM"],
             ["DDD", "EEE", "FFF", "III", "LLL", "MMM"], ["CCC", "FFF", "MMM"],
             ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"],
             ["GGG", "III", "LLL", "MMM"],
             ["DDD", "EEE", "FFF", "III", "LLL", "MMM"], ["CCC", "FFF", "MMM"],
             ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"],
             ["GGG", "III", "LLL", "MMM"],
             ["DDD", "EEE", "FFF", "III", "LLL", "MMM"], ["CCC", "FFF", "MMM"],
             ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"]],
            labels={
                "l1": [
                    True, True, False, False, True, True, False, False, True,
                    True, False, False, True, True, False, False
                ]
            },
            path=path)

        dataset = RepertoireDataset(repertoires=repertoires,
                                    metadata_file=metadata,
                                    labels={"l1": [True, False]})
        ImmuneMLExporter.export(dataset, path)

        specs = {
            "definitions": {
                "datasets": {
                    "d1": {
                        "format": "ImmuneML",
                        "params": {
                            "path": str(path / f"{dataset.name}.iml_dataset"),
                        }
                    }
                },
                "encodings": {
                    "e1": {
                        "SequenceAbundance": {
                            'comparison_attributes':
                            ["sequence_aas", "v_alleles", "j_alleles"]
                        }
                    }
                },
                "ml_methods": {
                    "knn": {
                        "KNN": {
                            "n_neighbors": 1
                        },
                    }
                },
                "reports": {
                    "r1": {
                        "ReferenceSequenceOverlap": {
                            "reference_path":
                            str(ref_path),
                            'comparison_attributes':
                            ["sequence_aas", "v_alleles", "j_alleles"]
                        }
                    }
                }
            },
            "instructions": {
                "inst1": {
                    "type": "TrainMLModel",
                    "settings": [{
                        "encoding": "e1",
                        "ml_method": "knn"
                    }],
                    "assessment": {
                        "split_strategy": "random",
                        "split_count": 1,
                        "training_percentage": 0.7,
                        "reports": {}
                    },
                    "selection": {
                        "split_strategy": "random",
                        "split_count": 1,
                        "training_percentage": 0.7,
                    },
                    "labels": [{
                        "l1": {
                            "positive_class": True
                        }
                    }],
                    "dataset": "d1",
                    "strategy": "GridSearch",
                    "metrics": ["accuracy"],
                    "number_of_processes": 2,
                    "reports": ["r1"],
                    "optimization_metric": "balanced_accuracy",
                    "refit_optimal_model": True,
                }
            }
        }

        specs_file = path / "specs.yaml"
        with open(specs_file, "w") as file:
            yaml.dump(specs, file)

        app = ImmuneMLApp(specs_file, path / "result")
        app.run()

        shutil.rmtree(path)