def create_dataset(self): path = Path( os.path.relpath(EnvironmentSettings.root_path / "test/tmp/immunemlapp/initial_dataset")) PathBuilder.build(path) repertoire_count = 30 repertoires, metadata = RepertoireBuilder.build( [["AA", "AAAA", "AAAA", "AAA"] for i in range(repertoire_count)], path, { "CD": [ 'yes' if i % 2 == 0 else 'no' for i in range(repertoire_count) ], "CMV": [ True if i % 2 == 1 else False for i in range(repertoire_count) ] }, [[{ "chain": "A" if i % 2 == 0 else "B", "count": random.randint(2, 5) } for i in range(4)] for j in range(repertoire_count)]) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata, labels={ "CD": [True, False], "CMV": [True, False] }, name="d1") PickleExporter.export(dataset, path) return path / "d1.iml_dataset"
def make_random_dataset(self, path): alphabet = EnvironmentSettings.get_sequence_alphabet() sequences = [["".join([rn.choice(alphabet) for i in range(20)]) for i in range(100)] for i in range(40)] repertoires, metadata = RepertoireBuilder.build(sequences, path, subject_ids=[i % 2 for i in range(len(sequences))]) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata) PickleExporter.export(dataset, path)
def create_dummy_dataset(self, path): repertoires, metadata = RepertoireBuilder.build([["AA"], ["CC"]], path, labels={"label1": ["val1", "val2"], "label2": ["val1", "val2"]}) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata) dataset.name = "my_dataset" PickleExporter.export(dataset, path) return f"{dataset.name}.iml_dataset"
def test_build(self): path = EnvironmentSettings.root_path / "test/tmp/repbuilder/" repertoires, metadata = RepertoireBuilder.build( [["AAA", "CCC"], ["TTTT"]], path, {"default": [1, 2]}) self.assertEqual(2, len(repertoires)) self.assertEqual((2, 4), pd.read_csv(metadata).shape) self.assertEqual(2, len(repertoires[0].sequences)) self.assertTrue( all([ isinstance(seq, ReceptorSequence) for seq in repertoires[0].sequences ])) self.assertEqual(1, repertoires[0].metadata["default"]) self.assertEqual(1, len(repertoires[1].sequences)) self.assertTrue( all([ isinstance(seq, ReceptorSequence) for seq in repertoires[1].sequences ])) self.assertEqual(2, repertoires[1].metadata["default"]) self.assertEqual("rep_1", repertoires[1].metadata["subject_id"]) # Testing with custom metadata repertoires, metadata = RepertoireBuilder.build([["AAA", "CCC"]], path, seq_metadata=[[{ "v_gene": "v5", "j_gene": "j5" }, { "v_gene": "v2", "j_gene": "j2" }]]) self.assertEqual(repertoires[0].sequences[0].metadata.v_gene, "v5") self.assertEqual(repertoires[0].sequences[0].metadata.j_gene, "j5") self.assertEqual(repertoires[0].sequences[1].metadata.v_gene, "v2") self.assertEqual(repertoires[0].sequences[1].metadata.j_gene, "j2") shutil.rmtree(path)
def test_run(self): path = EnvironmentSettings.root_path / "test/tmp/mlmethodassessment/" PathBuilder.build(path) dataset = RepertoireDataset(repertoires=RepertoireBuilder.build( [["AA"], ["CC"], ["AA"], ["CC"], ["AA"], ["CC"], ["AA"], ["CC"], ["AA"], ["CC"], ["AA"], ["CC"]], path)[0]) dataset.encoded_data = EncodedData( examples=np.array([[1, 1], [1, 1], [3, 3], [1, 1], [1, 1], [3, 3], [1, 1], [1, 1], [3, 3], [1, 1], [1, 1], [3, 3]]), labels={ "l1": [1, 1, 3, 1, 1, 3, 1, 1, 3, 1, 1, 3], "l2": [1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3] }) label_config = LabelConfiguration() label_config.add_label("l1", [1, 3]) label = Label(name='l1', values=[1, 2]) method1 = LogisticRegression() method1.fit(dataset.encoded_data, label=label) res = MLMethodAssessment.run( MLMethodAssessmentParams( dataset=dataset, method=method1, metrics={ Metric.ACCURACY, Metric.BALANCED_ACCURACY, Metric.F1_MACRO }, optimization_metric=Metric.LOG_LOSS, predictions_path=EnvironmentSettings.root_path / "test/tmp/mlmethodassessment/predictions.csv", label=label, ml_score_path=EnvironmentSettings.root_path / "test/tmp/mlmethodassessment/ml_score.csv", split_index=1, path=EnvironmentSettings.root_path / "test/tmp/mlmethodassessment/")) self.assertTrue(isinstance(res, dict)) self.assertTrue(res[Metric.LOG_LOSS.name.lower()] <= 0.1) self.assertTrue( os.path.isfile(EnvironmentSettings.root_path / "test/tmp/mlmethodassessment/ml_score.csv")) df = pd.read_csv(EnvironmentSettings.root_path / "test/tmp/mlmethodassessment/ml_score.csv") self.assertTrue(df.shape[0] == 1) df = pd.read_csv(EnvironmentSettings.root_path / "test/tmp/mlmethodassessment/predictions.csv") self.assertEqual(12, df.shape[0]) shutil.rmtree(EnvironmentSettings.root_path / "test/tmp/mlmethodassessment/")
def test_run(self): path = EnvironmentSettings.root_path / "test/tmp/smmodel/" PathBuilder.build(path) repertoires, metadata = RepertoireBuilder.build( [["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"]], path, { "default": [ 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2 ] }) dataset = RepertoireDataset(repertoires=repertoires, labels={"default": [1, 2]}, metadata_file=metadata) label_config = LabelConfiguration() label_config.add_label("default", [1, 2]) hp_settings = [ HPSetting( Word2VecEncoder.build_object( dataset, **{ "vector_size": 8, "model_type": ModelType.SEQUENCE.name, "k": 3 }), { "vector_size": 8, "model_type": ModelType.SEQUENCE.name, "k": 3 }, LogisticRegression(), { "model_selection_cv": False, "model_selection_n_folds": -1 }, []) ] split_config_assessment = SplitConfig(SplitType.RANDOM, 1, 0.5, ReportConfig()) split_config_selection = SplitConfig(SplitType.RANDOM, 1, 0.5, ReportConfig()) instruction = TrainMLModelInstruction( dataset, GridSearch(hp_settings), hp_settings, split_config_assessment, split_config_selection, {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, label_config, path) semantic_model = SemanticModel([instruction], path) semantic_model.run() shutil.rmtree(path)
def create_dataset(self, path: str) -> RepertoireDataset: repertoires, metadata = RepertoireBuilder.build( [["A", "B"], ["B", "C"], ["D"], ["E", "F"], ["A", "B"], ["B", "C"], ["D"], ["E", "F"]], path, { "l1": [1, 0, 1, 0, 1, 0, 1, 0], "l2": [2, 3, 2, 3, 2, 3, 3, 3] }) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata) return dataset
def test_get_metadata_fields(self): path = EnvironmentSettings.tmp_test_path / "repertoire_dataset/" PathBuilder.build(path) repertoires, metadata = RepertoireBuilder.build([["AA"], ["BB"]], path, {"l1": [1, 2], "hla": ["A", "B"]}, subject_ids=["d1", "d2"]) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata) self.assertTrue("l1" in dataset.get_metadata_fields()) self.assertTrue("hla" in dataset.get_metadata_fields()) self.assertTrue("subject_id" in dataset.get_metadata_fields()) shutil.rmtree(path)
def _build_test_dataset(self, path): repertoires, metadata = RepertoireBuilder.build( [["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF", "III", "LLL", "MMM"], ["CCC", "FFF", "MMM"], ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"]], labels={"l1": [True, True, False, False]}, path=path) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata, identifier="1") return dataset
def create_datasets(self, path: Path): repertoires, metadata = RepertoireBuilder.build( [["A", "B"], ["B", "C"], ["D"], ["E", "F"]], path, { "l1": [1, 0, 1, 0], "l2": [2, 3, 2, 3] }) main_dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata) sub_dataset = main_dataset.make_subset([0, 1], path=path, dataset_type="subset") return main_dataset, sub_dataset
def create_dummy_data(self, path): # Setting up dummy data labels = { "subject_id": ["subject_1", "subject_2", "subject_3"], "label": ["yes", "yes", "no"] } metadata = { "v_gene": "TRBV1", "j_gene": "TRBJ1", "chain": Chain.BETA.value } repertoires, metadata = RepertoireBuilder.build( sequences=[["AAAA"], ["SSSS"], ["SSSS", "CCCC"]], path=path, labels=labels, seq_metadata=[[{ **metadata, "count": 10 }], [{ **metadata, "count": 10 }], [{ **metadata, "count": 5 }, { **metadata, "count": 5 }]], subject_ids=labels["subject_id"]) dataset = RepertoireDataset(repertoires=repertoires) label_config = LabelConfiguration() label_config.add_label("subject_id", labels["subject_id"]) label_config.add_label("label", labels["label"]) file_content = """complex.id Gene CDR3 V J Species MHC A MHC B MHC class Epitope Epitope gene Epitope species Reference Method Meta CDR3fix Score 100 TRB AAAA TRBV1 TRBJ1 HomoSapiens HLA-A*11:01 B2M MHCI AVFDRKSDAK EBNA4 EBV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""} {"cdr3": "CASSPPRVYSNGAGLAGVGWRNEQFF", "cdr3_old": "CASSPPRVYSNGAGLAGVGWRNEQFF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-1*01", "jStart": 21, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-4*01"} 0 200 TRB SSSS TRBV1 TRBJ1 HomoSapiens HLA-A*03:01 B2M MHCI KLGGALQAK IE1 CMV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""} {"cdr3": "CASSWTWDAATLWGQGALGGANVLTF", "cdr3_old": "CASSWTWDAATLWGQGALGGANVLTF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-6*01", "jStart": 19, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-5*01"} 0""" with open(path / "refs.tsv", "w") as file: file.writelines(file_content) reference_sequences = { "params": { "path": path / "refs.tsv", "region_type": "FULL_SEQUENCE" }, "format": "VDJdb" } return dataset, label_config, reference_sequences, labels
def create_dataset(self, path): repertoires, metadata = RepertoireBuilder.build( [["AAA"], ["AAAC"], ["ACA"], ["CAAA"], ["AAAC"], ["AAA"]], path, { "l1": [1, 1, 1, 0, 0, 0], "l2": [2, 3, 2, 3, 2, 3] }) dataset = RepertoireDataset(repertoires=repertoires, labels={ "l1": [0, 1], "l2": [2, 3] }, metadata_file=metadata) return dataset
def test_export(self): path = EnvironmentSettings.tmp_test_path / "imlexporter/" PathBuilder.build(path) repertoires, metadata = RepertoireBuilder.build([["AA"], ["CC"]], path) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata) ImmuneMLExporter.export(dataset, EnvironmentSettings.tmp_test_path / "imlexporter/") with open(EnvironmentSettings.tmp_test_path / f"imlexporter/{dataset.name}.iml_dataset", "r") as file: dataset2 = yaml.safe_load(file) shutil.rmtree(EnvironmentSettings.tmp_test_path / "imlexporter/") self.assertTrue(isinstance(dataset2, dict)) self.assertEqual('RepertoireDataset', dataset2['dataset_class']) self.assertEqual(dataset.identifier, dataset2['identifier'])
def test_process(self): path = EnvironmentSettings.root_path / "test/tmp/clones_per_repertoire_filter/" PathBuilder.build(path) dataset = RepertoireDataset(repertoires=RepertoireBuilder.build([["ACF", "ACF", "ACF"], ["ACF", "ACF"], ["ACF", "ACF", "ACF", "ACF"]], path)[0]) dataset1 = ClonesPerRepertoireFilter(**{"lower_limit": 3, "result_path": path}).process_dataset(dataset, path) self.assertEqual(2, dataset1.get_example_count()) dataset2 = ClonesPerRepertoireFilter(**{"upper_limit": 2, "result_path": path}).process_dataset(dataset, path) self.assertEqual(1, dataset2.get_example_count()) self.assertRaises(AssertionError, ClonesPerRepertoireFilter(**{"lower_limit": 10, "result_path": path}).process_dataset, dataset, path) shutil.rmtree(path)
def test_import(self): path = PathBuilder.build(EnvironmentSettings.tmp_test_path / "iml_import/") repertoires, metadata = RepertoireBuilder.build([["AA"], ["CC"]], path) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata) with open(path / "dataset.iml_dataset", "w") as file: dataset_dict = {key: item if not isinstance(item, Path) else str(item) for key, item in vars(dataset).items() if key not in ['repertoires', 'encoded_data']} yaml.dump({**dataset_dict, **{"dataset_class": "RepertoireDataset"}}, file) dataset2 = ImmuneMLImport.import_dataset({"path": path / "dataset.iml_dataset"}, "dataset_name") shutil.rmtree(path) self.assertEqual(2, len(dataset2.get_data())) self.assertEqual("rep_1", dataset2.get_data()[1].metadata["subject_id"])
def create_dummy_data(self, path): # Setting up dummy data labels = {"subject_id": ["subject_1", "subject_1", "subject_2", "subject_2", "subject_3"], "label": ["yes", "yes", "no", "no", "no"]} metadata_alpha = {"v_gene": "V1", "j_gene": "J1", "chain": Chain.ALPHA.value} metadata_beta = {"v_gene": "V1", "j_gene": "J1", "chain": Chain.BETA.value} repertoires, metadata = RepertoireBuilder.build(sequences=[["AAAA"], ["SSSS"], ["AAAA", "CCCC"], ["SSSS", "TTTT"], ["AAAA", "CCCC", "SSSS", "TTTT"]], path=path, labels=labels, seq_metadata=[[{**metadata_alpha, "count": 10}], [{**metadata_beta, "count": 10}], [{**metadata_alpha, "count": 5}, {**metadata_alpha, "count": 5}], [{**metadata_beta, "count": 5}, {**metadata_beta, "count": 5}], [{**metadata_alpha, "count": 1}, {**metadata_alpha, "count": 2}, {**metadata_beta, "count": 1}, {**metadata_beta, "count": 2}]], subject_ids=labels["subject_id"]) dataset = RepertoireDataset(repertoires=repertoires) label_config = LabelConfiguration() label_config.add_label("subject_id", labels["subject_id"]) label_config.add_label("label", labels["label"]) # clonotype 100 with TRA=AAAA, TRB = SSSS; clonotype 200 with TRA=CCCC, TRB = TTTT file_content = """Cell type Clonotype ID Chain: TRA (1) TRA - V gene (1) TRA - D gene (1) TRA - J gene (1) Chain: TRA (2) TRA - V gene (2) TRA - D gene (2) TRA - J gene (2) Chain: TRB (1) TRB - V gene (1) TRB - D gene (1) TRB - J gene (1) Chain: TRB (2) TRB - V gene (2) TRB - D gene (2) TRB - J gene (2) Cells pr. clonotype Clonotype (Id) Clonotype (Name) TCR_AB 100 AAAA TRAV1 TRAJ1 null null null null SSSS TRBV1 TRBJ1 null null null null 1 1941533 3ca0cd7f-02fd-40bb-b295-7cd5d419e474(101, 102, 103, 104, 105, 108, 109, 127, 128, 130, 131, 132, 133, 134, 174)Size:1 TCR_AB 200 CCCC TRAV1 TRAJ1 null null null null TTTT TRBV1 TRBJ1 null null null null 1 1941532 1df22bbc-8113-46b9-8913-da95fcf9a568(101, 102, 103, 104, 105, 108, 109, 127, 128, 130, 131, 132, 133, 134, 174)Size:1 """ with open(path / "refs.tsv", "w") as file: file.writelines(file_content) reference_receptors = {"params": {"path": path / "refs.tsv"}, "format": "IRIS"} return dataset, label_config, reference_receptors, labels
def test_encode(self): path = EnvironmentSettings.tmp_test_path / "abundance_encoder/" PathBuilder.build(path) repertoires, metadata = RepertoireBuilder.build( [["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF", "III", "LLL", "MMM"], ["CCC", "FFF", "MMM"], ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"]], labels={"l1": [True, True, False, False]}, path=path) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata, identifier="1") encoder = SequenceAbundanceEncoder.build_object( dataset, **{ "comparison_attributes": ["sequence_aas"], "p_value_threshold": 0.4, "sequence_batch_size": 4, "repertoire_batch_size": 8 }) label_config = LabelConfiguration( [Label("l1", [True, False], positive_class=True)]) encoded_dataset = encoder.encode( dataset, EncoderParams(result_path=path, label_config=label_config)) self.assertTrue( np.array_equal(np.array([[1, 4], [1, 6], [0, 3], [0, 6]]), encoded_dataset.encoded_data.examples)) encoder.p_value_threshold = 0.05 encoded_dataset = encoder.encode( dataset, EncoderParams(result_path=path, label_config=label_config)) self.assertTrue( np.array_equal(np.array([[0, 4], [0, 6], [0, 3], [0, 6]]), encoded_dataset.encoded_data.examples)) shutil.rmtree(path)
def _create_state_object(self, path): repertoires, metadata = RepertoireBuilder.build(sequences=[["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"]], path=path, labels={ "l1": [1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2], "l2": [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]}) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata, labels={"l1": [1, 2], "l2": [0, 1]}) enc_params = {"k": 3, "model_type": ModelType.SEQUENCE.name, "vector_size": 4} hp_settings = [HPSetting(Word2VecEncoder.build_object(dataset, **enc_params), enc_params, LogisticRegression(), {"model_selection_cv": False, "model_selection_n_folds": -1}, [])] label_config = LabelConfiguration([Label("l1", [1, 2]), Label("l2", [0, 1])]) process = TrainMLModelInstruction(dataset, GridSearch(hp_settings), hp_settings, SplitConfig(SplitType.RANDOM, 1, 0.7), SplitConfig(SplitType.RANDOM, 1, 0.7), {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, label_config, path) state = process.run(result_path=path) return state
def test_load(self): path = EnvironmentSettings.root_path / "test/tmp/pathbuilder/" PathBuilder.build(path) repertoires, metadata = RepertoireBuilder.build([["AA"], ["CC"]], path) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata) with open(path / "dataset.pkl", "wb") as file: pickle.dump(dataset, file) dataset2 = PickleImport.import_dataset({"path": path / "dataset.pkl"}, "dataset_name") shutil.rmtree(path) self.assertEqual(2, len(dataset2.get_data())) self.assertEqual("rep_1", dataset2.get_data()[1].metadata["subject_id"])
def test_process(self): path = EnvironmentSettings.root_path / "test/tmp/metadata_filter/" PathBuilder.build(path) dataset = RepertoireDataset(repertoires=RepertoireBuilder.build( [["ACF", "ACF", "ACF"], ["ACF", "ACF"], ["ACF", "ACF", "ACF", "ACF"]], path)[0]) df = pd.DataFrame(data={"key1": [0, 1, 2], "key2": [0, 1, 2]}) df.to_csv(path / "metadata.csv") dataset.metadata_file = path / "metadata.csv" dataset1 = MetadataRepertoireFilter( **{ "criteria": { "type": OperationType.GREATER_THAN.name, "value": { "type": DataType.COLUMN.name, "name": "key2" }, "threshold": 1 }, "result_path": path }).process_dataset(dataset, path) self.assertEqual(1, dataset1.get_example_count()) self.assertRaises( AssertionError, MetadataRepertoireFilter( **{ "criteria": { "type": OperationType.GREATER_THAN.name, "value": { "type": DataType.COLUMN.name, "name": "key2" }, "threshold": 10 } }).process_dataset, dataset, path) shutil.rmtree(path)
def create_dummy_data(self, path): # Setting up dummy data labels = {"subject_id": ["subject_1", "subject_2", "subject_3"], "label": ["yes", "no", "no"]} metadata_alpha = {"v_gene": "V1", "j_gene": "J1", "chain": Chain.LIGHT.value} metadata_beta = {"v_gene": "V1", "j_gene": "J1", "chain": Chain.HEAVY.value} repertoires, metadata = RepertoireBuilder.build(sequences=[["XXAGQXGSSNTGKLIXX", "XXAGQXGSSNTGKLIYY", "XXSAGQGETQYXX"], ["ASSXRXX"], ["XXIXXNDYKLSXX", "CCCC", "SSSS", "TTTT"]], path=path, labels=labels, seq_metadata=[[{**metadata_alpha, "count": 10, "v_gene": "IGLV35"}, {**metadata_alpha, "count": 10}, {**metadata_beta, "count": 10, "v_gene": "IGHV29-1"}], [{**metadata_beta, "count": 10, "v_gene": "IGHV7-3"}], [{**metadata_alpha, "count": 5, "v_gene": "IGLV26-2"}, {**metadata_alpha, "count": 2}, {**metadata_beta, "count": 1}, {**metadata_beta, "count": 2}]], subject_ids=labels["subject_id"]) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata) label_config = LabelConfiguration() label_config.add_label("subject_id", labels["subject_id"]) label_config.add_label("label", labels["label"]) file_content = """id IGLV IGHV IGL_regex IGH_regex 1 IGLV35 IGHV29-1 AGQ.GSSNTGKLI S[APGFTVML]GQGETQY 2 IGHV7-3 ASS.R.* 3 IGLV26-1 I..NDYKLS 4 IGLV26-2 I..NDYKLS """ filepath = path / "reference_motifs.tsv" with open(filepath, "w") as file: file.writelines(file_content) return dataset, label_config, filepath, labels
def test_export(self): path = EnvironmentSettings.tmp_test_path / "pickleexporter/" PathBuilder.build(path) repertoires, metadata = RepertoireBuilder.build([["AA"], ["CC"]], path) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata) PickleExporter.export( dataset, EnvironmentSettings.tmp_test_path / "pickleexporter/") with open( EnvironmentSettings.tmp_test_path / f"pickleexporter/{dataset.name}.iml_dataset", "rb") as file: dataset2 = pickle.load(file) shutil.rmtree(EnvironmentSettings.tmp_test_path / "pickleexporter/") self.assertTrue(isinstance(dataset2, RepertoireDataset)) self.assertEqual(2, len(dataset2.get_data())) self.assertEqual("rep_0", dataset2.get_data()[0].metadata["subject_id"])
def test_encode(self): path = EnvironmentSettings.tmp_test_path / "count_encoder/" PathBuilder.build(path) repertoires, metadata = RepertoireBuilder.build( [["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF", "III", "LLL", "MMM"], ["CCC", "FFF", "MMM"], ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"]], labels={"l1": [True, True, False, False]}, path=path) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata, identifier="1") encoder = SequenceCountEncoder.build_object( dataset, **{ "comparison_attributes": ["sequence_aas"], "p_value_threshold": 0.4, "sequence_batch_size": 4 }) label_config = LabelConfiguration( [Label("l1", [True, False], positive_class=True)]) encoded_dataset = encoder.encode( dataset, EncoderParams(result_path=path, label_config=label_config)) test = encoded_dataset.encoded_data.examples self.assertTrue(test[0] == 1) self.assertTrue(test[1] == 1) self.assertTrue(test[2] == 0) self.assertTrue(test[3] == 0) self.assertTrue("III" in encoded_dataset.encoded_data.feature_names) shutil.rmtree(path)
def prepare_dataset(self, path): PathBuilder.build(path) repertoires, metadata = RepertoireBuilder.build( sequences=[["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"]], path=path, labels={ "l1": [ 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2 ], "l2": [ 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 ] }) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata, labels={ "l1": [1, 2], "l2": [0, 1] }, name="dataset1") PickleExporter.export(dataset, path)
def test_generate(self): path = EnvironmentSettings.tmp_test_path / "disease_assoc_seq_cv/" PathBuilder.build(path) repertoires, metadata = RepertoireBuilder.build( [["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF"], ["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF"], ["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF"], ["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF"], ["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF"], ["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF"], ["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF"]], labels={ "l1": [ True, False, True, False, True, False, True, False, True, False, True, False, True, False ] }, path=path) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata, labels={"l1": [True, False]}) PickleExporter.export(dataset, path) specs = { "definitions": { "datasets": { "d1": { "format": "Pickle", "params": { "path": str(path / f"{dataset.name}.iml_dataset"), } } }, "encodings": { "e1": { "SequenceAbundance": { 'p_value_threshold': 0.5 } } }, "ml_methods": { "knn": { "KNN": { "n_neighbors": 1 }, } }, "reports": { "r1": { "DiseaseAssociatedSequenceCVOverlap": { "compare_in_selection": True, "compare_in_assessment": True } } } }, "instructions": { "inst1": { "type": "TrainMLModel", "settings": [{ "encoding": "e1", "ml_method": "knn" }], "assessment": { "split_strategy": "random", "split_count": 1, "training_percentage": 0.5, "reports": {} }, "selection": { "split_strategy": "random", "split_count": 1, "training_percentage": 0.5, }, "labels": [{ "l1": { "positive_class": True } }], "dataset": "d1", "strategy": "GridSearch", "metrics": ["accuracy"], "number_of_processes": 2, "reports": ["r1"], "optimization_metric": "balanced_accuracy", "refit_optimal_model": True, "store_encoded_data": False } } } specs_file = path / "specs.yaml" with open(specs_file, "w") as file: yaml.dump(specs, file) app = ImmuneMLApp(specs_file, path / "result/") state = app.run()[0] self.assertEqual(1, len(state.report_results)) self.assertTrue(len(state.report_results[0].output_figures) > 0) self.assertTrue(len(state.report_results[0].output_tables) > 0) for fig in state.report_results[0].output_figures: self.assertTrue(os.path.isfile(fig.path)) for table in state.report_results[0].output_tables: self.assertTrue(os.path.isfile(table.path)) shutil.rmtree(path)
def test_run(self): path = EnvironmentSettings.tmp_test_path / "hpoptimproc/" PathBuilder.build(path) repertoires, metadata = RepertoireBuilder.build( sequences=[["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"]], path=path, labels={ "l1": [ 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2 ], "l2": [ 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 ] }) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata, labels={ "l1": [1, 2], "l2": [0, 1] }) enc1 = { "k": 3, "model_type": ModelType.SEQUENCE.name, "vector_size": 4 } enc2 = { "k": 3, "model_type": ModelType.SEQUENCE.name, "vector_size": 6 } hp_settings = [ HPSetting(Word2VecEncoder.build_object(dataset, **enc1), enc1, LogisticRegression(), { "model_selection_cv": False, "model_selection_n_folds": -1 }, []), HPSetting( Word2VecEncoder.build_object(dataset, **enc2), enc2, SVM(), { "model_selection_cv": False, "model_selection_n_folds": -1 }, [ClonesPerRepertoireFilter(lower_limit=-1, upper_limit=1000)]) ] report = SequenceLengthDistribution() label_config = LabelConfiguration( [Label("l1", [1, 2]), Label("l2", [0, 1])]) process = TrainMLModelInstruction( dataset, GridSearch(hp_settings), hp_settings, SplitConfig(SplitType.RANDOM, 1, 0.5, reports=ReportConfig(data_splits={"seqlen": report})), SplitConfig(SplitType.RANDOM, 1, 0.5, reports=ReportConfig(data_splits={"seqlen": report})), {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, label_config, path) state = process.run(result_path=path) self.assertTrue(isinstance(state, TrainMLModelState)) self.assertEqual(1, len(state.assessment_states)) self.assertTrue("l1" in state.assessment_states[0].label_states) self.assertTrue("l2" in state.assessment_states[0].label_states) shutil.rmtree(path)
def test_process(self): path = EnvironmentSettings.root_path / "test/tmp/count_per_seq_filter/" PathBuilder.build(path) dataset = RepertoireDataset(repertoires=RepertoireBuilder.build( [["ACF", "ACF", "ACF"], ["ACF", "ACF"], ["ACF", "ACF", "ACF", "ACF"]], path, seq_metadata=[[{ "count": 1 }, { "count": 2 }, { "count": 3 }], [{ "count": 4 }, { "count": 1 }], [{ "count": 5 }, { "count": 6 }, { "count": None }, { "count": 1 }]])[0]) dataset1 = CountPerSequenceFilter( **{ "low_count_limit": 2, "remove_without_count": True, "remove_empty_repertoires": False, "result_path": path, "batch_size": 4 }).process_dataset(dataset, path) self.assertEqual(2, dataset1.repertoires[0].get_sequence_aas().shape[0]) dataset2 = CountPerSequenceFilter( **{ "low_count_limit": 5, "remove_without_count": True, "remove_empty_repertoires": False, "result_path": path, "batch_size": 4 }).process_dataset(dataset, path) self.assertEqual(0, dataset2.repertoires[0].get_sequence_aas().shape[0]) dataset3 = CountPerSequenceFilter( **{ "low_count_limit": 0, "remove_without_count": True, "remove_empty_repertoires": False, "result_path": path, "batch_size": 4 }).process_dataset(dataset, path) self.assertEqual(3, dataset3.repertoires[2].get_sequence_aas().shape[0]) dataset = RepertoireDataset(repertoires=RepertoireBuilder.build( [["ACF", "ACF", "ACF"], ["ACF", "ACF"], ["ACF", "ACF", "ACF", "ACF"]], path, seq_metadata=[[{ "count": None }, { "count": None }, { "count": None }], [{ "count": None }, { "count": None }], [{ "count": None }, { "count": None }, { "count": None }, { "count": None }]])[0]) dataset4 = CountPerSequenceFilter( **{ "low_count_limit": 0, "remove_without_count": True, "remove_empty_repertoires": False, "result_path": path, "batch_size": 4 }).process_dataset(dataset, path) self.assertEqual(0, dataset4.repertoires[0].get_sequence_aas().shape[0]) self.assertEqual(0, dataset4.repertoires[1].get_sequence_aas().shape[0]) self.assertEqual(0, dataset4.repertoires[2].get_sequence_aas().shape[0]) self.assertRaises( AssertionError, CountPerSequenceFilter( **{ "low_count_limit": 10, "remove_without_count": True, "remove_empty_repertoires": True, "result_path": path, "batch_size": 4 }).process_dataset, dataset, path) shutil.rmtree(path)
def generate_repertoire_dataset(repertoire_count: int, sequence_count_probabilities: dict, sequence_length_probabilities: dict, labels: dict, path: Path) -> RepertoireDataset: """ Creates repertoire_count repertoires where the number of sequences per repertoire is sampled from the probability distribution given in sequence_count_probabilities. The length of sequences is sampled independently for each sequence from sequence_length_probabilities distribution. The labels are also randomly assigned to repertoires from the distribution given in labels. In this case, labels are multi-class, so each repertoire will get at one class from each label. This means that negative classes for the labels should be included as well in the specification. An example of input parameters is given below: repertoire_count: 100 # generate 100 repertoires sequence_count_probabilities: 100: 0.5 # half of the generated repertoires will have 100 sequences 200: 0.5 # the other half of the generated repertoires will have 200 sequences sequence_length_distribution: 14: 0.8 # 80% of all generated sequences for all repertoires will have length 14 15: 0.2 # 20% of all generated sequences across all repertoires will have length 15 labels: cmv: # label name True: 0.5 # 50% of the repertoires will have class True False: 0.5 # 50% of the repertoires will have class False coeliac: # next label with classes that will be assigned to repertoires independently of the previous label or any other parameter 1: 0.3 # 30% of the generated repertoires will have class 1 0: 0.7 # 70% of the generated repertoires will have class 0 """ RandomDatasetGenerator._check_rep_dataset_generation_params( repertoire_count, sequence_count_probabilities, sequence_length_probabilities, labels, path) alphabet = EnvironmentSettings.get_sequence_alphabet() PathBuilder.build(path) sequences = [[ "".join( random.choices(alphabet, k=random.choices( list(sequence_length_probabilities.keys()), sequence_length_probabilities.values())[0])) for seq_count in range( random.choices(list(sequence_count_probabilities.keys()), sequence_count_probabilities.values())[0]) ] for rep in range(repertoire_count)] if labels is not None: processed_labels = { label: random.choices(list(labels[label].keys()), labels[label].values(), k=repertoire_count) for label in labels } dataset_params = { label: list(labels[label].keys()) for label in labels } else: processed_labels = None dataset_params = None repertoires, metadata = RepertoireBuilder.build( sequences=sequences, path=path, labels=processed_labels) dataset = RepertoireDataset(labels=dataset_params, repertoires=repertoires, metadata_file=metadata) return dataset
def test_parse_yaml_file(self): path = EnvironmentSettings.root_path / "test/tmp/parser/" dataset = RepertoireDataset(repertoires=RepertoireBuilder.build( [["AAA", "CCC"], ["TTTT"]], path, {"default": [1, 2]})[0], labels={"default": [1, 2]}) PickleExporter.export(dataset, path) spec = { "definitions": { "datasets": { "d1": { "format": "Pickle", "params": { "path": str(path / f"{dataset.name}.iml_dataset"), } } }, "encodings": { "a1": { "Word2Vec": { "k": 3, "model_type": "sequence", "vector_size": 8, } }, "a2": "Word2Vec" }, "ml_methods": { "simpleLR": { "LogisticRegression": { "penalty": "l1" }, "model_selection_cv": False, "model_selection_n_folds": -1, }, "simpleLR2": "LogisticRegression" }, "reports": { "rep1": "SequenceLengthDistribution" } }, "instructions": {} } PathBuilder.build(path) specs_filename = path / "tmp_yaml_spec.yaml" with specs_filename.open("w") as file: yaml.dump(spec, file, default_flow_style=False) symbol_table, _ = ImmuneMLParser.parse_yaml_file(specs_filename, result_path=path) self.assertTrue( all([ symbol_table.contains(key) for key in ["simpleLR", "rep1", "a1", "d1"] ])) self.assertTrue(isinstance(symbol_table.get("d1"), RepertoireDataset)) with self.assertRaises(YAMLError): with specs_filename.open("r") as file: specs_text = file.readlines() specs_text[0] = " definitions:" with specs_filename.open("w") as file: file.writelines(specs_text) ImmuneMLParser.parse_yaml_file(specs_filename, result_path=path) shutil.rmtree(path)
def test_encoding(self): path = EnvironmentSettings.tmp_test_path / "integration_test_emerson_encoding/" PathBuilder.build(path) ref_path = path / "reference.csv" pd.DataFrame({ "sequence_aas": ["GGG", "III", "TTT", "EFEF"], "v_alleles": ["TRBV6-1*01", "TRBV6-1*01", "TRBV6-1*01", "TRBV6-1*01"], 'j_alleles': ["TRBJ2-7", "TRBJ2-7", "TRBJ2-7", "TRBJ2-7"] }).to_csv(ref_path, index=False) repertoires, metadata = RepertoireBuilder.build( [["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF", "III", "LLL", "MMM"], ["CCC", "FFF", "MMM"], ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"], ["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF", "III", "LLL", "MMM"], ["CCC", "FFF", "MMM"], ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"], ["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF", "III", "LLL", "MMM"], ["CCC", "FFF", "MMM"], ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"], ["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF", "III", "LLL", "MMM"], ["CCC", "FFF", "MMM"], ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"]], labels={ "l1": [ True, True, False, False, True, True, False, False, True, True, False, False, True, True, False, False ] }, path=path) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata, labels={"l1": [True, False]}) ImmuneMLExporter.export(dataset, path) specs = { "definitions": { "datasets": { "d1": { "format": "ImmuneML", "params": { "path": str(path / f"{dataset.name}.iml_dataset"), } } }, "encodings": { "e1": { "SequenceAbundance": { 'comparison_attributes': ["sequence_aas", "v_alleles", "j_alleles"] } } }, "ml_methods": { "knn": { "KNN": { "n_neighbors": 1 }, } }, "reports": { "r1": { "ReferenceSequenceOverlap": { "reference_path": str(ref_path), 'comparison_attributes': ["sequence_aas", "v_alleles", "j_alleles"] } } } }, "instructions": { "inst1": { "type": "TrainMLModel", "settings": [{ "encoding": "e1", "ml_method": "knn" }], "assessment": { "split_strategy": "random", "split_count": 1, "training_percentage": 0.7, "reports": {} }, "selection": { "split_strategy": "random", "split_count": 1, "training_percentage": 0.7, }, "labels": [{ "l1": { "positive_class": True } }], "dataset": "d1", "strategy": "GridSearch", "metrics": ["accuracy"], "number_of_processes": 2, "reports": ["r1"], "optimization_metric": "balanced_accuracy", "refit_optimal_model": True, } } } specs_file = path / "specs.yaml" with open(specs_file, "w") as file: yaml.dump(specs, file) app = ImmuneMLApp(specs_file, path / "result") app.run() shutil.rmtree(path)