def test_encode(self): path = EnvironmentSettings.tmp_test_path + "abundance_encoder/" PathBuilder.build(path) repertoires, metadata = RepertoireBuilder.build([["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF", "III", "LLL", "MMM"], ["CCC", "FFF", "MMM"], ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"]], labels={"l1": [True, True, False, False]}, path=path) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata, identifier="1") encoder = SequenceAbundanceEncoder.build_object(dataset, **{ "comparison_attributes": ["sequence_aas"], "p_value_threshold": 0.4, "sequence_batch_size": 4, "repertoire_batch_size": 8 }) label_config = LabelConfiguration([Label("l1", [True, False], positive_class=True)]) encoded_dataset = encoder.encode(dataset, EncoderParams(result_path=path, label_config=label_config)) self.assertTrue(np.array_equal(np.array([[1, 4], [1, 6], [0, 3], [0, 6]]), encoded_dataset.encoded_data.examples)) encoder.p_value_threshold = 0.05 encoded_dataset = encoder.encode(dataset, EncoderParams(result_path=path, label_config=label_config)) self.assertTrue(np.array_equal(np.array([[0, 4], [0, 6], [0, 3], [0, 6]]), encoded_dataset.encoded_data.examples)) shutil.rmtree(path)
def test_process(self): path = EnvironmentSettings.root_path + "test/tmp/clones_per_repertoire_filter/" PathBuilder.build(path) dataset = RepertoireDataset(repertoires=RepertoireBuilder.build( [["ACF", "ACF", "ACF"], ["ACF", "ACF"], ["ACF", "ACF", "ACF", "ACF"]], path)[0]) dataset1 = ClonesPerRepertoireFilter.process(dataset, { "lower_limit": 3, "result_path": path }) self.assertEqual(2, dataset1.get_example_count()) dataset2 = ClonesPerRepertoireFilter.process(dataset, { "upper_limit": 2, "result_path": path }) self.assertEqual(1, dataset2.get_example_count()) self.assertRaises(AssertionError, ClonesPerRepertoireFilter.process, dataset, { "lower_limit": 10, "result_path": path }) shutil.rmtree(path)
def test_build(self): path = EnvironmentSettings.root_path + "test/tmp/repbuilder/" repertoires, metadata = RepertoireBuilder.build( [["AAA", "CCC"], ["TTTT"]], path, {"default": [1, 2]}) self.assertEqual(2, len(repertoires)) self.assertEqual((2, 4), pd.read_csv(metadata).shape) self.assertEqual(2, len(repertoires[0].sequences)) self.assertTrue( all([ isinstance(seq, ReceptorSequence) for seq in repertoires[0].sequences ])) self.assertEqual(1, repertoires[0].metadata["default"]) self.assertEqual(1, len(repertoires[1].sequences)) self.assertTrue( all([ isinstance(seq, ReceptorSequence) for seq in repertoires[1].sequences ])) self.assertEqual(2, repertoires[1].metadata["default"]) self.assertEqual("rep_1", repertoires[1].metadata["subject_id"]) # Testing with custom metadata repertoires, metadata = RepertoireBuilder.build([["AAA", "CCC"]], path, seq_metadata=[[{ "v_gene": "v5", "j_gene": "j5" }, { "v_gene": "v2", "j_gene": "j2" }]]) self.assertEqual(repertoires[0].sequences[0].metadata.v_gene, "v5") self.assertEqual(repertoires[0].sequences[0].metadata.j_gene, "j5") self.assertEqual(repertoires[0].sequences[1].metadata.v_gene, "v2") self.assertEqual(repertoires[0].sequences[1].metadata.j_gene, "j2") shutil.rmtree(path)
def create_dataset(self, path: str) -> RepertoireDataset: repertoires, metadata = RepertoireBuilder.build( [["A", "B"], ["B", "C"], ["D"], ["E", "F"], ["A", "B"], ["B", "C"], ["D"], ["E", "F"]], path, { "l1": [1, 0, 1, 0, 1, 0, 1, 0], "l2": [2, 3, 2, 3, 2, 3, 3, 3] }) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata) return dataset
def test_get_metadata_fields(self): path = EnvironmentSettings.tmp_test_path + "repertoire_dataset/" PathBuilder.build(path) repertoires, metadata = RepertoireBuilder.build([["AA"], ["BB"]], path, {"l1": [1, 2], "hla": ["A", "B"]}, subject_ids=["d1", "d2"]) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata) self.assertTrue("l1" in dataset.get_metadata_fields()) self.assertTrue("hla" in dataset.get_metadata_fields()) self.assertTrue("subject_id" in dataset.get_metadata_fields()) shutil.rmtree(path)
def create_datasets(self, path: str): repertoires, metadata = RepertoireBuilder.build( [["A", "B"], ["B", "C"], ["D"], ["E", "F"]], path, { "l1": [1, 0, 1, 0], "l2": [2, 3, 2, 3] }) main_dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata) sub_dataset = main_dataset.make_subset([0, 1], path=path, dataset_type="subset") return main_dataset, sub_dataset
def create_dummy_data(self, path): # Setting up dummy data labels = { "subject_id": ["subject_1", "subject_2", "subject_3"], "label": ["yes", "yes", "no"] } metadata = { "v_gene": "TRBV1", "j_gene": "TRBJ1", "chain": Chain.BETA.value } repertoires, metadata = RepertoireBuilder.build( sequences=[["AAAA"], ["SSSS"], ["SSSS", "CCCC"]], path=path, labels=labels, seq_metadata=[[{ **metadata, "count": 10 }], [{ **metadata, "count": 10 }], [{ **metadata, "count": 5 }, { **metadata, "count": 5 }]], subject_ids=labels["subject_id"]) dataset = RepertoireDataset(repertoires=repertoires) label_config = LabelConfiguration() label_config.add_label("subject_id", labels["subject_id"]) label_config.add_label("label", labels["label"]) file_content = """complex.id Gene CDR3 V J Species MHC A MHC B MHC class Epitope Epitope gene Epitope species Reference Method Meta CDR3fix Score 100 TRB AAAA TRBV1 TRBJ1 HomoSapiens HLA-A*11:01 B2M MHCI AVFDRKSDAK EBNA4 EBV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""} {"cdr3": "CASSPPRVYSNGAGLAGVGWRNEQFF", "cdr3_old": "CASSPPRVYSNGAGLAGVGWRNEQFF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-1*01", "jStart": 21, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-4*01"} 0 200 TRB SSSS TRBV1 TRBJ1 HomoSapiens HLA-A*03:01 B2M MHCI KLGGALQAK IE1 CMV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""} {"cdr3": "CASSWTWDAATLWGQGALGGANVLTF", "cdr3_old": "CASSWTWDAATLWGQGALGGANVLTF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-6*01", "jStart": 19, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-5*01"} 0""" with open(path + "refs.tsv", "w") as file: file.writelines(file_content) reference_sequences = { "params": { "path": path + "refs.tsv", "region_type": "FULL_SEQUENCE" }, "format": "VDJdb" } return dataset, label_config, reference_sequences, labels
def create_dataset(self, path): repertoires, metadata = RepertoireBuilder.build( [["AAA"], ["AAAC"], ["ACA"], ["CAAA"], ["AAAC"], ["AAA"]], path, { "l1": [1, 1, 1, 0, 0, 0], "l2": [2, 3, 2, 3, 2, 3] }) dataset = RepertoireDataset(repertoires=repertoires, params={ "l1": [0, 1], "l2": [2, 3] }, metadata_file=metadata) return dataset
def create_dataset(self): path = os.path.relpath(EnvironmentSettings.root_path + "test/tmp/immunemlapp/initial_dataset/") + "/" PathBuilder.build(path) repertoire_count = 30 repertoires, metadata = RepertoireBuilder.build([["AA", "AAAA", "AAAA", "AAA"] for i in range(repertoire_count)], path, {"CD": ['yes' if i % 2 == 0 else 'no' for i in range(repertoire_count)], "CMV": [True if i % 2 == 1 else False for i in range(repertoire_count)]}, [[{"chain": "A" if i % 2 == 0 else "B", "count": random.randint(2, 5)} for i in range(4)] for j in range(repertoire_count)]) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata, params={"CD": [True, False], "CMV": [True, False]}, name="d1") PickleExporter.export(dataset, path) return path + "d1.iml_dataset"
def create_dummy_dataset(self, path): repertoires, metadata = RepertoireBuilder.build([["AA"], ["CC"]], path, labels={ "label1": ["val1", "val2"], "label2": ["val1", "val2"] }) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata) dataset.name = "my_dataset" PickleExporter.export(dataset, path) return f"{dataset.name}.iml_dataset"
def generate_repertoire_dataset(repertoire_count: int, sequence_count_probabilities: dict, sequence_length_probabilities: dict, labels: dict, path: str) -> RepertoireDataset: """ Creates repertoire_count repertoires where the number of sequences per repertoire is sampled from the probability distribution given in sequence_count_probabilities. The length of sequences is sampled independently for each sequence from sequence_length_probabilities distribution. The labels are also randomly assigned to repertoires from the distribution given in labels. In this case, labels are multi-class, so each repertoire will get at one class from each label. This means that negative classes for the labels should be included as well in the specification. An example of input parameters is given below: repertoire_count: 100 # generate 100 repertoires sequence_count_probabilities: 100: 0.5 # half of the generated repertoires will have 100 sequences 200: 0.5 # the other half of the generated repertoires will have 200 sequences sequence_length_distribution: 14: 0.8 # 80% of all generated sequences for all repertoires will have length 14 15: 0.2 # 20% of all generated sequences across all repertoires will have length 15 labels: cmv: # label name True: 0.5 # 50% of the repertoires will have class True False: 0.5 # 50% of the repertoires will have class False coeliac: # next label with classes that will be assigned to repertoires independently of the previous label or any other parameter 1: 0.3 # 30% of the generated repertoires will have class 1 0: 0.7 # 70% of the generated repertoires will have class 0 """ RandomDatasetGenerator._check_rep_dataset_generation_params(repertoire_count, sequence_count_probabilities, sequence_length_probabilities, labels, path) alphabet = EnvironmentSettings.get_sequence_alphabet() PathBuilder.build(path) sequences = [["".join(random.choices(alphabet, k=random.choices(list(sequence_length_probabilities.keys()), sequence_length_probabilities.values())[0])) for seq_count in range(random.choices(list(sequence_count_probabilities.keys()), sequence_count_probabilities.values())[0])] for rep in range(repertoire_count)] if labels is not None: processed_labels = {label: random.choices(list(labels[label].keys()), labels[label].values(), k=repertoire_count) for label in labels} dataset_params = {label: list(labels[label].keys()) for label in labels} else: processed_labels = None dataset_params = None repertoires, metadata = RepertoireBuilder.build(sequences=sequences, path=path, labels=processed_labels) dataset = RepertoireDataset(params=dataset_params, repertoires=repertoires, metadata_file=metadata) return dataset
def test_load(self): path = EnvironmentSettings.root_path + "test/tmp/pathbuilder/" PathBuilder.build(path) repertoires, metadata = RepertoireBuilder.build([["AA"], ["CC"]], path) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata) with open(path + "dataset.pkl", "wb") as file: pickle.dump(dataset, file) dataset2 = PickleImport.import_dataset({"path": path + "dataset.pkl"}, "dataset_name") shutil.rmtree(path) self.assertEqual(2, len(dataset2.get_data())) self.assertEqual("rep_1", dataset2.get_data()[1].metadata["subject_id"])
def _create_state_object(self, path): repertoires, metadata = RepertoireBuilder.build(sequences=[["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"]], path=path, labels={ "l1": [1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2], "l2": [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]}) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata, params={"l1": [1, 2], "l2": [0, 1]}) enc_params = {"k": 3, "model_type": ModelType.SEQUENCE.name, "vector_size": 4} hp_settings = [HPSetting(Word2VecEncoder.build_object(dataset, **enc_params), enc_params, LogisticRegression(), {"model_selection_cv": False, "model_selection_n_folds": -1}, [])] label_config = LabelConfiguration([Label("l1", [1, 2]), Label("l2", [0, 1])]) process = TrainMLModelInstruction(dataset, GridSearch(hp_settings), hp_settings, SplitConfig(SplitType.RANDOM, 1, 0.5), SplitConfig(SplitType.RANDOM, 1, 0.5), {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, label_config, path) state = process.run(result_path=path) return state
def test_process(self): path = EnvironmentSettings.root_path + "test/tmp/metadata_filter/" PathBuilder.build(path) dataset = RepertoireDataset(repertoires=RepertoireBuilder.build( [["ACF", "ACF", "ACF"], ["ACF", "ACF"], ["ACF", "ACF", "ACF", "ACF"]], path)[0]) df = pd.DataFrame(data={"key1": [0, 1, 2], "key2": [0, 1, 2]}) df.to_csv(path + "metadata.csv") dataset.metadata_file = path + "metadata.csv" dataset1 = MetadataRepertoireFilter.process( dataset, { "criteria": { "type": OperationType.GREATER_THAN, "value": { "type": DataType.COLUMN, "name": "key2" }, "threshold": 1 }, "result_path": path }) self.assertEqual(1, dataset1.get_example_count()) self.assertRaises( AssertionError, MetadataRepertoireFilter.process, dataset, { "criteria": { "type": OperationType.GREATER_THAN, "value": { "type": DataType.COLUMN, "name": "key2" }, "threshold": 10 }, "result_path": path }) shutil.rmtree(path)
def test_export(self): path = EnvironmentSettings.tmp_test_path + "pickleexporter/" PathBuilder.build(path) repertoires, metadata = RepertoireBuilder.build([["AA"], ["CC"]], path) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata) PickleExporter.export( dataset, EnvironmentSettings.tmp_test_path + "pickleexporter/") with open( EnvironmentSettings.tmp_test_path + f"pickleexporter/{dataset.name}.iml_dataset", "rb") as file: dataset2 = pickle.load(file) shutil.rmtree(EnvironmentSettings.tmp_test_path + "pickleexporter/") self.assertTrue(isinstance(dataset2, RepertoireDataset)) self.assertEqual(2, len(dataset2.get_data())) self.assertEqual("rep_0", dataset2.get_data()[0].metadata["subject_id"])
def prepare_dataset(self, path): PathBuilder.build(path) repertoires, metadata = RepertoireBuilder.build( sequences=[["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"]], path=path, labels={ "l1": [ 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2 ], "l2": [ 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 ] }) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata, params={ "l1": [1, 2], "l2": [0, 1] }, name="dataset1") PickleExporter.export(dataset, path)
def test_encode(self): path = EnvironmentSettings.tmp_test_path + "count_encoder/" PathBuilder.build(path) repertoires, metadata = RepertoireBuilder.build( [["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF", "III", "LLL", "MMM"], ["CCC", "FFF", "MMM"], ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"]], labels={"l1": [True, True, False, False]}, path=path) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata, identifier="1") encoder = SequenceCountEncoder.build_object( dataset, **{ "comparison_attributes": ["sequence_aas"], "p_value_threshold": 0.4, "sequence_batch_size": 4 }) label_config = LabelConfiguration( [Label("l1", [True, False], positive_class=True)]) encoded_dataset = encoder.encode( dataset, EncoderParams(result_path=path, label_config=label_config)) test = encoded_dataset.encoded_data.examples self.assertTrue(test[0] == 1) self.assertTrue(test[1] == 1) self.assertTrue(test[2] == 0) self.assertTrue(test[3] == 0) self.assertTrue("III" in encoded_dataset.encoded_data.feature_names) shutil.rmtree(path)
def test_run(self): path = EnvironmentSettings.root_path + "test/tmp/smmodel/" PathBuilder.build(path) repertoires, metadata = RepertoireBuilder.build([["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"]], path, {"default": [1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2]}) dataset = RepertoireDataset(repertoires=repertoires, params={"default": [1, 2]}, metadata_file=metadata) label_config = LabelConfiguration() label_config.add_label("default", [1, 2]) hp_settings = [HPSetting(Word2VecEncoder.build_object(dataset, **{"vector_size": 8, "model_type": ModelType.SEQUENCE.name, "k": 3}), {"vector_size": 8, "model_type": ModelType.SEQUENCE.name, "k": 3}, LogisticRegression(), {"model_selection_cv": False, "model_selection_n_folds": -1}, [])] split_config_assessment = SplitConfig(SplitType.RANDOM, 1, 0.5, ReportConfig()) split_config_selection = SplitConfig(SplitType.RANDOM, 1, 0.5, ReportConfig()) instruction = TrainMLModelInstruction(dataset, GridSearch(hp_settings), hp_settings, split_config_assessment, split_config_selection, {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, label_config, path) semantic_model = SemanticModel([instruction], path) semantic_model.run() shutil.rmtree(path)
def test_run(self): path = EnvironmentSettings.root_path + "test/tmp/mlmethodassessment/" PathBuilder.build(path) dataset = RepertoireDataset(repertoires=RepertoireBuilder.build([["AA"], ["CC"], ["AA"], ["CC"], ["AA"], ["CC"], ["AA"], ["CC"], ["AA"], ["CC"], ["AA"], ["CC"]], path)[0]) dataset.encoded_data = EncodedData( examples=np.array([[1, 1], [1, 1], [3, 3], [1, 1], [1, 1], [3, 3], [1, 1], [1, 1], [3, 3], [1, 1], [1, 1], [3, 3]]), labels={"l1": [1, 1, 3, 1, 1, 3, 1, 1, 3, 1, 1, 3], "l2": [1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3]} ) label_config = LabelConfiguration() label_config.add_label("l1", [1, 3]) method1 = LogisticRegression() method1.fit(dataset.encoded_data, label_name='l1') res = MLMethodAssessment.run(MLMethodAssessmentParams( dataset=dataset, method=method1, metrics={Metric.ACCURACY, Metric.BALANCED_ACCURACY, Metric.F1_MACRO}, optimization_metric=Metric.LOG_LOSS, predictions_path=EnvironmentSettings.root_path + "test/tmp/mlmethodassessment/predictions.csv", label="l1", ml_score_path=EnvironmentSettings.root_path + "test/tmp/mlmethodassessment/ml_score.csv", split_index=1, path=EnvironmentSettings.root_path + "test/tmp/mlmethodassessment/" )) self.assertTrue(isinstance(res, dict)) self.assertTrue(res[Metric.LOG_LOSS.name.lower()] <= 0.1) self.assertTrue(os.path.isfile(EnvironmentSettings.root_path + "test/tmp/mlmethodassessment/ml_score.csv")) df = pd.read_csv(EnvironmentSettings.root_path + "test/tmp/mlmethodassessment/ml_score.csv") self.assertTrue(df.shape[0] == 1) df = pd.read_csv(EnvironmentSettings.root_path + "test/tmp/mlmethodassessment/predictions.csv") self.assertEqual(12, df.shape[0]) shutil.rmtree(EnvironmentSettings.root_path + "test/tmp/mlmethodassessment/")
def test_process(self): path = EnvironmentSettings.root_path + "test/tmp/count_per_seq_filter/" PathBuilder.build(path) dataset = RepertoireDataset(repertoires=RepertoireBuilder.build( [["ACF", "ACF", "ACF"], ["ACF", "ACF"], ["ACF", "ACF", "ACF", "ACF"]], path, seq_metadata=[[{ "count": 1 }, { "count": 2 }, { "count": 3 }], [{ "count": 4 }, { "count": 1 }], [{ "count": 5 }, { "count": 6 }, { "count": None }, { "count": 1 }]])[0]) dataset1 = CountPerSequenceFilter.process( dataset, { "low_count_limit": 2, "remove_without_count": True, "remove_empty_repertoires": False, "result_path": path, "batch_size": 4 }) self.assertEqual(2, dataset1.repertoires[0].get_sequence_aas().shape[0]) dataset2 = CountPerSequenceFilter.process( dataset, { "low_count_limit": 5, "remove_without_count": True, "remove_empty_repertoires": False, "result_path": path, "batch_size": 4 }) self.assertEqual(0, dataset2.repertoires[0].get_sequence_aas().shape[0]) dataset3 = CountPerSequenceFilter.process( dataset, { "low_count_limit": 0, "remove_without_count": True, "remove_empty_repertoires": False, "result_path": path, "batch_size": 4 }) self.assertEqual(3, dataset3.repertoires[2].get_sequence_aas().shape[0]) dataset = RepertoireDataset(repertoires=RepertoireBuilder.build( [["ACF", "ACF", "ACF"], ["ACF", "ACF"], ["ACF", "ACF", "ACF", "ACF"]], path, seq_metadata=[[{ "count": None }, { "count": None }, { "count": None }], [{ "count": None }, { "count": None }], [{ "count": None }, { "count": None }, { "count": None }, { "count": None }]])[0]) dataset4 = CountPerSequenceFilter.process( dataset, { "low_count_limit": 0, "remove_without_count": True, "remove_empty_repertoires": False, "result_path": path, "batch_size": 4 }) self.assertEqual(0, dataset4.repertoires[0].get_sequence_aas().shape[0]) self.assertEqual(0, dataset4.repertoires[1].get_sequence_aas().shape[0]) self.assertEqual(0, dataset4.repertoires[2].get_sequence_aas().shape[0]) self.assertRaises( AssertionError, CountPerSequenceFilter.process, dataset, { "low_count_limit": 10, "remove_without_count": True, "remove_empty_repertoires": True, "result_path": path, "batch_size": 4 }) shutil.rmtree(path)
def test_generate(self): path = EnvironmentSettings.tmp_test_path + "disease_assoc_seq_cv/" PathBuilder.build(path) repertoires, metadata = RepertoireBuilder.build( [["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF"], ["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF"], ["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF"], ["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF"], ["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF"], ["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF"], ["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF"]], labels={ "l1": [ True, False, True, False, True, False, True, False, True, False, True, False, True, False ] }, path=path) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata, params={"l1": [True, False]}) PickleExporter.export(dataset, path) specs = { "definitions": { "datasets": { "d1": { "format": "Pickle", "params": { "path": path + f"{dataset.name}.iml_dataset", } } }, "encodings": { "e1": { "SequenceAbundance": { 'p_value_threshold': 0.5 } } }, "ml_methods": { "knn": { "KNN": { "n_neighbors": 1 }, } }, "reports": { "r1": { "DiseaseAssociatedSequenceCVOverlap": { "compare_in_selection": True, "compare_in_assessment": True } } } }, "instructions": { "inst1": { "type": "TrainMLModel", "settings": [{ "encoding": "e1", "ml_method": "knn" }], "assessment": { "split_strategy": "random", "split_count": 1, "training_percentage": 0.5, "reports": {} }, "selection": { "split_strategy": "random", "split_count": 1, "training_percentage": 0.5, }, "labels": [{ "l1": { "positive_class": True } }], "dataset": "d1", "strategy": "GridSearch", "metrics": ["accuracy"], "number_of_processes": 2, "reports": ["r1"], "optimization_metric": "balanced_accuracy", "refit_optimal_model": True, "store_encoded_data": False } } } specs_file = path + "specs.yaml" with open(specs_file, "w") as file: yaml.dump(specs, file) app = ImmuneMLApp(specs_file, path + "result/") state = app.run()[0] self.assertEqual(1, len(state.report_results)) self.assertTrue(len(state.report_results[0].output_figures) > 0) self.assertTrue(len(state.report_results[0].output_tables) > 0) for fig in state.report_results[0].output_figures: self.assertTrue(os.path.isfile(fig.path)) for table in state.report_results[0].output_tables: self.assertTrue(os.path.isfile(table.path)) shutil.rmtree(path)
def create_encoded_matchedregex(self, path): # Setting up dummy data labels = { "subject_id": ["subject_1", "subject_2", "subject_3"], "label": ["yes", "no", "no"] } metadata_alpha = { "v_gene": "V1", "j_gene": "J1", "chain": Chain.ALPHA.value } metadata_beta = { "v_gene": "V1", "j_gene": "J1", "chain": Chain.BETA.value } repertoires, metadata = RepertoireBuilder.build( sequences=[[ "XXAGQXGSSNTGKLIXX", "XXAGQXGSSNTGKLIYY", "XXSAGQGETQYXX" ], ["ASSXRXX"], ["XXIXXNDYKLSXX", "CCCC", "SSSS", "TTTT"]], path=path, labels=labels, seq_metadata=[[{ **metadata_alpha, "count": 10, "v_gene": "TRAV35" }, { **metadata_alpha, "count": 10 }, { **metadata_beta, "count": 10, "v_gene": "TRBV29-1" }], [{ **metadata_beta, "count": 10, "v_gene": "TRBV7-3" }], [{ **metadata_alpha, "count": 5, "v_gene": "TRAV26-2" }, { **metadata_alpha, "count": 2 }, { **metadata_beta, "count": 1 }, { **metadata_beta, "count": 2 }]], subject_ids=labels["subject_id"]) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata) label_config = LabelConfiguration() label_config.add_label("subject_id", labels["subject_id"]) label_config.add_label("label", labels["label"]) file_content = """id TRAV TRBV TRA_regex TRB_regex 1 TRAV35 TRBV29-1 AGQ.GSSNTGKLI S[APGFTVML]GQGETQY 2 TRBV7-3 ASS.R.* 3 TRAV26-1 I..NDYKLS 4 TRAV26-2 I..NDYKLS """ filepath = path + "reference_motifs.tsv" with open(filepath, "w") as file: file.writelines(file_content) encoder = MatchedRegexEncoder.build_object( dataset, **{ "motif_filepath": filepath, "match_v_genes": False, "sum_counts": True }) encoded = encoder.encode( dataset, EncoderParams(result_path=path, label_config=label_config, filename="dataset.csv")) return encoded
def test_parse_yaml_file(self): path = EnvironmentSettings.root_path + "test/tmp/parser/" dataset = RepertoireDataset(repertoires=RepertoireBuilder.build( [["AAA", "CCC"], ["TTTT"]], path, {"default": [1, 2]})[0], params={"default": [1, 2]}) PickleExporter.export(dataset, path) spec = { "definitions": { "datasets": { "d1": { "format": "Pickle", "params": { "path": path + f"{dataset.name}.iml_dataset", } } }, "encodings": { "a1": { "Word2Vec": { "k": 3, "model_type": "sequence", "vector_size": 8, } }, "a2": "Word2Vec" }, "ml_methods": { "simpleLR": { "LogisticRegression": { "penalty": "l1" }, "model_selection_cv": False, "model_selection_n_folds": -1, }, "simpleLR2": "LogisticRegression" }, "reports": { "rep1": "SequenceLengthDistribution" } }, "instructions": {} } PathBuilder.build(path) specs_filename = path + "tmp_yaml_spec.yaml" with open(specs_filename, "w") as file: yaml.dump(spec, file, default_flow_style=False) symbol_table, _ = ImmuneMLParser.parse_yaml_file(specs_filename) self.assertTrue( all([ symbol_table.contains(key) for key in ["simpleLR", "rep1", "a1", "d1"] ])) self.assertTrue(isinstance(symbol_table.get("d1"), RepertoireDataset)) with self.assertRaises(YAMLError): with open(specs_filename, "r") as file: specs_text = file.readlines() specs_text[0] = " definitions:" with open(specs_filename, "w") as file: file.writelines(specs_text) ImmuneMLParser.parse_yaml_file(specs_filename) shutil.rmtree(path)
def test_encoding(self): path = EnvironmentSettings.tmp_test_path + "integration_test_emerson_encoding/" PathBuilder.build(path) ref_path = path + "reference.csv" pd.DataFrame({ "sequence_aas": ["GGG", "III", "TTT", "EFEF"], "v_alleles": ["TRBV6-1*01", "TRBV6-1*01", "TRBV6-1*01", "TRBV6-1*01"], 'j_alleles': ["TRBJ2-7", "TRBJ2-7", "TRBJ2-7", "TRBJ2-7"] }).to_csv(ref_path, index=False) repertoires, metadata = RepertoireBuilder.build( [["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF", "III", "LLL", "MMM"], ["CCC", "FFF", "MMM"], ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"], ["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF", "III", "LLL", "MMM"], ["CCC", "FFF", "MMM"], ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"], ["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF", "III", "LLL", "MMM"], ["CCC", "FFF", "MMM"], ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"], ["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF", "III", "LLL", "MMM"], ["CCC", "FFF", "MMM"], ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"]], labels={ "l1": [ True, True, False, False, True, True, False, False, True, True, False, False, True, True, False, False ] }, path=path) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata, params={"l1": [True, False]}) PickleExporter.export(dataset, path) specs = { "definitions": { "datasets": { "d1": { "format": "Pickle", "params": { "path": path + f"{dataset.name}.iml_dataset", } } }, "encodings": { "e1": { "SequenceAbundance": { 'comparison_attributes': ["sequence_aas", "v_alleles", "j_alleles"] } } }, "ml_methods": { "knn": { "KNN": { "n_neighbors": 1 }, } }, "reports": { "r1": { "ReferenceSequenceOverlap": { "reference_path": ref_path, 'comparison_attributes': ["sequence_aas", "v_alleles", "j_alleles"] } } } }, "instructions": { "inst1": { "type": "TrainMLModel", "settings": [{ "encoding": "e1", "ml_method": "knn" }], "assessment": { "split_strategy": "random", "split_count": 1, "training_percentage": 0.7, "reports": {} }, "selection": { "split_strategy": "random", "split_count": 1, "training_percentage": 0.7, }, "labels": [{ "l1": { "positive_class": True } }], "dataset": "d1", "strategy": "GridSearch", "metrics": ["accuracy"], "number_of_processes": 2, "reports": ["r1"], "optimization_metric": "balanced_accuracy", "refit_optimal_model": True, "store_encoded_data": False } } } specs_file = path + "specs.yaml" with open(specs_file, "w") as file: yaml.dump(specs, file) app = ImmuneMLApp(specs_file, path + "result/") app.run() shutil.rmtree(path)
def create_dummy_data(self, path): # Setting up dummy data labels = { "subject_id": ["subject_1", "subject_2", "subject_3"], "label": ["yes", "no", "no"] } metadata_alpha = { "v_gene": "V1", "j_gene": "J1", "chain": Chain.LIGHT.value } metadata_beta = { "v_gene": "V1", "j_gene": "J1", "chain": Chain.HEAVY.value } repertoires, metadata = RepertoireBuilder.build( sequences=[[ "XXAGQXGSSNTGKLIXX", "XXAGQXGSSNTGKLIYY", "XXSAGQGETQYXX" ], ["ASSXRXX"], ["XXIXXNDYKLSXX", "CCCC", "SSSS", "TTTT"]], path=path, labels=labels, seq_metadata=[[{ **metadata_alpha, "count": 10, "v_gene": "IGLV35" }, { **metadata_alpha, "count": 10 }, { **metadata_beta, "count": 10, "v_gene": "IGHV29-1" }], [{ **metadata_beta, "count": 10, "v_gene": "IGHV7-3" }], [{ **metadata_alpha, "count": 5, "v_gene": "IGLV26-2" }, { **metadata_alpha, "count": 2 }, { **metadata_beta, "count": 1 }, { **metadata_beta, "count": 2 }]], subject_ids=labels["subject_id"]) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata) label_config = LabelConfiguration() label_config.add_label("subject_id", labels["subject_id"]) label_config.add_label("label", labels["label"]) file_content = """id IGLV IGHV IGL_regex IGH_regex 1 IGLV35 IGHV29-1 AGQ.GSSNTGKLI S[APGFTVML]GQGETQY 2 IGHV7-3 ASS.R.* 3 IGLV26-1 I..NDYKLS 4 IGLV26-2 I..NDYKLS """ filepath = path + "reference_motifs.tsv" with open(filepath, "w") as file: file.writelines(file_content) return dataset, label_config, filepath, labels
def create_dataset(self, path: str) -> RepertoireDataset: repertoires, metadata = RepertoireBuilder.build( [["A", "B"], ["D"], ["E", "F"], ["B", "C"], ["A", "D"]], path) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata) return dataset
def test_run(self): path = EnvironmentSettings.tmp_test_path + "hpoptimproc/" PathBuilder.build(path) repertoires, metadata = RepertoireBuilder.build( sequences=[["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"]], path=path, labels={ "l1": [ 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2 ], "l2": [ 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 ] }) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata, params={ "l1": [1, 2], "l2": [0, 1] }) enc1 = { "k": 3, "model_type": ModelType.SEQUENCE.name, "vector_size": 4 } enc2 = { "k": 3, "model_type": ModelType.SEQUENCE.name, "vector_size": 6 } hp_settings = [ HPSetting(Word2VecEncoder.build_object(dataset, **enc1), enc1, LogisticRegression(), { "model_selection_cv": False, "model_selection_n_folds": -1 }, []), HPSetting( Word2VecEncoder.build_object(dataset, **enc2), enc2, SVM(), { "model_selection_cv": False, "model_selection_n_folds": -1 }, [ClonesPerRepertoireFilter(lower_limit=-1, upper_limit=1000)]) ] report = SequenceLengthDistribution() label_config = LabelConfiguration( [Label("l1", [1, 2]), Label("l2", [0, 1])]) process = TrainMLModelInstruction( dataset, GridSearch(hp_settings), hp_settings, SplitConfig(SplitType.RANDOM, 1, 0.5, reports=ReportConfig(data_splits={"seqlen": report})), SplitConfig(SplitType.RANDOM, 1, 0.5, reports=ReportConfig(data_splits={"seqlen": report})), {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, label_config, path) state = process.run(result_path=path) self.assertTrue(isinstance(state, TrainMLModelState)) self.assertEqual(1, len(state.assessment_states)) self.assertTrue("l1" in state.assessment_states[0].label_states) self.assertTrue("l2" in state.assessment_states[0].label_states) shutil.rmtree(path)