def test_sequence_dataset(self): path = PathBuilder.build(EnvironmentSettings.tmp_test_path / "integration_dataset_gen_html_sequence/") dataset_path = path / "sequence_dataset/" specs = { "definitions": { "datasets": { "sequencedataset": { "format": "RandomSequenceDataset", "params": { "sequence_count": 10, "length_probabilities": { 10: 1 }, "labels": { "epitope_a": { True: 0.5, False: 0.5 }, "epitope_b": { True: 0.5, False: 0.5 } }, "result_path": str(dataset_path) } } } }, "instructions": { "instr1": { "type": "DatasetExport", "export_formats": ["Pickle", "AIRR"], "datasets": ["sequencedataset"] } }, "output": { "format": "HTML" } } specs_path = path / "specs.yaml" with open(specs_path, "w") as file: yaml.dump(specs, file) app = ImmuneMLApp(specs_path, path / "result/") app.run() shutil.rmtree(path)
def test_dataset_generation(self): path = PathBuilder.build(EnvironmentSettings.tmp_test_path / "cv_split_variant/") repertoire_specs = self.build_specs(path) specs_filename = path / "specs.yaml" with open(specs_filename, "w") as file: yaml.dump(repertoire_specs, file) app = ImmuneMLApp(specs_filename, path / "result/") app.run() shutil.rmtree(path)
def test_subsampling(self): path = PathBuilder.build(EnvironmentSettings.tmp_test_path / "subsampling_workflow/") repertoire_specs = self.build_specs(path) specs_filename = path / "specs.yaml" with open(specs_filename, "w") as file: yaml.dump(repertoire_specs, file) app = ImmuneMLApp(specs_filename, path / "result/") app.run() shutil.rmtree(path)
def _run(self): PathBuilder.build(self.result_path) self._prepare_specs() app = ImmuneMLApp(self.yaml_path, self.result_path) app.run() model_locations = list(self.result_path.glob(f"{self.instruction_name}/optimal_*/zip/*.zip")) model_export_path = PathBuilder.build(self.result_path / 'exported_models/') for model_location in model_locations: shutil.copyfile(model_location, model_export_path / model_location.name) logging.info(f"{GalaxyTrainMLModel.__name__}: immuneML has finished and the trained models were exported.")
def run(self, result_path: str): result_path = self.build_path(result_path) self._simulate_dataset_with_signals(result_path / "synthetic_dataset") print("immuneML quickstart: training a machine learning model...") specs_file = self.create_specfication(result_path / "machine_learning_analysis") app = ImmuneMLApp(specs_file, result_path / "machine_learning_analysis/result") app.run() print( "immuneML quickstart: finished training a machine learning model.")
def run(self, result_path: str): result_path = self.build_path(result_path) logging.basicConfig(filename=Path(result_path) / "log.txt", level=logging.ERROR, format='%(asctime)s %(levelname)s: %(message)s') warnings.showwarning = lambda message, category, filename, lineno, file=None, line=None: logging.warning(message) self._simulate_dataset_with_signals(result_path / "synthetic_dataset") print("immuneML quickstart: training a machine learning model...") specs_file = self.create_specfication(result_path / "machine_learning_analysis") app = ImmuneMLApp(specs_file, result_path / "machine_learning_analysis/result") app.run() print("immuneML quickstart: finished training a machine learning model.")
def _run(self): PathBuilder.build(self.result_path) self._update_specs() state = ImmuneMLApp(self.yaml_path, self.result_path).run()[0] shutil.copytree( list(list(state.paths.values())[0].values())[0], self.result_path / "result/") print("Exported dataset.")
def _run(self): PathBuilder.build(self.result_path) self._check_specs() state = ImmuneMLApp(self.yaml_path, self.result_path).run()[0] if os.path.relpath(state.predictions_path) != os.path.relpath( self.result_path / "predictions.csv"): shutil.copy(state.predictions_path, self.result_path / "predictions.csv") print("Applied ML model to the dataset, predictions are available.")
def test_simulation(self): path = EnvironmentSettings.tmp_test_path / "integration_simulation/" self.prepare_dataset(path) specs_path = self.prepare_specs(path) PathBuilder.build(path / "result/") app = ImmuneMLApp(specification_path=specs_path, result_path=path / "result/") app.run() self.assertTrue(os.path.isfile(path / "result/inst1/metadata.csv")) metadata_df = pd.read_csv(path / "result/inst1/metadata.csv", comment=Constants.COMMENT_SIGN) self.assertTrue("signal1" in metadata_df.columns) self.assertEqual(17, sum(metadata_df["signal1"])) self.assertTrue(os.path.isfile(path / "result/index.html")) self.assertTrue( os.path.isfile( path / "result/inst1/exported_dataset/pickle/d1.iml_dataset")) shutil.rmtree(path)
def test_ml(self): path = PathBuilder.build(EnvironmentSettings.tmp_test_path / "integration_ml/") specs_path = self.prepare_specs(path) PathBuilder.build(path / "result_export/") app = ImmuneMLApp(specification_path=Path(specs_path), result_path=path / "result_export/") states = app.run() self.assertTrue(os.path.isfile(path / "result_export/index.html")) specs_path = self.prepare_import_specs(path) app = ImmuneMLApp(Path(specs_path), path / 'result_import/') result_path = app.run() self.assertTrue(os.path.isfile(path / "result_import/index.html")) shutil.rmtree(path)
def run_tool(yaml_path, result_path): PathBuilder.build(result_path) app = ImmuneMLApp(yaml_path, result_path) app.run()
def test_simulation_receptors(self): path = PathBuilder.build(EnvironmentSettings.tmp_test_path / "integration_simulation_receptor/") specs = { "definitions": { "datasets": { "d1": { "format": "RandomReceptorDataset", "params": { "receptor_count": 100, "chain_1_length_probabilities": { 10: 1 }, "chain_2_length_probabilities": { 10: 1 }, "result_path": str(path / "dataset/"), "labels": {} } }, }, "motifs": { "motif1": { "seed_chain1": "CC/C", "name_chain1": "ALPHA", "name_chain2": "BETA", "seed_chain2": "F/FF", "instantiation": { "GappedKmer": { "max_gap": 1, "alphabet_weights": None, "position_weights": None }, } }, "motif2": { "seed_chain1": "CCC", "name_chain1": "ALPHA", "name_chain2": "BETA", "seed_chain2": "FFF", "instantiation": "GappedKmer" } }, "signals": { "signal1": { "motifs": ["motif1", "motif2"], "implanting": "Receptor", "sequence_position_weights": None }, "signal2": { "motifs": ["motif1"], "implanting": "Receptor", "sequence_position_weights": None } }, "simulations": { "sim1": { "var1": { "signals": ["signal1"], "dataset_implanting_rate": 0.5 }, "var2": { "signals": ["signal2"], "dataset_implanting_rate": 0.5 } } } }, "instructions": { "inst1": { "type": "Simulation", "dataset": "d1", "simulation": "sim1", "export_formats": ["Pickle"] } }, "output": { "format": "HTML" } } with open(path / "specs.yaml", "w") as file: yaml.dump(specs, file) app = ImmuneMLApp(path / "specs.yaml", path / "result/") app.run() self.assertTrue(os.path.isfile(path / "result/index.html")) self.assertTrue( os.path.isfile( path / "result/inst1/exported_dataset/pickle/d1.iml_dataset")) dataset = PickleImport.import_dataset( { "path": path / "result/inst1/exported_dataset/pickle/d1.iml_dataset" }, "d1") self.assertEqual(100, dataset.get_example_count()) self.assertEqual( 100, len([ receptor for receptor in dataset.get_data() if "signal1" in receptor.metadata ])) self.assertEqual( 50, len([ receptor for receptor in dataset.get_data() if receptor.metadata["signal1"] ])) self.assertEqual( 100, len([ receptor for receptor in dataset.get_data() if "signal2" in receptor.metadata ])) self.assertEqual( 50, len([ receptor for receptor in dataset.get_data() if receptor.metadata["signal2"] ])) shutil.rmtree(path)
def test_encoding(self): path = EnvironmentSettings.tmp_test_path / "integration_test_emerson_encoding/" PathBuilder.build(path) ref_path = path / "reference.csv" pd.DataFrame({ "sequence_aas": ["GGG", "III", "TTT", "EFEF"], "v_alleles": ["TRBV6-1*01", "TRBV6-1*01", "TRBV6-1*01", "TRBV6-1*01"], 'j_alleles': ["TRBJ2-7", "TRBJ2-7", "TRBJ2-7", "TRBJ2-7"] }).to_csv(ref_path, index=False) repertoires, metadata = RepertoireBuilder.build( [["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF", "III", "LLL", "MMM"], ["CCC", "FFF", "MMM"], ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"], ["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF", "III", "LLL", "MMM"], ["CCC", "FFF", "MMM"], ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"], ["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF", "III", "LLL", "MMM"], ["CCC", "FFF", "MMM"], ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"], ["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF", "III", "LLL", "MMM"], ["CCC", "FFF", "MMM"], ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"]], labels={ "l1": [ True, True, False, False, True, True, False, False, True, True, False, False, True, True, False, False ] }, path=path) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata, labels={"l1": [True, False]}) ImmuneMLExporter.export(dataset, path) specs = { "definitions": { "datasets": { "d1": { "format": "ImmuneML", "params": { "path": str(path / f"{dataset.name}.iml_dataset"), } } }, "encodings": { "e1": { "SequenceAbundance": { 'comparison_attributes': ["sequence_aas", "v_alleles", "j_alleles"] } } }, "ml_methods": { "knn": { "KNN": { "n_neighbors": 1 }, } }, "reports": { "r1": { "ReferenceSequenceOverlap": { "reference_path": str(ref_path), 'comparison_attributes': ["sequence_aas", "v_alleles", "j_alleles"] } } } }, "instructions": { "inst1": { "type": "TrainMLModel", "settings": [{ "encoding": "e1", "ml_method": "knn" }], "assessment": { "split_strategy": "random", "split_count": 1, "training_percentage": 0.7, "reports": {} }, "selection": { "split_strategy": "random", "split_count": 1, "training_percentage": 0.7, }, "labels": [{ "l1": { "positive_class": True } }], "dataset": "d1", "strategy": "GridSearch", "metrics": ["accuracy"], "number_of_processes": 2, "reports": ["r1"], "optimization_metric": "balanced_accuracy", "refit_optimal_model": True, } } } specs_file = path / "specs.yaml" with open(specs_file, "w") as file: yaml.dump(specs, file) app = ImmuneMLApp(specs_file, path / "result") app.run() shutil.rmtree(path)
def test(self): path = PathBuilder.build(EnvironmentSettings.tmp_test_path / "integration_receptor_cnn_workflow/") specs = { "definitions": { "datasets": { "d1": { "format": "RandomReceptorDataset", "params": { "result_path": str(path / "generated_dataset/"), "receptor_count": 500, "chain_1_length_probabilities": { 5: 1. }, "chain_2_length_probabilities": { 6: 1. }, "labels": { "cmv_epitope": { True: 0.5, False: 0.5 } } } } }, "encodings": { "enc1": { "OneHot": { "use_positional_info": True } } }, "ml_methods": { "cnn": { "ReceptorCNN": { "iteration_count": 1000, "evaluate_at": 10, "batch_size": 100, "number_of_threads": 4 } } } }, "instructions": { "instr1": { "type": "TrainMLModel", "settings": [{ "encoding": "enc1", "ml_method": "cnn" }], "assessment": { "split_strategy": "random", "split_count": 1, "training_percentage": 0.7, }, "selection": { "split_strategy": "random", "split_count": 1, "training_percentage": 1, }, "labels": ["cmv_epitope"], "dataset": "d1", "strategy": "GridSearch", "metrics": ["accuracy"], "number_of_processes": 4, "reports": None, "optimization_metric": "balanced_accuracy", "refit_optimal_model": False, } } } with open(path / "specs.yaml", "w") as file: yaml.dump(specs, file) app = ImmuneMLApp(path / "specs.yaml", path / 'result/') app.run() shutil.rmtree(path)
def test_generate(self): path = EnvironmentSettings.tmp_test_path / "disease_assoc_seq_cv/" PathBuilder.build(path) repertoires, metadata = RepertoireBuilder.build( [["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF"], ["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF"], ["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF"], ["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF"], ["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF"], ["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF"], ["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF"]], labels={ "l1": [ True, False, True, False, True, False, True, False, True, False, True, False, True, False ] }, path=path) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata, labels={"l1": [True, False]}) PickleExporter.export(dataset, path) specs = { "definitions": { "datasets": { "d1": { "format": "Pickle", "params": { "path": str(path / f"{dataset.name}.iml_dataset"), } } }, "encodings": { "e1": { "SequenceAbundance": { 'p_value_threshold': 0.5 } } }, "ml_methods": { "knn": { "KNN": { "n_neighbors": 1 }, } }, "reports": { "r1": { "DiseaseAssociatedSequenceCVOverlap": { "compare_in_selection": True, "compare_in_assessment": True } } } }, "instructions": { "inst1": { "type": "TrainMLModel", "settings": [{ "encoding": "e1", "ml_method": "knn" }], "assessment": { "split_strategy": "random", "split_count": 1, "training_percentage": 0.5, "reports": {} }, "selection": { "split_strategy": "random", "split_count": 1, "training_percentage": 0.5, }, "labels": [{ "l1": { "positive_class": True } }], "dataset": "d1", "strategy": "GridSearch", "metrics": ["accuracy"], "number_of_processes": 2, "reports": ["r1"], "optimization_metric": "balanced_accuracy", "refit_optimal_model": True, "store_encoded_data": False } } } specs_file = path / "specs.yaml" with open(specs_file, "w") as file: yaml.dump(specs, file) app = ImmuneMLApp(specs_file, path / "result/") state = app.run()[0] self.assertEqual(1, len(state.report_results)) self.assertTrue(len(state.report_results[0].output_figures) > 0) self.assertTrue(len(state.report_results[0].output_tables) > 0) for fig in state.report_results[0].output_figures: self.assertTrue(os.path.isfile(fig.path)) for table in state.report_results[0].output_tables: self.assertTrue(os.path.isfile(table.path)) shutil.rmtree(path)