def create_dataset(self): path = Path( os.path.relpath(EnvironmentSettings.root_path / "test/tmp/immunemlapp/initial_dataset")) PathBuilder.build(path) repertoire_count = 30 repertoires, metadata = RepertoireBuilder.build( [["AA", "AAAA", "AAAA", "AAA"] for i in range(repertoire_count)], path, { "CD": [ 'yes' if i % 2 == 0 else 'no' for i in range(repertoire_count) ], "CMV": [ True if i % 2 == 1 else False for i in range(repertoire_count) ] }, [[{ "chain": "A" if i % 2 == 0 else "B", "count": random.randint(2, 5) } for i in range(4)] for j in range(repertoire_count)]) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata, labels={ "CD": [True, False], "CMV": [True, False] }, name="d1") PickleExporter.export(dataset, path) return path / "d1.iml_dataset"
def import_sequence_dataset(import_class, params, dataset_name: str): PathBuilder.build(params.result_path) filenames = ImportHelper.get_sequence_filenames(params.path, dataset_name) file_index = 0 dataset_filenames = [] dataset_params = {} items = None for index, filename in enumerate(filenames): new_items = ImportHelper.import_items(import_class, filename, params) items = np.append(items, new_items) if items is not None else new_items dataset_params = ImportHelper.extract_sequence_dataset_params(items, params) while len(items) > params.sequence_file_size or (index == len(filenames) - 1 and len(items) > 0): dataset_filenames.append(params.result_path / "batch_{}.pickle".format(file_index)) ImportHelper.store_sequence_items(dataset_filenames, items, params.sequence_file_size) items = items[params.sequence_file_size:] file_index += 1 init_kwargs = {"filenames": dataset_filenames, "file_size": params.sequence_file_size, "name": dataset_name, "labels": dataset_params} dataset = ReceptorDataset(**init_kwargs) if params.paired else SequenceDataset(**init_kwargs) PickleExporter.export(dataset, params.result_path) return dataset
def make_random_dataset(self, path): alphabet = EnvironmentSettings.get_sequence_alphabet() sequences = [["".join([rn.choice(alphabet) for i in range(20)]) for i in range(100)] for i in range(40)] repertoires, metadata = RepertoireBuilder.build(sequences, path, subject_ids=[i % 2 for i in range(len(sequences))]) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata) PickleExporter.export(dataset, path)
def create_dummy_dataset(self, path): repertoires, metadata = RepertoireBuilder.build([["AA"], ["CC"]], path, labels={"label1": ["val1", "val2"], "label2": ["val1", "val2"]}) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata) dataset.name = "my_dataset" PickleExporter.export(dataset, path) return f"{dataset.name}.iml_dataset"
def import_dataset(params, dataset_name: str) -> ReceptorDataset: generic_params = DatasetImportParams.build_object(**params) filenames = ImportHelper.get_sequence_filenames(generic_params.path, dataset_name) PathBuilder.build(generic_params.result_path, warn_if_exists=True) dataset = SingleLineReceptorImport._import_from_files(filenames, generic_params) dataset.name = dataset_name dataset.labels = ImportHelper.extract_sequence_dataset_params(params=generic_params) PickleExporter.export(dataset, generic_params.result_path) return dataset
def test_load_receptors(self): path = EnvironmentSettings.tmp_test_path / "pickle_import_receptors/" PathBuilder.build(path) dataset = RandomDatasetGenerator.generate_receptor_dataset( 10, {2: 1}, {3: 1}, {}, path) dataset.name = "d1" PickleExporter.export(dataset, path) receptor_dataset = PickleImport.import_dataset( {"path": path / "d1.iml_dataset"}, "dataset_name") self.assertEqual(10, len(list(receptor_dataset.get_data()))) shutil.rmtree(path)
def test_export_receptor_dataset(self): path = EnvironmentSettings.tmp_test_path / "pickleexporter_receptor/" PathBuilder.build(path) dataset = RandomDatasetGenerator.generate_receptor_dataset( 10, {2: 1}, {3: 1}, {}, path) dataset.name = "d1" PickleExporter.export(dataset, path) with open(path / f"{dataset.name}.iml_dataset", "rb") as file: dataset2 = pickle.load(file) self.assertTrue(isinstance(dataset2, ReceptorDataset)) self.assertEqual(10, dataset2.get_example_count()) shutil.rmtree(path)
def import_repertoire_dataset(import_class, params: DatasetImportParams, dataset_name: str) -> RepertoireDataset: """ Function to create a dataset from the metadata and a list of repertoire files and exports dataset pickle file Arguments: import_class: class to use for import params: instance of DatasetImportParams class which includes information on path, columns, result path etc. dataset_name: user-defined name of the dataset Returns: RepertoireDataset object that was created """ metadata = pd.read_csv(params.metadata_file, ",") ParameterValidator.assert_keys_present( metadata.columns.tolist(), ["filename"], ImportHelper.__name__, f'{dataset_name}: params: metadata_file') PathBuilder.build(params.result_path / "repertoires/") arguments = [(import_class, row, params) for index, row in metadata.iterrows()] with Pool(params.number_of_processes) as pool: repertoires = pool.starmap(ImportHelper.load_repertoire_as_object, arguments) new_metadata_file = ImportHelper.make_new_metadata_file( repertoires, metadata, params.result_path, dataset_name) potential_labels = list(set(metadata.columns.tolist()) - {"filename"}) dataset = RepertoireDataset(labels={ key: list(set(metadata[key].values.tolist())) for key in potential_labels }, repertoires=repertoires, metadata_file=new_metadata_file, name=dataset_name) PickleExporter.export(dataset, params.result_path) return dataset
def test_export(self): path = EnvironmentSettings.tmp_test_path / "pickleexporter/" PathBuilder.build(path) repertoires, metadata = RepertoireBuilder.build([["AA"], ["CC"]], path) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata) PickleExporter.export( dataset, EnvironmentSettings.tmp_test_path / "pickleexporter/") with open( EnvironmentSettings.tmp_test_path / f"pickleexporter/{dataset.name}.iml_dataset", "rb") as file: dataset2 = pickle.load(file) shutil.rmtree(EnvironmentSettings.tmp_test_path / "pickleexporter/") self.assertTrue(isinstance(dataset2, RepertoireDataset)) self.assertEqual(2, len(dataset2.get_data())) self.assertEqual("rep_0", dataset2.get_data()[0].metadata["subject_id"])
def prepare_dataset(self, path): PathBuilder.build(path) repertoires, metadata = RepertoireBuilder.build( sequences=[["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"]], path=path, labels={ "l1": [ 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2 ], "l2": [ 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 ] }) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata, labels={ "l1": [1, 2], "l2": [0, 1] }, name="dataset1") PickleExporter.export(dataset, path)
def test_run(self): path = PathBuilder.build(EnvironmentSettings.tmp_test_path / "api_galaxy_yaml_tool/") result_path = path / "result/" dataset = RandomDatasetGenerator.generate_repertoire_dataset( 10, {10: 1}, {12: 1}, {}, result_path) dataset.name = "d1" PickleExporter.export(dataset, result_path) specs = { "definitions": { "datasets": { "new_d1": { "format": "Pickle", "params": { "metadata_file": str(result_path / "d1_metadata.csv") } }, "d2": { "format": "RandomRepertoireDataset", "params": { "repertoire_count": 50, "sequence_length_probabilities": { 10: 1 }, 'sequence_count_probabilities': { 10: 1 }, 'labels': { "CD": { True: 0.5, False: 0.5 } } } } }, "encodings": { "e1": { "Word2Vec": { "k": 3, "model_type": "sequence", "vector_size": 8, } }, "e2": { "Word2Vec": { "k": 3, "model_type": "sequence", "vector_size": 10, } }, }, "ml_methods": { "simpleLR": { "LogisticRegression": { "penalty": "l1" }, "model_selection_cv": False, "model_selection_n_folds": -1, } }, }, "instructions": { "inst1": { "type": "DatasetExport", "datasets": ["new_d1", 'd2'], "export_formats": ["AIRR"] }, "inst2": { "type": "TrainMLModel", "settings": [{ "encoding": "e1", "ml_method": "simpleLR" }, { "encoding": "e2", "ml_method": "simpleLR" }], "assessment": { "split_strategy": "random", "split_count": 1, "training_percentage": 0.7 }, "selection": { "split_strategy": "random", "split_count": 2, "training_percentage": 0.7 }, "labels": ["CD"], "dataset": "d2", "strategy": "GridSearch", "metrics": ["accuracy", "auc"], "reports": [], "number_of_processes": 10, "optimization_metric": "accuracy", 'refit_optimal_model': False, "store_encoded_data": False } } } specs_path = path / "specs.yaml" with open(specs_path, "w") as file: yaml.dump(specs, file) run_immuneML( Namespace( **{ "specification_path": specs_path, "result_path": result_path / 'result/', 'tool': "GalaxyYamlTool" })) self.assertTrue( os.path.exists(result_path / "result/inst1/new_d1/AIRR")) self.assertTrue(os.path.exists(result_path / "result/inst1/d2/AIRR")) self.assertTrue(os.path.exists(result_path / "result/d2")) shutil.rmtree(path)
def store(encoded_dataset, params: EncoderParams): PickleExporter.export(encoded_dataset, params.result_path)
def test_parse_yaml_file(self): path = EnvironmentSettings.root_path / "test/tmp/parser/" dataset = RepertoireDataset(repertoires=RepertoireBuilder.build( [["AAA", "CCC"], ["TTTT"]], path, {"default": [1, 2]})[0], labels={"default": [1, 2]}) PickleExporter.export(dataset, path) spec = { "definitions": { "datasets": { "d1": { "format": "Pickle", "params": { "path": str(path / f"{dataset.name}.iml_dataset"), } } }, "encodings": { "a1": { "Word2Vec": { "k": 3, "model_type": "sequence", "vector_size": 8, } }, "a2": "Word2Vec" }, "ml_methods": { "simpleLR": { "LogisticRegression": { "penalty": "l1" }, "model_selection_cv": False, "model_selection_n_folds": -1, }, "simpleLR2": "LogisticRegression" }, "reports": { "rep1": "SequenceLengthDistribution" } }, "instructions": {} } PathBuilder.build(path) specs_filename = path / "tmp_yaml_spec.yaml" with specs_filename.open("w") as file: yaml.dump(spec, file, default_flow_style=False) symbol_table, _ = ImmuneMLParser.parse_yaml_file(specs_filename, result_path=path) self.assertTrue( all([ symbol_table.contains(key) for key in ["simpleLR", "rep1", "a1", "d1"] ])) self.assertTrue(isinstance(symbol_table.get("d1"), RepertoireDataset)) with self.assertRaises(YAMLError): with specs_filename.open("r") as file: specs_text = file.readlines() specs_text[0] = " definitions:" with specs_filename.open("w") as file: file.writelines(specs_text) ImmuneMLParser.parse_yaml_file(specs_filename, result_path=path) shutil.rmtree(path)
def test_encoding(self): path = EnvironmentSettings.tmp_test_path / "integration_test_emerson_encoding/" PathBuilder.build(path) ref_path = path / "reference.csv" pd.DataFrame({ "sequence_aas": ["GGG", "III", "TTT", "EFEF"], "v_alleles": ["TRBV6-1*01", "TRBV6-1*01", "TRBV6-1*01", "TRBV6-1*01"], 'j_alleles': ["TRBJ2-7", "TRBJ2-7", "TRBJ2-7", "TRBJ2-7"] }).to_csv(ref_path, index=False) repertoires, metadata = RepertoireBuilder.build( [["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF", "III", "LLL", "MMM"], ["CCC", "FFF", "MMM"], ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"], ["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF", "III", "LLL", "MMM"], ["CCC", "FFF", "MMM"], ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"], ["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF", "III", "LLL", "MMM"], ["CCC", "FFF", "MMM"], ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"], ["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF", "III", "LLL", "MMM"], ["CCC", "FFF", "MMM"], ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"]], labels={ "l1": [ True, True, False, False, True, True, False, False, True, True, False, False, True, True, False, False ] }, path=path) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata, labels={"l1": [True, False]}) PickleExporter.export(dataset, path) specs = { "definitions": { "datasets": { "d1": { "format": "Pickle", "params": { "path": str(path / f"{dataset.name}.iml_dataset"), } } }, "encodings": { "e1": { "SequenceAbundance": { 'comparison_attributes': ["sequence_aas", "v_alleles", "j_alleles"] } } }, "ml_methods": { "knn": { "KNN": { "n_neighbors": 1 }, } }, "reports": { "r1": { "ReferenceSequenceOverlap": { "reference_path": str(ref_path), 'comparison_attributes': ["sequence_aas", "v_alleles", "j_alleles"] } } } }, "instructions": { "inst1": { "type": "TrainMLModel", "settings": [{ "encoding": "e1", "ml_method": "knn" }], "assessment": { "split_strategy": "random", "split_count": 1, "training_percentage": 0.7, "reports": {} }, "selection": { "split_strategy": "random", "split_count": 1, "training_percentage": 0.7, }, "labels": [{ "l1": { "positive_class": True } }], "dataset": "d1", "strategy": "GridSearch", "metrics": ["accuracy"], "number_of_processes": 2, "reports": ["r1"], "optimization_metric": "balanced_accuracy", "refit_optimal_model": True, "store_encoded_data": False } } } specs_file = path / "specs.yaml" with open(specs_file, "w") as file: yaml.dump(specs, file) app = ImmuneMLApp(specs_file, path / "result") app.run() shutil.rmtree(path)
def test_generate(self): path = EnvironmentSettings.tmp_test_path / "disease_assoc_seq_cv/" PathBuilder.build(path) repertoires, metadata = RepertoireBuilder.build( [["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF"], ["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF"], ["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF"], ["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF"], ["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF"], ["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF"], ["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF"]], labels={ "l1": [ True, False, True, False, True, False, True, False, True, False, True, False, True, False ] }, path=path) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata, labels={"l1": [True, False]}) PickleExporter.export(dataset, path) specs = { "definitions": { "datasets": { "d1": { "format": "Pickle", "params": { "path": str(path / f"{dataset.name}.iml_dataset"), } } }, "encodings": { "e1": { "SequenceAbundance": { 'p_value_threshold': 0.5 } } }, "ml_methods": { "knn": { "KNN": { "n_neighbors": 1 }, } }, "reports": { "r1": { "DiseaseAssociatedSequenceCVOverlap": { "compare_in_selection": True, "compare_in_assessment": True } } } }, "instructions": { "inst1": { "type": "TrainMLModel", "settings": [{ "encoding": "e1", "ml_method": "knn" }], "assessment": { "split_strategy": "random", "split_count": 1, "training_percentage": 0.5, "reports": {} }, "selection": { "split_strategy": "random", "split_count": 1, "training_percentage": 0.5, }, "labels": [{ "l1": { "positive_class": True } }], "dataset": "d1", "strategy": "GridSearch", "metrics": ["accuracy"], "number_of_processes": 2, "reports": ["r1"], "optimization_metric": "balanced_accuracy", "refit_optimal_model": True, "store_encoded_data": False } } } specs_file = path / "specs.yaml" with open(specs_file, "w") as file: yaml.dump(specs, file) app = ImmuneMLApp(specs_file, path / "result/") state = app.run()[0] self.assertEqual(1, len(state.report_results)) self.assertTrue(len(state.report_results[0].output_figures) > 0) self.assertTrue(len(state.report_results[0].output_tables) > 0) for fig in state.report_results[0].output_figures: self.assertTrue(os.path.isfile(fig.path)) for table in state.report_results[0].output_tables: self.assertTrue(os.path.isfile(table.path)) shutil.rmtree(path)