def test_generate(self): path = EnvironmentSettings.root_path / "test/tmp/mlsettingsperformance/" PathBuilder.build(path) report = MLSettingsPerformance(**{"single_axis_labels": False, "x_label_position": None, "y_label_position": None}) report.result_path = path report.state = self._create_state_object(path / "input_data/") result = report.generate_report() self.assertTrue(os.path.isfile(path / "performance.csv")) self.assertTrue(os.path.isfile(path / "performance.html")) self.assertIsInstance(result, ReportResult) self.assertEqual(result.output_figures[0].path, path / "performance.html") self.assertEqual(result.output_tables[0].path, path / "performance.csv") written_data = pd.read_csv(path / "performance.csv") self.assertEqual(list(written_data.columns), ["fold", "label", "encoding", "ml_method", "performance"]) shutil.rmtree(path)
def test_subsampling(self): path = PathBuilder.build(EnvironmentSettings.tmp_test_path / "subsampling_workflow/") repertoire_specs = self.build_specs(path) specs_filename = path / "specs.yaml" with open(specs_filename, "w") as file: yaml.dump(repertoire_specs, file) app = ImmuneMLApp(specs_filename, path / "result/") app.run() shutil.rmtree(path)
def test_run(self): path = PathBuilder.build(EnvironmentSettings.tmp_test_path / "performance_overview/") specs_file = self._prepare_specs(path) tool = MultiDatasetBenchmarkTool(specs_file, path / "result/") tool.run() self.assertTrue(os.path.isfile(path / "result/benchmarking_reports/performance_overview/precision_recall_data_d1.csv")) self.assertTrue(os.path.isfile(path / "result/benchmarking_reports/performance_overview/precision_recall_data_d2.csv")) self.assertTrue(os.path.isfile(path / "result/benchmarking_reports/performance_overview/precision_recall_curve.html")) self.assertTrue(os.path.isfile(path / "result/benchmarking_reports/performance_overview/roc_curve.html")) shutil.rmtree(path)
def _generate(self) -> ReportResult: self.result_path = PathBuilder.build(self.result_path / self.name) self._extract_label() hp_items = [state.optimal_hp_items[self.label] for state in self.instruction_states] overlap_matrix = SequenceAnalysisHelper.compute_overlap_matrix(hp_items) labels = [state.dataset.name for state in self.instruction_states] figure_path = self._make_figure(overlap_matrix, labels) data_path = self._export_matrix(overlap_matrix, labels) return ReportResult(output_figures=[ReportOutput(figure_path, 'sequence overlap across datasets')], output_tables=[ReportOutput(data_path, 'sequence overlap across datasets (csv)')])
def process(dataset: RepertoireDataset, params: dict) -> RepertoireDataset: Preprocessor.check_dataset_type(dataset, [RepertoireDataset], "ChainRepertoireFilter") processed_dataset = dataset.clone() PathBuilder.build(params["result_path"]) repertoires = [] indices = [] for index, repertoire in enumerate(dataset.get_data()): if all(sequence.metadata.chain == Chain.get_chain( params["keep_chain"]) for sequence in repertoire.sequences): repertoires.append(repertoire) indices.append(index) processed_dataset.repertoires = repertoires processed_dataset.metadata_file = ChainRepertoireFilter.build_new_metadata( processed_dataset, indices, params["result_path"]) Filter.check_dataset_not_empty(processed_dataset, "ChainRepertoireFilter") return processed_dataset
def _implant_signals_in_repertoires( simulation_state: SimulationState = None) -> Dataset: repertoires_path = PathBuilder.build(simulation_state.result_path / "repertoires") processed_repertoires = SignalImplanter._implant_signals( simulation_state, SignalImplanter._process_repertoire, repertoires_path) processed_dataset = RepertoireDataset(repertoires=processed_repertoires, labels={**(simulation_state.dataset.labels if simulation_state.dataset.labels is not None else {}), **{signal.id: [True, False] for signal in simulation_state.signals}}, name=simulation_state.dataset.name, metadata_file=Path(SignalImplanter._create_metadata_file(processed_repertoires, simulation_state))) return processed_dataset
def test_sequence_dataset(self): path = PathBuilder.build(EnvironmentSettings.tmp_test_path / "integration_dataset_gen_html_sequence/") dataset_path = path / "sequence_dataset/" specs = { "definitions": { "datasets": { "sequencedataset": { "format": "RandomSequenceDataset", "params": { "sequence_count": 10, "length_probabilities": { 10: 1 }, "labels": { "epitope_a": { True: 0.5, False: 0.5 }, "epitope_b": { True: 0.5, False: 0.5 } }, "result_path": str(dataset_path) } } } }, "instructions": { "instr1": { "type": "DatasetExport", "export_formats": ["Pickle", "AIRR"], "datasets": ["sequencedataset"] } }, "output": { "format": "HTML" } } specs_path = path / "specs.yaml" with open(specs_path, "w") as file: yaml.dump(specs, file) app = ImmuneMLApp(specs_path, path / "result/") app.run() shutil.rmtree(path)
def _generate(self) -> ReportResult: X = self.train_dataset.encoded_data predicted_y = self.method.predict(X, self.label)[self.label.name] predicted_proba_y = self.method.predict_proba(X, self.label)[self.label.name] true_y = self.train_dataset.encoded_data.labels[self.label.name] classes = self.method.get_classes() PathBuilder.build(self.result_path) scores = {} output = { 'tables': [], 'figures': [] } for metric in self.metrics_set: _score = TrainingPerformance._compute_score( Metric[metric], predicted_y, predicted_proba_y, true_y, classes, ) if metric == 'CONFUSION_MATRIX': self._generate_heatmap(classes, classes, _score, metric, output) else: scores[metric] = _score scores_df = pd.DataFrame.from_dict(scores, orient='index') scores_df.columns = [self.label.name] self._generate_barplot(scores_df, output) return ReportResult(self.name, info="Plots the evaluation metrics for the performance given machine learning model and training dataset.", output_tables=output['tables'], output_figures=output['figures'])
def test_sequence_dataset(self): path = EnvironmentSettings.root_path / "test/tmp/ioairr/" PathBuilder.build(path) self.create_dummy_dataset(path, False) column_mapping = self.get_column_mapping() params = { "is_repertoire": False, "result_path": path, "path": path, "import_out_of_frame": False, "import_with_stop_codon": False, "import_productive": True, "region_type": "IMGT_CDR3", "import_empty_nt_sequences": True, "import_empty_aa_sequences": False, "column_mapping": column_mapping, "import_illegal_characters": False, "separator": "\t", "sequence_file_size": 1 } dataset = AIRRImport.import_dataset(params, "airr_sequence_dataset") self.assertEqual(5, dataset.get_example_count()) self.assertEqual(5, len(dataset.get_filenames())) for idx, sequence in enumerate(dataset.get_data()): self.assertEqual(sequence.amino_acid_sequence, "ASGVAGTFDY") v_genes = sorted( ["IGHV4-59", "IGHV4-34", "IGHV4-31", "IGHV4-31", "IGHV4-31"]) self.assertListEqual( sorted( [sequence.metadata.v_gene for sequence in dataset.get_data()]), v_genes) shutil.rmtree(path)
def test_minimal_dataset(self): # test to make sure import works with minimally specified input path = EnvironmentSettings.root_path / "test/tmp/ioairr/" PathBuilder.build(path) file1_content = """sequence_id junction_aa IVKNQEJ01BVGQ6 CASGVAGTFDYW IVKNQEJ01AQVWS CASGVAGTFDYW IVKNQEJ01AOYFZ CASGVAGNFLLX IVKNQEJ01EI5S4 CASGVAGTFDYW""" with open(path / "rep1.tsv", "w") as file: file.writelines(file1_content) with open(path / "metadata.csv", "w") as file: file.writelines("""filename,subject_id rep1.tsv,1""") column_mapping = self.get_column_mapping() params = { "is_repertoire": True, "result_path": path, "path": path, "metadata_file": path / "metadata.csv", "import_out_of_frame": False, "import_with_stop_codon": False, "import_productive": True, "region_type": "IMGT_CDR3", "import_empty_nt_sequences": True, "import_empty_aa_sequences": False, "column_mapping": column_mapping, "import_illegal_characters": False, "separator": "\t" } AIRRImport.import_dataset(params, "airr_minimal_repertoire_dataset") shutil.rmtree(path)
def import_repertoire_dataset(import_class, params: DatasetImportParams, dataset_name: str) -> RepertoireDataset: """ Function to create a dataset from the metadata and a list of repertoire files and exports dataset pickle file Arguments: import_class: class to use for import params: instance of DatasetImportParams class which includes information on path, columns, result path etc. dataset_name: user-defined name of the dataset Returns: RepertoireDataset object that was created """ try: metadata = pd.read_csv(params.metadata_file, ",") except Exception as e: raise Exception(f"{e}\nAn error occurred while reading in the metadata file {params.metadata_file}. Please see the error log above for " f"more details on this error and the documentation for the expected format of the metadata.") ParameterValidator.assert_keys_present(metadata.columns.tolist(), ["filename"], ImportHelper.__name__, f'{dataset_name}: params: metadata_file') PathBuilder.build(params.result_path / "repertoires/") arguments = [(import_class, row, params) for index, row in metadata.iterrows()] with Pool(params.number_of_processes) as pool: repertoires = pool.starmap(ImportHelper.load_repertoire_as_object, arguments) new_metadata_file = ImportHelper.make_new_metadata_file(repertoires, metadata, params.result_path, dataset_name) potential_labels = list(set(metadata.columns.tolist()) - {"filename"}) dataset = RepertoireDataset(labels={key: list(set(metadata[key].values.tolist())) for key in potential_labels}, repertoires=repertoires, metadata_file=new_metadata_file, name=dataset_name) PickleExporter.export(dataset, params.result_path) return dataset
def test_repertoiredataset(self): path = EnvironmentSettings.tmp_test_path / "repertoiredataset_yaml" PathBuilder.build(path) self.create_dummy_dataset(path, write_metadata=True) old_wd = os.getcwd() os.chdir(path) yamlbuilder_main([ "-r", "VDJdb", "-o", str(path), "-f", "repertoire.yaml", "-m", "metadata.csv", "-i", "True" ]) with open(path / "repertoire.yaml", "r") as file: loaded_receptor = yaml.load(file, Loader=yaml.FullLoader) self.assertDictEqual( loaded_receptor["definitions"]["datasets"], { "dataset": { "format": "VDJdb", "params": { "path": "./", "metadata_file": "metadata.csv", "is_repertoire": True, "region_type": RegionType.IMGT_CDR3.name, "result_path": "./" } } }) ImmuneMLParser.parse_yaml_file(path / "repertoire.yaml") os.chdir(old_wd) shutil.rmtree(path)
def test_sequence_export(self): path = EnvironmentSettings.tmp_test_path / "airr_exporter_receptor/" PathBuilder.build(path) dataset = self.create_dummy_sequencedataset(path) path_exported = path / "exported_sequences" AIRRExporter.export(dataset, path_exported) resulting_data = pd.read_csv(path_exported / "batch1.tsv", sep="\t") self.assertListEqual(list(resulting_data["sequence_id"]), ["1a", "1b"]) self.assertListEqual(list(resulting_data["cdr3_aa"]), ["AAATTT", "ATATAT"]) self.assertListEqual(list(resulting_data["v_call"]), ["TRAV1", "TRBV1"]) self.assertListEqual(list(resulting_data["j_call"]), ["TRAJ1", "TRBJ1"]) self.assertListEqual(list(resulting_data["d_call"]), ["TRAD1", "TRBD1"]) self.assertListEqual(list(resulting_data["locus"]), ["TRA", "TRB"]) self.assertListEqual(list(resulting_data["custom1"]), ["cust1", nan]) self.assertListEqual(list(resulting_data["custom2"]), [nan, "cust1"]) self.assertListEqual(list(resulting_data["productive"]), ['T', 'T']) self.assertListEqual(list(resulting_data["stop_codon"]), ['F', 'F']) resulting_data = pd.read_csv(path_exported / "batch2.tsv", sep="\t") self.assertListEqual(list(resulting_data["sequence_id"]), ["2b"]) self.assertListEqual(list(resulting_data["cdr3_aa"]), ["ATATAT"]) self.assertListEqual(list(resulting_data["v_call"]), ["TRBV1"]) self.assertListEqual(list(resulting_data["j_call"]), ["TRBJ1"]) self.assertListEqual(list(resulting_data["d_call"]), ["TRBD1"]) self.assertListEqual(list(resulting_data["locus"]), ["TRB"]) self.assertListEqual(list(resulting_data["custom2"]), ["cust1"]) self.assertListEqual(list(resulting_data["productive"]), ['T']) self.assertListEqual(list(resulting_data["stop_codon"]), ['F']) shutil.rmtree(path)
def test_encode(self): path = EnvironmentSettings.tmp_test_path / "distance_encoder/" PathBuilder.build(path) dataset = self.create_dataset(path) enc = DistanceEncoder.build_object( dataset, **{ "distance_metric": DistanceMetricType.JACCARD.name, "attributes_to_match": ["sequence_aas"], "sequence_batch_size": 20 }) enc.set_context({"dataset": dataset}) encoded = enc.encode( dataset, EncoderParams(result_path=path, label_config=LabelConfiguration( [Label("l1", [0, 1]), Label("l2", [2, 3])]), pool_size=4, filename="dataset.pkl")) self.assertEqual(8, encoded.encoded_data.examples.shape[0]) self.assertEqual(8, encoded.encoded_data.examples.shape[1]) self.assertEqual(0, encoded.encoded_data.examples.iloc[0, 0]) self.assertEqual(0, encoded.encoded_data.examples.iloc[1, 1]) self.assertEqual(0, encoded.encoded_data.examples.iloc[0, 4]) self.assertTrue( np.array_equal([1, 0, 1, 0, 1, 0, 1, 0], encoded.encoded_data.labels["l1"])) self.assertTrue( np.array_equal([2, 3, 2, 3, 2, 3, 3, 3], encoded.encoded_data.labels["l2"])) shutil.rmtree(path)
def test_load(self): x = np.array([[1, 0, 0], [0, 1, 1], [1, 1, 1], [0, 1, 1]]) y = {"default": np.array([1, 0, 2, 0])} knn = KNN() knn.fit(EncodedData(examples=sparse.csr_matrix(x), labels=y), Label("default")) path = EnvironmentSettings.root_path / "test/tmp/loadtestsklearn/" PathBuilder.build(path) with open(path / "knn.pickle", "wb") as file: pickle.dump(knn.model, file) config = MLMethodConfiguration() config.labels_with_values = {"default": [0, 1, 2]} config.store(path / "config.json") knn2 = KNN() knn2.load(path) self.assertTrue(isinstance(knn2.model, KNeighborsClassifier)) shutil.rmtree(path)
def store(self, path: Path, feature_names=None, details_path=None): content = self._convert_object_to_dict() PathBuilder.build(path) file_path = path / FilenameHandler.get_filename( self.__class__.__name__, "pickle") with file_path.open("wb") as file: pickle.dump(content, file) if details_path is None: params_path = path / FilenameHandler.get_filename( self.__class__.__name__, "yaml") else: params_path = details_path with params_path.open("w") as file: desc = { self.label_name: { **content, "feature_names": feature_names, "classes": list(self.class_mapping.values()) } } yaml.dump(desc, file)
def _output_specs(file_path=None, result_path=None, definitions: dict = None, instructions: dict = None, output: dict = None) -> Path: filepath = ImmuneMLParser._get_full_specs_filepath( file_path, result_path) result = { "definitions": definitions, "instructions": instructions, "output": output } result = ImmuneMLParser._paths_to_strings_recursive(result) PathBuilder.build(filepath.parent) with filepath.open("w") as file: yaml.dump(result, file) print( f"{datetime.datetime.now()}: Full specification is available at {filepath}.\n", flush=True) return filepath
def test_load_sequence_dataset(self): path = EnvironmentSettings.root_path / "test/tmp/mixcr/" PathBuilder.build(path) self.create_dummy_dataset(path, add_metadata=False) params = DefaultParamsLoader.load( EnvironmentSettings.default_params_path / "datasets/", "mixcr") params["is_repertoire"] = False params["paired"] = False params["result_path"] = path params["path"] = path dataset = MiXCRImport.import_dataset(params, "mixcr_repertoire_dataset") seqs = [sequence for sequence in dataset.get_data()] self.assertTrue(seqs[0].amino_acid_sequence in ["AVLETSGSRLT", "ALVTDSWGKLQ"]) # OSX/windows self.assertTrue(seqs[0].metadata.v_gene in ["TRAV21", "TRAV6"]) # OSX/windows shutil.rmtree(path)
def test_dataset_generation(self): path = PathBuilder.build(EnvironmentSettings.tmp_test_path / "cv_split_variant/") repertoire_specs = self.build_specs(path) specs_filename = path / "specs.yaml" with open(specs_filename, "w") as file: yaml.dump(repertoire_specs, file) app = ImmuneMLApp(specs_filename, path / "result/") app.run() shutil.rmtree(path)
def _vectorize_examples(self, examples, params: EncoderParams, keys: set) -> Tuple[np.ndarray, list]: if self.vectorizer_path is None: self.vectorizer_path = params.result_path / "vectorizer_keys.yaml" if params.learn_model is True: kmer_keys = sorted(list(keys)) PathBuilder.build(params.result_path) with self.vectorizer_path.open("w") as file: yaml.dump(kmer_keys, file) else: with self.vectorizer_path.open("r") as file: kmer_keys = yaml.safe_load(file) vectorized_examples = [ np.array([ np.array(example[key]) if key in example else np.zeros(self.k * Util.ATCHLEY_FACTOR_COUNT + 1) for key in kmer_keys ]) for example in examples ] return np.array(vectorized_examples, dtype=np.float32), kmer_keys
def store(self, path: Path, feature_names=None, details_path: Path = None): PathBuilder.build(path) file_path = path / f"{self._get_model_filename()}.pickle" with file_path.open("wb") as file: dill.dump(self.model, file) if details_path is None: params_path = path / f"{self._get_model_filename()}.yaml" else: params_path = details_path with params_path.open("w") as file: desc = { **(self.get_params()), "feature_names": feature_names, "classes": self.model.classes_.tolist(), "class_mapping": self.class_mapping, } if self.label is not None: desc["label"] = vars(self.label) yaml.dump(desc, file)
def encode(self, dataset, params: EncoderParams) -> RepertoireDataset: result_path = params.result_path / "encoding" PathBuilder.build(result_path) self.export_repertoire_tsv_files(result_path) labels = params.label_config.get_labels_by_name() metadata_filepath = self.export_metadata_file(dataset, labels, result_path) encoded_dataset = dataset.clone() encoded_dataset.encoded_data = EncodedData( examples=None, labels=dataset.get_metadata(labels) if params.encode_labels else None, example_ids=dataset.repertoire_ids, encoding=DeepRCEncoder.__name__, info={ "metadata_filepath": metadata_filepath, "max_sequence_length": self.max_sequence_length }) return encoded_dataset
def test_process(self): path = EnvironmentSettings.root_path / "test/tmp/subject_rep_collector" PathBuilder.build(path) reps = [Repertoire.build_from_sequence_objects([ReceptorSequence("AAA", identifier="1")], path=path, metadata={"subject_id": "patient1"}), Repertoire.build_from_sequence_objects([ReceptorSequence("AAC", identifier="2")], path=path, metadata={"subject_id": "patient1"}), Repertoire.build_from_sequence_objects([ReceptorSequence("AAC", identifier="3")], path=path, metadata={"subject_id": "patient3"})] dataset = RepertoireDataset(repertoires=reps) dataset2 = SubjectRepertoireCollector().process_dataset(dataset, path / "result") self.assertEqual(2, len(dataset2.get_data())) self.assertEqual(3, len(dataset.get_data())) values = [2, 1] for index, rep in enumerate(dataset2.get_data()): self.assertEqual(values[index], len(rep.sequences)) shutil.rmtree(path)
def test_encode(self): path = EnvironmentSettings.tmp_test_path / "deeprc_encoder/" PathBuilder.build(path) PathBuilder.build(path / "encoded_data/") main_dataset, sub_dataset = self.create_datasets(path) enc = DeepRCEncoder.build_object(sub_dataset, **{}) enc.set_context({"dataset": main_dataset}) encoded = enc.encode( sub_dataset, EncoderParams(result_path=path / "encoded_data/", label_config=LabelConfiguration( [Label("l1", [0, 1]), Label("l2", [2, 3])]), pool_size=4)) self.assertListEqual(encoded.encoded_data.example_ids, sub_dataset.get_repertoire_ids()) self.assertTrue( os.path.isfile(encoded.encoded_data.info["metadata_filepath"])) metadata_content = pd.read_csv( encoded.encoded_data.info["metadata_filepath"], sep="\t") self.assertListEqual(list(metadata_content["ID"]), sub_dataset.get_repertoire_ids()) for repertoire in main_dataset.repertoires: rep_path = path / f"encoded_data/encoding/{repertoire.identifier}.tsv" self.assertTrue(os.path.isfile(rep_path)) repertoire_tsv = pd.read_csv(rep_path, sep="\t") self.assertListEqual(list(repertoire_tsv["amino_acid"]), list(repertoire.get_sequence_aas())) shutil.rmtree(path)
def __init__(self, train_dataset: Dataset, test_dataset: Dataset, label: Label, metrics: set, optimization_metric: Metric, path: Path, ml_reports: List[MLReport] = None, encoding_reports: list = None, data_reports: list = None, number_of_processes: int = 2, label_config: LabelConfiguration = None, report_context: dict = None, hp_setting: HPSetting = None): self.train_dataset = train_dataset self.test_dataset = test_dataset self.label = label self.label_config = label_config self.method = copy.deepcopy(hp_setting.ml_method) self.path = PathBuilder.build(path) if path is not None else None self.ml_details_path = path / "ml_details.yaml" if path is not None else None self.ml_score_path = path / "ml_score.csv" if path is not None else None self.train_predictions_path = path / "train_predictions.csv" if path is not None else None self.test_predictions_path = path / "test_predictions.csv" if path is not None else None self.report_path = PathBuilder.build( path / "reports") if path is not None else None self.number_of_processes = number_of_processes assert all([isinstance(metric, Metric) for metric in metrics]), \ "MLProcess: metrics are not set to be an instance of Metric." self.metrics = metrics self.metrics.add(Metric.BALANCED_ACCURACY) self.optimization_metric = optimization_metric self.ml_reports = ml_reports if ml_reports is not None else [] self.encoding_reports = encoding_reports if encoding_reports is not None else [] self.data_reports = data_reports if data_reports is not None else [] self.report_context = report_context self.hp_setting = copy.deepcopy(hp_setting)
def test_generate(self): path = PathBuilder.build(EnvironmentSettings.tmp_test_path / "kernel_sequence_logo/") dataset = RandomDatasetGenerator.generate_receptor_dataset( receptor_count=500, chain_1_length_probabilities={4: 1}, chain_2_length_probabilities={4: 1}, labels={"CMV": { True: 0.5, False: 0.5 }}, path=path / "dataset") enc_dataset = OneHotReceptorEncoder(True, 1, False, "enc1").encode( dataset, EncoderParams(path / "result", LabelConfiguration([Label("CMV", [True, False])]))) cnn = ReceptorCNN(kernel_count=2, kernel_size=[3], positional_channels=3, sequence_type="amino_acid", device="cpu", number_of_threads=4, random_seed=1, learning_rate=0.01, iteration_count=10, l1_weight_decay=0.1, evaluate_at=5, batch_size=100, training_percentage=0.8, l2_weight_decay=0.0) cnn.fit(enc_dataset.encoded_data, "CMV") report = KernelSequenceLogo(method=cnn, result_path=path / "logos/") report.generate_report() self.assertTrue(os.path.isfile(path / "logos/alpha_kernel_3_1.png")) self.assertTrue(os.path.isfile(path / "logos/alpha_kernel_3_2.png")) self.assertTrue(os.path.isfile(path / "logos/beta_kernel_3_1.png")) self.assertTrue(os.path.isfile(path / "logos/beta_kernel_3_2.png")) self.assertTrue(os.path.isfile(path / "logos/alpha_kernel_3_1.csv")) self.assertTrue(os.path.isfile(path / "logos/alpha_kernel_3_2.csv")) self.assertTrue(os.path.isfile(path / "logos/beta_kernel_3_1.csv")) self.assertTrue(os.path.isfile(path / "logos/beta_kernel_3_2.csv")) self.assertTrue( os.path.isfile(path / "logos/fully_connected_layer_weights.csv")) self.assertTrue( os.path.isfile(path / "logos/fully_connected_layer_weights.html")) shutil.rmtree(path)
def build_like(cls, repertoire, indices_to_keep: list, result_path: Path, filename_base: str = None): if indices_to_keep is not None and len(indices_to_keep) > 0: PathBuilder.build(result_path) data = repertoire.load_data() data = data[indices_to_keep] identifier = uuid4().hex filename_base = filename_base if filename_base is not None else identifier data_filename = result_path / f"{filename_base}.npy" np.save(str(data_filename), data) metadata_filename = result_path / f"{filename_base}_metadata.yaml" shutil.copyfile(repertoire.metadata_filename, metadata_filename) new_repertoire = Repertoire(data_filename, metadata_filename, identifier) return new_repertoire else: return None
def store(self, path: Path, feature_names=None, details_path: Path = None): PathBuilder.build(path) torch.save( copy.deepcopy(self.logistic_regression).state_dict(), str(path / "log_reg.pt")) custom_vars = copy.deepcopy(vars(self)) coefficients_df = pd.DataFrame( custom_vars["logistic_regression"].linear.weight.detach().numpy(), columns=feature_names) coefficients_df["bias"] = custom_vars[ "logistic_regression"].linear.bias.detach().numpy() coefficients_df.to_csv(path / "coefficients.csv", index=False) del custom_vars["result_path"] del custom_vars["logistic_regression"] del custom_vars["label"] if self.label: custom_vars["label"] = vars(self.label) params_path = path / "custom_params.yaml" with params_path.open('w') as file: yaml.dump(custom_vars, file)
def export(hp_item: HPItem, path: Path) -> Path: PathBuilder.build(path) preproc_filename = MLExporter._store_preprocessing_sequence( hp_item.hp_setting.preproc_sequence, path).name encoder_filename = MLExporter._store_encoder( hp_item.hp_setting.encoder, path).name hp_item.method.store(path, hp_item.method.get_feature_names()) labels_with_values = { hp_item.method.get_label_name(): hp_item.method.get_classes() } method_config = MLMethodConfiguration( labels_with_values=labels_with_values, software_used=hp_item.method.get_package_info(), encoding_name=hp_item.hp_setting.encoder_name, encoding_parameters=hp_item.hp_setting.encoder_params, encoding_file=encoder_filename, encoding_class=type(hp_item.hp_setting.encoder).__name__, ml_method=type(hp_item.method).__name__, ml_method_name=hp_item.method.name, train_dataset_id=hp_item.train_dataset.identifier, train_dataset_name=hp_item.train_dataset.name, preprocessing_sequence_name=hp_item.hp_setting. preproc_sequence_name, preprocessing_file=os.path.basename(preproc_filename), preprocessing_parameters={ type(seq).__name__: {str(key): str(val) for key, val in vars(seq).items()} for seq in hp_item.hp_setting.preproc_sequence }) method_config.store(path / 'ml_config.yaml') return path
def test_simulation(self): path = EnvironmentSettings.tmp_test_path / "integration_simulation/" self.prepare_dataset(path) specs_path = self.prepare_specs(path) PathBuilder.build(path / "result/") app = ImmuneMLApp(specification_path=specs_path, result_path=path / "result/") app.run() self.assertTrue(os.path.isfile(path / "result/inst1/metadata.csv")) metadata_df = pd.read_csv(path / "result/inst1/metadata.csv", comment=Constants.COMMENT_SIGN) self.assertTrue("signal1" in metadata_df.columns) self.assertEqual(17, sum(metadata_df["signal1"])) self.assertTrue(os.path.isfile(path / "result/index.html")) self.assertTrue( os.path.isfile( path / "result/inst1/exported_dataset/pickle/d1.iml_dataset")) shutil.rmtree(path)