def _create_dummy_lr_model(self, path, encoded_data, label): # dummy logistic regression with 100 observations with 3 features belonging to 2 classes dummy_lr = LogisticRegression() dummy_lr.fit_by_cross_validation(encoded_data, number_of_splits=2, label_name=label) return dummy_lr
def test_generate_next_setting(self): hp_settings = [ HPSetting(encoder=KmerFrequencyEncoder, encoder_params={}, encoder_name="enc1", ml_method=LogisticRegression(), ml_params={ "model_selection_cv": False, "model_selection_n_fold": -1 }, ml_method_name="ml1", preproc_sequence=[]), HPSetting(encoder=Word2VecEncoder, encoder_params={}, encoder_name="enc2", ml_method=LogisticRegression(), ml_params={ "model_selection_cv": False, "model_selection_n_fold": -1 }, ml_method_name="ml2", preproc_sequence=[]) ] grid_search = GridSearch(hp_settings) setting1 = grid_search.generate_next_setting() setting2 = grid_search.generate_next_setting(setting1, 0.7) setting3 = grid_search.generate_next_setting(setting2, 0.8) self.assertIsNone(setting3) self.assertEqual(KmerFrequencyEncoder, setting1.encoder) self.assertEqual(Word2VecEncoder, setting2.encoder)
def test_get_optimal_hps(self): hp_settings = [ HPSetting(encoder=KmerFrequencyEncoder, encoder_params={}, encoder_name="e1", ml_method=LogisticRegression(), ml_params={ "model_selection_cv": False, "model_selection_n_fold": -1 }, ml_method_name="ml1", preproc_sequence=[]), HPSetting(encoder=Word2VecEncoder, encoder_params={}, encoder_name='e2', ml_method=LogisticRegression(), ml_params={ "model_selection_cv": False, "model_selection_n_fold": -1 }, ml_method_name="ml2", preproc_sequence=[]) ] grid_search = GridSearch(hp_settings) setting1 = grid_search.generate_next_setting() setting2 = grid_search.generate_next_setting(setting1, 0.7) grid_search.generate_next_setting(setting2, 0.8) optimal = grid_search.get_optimal_hps() self.assertEqual(hp_settings[1], optimal)
def test_run(self): path = EnvironmentSettings.tmp_test_path / "mlapplicationtest/" PathBuilder.build(path) dataset = RandomDatasetGenerator.generate_repertoire_dataset(50, {5: 1}, {5: 1}, {"l1": {1: 0.5, 2: 0.5}}, path / 'dataset/') ml_method = LogisticRegression() encoder = KmerFreqRepertoireEncoder(NormalizationType.RELATIVE_FREQUENCY, ReadsType.UNIQUE, SequenceEncodingType.CONTINUOUS_KMER, 3, scale_to_zero_mean=True, scale_to_unit_variance=True) label_config = LabelConfiguration([Label("l1", [1, 2])]) enc_dataset = encoder.encode(dataset, EncoderParams(result_path=path, label_config=label_config, filename="tmp_enc_dataset.pickle", pool_size=4)) ml_method.fit(enc_dataset.encoded_data, 'l1') hp_setting = HPSetting(encoder, {"normalization_type": "relative_frequency", "reads": "unique", "sequence_encoding": "continuous_kmer", "k": 3, "scale_to_zero_mean": True, "scale_to_unit_variance": True}, ml_method, {}, [], 'enc1', 'ml1') PathBuilder.build(path / 'result/instr1/') shutil.copy(path / 'dict_vectorizer.pickle', path / 'result/instr1/dict_vectorizer.pickle') shutil.copy(path / 'scaler.pickle', path / 'result/instr1/scaler.pickle') ml_app = MLApplicationInstruction(dataset, label_config, hp_setting, 4, "instr1", False) ml_app.run(path / 'result/') predictions_path = path / "result/instr1/predictions.csv" self.assertTrue(os.path.isfile(predictions_path)) df = pd.read_csv(predictions_path) self.assertEqual(50, df.shape[0]) shutil.rmtree(path)
def _create_dummy_lr_model(self): dummy_lr = LogisticRegression() encoded_tr = EncodedData(np.random.rand(100, 20), {"l1": [i % 2 for i in range(0, 100)]}) dummy_lr.fit_by_cross_validation(encoded_tr, number_of_splits=2, label=Label("l1", values=[0, 1])) return dummy_lr, encoded_tr
def test_run(self): path = EnvironmentSettings.root_path / "test/tmp/mlmethodassessment/" PathBuilder.build(path) dataset = RepertoireDataset(repertoires=RepertoireBuilder.build( [["AA"], ["CC"], ["AA"], ["CC"], ["AA"], ["CC"], ["AA"], ["CC"], ["AA"], ["CC"], ["AA"], ["CC"]], path)[0]) dataset.encoded_data = EncodedData( examples=np.array([[1, 1], [1, 1], [3, 3], [1, 1], [1, 1], [3, 3], [1, 1], [1, 1], [3, 3], [1, 1], [1, 1], [3, 3]]), labels={ "l1": [1, 1, 3, 1, 1, 3, 1, 1, 3, 1, 1, 3], "l2": [1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3] }) label_config = LabelConfiguration() label_config.add_label("l1", [1, 3]) label = Label(name='l1', values=[1, 2]) method1 = LogisticRegression() method1.fit(dataset.encoded_data, label=label) res = MLMethodAssessment.run( MLMethodAssessmentParams( dataset=dataset, method=method1, metrics={ Metric.ACCURACY, Metric.BALANCED_ACCURACY, Metric.F1_MACRO }, optimization_metric=Metric.LOG_LOSS, predictions_path=EnvironmentSettings.root_path / "test/tmp/mlmethodassessment/predictions.csv", label=label, ml_score_path=EnvironmentSettings.root_path / "test/tmp/mlmethodassessment/ml_score.csv", split_index=1, path=EnvironmentSettings.root_path / "test/tmp/mlmethodassessment/")) self.assertTrue(isinstance(res, dict)) self.assertTrue(res[Metric.LOG_LOSS.name.lower()] <= 0.1) self.assertTrue( os.path.isfile(EnvironmentSettings.root_path / "test/tmp/mlmethodassessment/ml_score.csv")) df = pd.read_csv(EnvironmentSettings.root_path / "test/tmp/mlmethodassessment/ml_score.csv") self.assertTrue(df.shape[0] == 1) df = pd.read_csv(EnvironmentSettings.root_path / "test/tmp/mlmethodassessment/predictions.csv") self.assertEqual(12, df.shape[0]) shutil.rmtree(EnvironmentSettings.root_path / "test/tmp/mlmethodassessment/")
def test_fit_by_cross_validation(self): x = EncodedData( np.array([[1, 0, 0], [0, 1, 1], [1, 1, 1], [0, 1, 1], [1, 0, 0], [0, 1, 1], [1, 1, 1], [0, 1, 1]]), { "test1": [1, 0, 2, 0, 1, 0, 2, 0], "test2": [1, 0, 2, 0, 1, 0, 2, 0] }) lr = LogisticRegression() lr.fit_by_cross_validation(x, number_of_splits=2, label=Label("test2"))
def test_predict(self): x = np.array([[1, 0, 0], [0, 1, 1], [1, 1, 1], [0, 1, 1]]) y = {"test1": [1, 0, 2, 0], "test2": [1, 0, 2, 0]} lr = LogisticRegression() lr.fit(EncodedData(x, y), Label("test2")) test_x = np.array([[0, 1, 0], [1, 0, 0]]) y = lr.predict(EncodedData(test_x), Label("test2")) self.assertTrue(len(y["test2"]) == 2) self.assertTrue(y["test2"][1] in [0, 1, 2])
def _create_dummy_lr_model(self, path): # dummy logistic regression with 100 observations with 20 features belonging to 2 classes dummy_lr = LogisticRegression() dummy_lr.fit_by_cross_validation(EncodedData(np.random.rand(100, 5), {"l1": [i % 2 for i in range(0, 100)]}), number_of_splits=2, label=Label("l1")) # Change coefficients to values 1-20 dummy_lr.model.coef_ = np.array(list(range(0, 5))).reshape(1, -1) with open(path / "ml_details.yaml", "w") as file: yaml.dump({"l1": {"feature_names": ["AAA", "AAC", "CKJ", "KSA", "AKJ"]}}, file) return dummy_lr
def test(self): path = EnvironmentSettings.tmp_test_path / "integration_sequence_classification/" dataset = RandomDatasetGenerator.generate_sequence_dataset(50, {4: 1}, {'l1': {1: 0.5, 2: 0.5}}, path / 'data') os.environ["cache_type"] = "test" encoder_params = { "normalization_type": NormalizationType.RELATIVE_FREQUENCY.name, "reads": ReadsType.UNIQUE.name, "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name, "sequence_type": SequenceType.AMINO_ACID.name, "k": 3 } hp_setting = HPSetting(encoder=KmerFrequencyEncoder.build_object(dataset, **encoder_params), encoder_params=encoder_params, ml_method=LogisticRegression(), ml_params={"model_selection_cv": False, "model_selection_n_folds": -1}, preproc_sequence=[]) lc = LabelConfiguration() lc.add_label("l1", [1, 2]) instruction = TrainMLModelInstruction(dataset, GridSearch([hp_setting]), [hp_setting], SplitConfig(SplitType.RANDOM, 1, 0.5, reports=ReportConfig()), SplitConfig(SplitType.RANDOM, 1, 0.5, reports=ReportConfig()), {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, lc, path) result = instruction.run(result_path=path) shutil.rmtree(path)
def test_run(self): path = EnvironmentSettings.root_path / "test/tmp/smmodel/" PathBuilder.build(path) repertoires, metadata = RepertoireBuilder.build( [["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"]], path, { "default": [ 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2 ] }) dataset = RepertoireDataset(repertoires=repertoires, labels={"default": [1, 2]}, metadata_file=metadata) label_config = LabelConfiguration() label_config.add_label("default", [1, 2]) hp_settings = [ HPSetting( Word2VecEncoder.build_object( dataset, **{ "vector_size": 8, "model_type": ModelType.SEQUENCE.name, "k": 3 }), { "vector_size": 8, "model_type": ModelType.SEQUENCE.name, "k": 3 }, LogisticRegression(), { "model_selection_cv": False, "model_selection_n_folds": -1 }, []) ] split_config_assessment = SplitConfig(SplitType.RANDOM, 1, 0.5, ReportConfig()) split_config_selection = SplitConfig(SplitType.RANDOM, 1, 0.5, ReportConfig()) instruction = TrainMLModelInstruction( dataset, GridSearch(hp_settings), hp_settings, split_config_assessment, split_config_selection, {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, label_config, path) semantic_model = SemanticModel([instruction], path) semantic_model.run() shutil.rmtree(path)
def test_load(self): x = np.array([[1, 0, 0], [0, 1, 1], [1, 1, 1], [0, 1, 1]]) y = {"default": np.array([1, 0, 2, 0])} lr = LogisticRegression() lr.fit(EncodedData(x, y), 'default') path = EnvironmentSettings.root_path / "test/tmp/lr2/" PathBuilder.build(path) with open(path / "logistic_regression.pickle", "wb") as file: pickle.dump(lr.get_model(), file) lr2 = LogisticRegression() lr2.load(path) self.assertTrue(isinstance(lr2.get_model(), SklearnLogisticRegression)) shutil.rmtree(path)
def _create_dummy_lr_model(self, path): # dummy logistic regression with 100 observations with 20 features belonging to 2 classes dummy_lr = LogisticRegression() dummy_lr.fit_by_cross_validation(EncodedData(np.random.rand( 100, 20), {"l1": [i % 2 for i in range(0, 100)]}), number_of_splits=2, label_name="l1") # Change coefficients to values 1-20 dummy_lr.models["l1"].coef_ = np.array(list(range(0, 20))).reshape(1, -1) file_path = path / "ml_details.yaml" with file_path.open("w") as file: yaml.dump( {"l1": { "feature_names": [f"feature{i}" for i in range(20)] }}, file) return dummy_lr
def test_run(self): method = LogisticRegression() dataset = RepertoireDataset() dataset.encoded_data = EncodedData(examples=np.array([[1, 2, 3], [2, 3, 4], [1, 2, 3], [2, 3, 4], [1, 2, 3], [2, 3, 4]]), labels={ "l1": [1, 0, 1, 0, 1, 0], "l2": [0, 1, 0, 1, 0, 1] }, feature_names=["f1", "f2", "f3"]) path = EnvironmentSettings.root_path / "test/tmp/mlmethodtrainer/" method = MLMethodTrainer.run( MLMethodTrainerParams(result_path=path, dataset=dataset, label=Label(name="l1", values=[0, 1]), method=method, model_selection_n_folds=2, model_selection_cv=True, cores_for_training=1, train_predictions_path=path / "predictions.csv", ml_details_path=path / "details.yaml", optimization_metric="balanced_accuracy")) method.predict(EncodedData(np.array([1, 2, 3]).reshape(1, -1)), Label("l1")) self.assertTrue(os.path.isfile(path / "predictions.csv")) self.assertTrue(os.path.isfile(path / "details.yaml")) shutil.rmtree(path)
def test(self): path = EnvironmentSettings.tmp_test_path / "integration_receptor_classification/" dataset = self.create_dataset(path) os.environ["cache_type"] = "test" encoder_params = { "normalization_type": NormalizationType.RELATIVE_FREQUENCY.name, "reads": ReadsType.UNIQUE.name, "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name, "sequence_type": SequenceType.AMINO_ACID.name, "k": 3 } hp_setting = HPSetting(encoder=KmerFrequencyEncoder.build_object( dataset, **encoder_params), encoder_params=encoder_params, ml_method=LogisticRegression(), ml_params={ "model_selection_cv": False, "model_selection_n_folds": -1 }, preproc_sequence=[]) lc = LabelConfiguration() lc.add_label("l1", [1, 2]) instruction = TrainMLModelInstruction( dataset, GridSearch([hp_setting]), [hp_setting], SplitConfig(SplitType.RANDOM, 1, 0.5, reports=ReportConfig()), SplitConfig(SplitType.RANDOM, 1, 0.5, reports=ReportConfig()), {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, lc, path) state = instruction.run(result_path=path) print(vars(state)) self.assertEqual( 1.0, state.assessment_states[0].label_states["l1"]. optimal_assessment_item.performance[ state.optimization_metric.name.lower()]) shutil.rmtree(path)
def _create_state_object(self, path): repertoires, metadata = RepertoireBuilder.build(sequences=[["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"]], path=path, labels={ "l1": [1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2], "l2": [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]}) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata, labels={"l1": [1, 2], "l2": [0, 1]}) enc_params = {"k": 3, "model_type": ModelType.SEQUENCE.name, "vector_size": 4} hp_settings = [HPSetting(Word2VecEncoder.build_object(dataset, **enc_params), enc_params, LogisticRegression(), {"model_selection_cv": False, "model_selection_n_folds": -1}, [])] label_config = LabelConfiguration([Label("l1", [1, 2]), Label("l2", [0, 1])]) process = TrainMLModelInstruction(dataset, GridSearch(hp_settings), hp_settings, SplitConfig(SplitType.RANDOM, 1, 0.7), SplitConfig(SplitType.RANDOM, 1, 0.7), {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, label_config, path) state = process.run(result_path=path) return state
def test_store(self): x = np.array([[1, 0, 0], [0, 1, 1], [1, 1, 1], [0, 1, 1]]) y = {"default": np.array([1, 0, 2, 0])} lr = LogisticRegression() lr.fit(EncodedData(x, y), Label("default")) path = EnvironmentSettings.root_path / "test/tmp/lr/" lr.store(path, ["f1", "f2", "f3"]) self.assertTrue(os.path.isfile(path / "logistic_regression.pickle")) with open(path / "logistic_regression.pickle", "rb") as file: lr2 = pickle.load(file) self.assertTrue(isinstance(lr2, SklearnLogisticRegression)) shutil.rmtree(path)
def test_fit(self): x = np.array([[1, 0, 0], [0, 1, 1], [1, 1, 1], [0, 1, 1]]) y = {"test": np.array([1, 0, 2, 0])} lr = LogisticRegression() lr.fit(EncodedData(x, y), Label("test"))
def test_run(self): path = EnvironmentSettings.tmp_test_path / "hpoptimproc/" PathBuilder.build(path) repertoires, metadata = RepertoireBuilder.build( sequences=[["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"]], path=path, labels={ "l1": [ 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2 ], "l2": [ 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 ] }) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata, labels={ "l1": [1, 2], "l2": [0, 1] }) enc1 = { "k": 3, "model_type": ModelType.SEQUENCE.name, "vector_size": 4 } enc2 = { "k": 3, "model_type": ModelType.SEQUENCE.name, "vector_size": 6 } hp_settings = [ HPSetting(Word2VecEncoder.build_object(dataset, **enc1), enc1, LogisticRegression(), { "model_selection_cv": False, "model_selection_n_folds": -1 }, []), HPSetting( Word2VecEncoder.build_object(dataset, **enc2), enc2, SVM(), { "model_selection_cv": False, "model_selection_n_folds": -1 }, [ClonesPerRepertoireFilter(lower_limit=-1, upper_limit=1000)]) ] report = SequenceLengthDistribution() label_config = LabelConfiguration( [Label("l1", [1, 2]), Label("l2", [0, 1])]) process = TrainMLModelInstruction( dataset, GridSearch(hp_settings), hp_settings, SplitConfig(SplitType.RANDOM, 1, 0.5, reports=ReportConfig(data_splits={"seqlen": report})), SplitConfig(SplitType.RANDOM, 1, 0.5, reports=ReportConfig(data_splits={"seqlen": report})), {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, label_config, path) state = process.run(result_path=path) self.assertTrue(isinstance(state, TrainMLModelState)) self.assertEqual(1, len(state.assessment_states)) self.assertTrue("l1" in state.assessment_states[0].label_states) self.assertTrue("l2" in state.assessment_states[0].label_states) shutil.rmtree(path)