def _create_report(self, path): report = ConfounderAnalysis.build_object( metadata_labels=["age", "HLA"], name='test') report.ml_details_path = path / "ml_details.yaml" report.label = Label("disease") report.result_path = path encoder = KmerFrequencyEncoder.build_object( RepertoireDataset(), **{ "normalization_type": NormalizationType.RELATIVE_FREQUENCY.name, "reads": ReadsType.UNIQUE.name, "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name, "k": 3, 'sequence_type': SequenceType.AMINO_ACID.name }) report.train_dataset = self._encode_dataset( encoder, self._make_dataset(path / "train", size=100), path) report.test_dataset = self._encode_dataset(encoder, self._make_dataset(path / "test", size=40), path, learn_model=False) report.method = self._create_dummy_lr_model( path, report.train_dataset.encoded_data, Label("disease")) return report
def test_fit(self): x, y, encoded_data = self._prepare_data() knn = TCRdistClassifier(percentage=0.75) knn.fit(encoded_data, Label("test"), cores_for_training=4) predictions = knn.predict(encoded_data, Label("test")) self.assertTrue(np.array_equal(y["test"], predictions["test"])) encoded_data.examples = np.array([[1.1, 0.1, 0.9, 1.9]]) predictions = knn.predict(encoded_data, Label("test")) self.assertTrue(np.array_equal([0], predictions["test"]))
def test_generate(self): path = PathBuilder.build(EnvironmentSettings.tmp_test_path / "tcrdist_motif_discovery/") dataset_path = self._create_dataset(path) dataset = SingleLineReceptorImport.import_dataset( { "path": dataset_path, "result_path": path / "dataset/", "separator": ",", "columns_to_load": [ "subject", "epitope", "count", "v_a_gene", "j_a_gene", "cdr3_a_aa", "v_b_gene", "j_b_gene", "cdr3_b_aa", "clone_id", "cdr3_a_nucseq", "cdr3_b_nucseq" ], "column_mapping": { "cdr3_a_aa": "alpha_amino_acid_sequence", "cdr3_b_aa": "beta_amino_acid_sequence", "cdr3_a_nucseq": "alpha_nucleotide_sequence", "cdr3_b_nucseq": "beta_nucleotide_sequence", "v_a_gene": "alpha_v_gene", "v_b_gene": "beta_v_gene", "j_a_gene": "alpha_j_gene", "j_b_gene": "beta_j_gene", "clone_id": "identifier" }, "receptor_chains": "TRA_TRB", "region_type": "IMGT_CDR3", "sequence_file_size": 50000, "organism": "mouse" }, 'd1') dataset = TCRdistEncoder(8).encode( dataset, EncoderParams(path / "result", LabelConfiguration([Label("epitope")]))) report = TCRdistMotifDiscovery(train_dataset=dataset, test_dataset=dataset, result_path=path / "report", name="report name", cores=8, positive_class_name="PA", min_cluster_size=3) report.label = Label("epitope") report._generate() shutil.rmtree(path)
def test_predict(self): x = np.array([[1, 0, 0], [0, 1, 1], [1, 1, 1], [0, 1, 1]]) y = {"test1": [1, 0, 2, 0], "test2": [1, 0, 2, 0]} knn = KNN(parameters={"n_neighbors": 2}) knn.fit(EncodedData(sparse.csr_matrix(x), labels=y), Label("test2")) test_x = np.array([[0, 1, 0], [1, 0, 0]]) y = knn.predict(EncodedData(sparse.csr_matrix(test_x)), Label("test2")) self.assertTrue(len(y["test2"]) == 2) self.assertTrue(y["test2"][1] in [0, 1, 2])
def test_predict(self): x = np.array([[1, 0, 0], [0, 1, 1], [1, 1, 1], [0, 1, 1]]) y = {"test1": [1, 0, 2, 0], "test2": [1, 0, 2, 0]} lr = LogisticRegression() lr.fit(EncodedData(x, y), Label("test2")) test_x = np.array([[0, 1, 0], [1, 0, 0]]) y = lr.predict(EncodedData(test_x), Label("test2")) self.assertTrue(len(y["test2"]) == 2) self.assertTrue(y["test2"][1] in [0, 1, 2])
def test_predict(self): x = np.array([[1, 0, 0], [0, 1, 1], [1, 1, 1], [0, 1, 1]]) y = {"default": np.array([1, 0, 2, 0])} rfc = RandomForestClassifier() rfc.fit(EncodedData(x, y), Label("default")) test_x = np.array([[0, 1, 0], [1, 0, 0]]) y = rfc.predict(EncodedData(test_x), Label("default"))["default"] self.assertTrue(len(y) == 2) self.assertTrue(y[0] in [0, 1, 2]) self.assertTrue(y[1] in [0, 1, 2])
def test_predict(self): x = np.array([[1, 0, 0], [0, 1, 1], [1, 1, 1], [0, 1, 1]]) y = {"test": np.array([1, 0, 2, 0])} svm = SVM() svm.fit(EncodedData(x, y), Label("test")) test_x = np.array([[0, 1, 0], [1, 0, 0]]) y = svm.predict(EncodedData(test_x), Label("test"))["test"] self.assertTrue(len(y) == 2) self.assertTrue(y[0] in [0, 1, 2]) self.assertTrue(y[1] in [0, 1, 2])
def test_generate(self): path = PathBuilder.build(EnvironmentSettings.tmp_test_path / "kernel_sequence_logo/") dataset = RandomDatasetGenerator.generate_receptor_dataset( receptor_count=500, chain_1_length_probabilities={4: 1}, chain_2_length_probabilities={4: 1}, labels={"CMV": { True: 0.5, False: 0.5 }}, path=path / "dataset") enc_dataset = OneHotReceptorEncoder(True, 1, False, "enc1").encode( dataset, EncoderParams(path / "result", LabelConfiguration([Label("CMV", [True, False])]))) cnn = ReceptorCNN(kernel_count=2, kernel_size=[3], positional_channels=3, sequence_type="amino_acid", device="cpu", number_of_threads=4, random_seed=1, learning_rate=0.01, iteration_count=10, l1_weight_decay=0.1, evaluate_at=5, batch_size=100, training_percentage=0.8, l2_weight_decay=0.0) cnn.fit(enc_dataset.encoded_data, Label("CMV")) report = KernelSequenceLogo(method=cnn, result_path=path / "logos/") report.generate_report() self.assertTrue(os.path.isfile(path / "logos/alpha_kernel_3_1.png")) self.assertTrue(os.path.isfile(path / "logos/alpha_kernel_3_2.png")) self.assertTrue(os.path.isfile(path / "logos/beta_kernel_3_1.png")) self.assertTrue(os.path.isfile(path / "logos/beta_kernel_3_2.png")) self.assertTrue(os.path.isfile(path / "logos/alpha_kernel_3_1.csv")) self.assertTrue(os.path.isfile(path / "logos/alpha_kernel_3_2.csv")) self.assertTrue(os.path.isfile(path / "logos/beta_kernel_3_1.csv")) self.assertTrue(os.path.isfile(path / "logos/beta_kernel_3_2.csv")) self.assertTrue( os.path.isfile(path / "logos/fully_connected_layer_weights.csv")) self.assertTrue( os.path.isfile(path / "logos/fully_connected_layer_weights.html")) shutil.rmtree(path)
def test_overlap(self): report = MotifSeedRecovery.build_object(**{"implanted_motifs_per_label": { "l1": {"seeds": ["AAA", "A/AA"], "hamming_distance": False, "gap_sizes": [1]}}}) report.label = Label("l1") self.assertEqual(report.identical_overlap(seed="AAA", feature="AAA"), 3) self.assertEqual(report.identical_overlap(seed="AAA", feature="AAx"), 0) self.assertEqual(report.identical_overlap(seed="AA/A", feature="AAxA"), 3) self.assertEqual(report.identical_overlap(seed="AA/A", feature="AAxx"), 0) self.assertEqual(report.hamming_overlap(seed="AAA", feature="AAA"), 3) self.assertEqual(report.hamming_overlap(seed="AAA", feature="AAx"), 2) self.assertEqual(report.hamming_overlap(seed="AAA", feature="xAx"), 1) self.assertEqual(report.hamming_overlap(seed="AA/A", feature="AAxA"), 3) self.assertEqual(report.hamming_overlap(seed="AA/A", feature="AAxx"), 2) self.assertEqual(report.max_overlap_sliding(seed="AAA", feature="xAAAx", overlap_fn=report.identical_overlap), 3) self.assertEqual(report.max_overlap_sliding(seed="AAA", feature="xAAxx", overlap_fn=report.identical_overlap), 0) self.assertEqual(report.max_overlap_sliding(seed="AAA", feature="AAxx", overlap_fn=report.identical_overlap), 2) self.assertEqual(report.max_overlap_sliding(seed="AA/A", feature="xAAxAx", overlap_fn=report.identical_overlap), 3) self.assertEqual(report.max_overlap_sliding(seed="AA/A", feature="xAAxxx", overlap_fn=report.identical_overlap), 1) self.assertEqual(report.max_overlap_sliding(seed="AAA", feature="xAAAx", overlap_fn=report.hamming_overlap), 3) self.assertEqual(report.max_overlap_sliding(seed="AAA", feature="xAAxx", overlap_fn=report.hamming_overlap), 2) self.assertEqual(report.max_overlap_sliding(seed="AAA", feature="xxAxx", overlap_fn=report.hamming_overlap), 1) self.assertEqual(report.max_overlap_sliding(seed="AA/A", feature="xAAxAx", overlap_fn=report.hamming_overlap), 3) self.assertEqual(report.max_overlap_sliding(seed="AA/A", feature="xAAxxx", overlap_fn=report.hamming_overlap), 2)
def load(self, path: Path, details_path: Path = None): name = FilenameHandler.get_filename(self.__class__.__name__, "pt") file_path = path / name if file_path.is_file(): self.model = torch.load(str(file_path)) self.model.eval() else: raise FileNotFoundError( f"{self.__class__.__name__} model could not be loaded from {file_path}. " f"Check if the path to the {name} file is properly set.") if details_path is None: params_path = path / FilenameHandler.get_filename( self.__class__.__name__, "yaml") else: params_path = details_path if params_path.is_file(): with params_path.open("r") as file: desc = yaml.safe_load(file) if "label" in desc: setattr(self, "label", Label(**desc["label"])) for param in ["feature_names", "classes"]: if param in desc: setattr(self, param, desc[param])
def test_fit_by_cross_validation(self): x = EncodedData(sparse.csr_matrix( np.array([[1, 0, 0], [0, 1, 1], [1, 1, 1], [0, 1, 1], [1, 0, 0], [0, 1, 1], [1, 1, 1], [0, 1, 1]])), labels={"t1": [1, 0, 2, 0, 1, 0, 2, 0], "t2": [1, 0, 2, 0, 1, 0, 2, 0]}) rfc = RandomForestClassifier() rfc.fit_by_cross_validation(x, number_of_splits=2, label=Label("t2"))
def test_run(self): path = EnvironmentSettings.tmp_test_path / "mlapplicationtest/" PathBuilder.build(path) dataset = RandomDatasetGenerator.generate_repertoire_dataset(50, {5: 1}, {5: 1}, {"l1": {1: 0.5, 2: 0.5}}, path / 'dataset/') ml_method = LogisticRegression() encoder = KmerFreqRepertoireEncoder(NormalizationType.RELATIVE_FREQUENCY, ReadsType.UNIQUE, SequenceEncodingType.CONTINUOUS_KMER, 3, scale_to_zero_mean=True, scale_to_unit_variance=True) label_config = LabelConfiguration([Label("l1", [1, 2])]) enc_dataset = encoder.encode(dataset, EncoderParams(result_path=path, label_config=label_config, filename="tmp_enc_dataset.pickle", pool_size=4)) ml_method.fit(enc_dataset.encoded_data, 'l1') hp_setting = HPSetting(encoder, {"normalization_type": "relative_frequency", "reads": "unique", "sequence_encoding": "continuous_kmer", "k": 3, "scale_to_zero_mean": True, "scale_to_unit_variance": True}, ml_method, {}, [], 'enc1', 'ml1') PathBuilder.build(path / 'result/instr1/') shutil.copy(path / 'dict_vectorizer.pickle', path / 'result/instr1/dict_vectorizer.pickle') shutil.copy(path / 'scaler.pickle', path / 'result/instr1/scaler.pickle') ml_app = MLApplicationInstruction(dataset, label_config, hp_setting, 4, "instr1", False) ml_app.run(path / 'result/') predictions_path = path / "result/instr1/predictions.csv" self.assertTrue(os.path.isfile(predictions_path)) df = pd.read_csv(predictions_path) self.assertEqual(50, df.shape[0]) shutil.rmtree(path)
def add_label(self, label_name: str, values: list = None, auxiliary_labels: list = None, positive_class=None): vals = list(values) if values else None if label_name in self._labels and self._labels[ label_name] is not None and len(self._labels[label_name]) > 0: warnings.warn( "Label " + label_name + " has already been set. Overriding existing values...", Warning) if positive_class is not None: if all(isinstance(val, str) for val in values) and not isinstance(positive_class, str): positive_class = str(positive_class) ParameterValidator.assert_in_valid_list(positive_class, values, Label.__name__, 'positive_class') else: positive_class = self._get_default_positive_class(values) if positive_class: logging.info( f"LabelConfiguration: set default positive class '{positive_class}' for label {label_name}" ) self._labels[label_name] = Label(label_name, vals, auxiliary_labels, positive_class)
def test_fit(self): x = np.array([[1, 0, 0], [0, 1, 1], [1, 1, 1], [0, 1, 1]]) y = {"test": np.array([1, 0, 2, 0])} knn = KNN() knn.fit(EncodedData(examples=sparse.csr_matrix(x), labels=y), Label("test"))
def _run_test(self, compairr_path): path = EnvironmentSettings.tmp_test_path / "compairr_distance_encoder/" PathBuilder.build(path) dataset = self.create_dataset(path) enc = CompAIRRDistanceEncoder.build_object(dataset, **{"compairr_path": compairr_path, "keep_compairr_input": True, "differences": 0, "indels": False, "ignore_counts": False, "threads": 8, "ignore_genes": False}) enc.set_context({"dataset": dataset}) encoded = enc.encode(dataset, EncoderParams(result_path=path, label_config=LabelConfiguration([Label("l1", [0, 1]), Label("l2", [2, 3])]), pool_size=4, filename="dataset.pkl")) self.assertEqual(8, encoded.encoded_data.examples.shape[0]) self.assertEqual(8, encoded.encoded_data.examples.shape[1]) self.assertEqual(0, encoded.encoded_data.examples[0, 0]) self.assertEqual(0, encoded.encoded_data.examples[1, 1]) self.assertEqual(0, encoded.encoded_data.examples[0, 4]) self.assertTrue(np.array_equal([1, 0, 1, 0, 1, 0, 1, 0], encoded.encoded_data.labels["l1"])) self.assertTrue(np.array_equal([2, 3, 2, 3, 2, 3, 3, 3], encoded.encoded_data.labels["l2"])) shutil.rmtree(path)
def import_hp_setting(config_dir: Path) -> Tuple[HPSetting, Label]: config = MLMethodConfiguration() config.load(config_dir / 'ml_config.yaml') ml_method = ReflectionHandler.get_class_by_name( config.ml_method, 'ml_methods/')() ml_method.load(config_dir) encoder = MLImport.import_encoder(config, config_dir) preprocessing_sequence = MLImport.import_preprocessing_sequence( config, config_dir) labels = list(config.labels_with_values.keys()) assert len( labels ) == 1, "MLImport: Multiple labels set in a single ml_config file." label = Label(labels[0], config.labels_with_values[labels[0]]) return HPSetting( encoder=encoder, encoder_params=config.encoding_parameters, encoder_name=config.encoding_name, ml_method=ml_method, ml_method_name=config.ml_method_name, ml_params={}, preproc_sequence=preprocessing_sequence, preproc_sequence_name=config.preprocessing_sequence_name), label
def test_encode(self): path = PathBuilder.build(EnvironmentSettings.tmp_test_path / "atchley_kmer_encoding/") dataset = RandomDatasetGenerator.generate_repertoire_dataset( 3, {1: 1}, {4: 1}, {"l1": { True: 0.4, False: 0.6 }}, path / "dataset") encoder = AtchleyKmerEncoder.build_object( dataset, **{ "k": 2, "skip_first_n_aa": 1, "skip_last_n_aa": 1, "abundance": "RELATIVE_ABUNDANCE", "normalize_all_features": False }) encoded_dataset = encoder.encode( dataset, EncoderParams(path / "result", LabelConfiguration(labels=[Label("l1")]))) self.assertEqual((3, 11, 3), encoded_dataset.encoded_data.examples.shape) self.assertEqual(0., encoded_dataset.encoded_data.examples[0, -1, 0]) shutil.rmtree(path)
def _create_dummy_lr_model(self): dummy_lr = LogisticRegression() encoded_tr = EncodedData(np.random.rand(100, 20), {"l1": [i % 2 for i in range(0, 100)]}) dummy_lr.fit_by_cross_validation(encoded_tr, number_of_splits=2, label=Label("l1", values=[0, 1])) return dummy_lr, encoded_tr
def train_classifier(self): classifier = ProbabilisticBinaryClassifier(100, 0.1) X = np.array([[3, 4], [1, 7], [5, 7], [3, 8]]) y = {"cmv": [True, False, True, False]} classifier.fit(EncodedData(X, y), Label("cmv")) return classifier
def test_run(self): path = EnvironmentSettings.root_path / "test/tmp/mlmethodassessment/" PathBuilder.build(path) dataset = RepertoireDataset(repertoires=RepertoireBuilder.build( [["AA"], ["CC"], ["AA"], ["CC"], ["AA"], ["CC"], ["AA"], ["CC"], ["AA"], ["CC"], ["AA"], ["CC"]], path)[0]) dataset.encoded_data = EncodedData( examples=np.array([[1, 1], [1, 1], [3, 3], [1, 1], [1, 1], [3, 3], [1, 1], [1, 1], [3, 3], [1, 1], [1, 1], [3, 3]]), labels={ "l1": [1, 1, 3, 1, 1, 3, 1, 1, 3, 1, 1, 3], "l2": [1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3] }) label_config = LabelConfiguration() label_config.add_label("l1", [1, 3]) label = Label(name='l1', values=[1, 2]) method1 = LogisticRegression() method1.fit(dataset.encoded_data, label=label) res = MLMethodAssessment.run( MLMethodAssessmentParams( dataset=dataset, method=method1, metrics={ Metric.ACCURACY, Metric.BALANCED_ACCURACY, Metric.F1_MACRO }, optimization_metric=Metric.LOG_LOSS, predictions_path=EnvironmentSettings.root_path / "test/tmp/mlmethodassessment/predictions.csv", label=label, ml_score_path=EnvironmentSettings.root_path / "test/tmp/mlmethodassessment/ml_score.csv", split_index=1, path=EnvironmentSettings.root_path / "test/tmp/mlmethodassessment/")) self.assertTrue(isinstance(res, dict)) self.assertTrue(res[Metric.LOG_LOSS.name.lower()] <= 0.1) self.assertTrue( os.path.isfile(EnvironmentSettings.root_path / "test/tmp/mlmethodassessment/ml_score.csv")) df = pd.read_csv(EnvironmentSettings.root_path / "test/tmp/mlmethodassessment/ml_score.csv") self.assertTrue(df.shape[0] == 1) df = pd.read_csv(EnvironmentSettings.root_path / "test/tmp/mlmethodassessment/predictions.csv") self.assertEqual(12, df.shape[0]) shutil.rmtree(EnvironmentSettings.root_path / "test/tmp/mlmethodassessment/")
def _create_report(self, path): report = TrainingPerformance.build_object(name='testcase') report.train_dataset = Dataset() report.method, report.train_dataset.encoded_data = self._create_dummy_lr_model() report.label = Label("l1", values=[0, 1]) report.result_path = path return report
def test_fit_by_cross_validation(self): x = EncodedData( np.array([[1, 0, 0], [0, 1, 1], [1, 1, 1], [0, 1, 1], [1, 0, 0], [0, 1, 1], [1, 1, 1], [0, 1, 1]]), { "t1": [1, 0, 2, 0, 1, 0, 2, 0], "t2": [1, 0, 2, 0, 1, 0, 2, 0] }) svm = SVC(parameter_grid={"penalty": ["l1"], "dual": [False]}) svm.fit_by_cross_validation(x, number_of_splits=2, label=Label("t1"))
def test_fit_by_cross_validation(self): x = EncodedData( np.array([[1, 0, 0], [0, 1, 1], [1, 1, 1], [0, 1, 1], [1, 0, 0], [0, 1, 1], [1, 1, 1], [0, 1, 1]]), { "t1": [1, 0, 2, 0, 1, 0, 2, 0], "t2": [1, 0, 2, 0, 1, 0, 2, 0] }) svm = SVM() svm.fit_by_cross_validation(x, number_of_splits=2, label=Label("t1"))
def _create_state_object(self, path): repertoires, metadata = RepertoireBuilder.build(sequences=[["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"]], path=path, labels={ "l1": [1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2], "l2": [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]}) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata, labels={"l1": [1, 2], "l2": [0, 1]}) enc_params = {"k": 3, "model_type": ModelType.SEQUENCE.name, "vector_size": 4} hp_settings = [HPSetting(Word2VecEncoder.build_object(dataset, **enc_params), enc_params, LogisticRegression(), {"model_selection_cv": False, "model_selection_n_folds": -1}, [])] label_config = LabelConfiguration([Label("l1", [1, 2]), Label("l2", [0, 1])]) process = TrainMLModelInstruction(dataset, GridSearch(hp_settings), hp_settings, SplitConfig(SplitType.RANDOM, 1, 0.7), SplitConfig(SplitType.RANDOM, 1, 0.7), {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, label_config, path) state = process.run(result_path=path) return state
def test_fit_by_cross_validation(self): x = EncodedData( np.array([[1, 0, 0], [0, 1, 1], [1, 1, 1], [0, 1, 1], [1, 0, 0], [0, 1, 1], [1, 1, 1], [0, 1, 1]]), { "test1": [1, 0, 2, 0, 1, 0, 2, 0], "test2": [1, 0, 2, 0, 1, 0, 2, 0] }) lr = LogisticRegression() lr.fit_by_cross_validation(x, number_of_splits=2, label=Label("test2"))
def test_fit(self): classifier = self.train_classifier() predictions = classifier.predict(EncodedData(np.array([[6, 7], [1, 6]])), Label("cmv")) proba_predictions = classifier.predict_proba(EncodedData(np.array([[6, 7], [1, 6]])), Label("cmv")) self.assertEqual([True, False], predictions["cmv"]) self.assertTrue(proba_predictions["cmv"][0, 1] > proba_predictions["cmv"][0, 0]) self.assertTrue(proba_predictions["cmv"][1, 0] > proba_predictions["cmv"][1, 1]) self.assertTrue((proba_predictions["cmv"] <= 1.0).all() and (proba_predictions["cmv"] >= 0.0).all())
def encode_dataset_by_kmer_freq(path_to_dataset_directory: str, result_path: str, metadata_path: str = None): """ encodes the repertoire dataset using KmerFrequencyEncoder Arguments: path_to_dataset_directory (str): path to directory containing all repertoire files with .tsv extension in MiXCR format result_path (str): where to store the results metadata_path(str): csv file with columns "filename", "subject_id", "disease" which is filled by default if value of argument is None, otherwise any metadata csv file passed to the function, must include filename and subject_id columns, and an arbitrary disease column Returns: encoded dataset with encoded data in encoded_dataset.encoded_data.examples """ path_to_dataset_directory = Path(path_to_dataset_directory) result_path = Path(result_path) if metadata_path is None: metadata_path = generate_random_metadata(path_to_dataset_directory, result_path) else: metadata_path = Path(metadata_path) loader = MiXCRImport() dataset = loader.import_dataset({ "is_repertoire": True, "path": path_to_dataset_directory, "metadata_file": metadata_path, "region_type": "IMGT_CDR3", # import_dataset in only cdr3 "number_of_processes": 4, # number of parallel processes for loading the data "result_path": result_path, "separator": "\t", "columns_to_load": ["cloneCount", "allVHitsWithScore", "allJHitsWithScore", "aaSeqCDR3", "nSeqCDR3"], "column_mapping": { "cloneCount": "counts", "allVHitsWithScore": "v_alleles", "allJHitsWithScore": "j_alleles" }, }, "mixcr_dataset") label_name = list(dataset.labels.keys())[0] # label that can be used for ML prediction - by default: "disease" with values True/False encoded_dataset = DataEncoder.run(DataEncoderParams(dataset, KmerFrequencyEncoder.build_object(dataset, **{ "normalization_type": "relative_frequency", # encode repertoire by the relative frequency of k-mers in repertoire "reads": "unique", # count each sequence only once, do not use clonal count "k": 2, # k-mer length "sequence_type": "amino_acid", "sequence_encoding": "continuous_kmer" # split each sequence in repertoire to overlapping k-mers }), EncoderParams(result_path=result_path, label_config=LabelConfiguration([Label(label_name, dataset.labels[label_name])])))) dataset_exporter = DesignMatrixExporter(dataset=encoded_dataset, result_path=result_path / "csv_exported", file_format='csv') dataset_exporter.generate_report() return encoded_dataset
def test_fit_by_cross_validation(self): x = EncodedData(np.array([[1, 0, 0], [0, 1, 1], [1, 1, 1], [0, 1, 1], [1, 0, 0], [0, 1, 1], [1, 1, 1], [0, 1, 1]]), labels={ "test1": [1, 0, 2, 0, 1, 0, 2, 0], "test2": [1, 0, 2, 0, 1, 0, 2, 0] }) knn = KNN(parameters={"n_neighbors": 2}) knn.fit_by_cross_validation(x, number_of_splits=2, label=Label("test1"))
def add_label(self, label: str, values: list = None, auxiliary_labels: list = None, positive_class=None): vals = list(values) if values else None if label in self._labels and self._labels[label] is not None and len(self._labels[label]) > 0: warnings.warn("Label " + label + " has already been set. Overriding existing values...", Warning) if positive_class is not None: if all(isinstance(val, str) for val in values) and not isinstance(positive_class, str): positive_class = str(positive_class) ParameterValidator.assert_in_valid_list(positive_class, values, Label.__name__, 'positive_class') self._labels[label] = Label(label, vals, auxiliary_labels, positive_class)
def _create_report(self, path): report = ROCCurve.build_object(name='testcase') report.method = self._create_dummy_lr_model() report.label = Label("l1") report.result_path = path report.test_dataset = Dataset() encoded_te = EncodedData(np.random.rand(100, 20), {"l1": [i % 2 for i in range(0, 100)]}) report.test_dataset.encoded_data = encoded_te return report