Beispiel #1
0
    def _create_report(self, path):
        report = ConfounderAnalysis.build_object(
            metadata_labels=["age", "HLA"], name='test')

        report.ml_details_path = path / "ml_details.yaml"
        report.label = Label("disease")
        report.result_path = path
        encoder = KmerFrequencyEncoder.build_object(
            RepertoireDataset(), **{
                "normalization_type":
                NormalizationType.RELATIVE_FREQUENCY.name,
                "reads": ReadsType.UNIQUE.name,
                "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name,
                "k": 3,
                'sequence_type': SequenceType.AMINO_ACID.name
            })
        report.train_dataset = self._encode_dataset(
            encoder, self._make_dataset(path / "train", size=100), path)
        report.test_dataset = self._encode_dataset(encoder,
                                                   self._make_dataset(path /
                                                                      "test",
                                                                      size=40),
                                                   path,
                                                   learn_model=False)
        report.method = self._create_dummy_lr_model(
            path, report.train_dataset.encoded_data, Label("disease"))

        return report
    def test_fit(self):
        x, y, encoded_data = self._prepare_data()
        knn = TCRdistClassifier(percentage=0.75)
        knn.fit(encoded_data, Label("test"), cores_for_training=4)
        predictions = knn.predict(encoded_data, Label("test"))
        self.assertTrue(np.array_equal(y["test"], predictions["test"]))

        encoded_data.examples = np.array([[1.1, 0.1, 0.9, 1.9]])
        predictions = knn.predict(encoded_data, Label("test"))
        self.assertTrue(np.array_equal([0], predictions["test"]))
    def test_generate(self):
        path = PathBuilder.build(EnvironmentSettings.tmp_test_path /
                                 "tcrdist_motif_discovery/")
        dataset_path = self._create_dataset(path)

        dataset = SingleLineReceptorImport.import_dataset(
            {
                "path":
                dataset_path,
                "result_path":
                path / "dataset/",
                "separator":
                ",",
                "columns_to_load": [
                    "subject", "epitope", "count", "v_a_gene", "j_a_gene",
                    "cdr3_a_aa", "v_b_gene", "j_b_gene", "cdr3_b_aa",
                    "clone_id", "cdr3_a_nucseq", "cdr3_b_nucseq"
                ],
                "column_mapping": {
                    "cdr3_a_aa": "alpha_amino_acid_sequence",
                    "cdr3_b_aa": "beta_amino_acid_sequence",
                    "cdr3_a_nucseq": "alpha_nucleotide_sequence",
                    "cdr3_b_nucseq": "beta_nucleotide_sequence",
                    "v_a_gene": "alpha_v_gene",
                    "v_b_gene": "beta_v_gene",
                    "j_a_gene": "alpha_j_gene",
                    "j_b_gene": "beta_j_gene",
                    "clone_id": "identifier"
                },
                "receptor_chains":
                "TRA_TRB",
                "region_type":
                "IMGT_CDR3",
                "sequence_file_size":
                50000,
                "organism":
                "mouse"
            }, 'd1')

        dataset = TCRdistEncoder(8).encode(
            dataset,
            EncoderParams(path / "result",
                          LabelConfiguration([Label("epitope")])))

        report = TCRdistMotifDiscovery(train_dataset=dataset,
                                       test_dataset=dataset,
                                       result_path=path / "report",
                                       name="report name",
                                       cores=8,
                                       positive_class_name="PA",
                                       min_cluster_size=3)
        report.label = Label("epitope")
        report._generate()

        shutil.rmtree(path)
Beispiel #4
0
    def test_predict(self):
        x = np.array([[1, 0, 0], [0, 1, 1], [1, 1, 1], [0, 1, 1]])
        y = {"test1": [1, 0, 2, 0], "test2": [1, 0, 2, 0]}

        knn = KNN(parameters={"n_neighbors": 2})
        knn.fit(EncodedData(sparse.csr_matrix(x), labels=y), Label("test2"))

        test_x = np.array([[0, 1, 0], [1, 0, 0]])
        y = knn.predict(EncodedData(sparse.csr_matrix(test_x)), Label("test2"))

        self.assertTrue(len(y["test2"]) == 2)
        self.assertTrue(y["test2"][1] in [0, 1, 2])
Beispiel #5
0
    def test_predict(self):
        x = np.array([[1, 0, 0], [0, 1, 1], [1, 1, 1], [0, 1, 1]])
        y = {"test1": [1, 0, 2, 0], "test2": [1, 0, 2, 0]}

        lr = LogisticRegression()
        lr.fit(EncodedData(x, y), Label("test2"))

        test_x = np.array([[0, 1, 0], [1, 0, 0]])
        y = lr.predict(EncodedData(test_x), Label("test2"))

        self.assertTrue(len(y["test2"]) == 2)
        self.assertTrue(y["test2"][1] in [0, 1, 2])
    def test_predict(self):
        x = np.array([[1, 0, 0], [0, 1, 1], [1, 1, 1], [0, 1, 1]])
        y = {"default": np.array([1, 0, 2, 0])}

        rfc = RandomForestClassifier()
        rfc.fit(EncodedData(x, y), Label("default"))

        test_x = np.array([[0, 1, 0], [1, 0, 0]])
        y = rfc.predict(EncodedData(test_x), Label("default"))["default"]

        self.assertTrue(len(y) == 2)
        self.assertTrue(y[0] in [0, 1, 2])
        self.assertTrue(y[1] in [0, 1, 2])
Beispiel #7
0
    def test_predict(self):
        x = np.array([[1, 0, 0], [0, 1, 1], [1, 1, 1], [0, 1, 1]])
        y = {"test": np.array([1, 0, 2, 0])}

        svm = SVM()
        svm.fit(EncodedData(x, y), Label("test"))

        test_x = np.array([[0, 1, 0], [1, 0, 0]])
        y = svm.predict(EncodedData(test_x), Label("test"))["test"]

        self.assertTrue(len(y) == 2)
        self.assertTrue(y[0] in [0, 1, 2])
        self.assertTrue(y[1] in [0, 1, 2])
    def test_generate(self):
        path = PathBuilder.build(EnvironmentSettings.tmp_test_path /
                                 "kernel_sequence_logo/")
        dataset = RandomDatasetGenerator.generate_receptor_dataset(
            receptor_count=500,
            chain_1_length_probabilities={4: 1},
            chain_2_length_probabilities={4: 1},
            labels={"CMV": {
                True: 0.5,
                False: 0.5
            }},
            path=path / "dataset")
        enc_dataset = OneHotReceptorEncoder(True, 1, False, "enc1").encode(
            dataset,
            EncoderParams(path / "result",
                          LabelConfiguration([Label("CMV", [True, False])])))
        cnn = ReceptorCNN(kernel_count=2,
                          kernel_size=[3],
                          positional_channels=3,
                          sequence_type="amino_acid",
                          device="cpu",
                          number_of_threads=4,
                          random_seed=1,
                          learning_rate=0.01,
                          iteration_count=10,
                          l1_weight_decay=0.1,
                          evaluate_at=5,
                          batch_size=100,
                          training_percentage=0.8,
                          l2_weight_decay=0.0)
        cnn.fit(enc_dataset.encoded_data, Label("CMV"))

        report = KernelSequenceLogo(method=cnn, result_path=path / "logos/")
        report.generate_report()

        self.assertTrue(os.path.isfile(path / "logos/alpha_kernel_3_1.png"))
        self.assertTrue(os.path.isfile(path / "logos/alpha_kernel_3_2.png"))
        self.assertTrue(os.path.isfile(path / "logos/beta_kernel_3_1.png"))
        self.assertTrue(os.path.isfile(path / "logos/beta_kernel_3_2.png"))
        self.assertTrue(os.path.isfile(path / "logos/alpha_kernel_3_1.csv"))
        self.assertTrue(os.path.isfile(path / "logos/alpha_kernel_3_2.csv"))
        self.assertTrue(os.path.isfile(path / "logos/beta_kernel_3_1.csv"))
        self.assertTrue(os.path.isfile(path / "logos/beta_kernel_3_2.csv"))
        self.assertTrue(
            os.path.isfile(path / "logos/fully_connected_layer_weights.csv"))
        self.assertTrue(
            os.path.isfile(path / "logos/fully_connected_layer_weights.html"))

        shutil.rmtree(path)
    def test_overlap(self):
        report = MotifSeedRecovery.build_object(**{"implanted_motifs_per_label": {
            "l1": {"seeds": ["AAA", "A/AA"],
                   "hamming_distance": False,
                   "gap_sizes": [1]}}})
        report.label = Label("l1")

        self.assertEqual(report.identical_overlap(seed="AAA", feature="AAA"), 3)
        self.assertEqual(report.identical_overlap(seed="AAA", feature="AAx"), 0)

        self.assertEqual(report.identical_overlap(seed="AA/A", feature="AAxA"), 3)
        self.assertEqual(report.identical_overlap(seed="AA/A", feature="AAxx"), 0)

        self.assertEqual(report.hamming_overlap(seed="AAA", feature="AAA"), 3)
        self.assertEqual(report.hamming_overlap(seed="AAA", feature="AAx"), 2)
        self.assertEqual(report.hamming_overlap(seed="AAA", feature="xAx"), 1)

        self.assertEqual(report.hamming_overlap(seed="AA/A", feature="AAxA"), 3)
        self.assertEqual(report.hamming_overlap(seed="AA/A", feature="AAxx"), 2)

        self.assertEqual(report.max_overlap_sliding(seed="AAA", feature="xAAAx", overlap_fn=report.identical_overlap), 3)
        self.assertEqual(report.max_overlap_sliding(seed="AAA", feature="xAAxx", overlap_fn=report.identical_overlap), 0)
        self.assertEqual(report.max_overlap_sliding(seed="AAA", feature="AAxx", overlap_fn=report.identical_overlap), 2)

        self.assertEqual(report.max_overlap_sliding(seed="AA/A", feature="xAAxAx", overlap_fn=report.identical_overlap), 3)
        self.assertEqual(report.max_overlap_sliding(seed="AA/A", feature="xAAxxx", overlap_fn=report.identical_overlap), 1)

        self.assertEqual(report.max_overlap_sliding(seed="AAA", feature="xAAAx", overlap_fn=report.hamming_overlap), 3)
        self.assertEqual(report.max_overlap_sliding(seed="AAA", feature="xAAxx", overlap_fn=report.hamming_overlap), 2)
        self.assertEqual(report.max_overlap_sliding(seed="AAA", feature="xxAxx", overlap_fn=report.hamming_overlap), 1)

        self.assertEqual(report.max_overlap_sliding(seed="AA/A", feature="xAAxAx", overlap_fn=report.hamming_overlap), 3)
        self.assertEqual(report.max_overlap_sliding(seed="AA/A", feature="xAAxxx", overlap_fn=report.hamming_overlap), 2)
Beispiel #10
0
    def load(self, path: Path, details_path: Path = None):
        name = FilenameHandler.get_filename(self.__class__.__name__, "pt")
        file_path = path / name
        if file_path.is_file():
            self.model = torch.load(str(file_path))
            self.model.eval()
        else:
            raise FileNotFoundError(
                f"{self.__class__.__name__} model could not be loaded from {file_path}. "
                f"Check if the path to the {name} file is properly set.")

        if details_path is None:
            params_path = path / FilenameHandler.get_filename(
                self.__class__.__name__, "yaml")
        else:
            params_path = details_path

        if params_path.is_file():
            with params_path.open("r") as file:
                desc = yaml.safe_load(file)
                if "label" in desc:
                    setattr(self, "label", Label(**desc["label"]))
                for param in ["feature_names", "classes"]:
                    if param in desc:
                        setattr(self, param, desc[param])
    def test_fit_by_cross_validation(self):
        x = EncodedData(sparse.csr_matrix(
            np.array([[1, 0, 0], [0, 1, 1], [1, 1, 1], [0, 1, 1], [1, 0, 0], [0, 1, 1], [1, 1, 1], [0, 1, 1]])),
            labels={"t1": [1, 0, 2, 0, 1, 0, 2, 0], "t2": [1, 0, 2, 0, 1, 0, 2, 0]})

        rfc = RandomForestClassifier()
        rfc.fit_by_cross_validation(x, number_of_splits=2, label=Label("t2"))
    def test_run(self):

        path = EnvironmentSettings.tmp_test_path / "mlapplicationtest/"
        PathBuilder.build(path)

        dataset = RandomDatasetGenerator.generate_repertoire_dataset(50, {5: 1}, {5: 1}, {"l1": {1: 0.5, 2: 0.5}}, path / 'dataset/')
        ml_method = LogisticRegression()
        encoder = KmerFreqRepertoireEncoder(NormalizationType.RELATIVE_FREQUENCY, ReadsType.UNIQUE, SequenceEncodingType.CONTINUOUS_KMER, 3,
                                            scale_to_zero_mean=True, scale_to_unit_variance=True)
        label_config = LabelConfiguration([Label("l1", [1, 2])])

        enc_dataset = encoder.encode(dataset, EncoderParams(result_path=path, label_config=label_config, filename="tmp_enc_dataset.pickle", pool_size=4))
        ml_method.fit(enc_dataset.encoded_data, 'l1')

        hp_setting = HPSetting(encoder, {"normalization_type": "relative_frequency", "reads": "unique", "sequence_encoding": "continuous_kmer",
                                         "k": 3, "scale_to_zero_mean": True, "scale_to_unit_variance": True}, ml_method, {}, [], 'enc1', 'ml1')

        PathBuilder.build(path / 'result/instr1/')
        shutil.copy(path / 'dict_vectorizer.pickle', path / 'result/instr1/dict_vectorizer.pickle')
        shutil.copy(path / 'scaler.pickle', path / 'result/instr1/scaler.pickle')

        ml_app = MLApplicationInstruction(dataset, label_config, hp_setting, 4, "instr1", False)
        ml_app.run(path / 'result/')

        predictions_path = path / "result/instr1/predictions.csv"
        self.assertTrue(os.path.isfile(predictions_path))

        df = pd.read_csv(predictions_path)
        self.assertEqual(50, df.shape[0])

        shutil.rmtree(path)
Beispiel #13
0
    def add_label(self,
                  label_name: str,
                  values: list = None,
                  auxiliary_labels: list = None,
                  positive_class=None):

        vals = list(values) if values else None

        if label_name in self._labels and self._labels[
                label_name] is not None and len(self._labels[label_name]) > 0:
            warnings.warn(
                "Label " + label_name +
                " has already been set. Overriding existing values...",
                Warning)

        if positive_class is not None:
            if all(isinstance(val, str)
                   for val in values) and not isinstance(positive_class, str):
                positive_class = str(positive_class)
            ParameterValidator.assert_in_valid_list(positive_class, values,
                                                    Label.__name__,
                                                    'positive_class')
        else:
            positive_class = self._get_default_positive_class(values)
            if positive_class:
                logging.info(
                    f"LabelConfiguration: set default positive class '{positive_class}' for label {label_name}"
                )

        self._labels[label_name] = Label(label_name, vals, auxiliary_labels,
                                         positive_class)
Beispiel #14
0
    def test_fit(self):
        x = np.array([[1, 0, 0], [0, 1, 1], [1, 1, 1], [0, 1, 1]])
        y = {"test": np.array([1, 0, 2, 0])}

        knn = KNN()
        knn.fit(EncodedData(examples=sparse.csr_matrix(x), labels=y),
                Label("test"))
    def _run_test(self, compairr_path):

        path = EnvironmentSettings.tmp_test_path / "compairr_distance_encoder/"

        PathBuilder.build(path)

        dataset = self.create_dataset(path)

        enc = CompAIRRDistanceEncoder.build_object(dataset, **{"compairr_path": compairr_path,
                                                           "keep_compairr_input": True,
                                                        "differences": 0,
                                                        "indels": False,
                                                        "ignore_counts": False,
                                                        "threads": 8,
                                                        "ignore_genes": False})

        enc.set_context({"dataset": dataset})
        encoded = enc.encode(dataset, EncoderParams(result_path=path,
                                                    label_config=LabelConfiguration([Label("l1", [0, 1]), Label("l2", [2, 3])]),
                                                    pool_size=4, filename="dataset.pkl"))

        self.assertEqual(8, encoded.encoded_data.examples.shape[0])
        self.assertEqual(8, encoded.encoded_data.examples.shape[1])

        self.assertEqual(0, encoded.encoded_data.examples[0, 0])
        self.assertEqual(0, encoded.encoded_data.examples[1, 1])
        self.assertEqual(0, encoded.encoded_data.examples[0, 4])

        self.assertTrue(np.array_equal([1, 0, 1, 0, 1, 0, 1, 0], encoded.encoded_data.labels["l1"]))
        self.assertTrue(np.array_equal([2, 3, 2, 3, 2, 3, 3, 3], encoded.encoded_data.labels["l2"]))

        shutil.rmtree(path)
Beispiel #16
0
    def import_hp_setting(config_dir: Path) -> Tuple[HPSetting, Label]:

        config = MLMethodConfiguration()
        config.load(config_dir / 'ml_config.yaml')

        ml_method = ReflectionHandler.get_class_by_name(
            config.ml_method, 'ml_methods/')()
        ml_method.load(config_dir)

        encoder = MLImport.import_encoder(config, config_dir)
        preprocessing_sequence = MLImport.import_preprocessing_sequence(
            config, config_dir)

        labels = list(config.labels_with_values.keys())
        assert len(
            labels
        ) == 1, "MLImport: Multiple labels set in a single ml_config file."

        label = Label(labels[0], config.labels_with_values[labels[0]])

        return HPSetting(
            encoder=encoder,
            encoder_params=config.encoding_parameters,
            encoder_name=config.encoding_name,
            ml_method=ml_method,
            ml_method_name=config.ml_method_name,
            ml_params={},
            preproc_sequence=preprocessing_sequence,
            preproc_sequence_name=config.preprocessing_sequence_name), label
    def test_encode(self):
        path = PathBuilder.build(EnvironmentSettings.tmp_test_path /
                                 "atchley_kmer_encoding/")
        dataset = RandomDatasetGenerator.generate_repertoire_dataset(
            3, {1: 1}, {4: 1}, {"l1": {
                True: 0.4,
                False: 0.6
            }}, path / "dataset")

        encoder = AtchleyKmerEncoder.build_object(
            dataset, **{
                "k": 2,
                "skip_first_n_aa": 1,
                "skip_last_n_aa": 1,
                "abundance": "RELATIVE_ABUNDANCE",
                "normalize_all_features": False
            })
        encoded_dataset = encoder.encode(
            dataset,
            EncoderParams(path / "result",
                          LabelConfiguration(labels=[Label("l1")])))

        self.assertEqual((3, 11, 3),
                         encoded_dataset.encoded_data.examples.shape)
        self.assertEqual(0., encoded_dataset.encoded_data.examples[0, -1, 0])

        shutil.rmtree(path)
Beispiel #18
0
    def _create_dummy_lr_model(self):
        dummy_lr = LogisticRegression()
        encoded_tr = EncodedData(np.random.rand(100, 20),
                                 {"l1": [i % 2 for i in range(0, 100)]})

        dummy_lr.fit_by_cross_validation(encoded_tr, number_of_splits=2,
                                         label=Label("l1", values=[0, 1]))
        return dummy_lr, encoded_tr
    def train_classifier(self):
        classifier = ProbabilisticBinaryClassifier(100, 0.1)

        X = np.array([[3, 4], [1, 7], [5, 7], [3, 8]])
        y = {"cmv": [True, False, True, False]}

        classifier.fit(EncodedData(X, y), Label("cmv"))

        return classifier
    def test_run(self):
        path = EnvironmentSettings.root_path / "test/tmp/mlmethodassessment/"
        PathBuilder.build(path)
        dataset = RepertoireDataset(repertoires=RepertoireBuilder.build(
            [["AA"], ["CC"], ["AA"], ["CC"], ["AA"], ["CC"], ["AA"], ["CC"],
             ["AA"], ["CC"], ["AA"], ["CC"]], path)[0])
        dataset.encoded_data = EncodedData(
            examples=np.array([[1, 1], [1, 1], [3, 3], [1, 1], [1, 1], [3, 3],
                               [1, 1], [1, 1], [3, 3], [1, 1], [1, 1], [3,
                                                                        3]]),
            labels={
                "l1": [1, 1, 3, 1, 1, 3, 1, 1, 3, 1, 1, 3],
                "l2": [1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3]
            })

        label_config = LabelConfiguration()
        label_config.add_label("l1", [1, 3])

        label = Label(name='l1', values=[1, 2])

        method1 = LogisticRegression()
        method1.fit(dataset.encoded_data, label=label)

        res = MLMethodAssessment.run(
            MLMethodAssessmentParams(
                dataset=dataset,
                method=method1,
                metrics={
                    Metric.ACCURACY, Metric.BALANCED_ACCURACY, Metric.F1_MACRO
                },
                optimization_metric=Metric.LOG_LOSS,
                predictions_path=EnvironmentSettings.root_path /
                "test/tmp/mlmethodassessment/predictions.csv",
                label=label,
                ml_score_path=EnvironmentSettings.root_path /
                "test/tmp/mlmethodassessment/ml_score.csv",
                split_index=1,
                path=EnvironmentSettings.root_path /
                "test/tmp/mlmethodassessment/"))

        self.assertTrue(isinstance(res, dict))
        self.assertTrue(res[Metric.LOG_LOSS.name.lower()] <= 0.1)

        self.assertTrue(
            os.path.isfile(EnvironmentSettings.root_path /
                           "test/tmp/mlmethodassessment/ml_score.csv"))

        df = pd.read_csv(EnvironmentSettings.root_path /
                         "test/tmp/mlmethodassessment/ml_score.csv")
        self.assertTrue(df.shape[0] == 1)

        df = pd.read_csv(EnvironmentSettings.root_path /
                         "test/tmp/mlmethodassessment/predictions.csv")
        self.assertEqual(12, df.shape[0])

        shutil.rmtree(EnvironmentSettings.root_path /
                      "test/tmp/mlmethodassessment/")
Beispiel #21
0
    def _create_report(self, path):
        report = TrainingPerformance.build_object(name='testcase')

        report.train_dataset = Dataset()
        report.method, report.train_dataset.encoded_data = self._create_dummy_lr_model()
        report.label = Label("l1", values=[0, 1])
        report.result_path = path

        return report
Beispiel #22
0
    def test_fit_by_cross_validation(self):
        x = EncodedData(
            np.array([[1, 0, 0], [0, 1, 1], [1, 1, 1], [0, 1, 1], [1, 0, 0],
                      [0, 1, 1], [1, 1, 1], [0, 1, 1]]), {
                          "t1": [1, 0, 2, 0, 1, 0, 2, 0],
                          "t2": [1, 0, 2, 0, 1, 0, 2, 0]
                      })

        svm = SVC(parameter_grid={"penalty": ["l1"], "dual": [False]})
        svm.fit_by_cross_validation(x, number_of_splits=2, label=Label("t1"))
Beispiel #23
0
    def test_fit_by_cross_validation(self):
        x = EncodedData(
            np.array([[1, 0, 0], [0, 1, 1], [1, 1, 1], [0, 1, 1], [1, 0, 0],
                      [0, 1, 1], [1, 1, 1], [0, 1, 1]]), {
                          "t1": [1, 0, 2, 0, 1, 0, 2, 0],
                          "t2": [1, 0, 2, 0, 1, 0, 2, 0]
                      })

        svm = SVM()
        svm.fit_by_cross_validation(x, number_of_splits=2, label=Label("t1"))
    def _create_state_object(self, path):
        repertoires, metadata = RepertoireBuilder.build(sequences=[["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"]],
                                                        path=path,
                                                        labels={
                                                            "l1": [1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
                                                                   1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2],
                                                            "l2": [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1,
                                                                   0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]})

        dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata,
                                    labels={"l1": [1, 2], "l2": [0, 1]})
        enc_params = {"k": 3, "model_type": ModelType.SEQUENCE.name, "vector_size": 4}
        hp_settings = [HPSetting(Word2VecEncoder.build_object(dataset, **enc_params), enc_params,
                                 LogisticRegression(),
                                 {"model_selection_cv": False, "model_selection_n_folds": -1},
                                 [])]

        label_config = LabelConfiguration([Label("l1", [1, 2]), Label("l2", [0, 1])])

        process = TrainMLModelInstruction(dataset, GridSearch(hp_settings), hp_settings,
                                          SplitConfig(SplitType.RANDOM, 1, 0.7),
                                          SplitConfig(SplitType.RANDOM, 1, 0.7),
                                          {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, label_config, path)

        state = process.run(result_path=path)

        return state
Beispiel #25
0
    def test_fit_by_cross_validation(self):
        x = EncodedData(
            np.array([[1, 0, 0], [0, 1, 1], [1, 1, 1], [0, 1, 1], [1, 0, 0],
                      [0, 1, 1], [1, 1, 1], [0, 1, 1]]), {
                          "test1": [1, 0, 2, 0, 1, 0, 2, 0],
                          "test2": [1, 0, 2, 0, 1, 0, 2, 0]
                      })

        lr = LogisticRegression()
        lr.fit_by_cross_validation(x, number_of_splits=2, label=Label("test2"))
    def test_fit(self):

        classifier = self.train_classifier()

        predictions = classifier.predict(EncodedData(np.array([[6, 7], [1, 6]])), Label("cmv"))
        proba_predictions = classifier.predict_proba(EncodedData(np.array([[6, 7], [1, 6]])), Label("cmv"))

        self.assertEqual([True, False], predictions["cmv"])
        self.assertTrue(proba_predictions["cmv"][0, 1] > proba_predictions["cmv"][0, 0])
        self.assertTrue(proba_predictions["cmv"][1, 0] > proba_predictions["cmv"][1, 1])
        self.assertTrue((proba_predictions["cmv"] <= 1.0).all() and (proba_predictions["cmv"] >= 0.0).all())
Beispiel #27
0
def encode_dataset_by_kmer_freq(path_to_dataset_directory: str, result_path: str, metadata_path: str = None):
    """
    encodes the repertoire dataset using KmerFrequencyEncoder

    Arguments:
        path_to_dataset_directory (str): path to directory containing all repertoire files with .tsv extension in MiXCR format
        result_path (str): where to store the results
        metadata_path(str): csv file with columns "filename", "subject_id", "disease" which is filled by default if value of argument is None,
            otherwise any metadata csv file passed to the function, must include filename and subject_id columns, and an arbitrary disease column
    Returns:
         encoded dataset with encoded data in encoded_dataset.encoded_data.examples
    """
    path_to_dataset_directory = Path(path_to_dataset_directory)
    result_path = Path(result_path)

    if metadata_path is None:
        metadata_path = generate_random_metadata(path_to_dataset_directory, result_path)
    else:
        metadata_path = Path(metadata_path)

    loader = MiXCRImport()
    dataset = loader.import_dataset({
        "is_repertoire": True,
        "path": path_to_dataset_directory,
        "metadata_file": metadata_path,
        "region_type": "IMGT_CDR3",  # import_dataset in only cdr3
        "number_of_processes": 4,  # number of parallel processes for loading the data
        "result_path": result_path,
        "separator": "\t",
        "columns_to_load": ["cloneCount", "allVHitsWithScore", "allJHitsWithScore", "aaSeqCDR3", "nSeqCDR3"],
        "column_mapping": {
            "cloneCount": "counts",
            "allVHitsWithScore": "v_alleles",
            "allJHitsWithScore": "j_alleles"
        },
    }, "mixcr_dataset")

    label_name = list(dataset.labels.keys())[0]  # label that can be used for ML prediction - by default: "disease" with values True/False

    encoded_dataset = DataEncoder.run(DataEncoderParams(dataset, KmerFrequencyEncoder.build_object(dataset, **{
        "normalization_type": "relative_frequency",  # encode repertoire by the relative frequency of k-mers in repertoire
        "reads": "unique",  # count each sequence only once, do not use clonal count
        "k": 2,  # k-mer length
        "sequence_type": "amino_acid",
        "sequence_encoding": "continuous_kmer"  # split each sequence in repertoire to overlapping k-mers
    }), EncoderParams(result_path=result_path,
                      label_config=LabelConfiguration([Label(label_name, dataset.labels[label_name])]))))

    dataset_exporter = DesignMatrixExporter(dataset=encoded_dataset,
                                            result_path=result_path / "csv_exported", file_format='csv')
    dataset_exporter.generate_report()

    return encoded_dataset
Beispiel #28
0
    def test_fit_by_cross_validation(self):
        x = EncodedData(np.array([[1, 0, 0], [0, 1, 1], [1, 1, 1], [0, 1, 1],
                                  [1, 0, 0], [0, 1, 1], [1, 1, 1], [0, 1, 1]]),
                        labels={
                            "test1": [1, 0, 2, 0, 1, 0, 2, 0],
                            "test2": [1, 0, 2, 0, 1, 0, 2, 0]
                        })

        knn = KNN(parameters={"n_neighbors": 2})
        knn.fit_by_cross_validation(x,
                                    number_of_splits=2,
                                    label=Label("test1"))
Beispiel #29
0
    def add_label(self, label: str, values: list = None, auxiliary_labels: list = None, positive_class=None):

        vals = list(values) if values else None

        if label in self._labels and self._labels[label] is not None and len(self._labels[label]) > 0:
            warnings.warn("Label " + label + " has already been set. Overriding existing values...", Warning)

        if positive_class is not None:
            if all(isinstance(val, str) for val in values) and not isinstance(positive_class, str):
                positive_class = str(positive_class)
            ParameterValidator.assert_in_valid_list(positive_class, values, Label.__name__, 'positive_class')

        self._labels[label] = Label(label, vals, auxiliary_labels, positive_class)
Beispiel #30
0
    def _create_report(self, path):
        report = ROCCurve.build_object(name='testcase')

        report.method = self._create_dummy_lr_model()
        report.label = Label("l1")
        report.result_path = path
        report.test_dataset = Dataset()
        encoded_te = EncodedData(np.random.rand(100, 20),
                                 {"l1": [i % 2 for i in range(0, 100)]})

        report.test_dataset.encoded_data = encoded_te

        return report