Beispiel #1
0
    def test_encode_sequence(self):
        sequence = ReceptorSequence(
            amino_acid_sequence="AAA",
            metadata=SequenceMetadata(frame_type="OUT"))
        enc = IdentitySequenceEncoder()
        self.assertEqual(
            enc.encode_sequence(
                sequence,
                EncoderParams(model={},
                              label_config=LabelConfiguration(),
                              result_path="")), ["AAA"])

        sequence = ReceptorSequence(
            amino_acid_sequence="AAA",
            metadata=SequenceMetadata(frame_type="STOP"))
        enc = IdentitySequenceEncoder()
        self.assertEqual(
            enc.encode_sequence(
                sequence,
                EncoderParams(model={},
                              label_config=LabelConfiguration(),
                              result_path="")), ["AAA"])

        sequence = ReceptorSequence(amino_acid_sequence="AAA",
                                    metadata=SequenceMetadata(frame_type="IN"))
        enc = IdentitySequenceEncoder()
        self.assertEqual(["AAA"],
                         enc.encode_sequence(
                             sequence,
                             EncoderParams(model={},
                                           label_config=LabelConfiguration(),
                                           result_path="")))
    def test_run(self):

        path = EnvironmentSettings.tmp_test_path / "mlapplicationtest/"
        PathBuilder.build(path)

        dataset = RandomDatasetGenerator.generate_repertoire_dataset(50, {5: 1}, {5: 1}, {"l1": {1: 0.5, 2: 0.5}}, path / 'dataset/')
        ml_method = LogisticRegression()
        encoder = KmerFreqRepertoireEncoder(NormalizationType.RELATIVE_FREQUENCY, ReadsType.UNIQUE, SequenceEncodingType.CONTINUOUS_KMER, 3,
                                            scale_to_zero_mean=True, scale_to_unit_variance=True)
        label_config = LabelConfiguration([Label("l1", [1, 2])])

        enc_dataset = encoder.encode(dataset, EncoderParams(result_path=path, label_config=label_config, filename="tmp_enc_dataset.pickle", pool_size=4))
        ml_method.fit(enc_dataset.encoded_data, 'l1')

        hp_setting = HPSetting(encoder, {"normalization_type": "relative_frequency", "reads": "unique", "sequence_encoding": "continuous_kmer",
                                         "k": 3, "scale_to_zero_mean": True, "scale_to_unit_variance": True}, ml_method, {}, [], 'enc1', 'ml1')

        PathBuilder.build(path / 'result/instr1/')
        shutil.copy(path / 'dict_vectorizer.pickle', path / 'result/instr1/dict_vectorizer.pickle')
        shutil.copy(path / 'scaler.pickle', path / 'result/instr1/scaler.pickle')

        ml_app = MLApplicationInstruction(dataset, label_config, hp_setting, 4, "instr1", False)
        ml_app.run(path / 'result/')

        predictions_path = path / "result/instr1/predictions.csv"
        self.assertTrue(os.path.isfile(predictions_path))

        df = pd.read_csv(predictions_path)
        self.assertEqual(50, df.shape[0])

        shutil.rmtree(path)
    def _run_test(self, compairr_path):

        path = EnvironmentSettings.tmp_test_path / "compairr_distance_encoder/"

        PathBuilder.build(path)

        dataset = self.create_dataset(path)

        enc = CompAIRRDistanceEncoder.build_object(dataset, **{"compairr_path": compairr_path,
                                                           "keep_compairr_input": True,
                                                        "differences": 0,
                                                        "indels": False,
                                                        "ignore_counts": False,
                                                        "threads": 8,
                                                        "ignore_genes": False})

        enc.set_context({"dataset": dataset})
        encoded = enc.encode(dataset, EncoderParams(result_path=path,
                                                    label_config=LabelConfiguration([Label("l1", [0, 1]), Label("l2", [2, 3])]),
                                                    pool_size=4, filename="dataset.pkl"))

        self.assertEqual(8, encoded.encoded_data.examples.shape[0])
        self.assertEqual(8, encoded.encoded_data.examples.shape[1])

        self.assertEqual(0, encoded.encoded_data.examples[0, 0])
        self.assertEqual(0, encoded.encoded_data.examples[1, 1])
        self.assertEqual(0, encoded.encoded_data.examples[0, 4])

        self.assertTrue(np.array_equal([1, 0, 1, 0, 1, 0, 1, 0], encoded.encoded_data.labels["l1"]))
        self.assertTrue(np.array_equal([2, 3, 2, 3, 2, 3, 3, 3], encoded.encoded_data.labels["l2"]))

        shutil.rmtree(path)
Beispiel #4
0
    def test_repertoire_flattened(self):
        path = EnvironmentSettings.root_path / "test/tmp/onehot_recep_flat/"

        PathBuilder.build(path)

        dataset, lc = self._construct_test_repertoiredataset(path, positional=False)

        encoder = OneHotEncoder.build_object(dataset, **{"use_positional_info": False, "distance_to_seq_middle": None,
                                                         "flatten": True})

        encoded_data = encoder.encode(dataset, EncoderParams(
            result_path=path,
            label_config=lc,
            pool_size=1,
            learn_model=True,
            model={},
            filename="dataset.pkl"
        ))

        self.assertTrue(isinstance(encoded_data, RepertoireDataset))

        onehot_a = [1.0] + [0.0] * 19
        onehot_t = [0.0] * 16 + [1.0] + [0] * 3
        onehot_empty = [0] * 20


        self.assertListEqual(list(encoded_data.encoded_data.examples[0]), onehot_a+onehot_a+onehot_a+onehot_a+onehot_a+onehot_t+onehot_a+onehot_empty+onehot_a+onehot_t+onehot_a+onehot_empty)
        self.assertListEqual(list(encoded_data.encoded_data.examples[1]), onehot_a+onehot_t+onehot_a+onehot_empty+onehot_t+onehot_a+onehot_a+onehot_empty+onehot_empty+onehot_empty+onehot_empty+onehot_empty)

        self.assertListEqual(list(encoded_data.encoded_data.feature_names), [f"{seq}_{pos}_{char}" for seq in range(3) for pos in range(4) for char in EnvironmentSettings.get_sequence_alphabet()])

        shutil.rmtree(path)
    def test__encode_new_dataset(self):
        path = EnvironmentSettings.root_path / "test/tmp/matched_receptors_encoder/"

        dataset, label_config, reference_receptors, labels = self.create_dummy_data(path)

        encoder = MatchedReceptorsEncoder.build_object(dataset, **{
            "reference": reference_receptors,
            "max_edit_distances": 0
        })

        encoded = encoder.encode(dataset, EncoderParams(
            result_path=path,
            label_config=label_config,
            filename="dataset.csv"
        ))

        expected_outcome = [[10, 0, 0, 0],[0, 10, 0, 0],[5, 0, 5, 0], [0, 5, 0, 5], [1, 1, 2, 2]]
        for index, row in enumerate(expected_outcome):
            self.assertListEqual(list(encoded.encoded_data.examples[index]), expected_outcome[index])

        self.assertDictEqual(encoded.encoded_data.labels, {"label": ["yes", "yes", "no", "no", "no"],
                                                           "subject_id": ["subject_1", "subject_1", "subject_2", "subject_2", "subject_3"]})
        self.assertListEqual(encoded.encoded_data.feature_names, ["100-A0-B0.alpha", "100-A0-B0.beta", "200-A0-B0.alpha", "200-A0-B0.beta"])

        self.assertListEqual(list(encoded.encoded_data.feature_annotations.receptor_id), ["100-A0-B0", "100-A0-B0", "200-A0-B0", "200-A0-B0"])
        self.assertListEqual(list(encoded.encoded_data.feature_annotations.clonotype_id), [100, 100, 200, 200])
        self.assertListEqual(list(encoded.encoded_data.feature_annotations.chain), ["alpha", "beta", "alpha", "beta"])
        self.assertListEqual(list(encoded.encoded_data.feature_annotations.sequence), ["AAAA", "SSSS", "CCCC", "TTTT"])
        self.assertListEqual(list(encoded.encoded_data.feature_annotations.v_gene), ["V1" for i in range(4)])
        self.assertListEqual(list(encoded.encoded_data.feature_annotations.j_gene), ["J1" for i in range(4)])

        shutil.rmtree(path)
    def test_encode(self):
        path = PathBuilder.build(EnvironmentSettings.tmp_test_path /
                                 "atchley_kmer_encoding/")
        dataset = RandomDatasetGenerator.generate_repertoire_dataset(
            3, {1: 1}, {4: 1}, {"l1": {
                True: 0.4,
                False: 0.6
            }}, path / "dataset")

        encoder = AtchleyKmerEncoder.build_object(
            dataset, **{
                "k": 2,
                "skip_first_n_aa": 1,
                "skip_last_n_aa": 1,
                "abundance": "RELATIVE_ABUNDANCE",
                "normalize_all_features": False
            })
        encoded_dataset = encoder.encode(
            dataset,
            EncoderParams(path / "result",
                          LabelConfiguration(labels=[Label("l1")])))

        self.assertEqual((3, 11, 3),
                         encoded_dataset.encoded_data.examples.shape)
        self.assertEqual(0., encoded_dataset.encoded_data.examples[0, -1, 0])

        shutil.rmtree(path)
Beispiel #7
0
    def get_encoded_repertoire(self, repertoire, params: EncoderParams):
        params.model = vars(self)

        return CacheHandler.memo_by_params((("encoding_model", params.model), ("type", "kmer_encoding"),
                                            ("labels", params.label_config.get_labels_by_name()),
                                            ("repertoire_id", repertoire.identifier)),
                                           lambda: self.encode_repertoire(repertoire, params), CacheObjectType.ENCODING_STEP)
Beispiel #8
0
    def encode_dataset(dataset,
                       hp_setting: HPSetting,
                       path: Path,
                       learn_model: bool,
                       context: dict,
                       number_of_processes: int,
                       label_configuration: LabelConfiguration,
                       encode_labels: bool = True,
                       store_encoded_data: bool = False):
        PathBuilder.build(path)

        encoded_dataset = DataEncoder.run(
            DataEncoderParams(dataset=dataset,
                              encoder=hp_setting.encoder,
                              encoder_params=EncoderParams(
                                  model=hp_setting.encoder_params,
                                  result_path=path,
                                  pool_size=number_of_processes,
                                  label_config=label_configuration,
                                  learn_model=learn_model,
                                  filename="train_dataset.pkl"
                                  if learn_model else "test_dataset.pkl",
                                  encode_labels=encode_labels),
                              store_encoded_data=store_encoded_data))
        return encoded_dataset
Beispiel #9
0
    def test_encode_no_v_no_count(self):
        path = EnvironmentSettings.root_path / "test/tmp/regex_matches_encoder/"

        dataset, label_config, motif_filepath, labels = self.create_dummy_data(path)

        encoder = MatchedRegexEncoder.build_object(dataset, **{
            "motif_filepath": motif_filepath,
            "match_v_genes": False,
            "sum_counts": False
        })


        encoded = encoder.encode(dataset, EncoderParams(
            result_path=path,
            label_config=label_config,
            filename="dataset.csv"
        ))

        expected_outcome = [[2, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]]

        for index, row in enumerate(expected_outcome):
            self.assertListEqual(list(encoded.encoded_data.examples[index]), expected_outcome[index])

        self.assertListEqual(["1_IGL", "1_IGH", "2_IGH", "3_IGL"], encoded.encoded_data.feature_names)
        self.assertListEqual(["subject_1", "subject_2", "subject_3"], encoded.encoded_data.example_ids)

        shutil.rmtree(path)
    def test_generate(self):
        path = PathBuilder.build(EnvironmentSettings.tmp_test_path / "tcrdist_motif_discovery/")
        dataset_path = self._create_dataset(path)

        dataset = SingleLineReceptorImport.import_dataset({"path": dataset_path,
                                                           "result_path": path / "dataset/",
                                                           "separator": ",",
                                                           "columns_to_load": ["subject", "epitope", "count", "v_a_gene", "j_a_gene", "cdr3_a_aa",
                                                                               "v_b_gene", "j_b_gene", "cdr3_b_aa", "clone_id", "cdr3_a_nucseq",
                                                                               "cdr3_b_nucseq"],
                                                           "column_mapping": {
                                                               "cdr3_a_aa": "alpha_amino_acid_sequence",
                                                               "cdr3_b_aa": "beta_amino_acid_sequence",
                                                               "cdr3_a_nucseq": "alpha_nucleotide_sequence",
                                                               "cdr3_b_nucseq": "beta_nucleotide_sequence",
                                                               "v_a_gene": "alpha_v_gene",
                                                               "v_b_gene": "beta_v_gene",
                                                               "j_a_gene": "alpha_j_gene",
                                                               "j_b_gene": "beta_j_gene",
                                                               "clone_id": "identifier"
                                                           },
                                                           "receptor_chains": "TRA_TRB",
                                                           "region_type": "IMGT_CDR3",
                                                           "sequence_file_size": 50000,
                                                           "organism": "mouse"}, 'd1')

        dataset = TCRdistEncoder(8).encode(dataset, EncoderParams(path / "result", LabelConfiguration([Label("epitope")])))

        report = TCRdistMotifDiscovery(train_dataset=dataset, test_dataset=dataset, result_path=path / "report", name="report name", cores=8,
                                       positive_class_name="PA", min_cluster_size=3)
        report._generate()

        shutil.rmtree(path)
    def _get_encoded_repertoire(self, repertoire, params: EncoderParams):
        params.model = vars(self)

        return CacheHandler.memo_by_params((("encoding_model", params.model),
                                            ("labels", params.label_config.get_labels_by_name()),
                                            ("repertoire_id", repertoire.identifier),
                                            ("repertoire_data", hashlib.sha256(np.ascontiguousarray(repertoire.get_attribute(self.sequence_type.value))).hexdigest())),
                                           lambda: self._encode_repertoire(repertoire, params), CacheObjectType.ENCODING)
    def test(self):

        receptors = [
            TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAACCC"),
                         beta=ReceptorSequence(amino_acid_sequence="AAACCC"),
                         identifier="1"),
            TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAA"),
                         beta=ReceptorSequence(amino_acid_sequence="CCC"),
                         identifier="2"),
            TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAACCC"),
                         beta=ReceptorSequence(amino_acid_sequence="AAACCC"),
                         identifier="3"),
            TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAA"),
                         beta=ReceptorSequence(amino_acid_sequence="CCC"),
                         identifier="4")
        ]

        path = EnvironmentSettings.tmp_test_path / "kmer_receptor_frequency/"
        PathBuilder.build(path / 'data')
        dataset = ReceptorDataset.build_from_objects(receptors,
                                                     path=path,
                                                     file_size=10)

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])

        encoder = KmerFreqReceptorEncoder.build_object(
            dataset, **{
                "normalization_type":
                NormalizationType.RELATIVE_FREQUENCY.name,
                "reads": ReadsType.UNIQUE.name,
                "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name,
                "sequence_type": SequenceType.AMINO_ACID.name,
                "k": 3
            })

        encoded_dataset = encoder.encode(
            dataset,
            EncoderParams(result_path=path / "2/",
                          label_config=lc,
                          pool_size=2,
                          learn_model=True,
                          model={},
                          filename="dataset.csv",
                          encode_labels=False))

        self.assertEqual(4, encoded_dataset.encoded_data.examples.shape[0])
        self.assertTrue(
            all(identifier in encoded_dataset.encoded_data.example_ids
                for identifier in ['1', '2', '3', '4']))
        self.assertTrue(
            numpy.array_equal(encoded_dataset.encoded_data.examples[0].A,
                              encoded_dataset.encoded_data.examples[2].A))
        self.assertTrue(
            all(feature_name in encoded_dataset.encoded_data.feature_names
                for feature_name in ["alpha_AAA", "alpha_AAC", "beta_CCC"]))

        shutil.rmtree(path)
Beispiel #13
0
    def test_encode(self):
        path = EnvironmentSettings.tmp_test_path / "abundance_encoder/"
        PathBuilder.build(path)

        repertoires, metadata = RepertoireBuilder.build(
            [["GGG", "III", "LLL", "MMM"],
             ["DDD", "EEE", "FFF", "III", "LLL", "MMM"], ["CCC", "FFF", "MMM"],
             ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"]],
            labels={"l1": [True, True, False, False]},
            path=path)

        dataset = RepertoireDataset(repertoires=repertoires,
                                    metadata_file=metadata,
                                    identifier="1")

        encoder = SequenceAbundanceEncoder.build_object(
            dataset, **{
                "comparison_attributes": ["sequence_aas"],
                "p_value_threshold": 0.4,
                "sequence_batch_size": 4,
                "repertoire_batch_size": 8
            })

        label_config = LabelConfiguration(
            [Label("l1", [True, False], positive_class=True)])

        encoded_dataset = encoder.encode(
            dataset, EncoderParams(result_path=path,
                                   label_config=label_config))

        self.assertTrue(
            np.array_equal(np.array([[1, 4], [1, 6], [0, 3], [0, 6]]),
                           encoded_dataset.encoded_data.examples))

        encoder.p_value_threshold = 0.05

        encoded_dataset = encoder.encode(
            dataset, EncoderParams(result_path=path,
                                   label_config=label_config))

        self.assertTrue(
            np.array_equal(np.array([[0, 4], [0, 6], [0, 3], [0, 6]]),
                           encoded_dataset.encoded_data.examples))

        shutil.rmtree(path)
    def test_encode_sequence(self):
        sequence = ReceptorSequence("CASSPRERATYEQCASSPRERATYEQCASSPRERATYEQ",
                                    None, None)
        result = IMGTKmerSequenceEncoder.encode_sequence(
            sequence,
            EncoderParams(model={"k": 3},
                          label_config=LabelConfiguration(),
                          result_path=""))

        self.assertEqual(
            {
                'CAS///105', 'ASS///106', 'SSP///107', 'SPR///108',
                'PRE///109', 'RER///110', 'ERA///111', 'RAT///111.001',
                'ATY///111.002', 'TYE///111.003', 'YEQ///111.004',
                'EQC///111.005', 'QCA///111.006', 'CAS///111.007',
                'ASS///111.008', 'SSP///111.009', 'SPR///111.01',
                'PRE///111.011', 'RER///111.012', 'ERA///111.013',
                'RAT///112.013', 'ATY///112.012', 'TYE///112.011',
                'YEQ///112.01', 'EQC///112.009', 'QCA///112.008',
                'CAS///112.007', 'ASS///112.006', 'SSP///112.005',
                'SPR///112.004', 'PRE///112.003', 'RER///112.002',
                'ERA///112.001', 'RAT///112', 'ATY///113', 'TYE///114',
                'YEQ///115'
            }, set(result))

        self.assertEqual(len(result), len(sequence.get_sequence()) - 3 + 1)

        sequence = ReceptorSequence("AHCDE", None, None)
        result = IMGTKmerSequenceEncoder.encode_sequence(
            sequence,
            EncoderParams(model={"k": 3},
                          label_config=LabelConfiguration(),
                          result_path=""))

        self.assertEqual({'AHC///105', 'HCD///106', 'CDE///107'}, set(result))

        self.assertEqual(len(result), len(sequence.get_sequence()) - 3 + 1)
        self.assertEqual(
            IMGTKmerSequenceEncoder.encode_sequence(
                sequence,
                EncoderParams(model={"k": 25},
                              label_config=LabelConfiguration(),
                              result_path="")), None)
Beispiel #15
0
 def _encode_sequence(self, sequence: ReceptorSequence, params: EncoderParams, sequence_encoder, counts):
     params.model = vars(self)
     features = sequence_encoder.encode_sequence(sequence, params)
     if features is not None:
         for i in features:
             if self.reads == ReadsType.UNIQUE:
                 counts[i] += 1
             elif self.reads == ReadsType.ALL:
                 counts[i] += sequence.metadata.count
     return counts
    def _test_encode(self, compairr_path):
        path = EnvironmentSettings.tmp_test_path / "compairr_abundance_encoder/"
        PathBuilder.build(path)

        dataset = self._build_test_dataset(path)

        for ignore_genes in [True, False]:
            result_path = path / f"ignore_genes={ignore_genes}"

            encoder = CompAIRRSequenceAbundanceEncoder.build_object(
                dataset, **{
                    "p_value_threshold": 0.4,
                    "compairr_path": compairr_path,
                    "sequence_batch_size": 2,
                    "ignore_genes": ignore_genes,
                    "threads": 8
                })

            label_config = LabelConfiguration(
                [Label("l1", [True, False], positive_class=True)])

            encoded_dataset = encoder.encode(
                dataset,
                EncoderParams(result_path=result_path,
                              label_config=label_config))

            self.assertTrue(
                np.array_equal(np.array([[1, 4], [1, 6], [0, 3], [0, 6]]),
                               encoded_dataset.encoded_data.examples))

            encoder.p_value_threshold = 0.05

            encoded_dataset = encoder.encode(
                dataset,
                EncoderParams(result_path=result_path,
                              label_config=label_config))

            self.assertTrue(
                np.array_equal(np.array([[0, 4], [0, 6], [0, 3], [0, 6]]),
                               encoded_dataset.encoded_data.examples))

        shutil.rmtree(path)
Beispiel #17
0
    def test(self):
        path = EnvironmentSettings.tmp_test_path / "onehot_sequence/"
        PathBuilder.build(path)

        dataset, lc = self._construct_test_dataset(path)

        encoder = OneHotEncoder.build_object(
            dataset, **{
                "use_positional_info": False,
                'sequence_type': 'amino_acid',
                "distance_to_seq_middle": None,
                "flatten": False
            })

        encoded_data = encoder.encode(
            dataset,
            EncoderParams(result_path=path / "encoded/",
                          label_config=lc,
                          learn_model=True,
                          model={},
                          filename="dataset.pkl"))

        self.assertTrue(isinstance(encoded_data, SequenceDataset))

        onehot_a = [1] + [0] * 19
        onehot_t = [0] * 16 + [1] + [0] * 3
        onehot_empty = [0] * 20

        self.assertListEqual(
            [list(item) for item in encoded_data.encoded_data.examples[0]],
            [onehot_a for i in range(4)])
        self.assertListEqual(
            [list(item) for item in encoded_data.encoded_data.examples[1]],
            [onehot_a, onehot_t, onehot_a, onehot_empty])
        self.assertListEqual(
            [list(item) for item in encoded_data.encoded_data.examples[2]],
            [onehot_a, onehot_t, onehot_t, onehot_empty])

        self.assertListEqual(
            encoded_data.encoded_data.example_ids,
            [receptor.identifier for receptor in dataset.get_data()])
        self.assertDictEqual(
            encoded_data.encoded_data.labels, {
                "l1": [
                    receptor_seq.get_attribute("l1")
                    for receptor_seq in dataset.get_data()
                ],
                "l2": [
                    receptor_seq.get_attribute("l2")
                    for receptor_seq in dataset.get_data()
                ]
            })

        shutil.rmtree(path)
Beispiel #18
0
def encode_dataset_by_kmer_freq(path_to_dataset_directory: str, result_path: str, metadata_path: str = None):
    """
    encodes the repertoire dataset using KmerFrequencyEncoder

    Arguments:
        path_to_dataset_directory (str): path to directory containing all repertoire files with .tsv extension in MiXCR format
        result_path (str): where to store the results
        metadata_path(str): csv file with columns "filename", "subject_id", "disease" which is filled by default if value of argument is None,
            otherwise any metadata csv file passed to the function, must include filename and subject_id columns, and an arbitrary disease column
    Returns:
         encoded dataset with encoded data in encoded_dataset.encoded_data.examples
    """
    path_to_dataset_directory = Path(path_to_dataset_directory)
    result_path = Path(result_path)

    if metadata_path is None:
        metadata_path = generate_random_metadata(path_to_dataset_directory, result_path)
    else:
        metadata_path = Path(metadata_path)

    loader = MiXCRImport()
    dataset = loader.import_dataset({
        "is_repertoire": True,
        "path": path_to_dataset_directory,
        "metadata_file": metadata_path,
        "region_type": "IMGT_CDR3",  # import_dataset in only cdr3
        "number_of_processes": 4,  # number of parallel processes for loading the data
        "result_path": result_path,
        "separator": "\t",
        "columns_to_load": ["cloneCount", "allVHitsWithScore", "allJHitsWithScore", "aaSeqCDR3", "nSeqCDR3"],
        "column_mapping": {
            "cloneCount": "counts",
            "allVHitsWithScore": "v_alleles",
            "allJHitsWithScore": "j_alleles"
        },
    }, "mixcr_dataset")

    label_name = list(dataset.labels.keys())[0]  # label that can be used for ML prediction - by default: "disease" with values True/False

    encoded_dataset = DataEncoder.run(DataEncoderParams(dataset, KmerFrequencyEncoder.build_object(dataset, **{
        "normalization_type": "relative_frequency",  # encode repertoire by the relative frequency of k-mers in repertoire
        "reads": "unique",  # count each sequence only once, do not use clonal count
        "k": 2,  # k-mer length
        "sequence_type": "amino_acid",
        "sequence_encoding": "continuous_kmer"  # split each sequence in repertoire to overlapping k-mers
    }), EncoderParams(result_path=result_path,
                      label_config=LabelConfiguration([Label(label_name, dataset.labels[label_name])]))))

    dataset_exporter = DesignMatrixExporter(dataset=encoded_dataset,
                                            result_path=result_path / "csv_exported", file_format='csv')
    dataset_exporter.generate_report()

    return encoded_dataset
 def encode(self, unit: ExploratoryAnalysisUnit, result_path: Path) -> Dataset:
     if unit.encoder is not None:
         encoded_dataset = DataEncoder.run(DataEncoderParams(dataset=unit.dataset, encoder=unit.encoder,
                                                             encoder_params=EncoderParams(result_path=result_path,
                                                                                          label_config=unit.label_config,
                                                                                          filename="encoded_dataset.pkl",
                                                                                          pool_size=unit.number_of_processes,
                                                                                          learn_model=True,
                                                                                          encode_labels=unit.label_config is not None),
                                                             store_encoded_data=True))
     else:
         encoded_dataset = unit.dataset
     return encoded_dataset
Beispiel #20
0
    def test_encode_sequence(self):
        seq = ReceptorSequence(amino_acid_sequence="CASSVFRTY")
        result = KmerSequenceEncoder.encode_sequence(seq, EncoderParams(model={"k": 3},
                                                                        label_config=LabelConfiguration(),
                                                                        result_path="", pool_size=4))

        self.assertTrue("CAS" in result)
        self.assertTrue("ASS" in result)
        self.assertTrue("SSV" in result)
        self.assertTrue("SVF" in result)
        self.assertTrue("VFR" in result)
        self.assertTrue("FRT" in result)
        self.assertTrue("RTY" in result)

        self.assertEqual(7, len(result))
        self.assertEqual(
            KmerSequenceEncoder.encode_sequence(
                ReceptorSequence(amino_acid_sequence="AC"),
                EncoderParams(model={"k": 3}, label_config=LabelConfiguration(), result_path="", pool_size=4)
            ),
            None
        )
    def test_encode_sequence(self):
        sequence = ReceptorSequence("AHCDE", None, None)
        kmers = IMGTGappedKmerEncoder.encode_sequence(
            sequence,
            EncoderParams(model={
                "k_left": 1,
                "max_gap": 1
            },
                          label_config=LabelConfiguration(),
                          result_path=""))

        self.assertEqual(
            {
                'AH///105', 'HC///106', 'CD///107', 'DE///116', 'A.C///105',
                'H.D///106', 'C.E///107'
            }, set(kmers))

        sequence = ReceptorSequence("CASSPRERATYEQCAY", None, None)
        kmers = IMGTGappedKmerEncoder.encode_sequence(
            sequence,
            EncoderParams(model={
                "k_left": 1,
                "max_gap": 1
            },
                          label_config=LabelConfiguration(),
                          result_path=""))

        self.assertEqual(
            {
                'CA///105', 'AS///106', 'SS///107', 'SP///108', 'PR///109',
                'RE///110', 'ER///111', 'RA///111.001', 'AT///112.002',
                'TY///112.001', 'YE///112', 'EQ///113', 'QC///114', 'CA///115',
                'AY///116', 'C.S///105', 'A.S///106', 'S.P///107', 'S.R///108',
                'P.E///109', 'R.R///110', 'E.A///111', 'R.T///111.001',
                'A.Y///112.002', 'T.E///112.001', 'Y.Q///112', 'E.C///113',
                'Q.A///114', 'C.Y///115'
            }, set(kmers))
    def test_receptor_flattened(self):
        path = EnvironmentSettings.root_path / "test/tmp/onehot_recep_flat/"

        PathBuilder.build(path)

        dataset = self.construct_test_flatten_dataset(path)

        encoder = OneHotEncoder.build_object(
            dataset, **{
                "use_positional_info": False,
                "distance_to_seq_middle": None,
                'sequence_type': 'amino_acid',
                "flatten": True
            })

        encoded_data = encoder.encode(
            dataset,
            EncoderParams(result_path=path,
                          label_config=LabelConfiguration([
                              Label(name="l1",
                                    values=[1, 0],
                                    positive_class="1")
                          ]),
                          pool_size=1,
                          learn_model=True,
                          model={},
                          filename="dataset.pkl"))

        self.assertTrue(isinstance(encoded_data, ReceptorDataset))

        onehot_a = [1.0] + [0.0] * 19
        onehot_t = [0.0] * 16 + [1.0] + [0] * 3

        self.assertListEqual(
            list(encoded_data.encoded_data.examples[0]),
            onehot_a + onehot_a + onehot_a + onehot_t + onehot_t + onehot_t +
            onehot_a + onehot_t + onehot_a + onehot_t + onehot_a + onehot_t)
        self.assertListEqual(list(encoded_data.encoded_data.examples[1]),
                             onehot_a * 12)
        self.assertListEqual(list(encoded_data.encoded_data.examples[2]),
                             onehot_a * 12)

        self.assertListEqual(list(encoded_data.encoded_data.feature_names), [
            f"{chain}_{pos}_{char}" for chain in ("alpha", "beta")
            for pos in range(6)
            for char in EnvironmentSettings.get_sequence_alphabet()
        ])

        shutil.rmtree(path)
Beispiel #23
0
    def test_generate(self):
        path = PathBuilder.build(EnvironmentSettings.tmp_test_path /
                                 "kernel_sequence_logo/")
        dataset = RandomDatasetGenerator.generate_receptor_dataset(
            receptor_count=500,
            chain_1_length_probabilities={4: 1},
            chain_2_length_probabilities={4: 1},
            labels={"CMV": {
                True: 0.5,
                False: 0.5
            }},
            path=path / "dataset")
        enc_dataset = OneHotReceptorEncoder(True, 1, False, "enc1").encode(
            dataset,
            EncoderParams(path / "result",
                          LabelConfiguration([Label("CMV", [True, False])])))
        cnn = ReceptorCNN(kernel_count=2,
                          kernel_size=[3],
                          positional_channels=3,
                          sequence_type="amino_acid",
                          device="cpu",
                          number_of_threads=4,
                          random_seed=1,
                          learning_rate=0.01,
                          iteration_count=10,
                          l1_weight_decay=0.1,
                          evaluate_at=5,
                          batch_size=100,
                          training_percentage=0.8,
                          l2_weight_decay=0.0)
        cnn.fit(enc_dataset.encoded_data, "CMV")

        report = KernelSequenceLogo(method=cnn, result_path=path / "logos/")
        report.generate_report()

        self.assertTrue(os.path.isfile(path / "logos/alpha_kernel_3_1.png"))
        self.assertTrue(os.path.isfile(path / "logos/alpha_kernel_3_2.png"))
        self.assertTrue(os.path.isfile(path / "logos/beta_kernel_3_1.png"))
        self.assertTrue(os.path.isfile(path / "logos/beta_kernel_3_2.png"))
        self.assertTrue(os.path.isfile(path / "logos/alpha_kernel_3_1.csv"))
        self.assertTrue(os.path.isfile(path / "logos/alpha_kernel_3_2.csv"))
        self.assertTrue(os.path.isfile(path / "logos/beta_kernel_3_1.csv"))
        self.assertTrue(os.path.isfile(path / "logos/beta_kernel_3_2.csv"))
        self.assertTrue(
            os.path.isfile(path / "logos/fully_connected_layer_weights.csv"))
        self.assertTrue(
            os.path.isfile(path / "logos/fully_connected_layer_weights.html"))

        shutil.rmtree(path)
    def test_encode_sequence(self):
        sequence = ReceptorSequence("AHCDE", None, None)
        kmers = IMGTGappedKmerEncoder.encode_sequence(
            sequence,
            EncoderParams(model={
                "k_left": 1,
                "max_gap": 1
            },
                          label_config=LabelConfiguration(),
                          result_path=""))

        self.assertEqual(
            {
                'AH-105', 'HC-106', 'CD-107', 'DE-116', 'A.C-105', 'H.D-106',
                'C.E-107'
            }, set(kmers))

        sequence = ReceptorSequence("CASSPRERATYEQCAY", None, None)
        kmers = IMGTGappedKmerEncoder.encode_sequence(
            sequence,
            EncoderParams(model={
                "k_left": 1,
                "max_gap": 1
            },
                          label_config=LabelConfiguration(),
                          result_path=""))

        self.assertEqual(
            {
                'CA-105', 'AS-106', 'SS-107', 'SP-108', 'PR-109', 'RE-110',
                'ER-111', 'RA-111.001', 'AT-112.002', 'TY-112.001', 'YE-112',
                'EQ-113', 'QC-114', 'CA-115', 'AY-116', 'C.S-105', 'A.S-106',
                'S.P-107', 'S.R-108', 'P.E-109', 'R.R-110', 'E.A-111',
                'R.T-111.001', 'A.Y-112.002', 'T.E-112.001', 'Y.Q-112',
                'E.C-113', 'Q.A-114', 'C.Y-115'
            }, set(kmers))
Beispiel #25
0
 def _encode_dataset(self,
                     encoder,
                     dataset,
                     path,
                     learn_model: bool = True):
     # encodes the repertoire by frequency of 3-mers
     lc = LabelConfiguration()
     lc.add_label("disease", [True, False])
     encoded_dataset = encoder.encode(
         dataset,
         EncoderParams(result_path=path / "encoded",
                       label_config=lc,
                       learn_model=learn_model,
                       model={}))
     return encoded_dataset
Beispiel #26
0
    def test_run(self):
        path = EnvironmentSettings.root_path / "test/tmp/dataencoder/"
        PathBuilder.build(path)

        rep1 = Repertoire.build_from_sequence_objects(
            [ReceptorSequence("AAA", identifier="1")],
            metadata={
                "l1": 1,
                "l2": 2
            },
            path=path)

        rep2 = Repertoire.build_from_sequence_objects(
            [ReceptorSequence("ATA", identifier="2")],
            metadata={
                "l1": 0,
                "l2": 3
            },
            path=path)

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])
        lc.add_label("l2", [0, 3])

        dataset = RepertoireDataset(repertoires=[rep1, rep2])
        encoder = Word2VecEncoder.build_object(
            dataset, **{
                "k": 3,
                "model_type": ModelType.SEQUENCE.name,
                "vector_size": 6
            })

        res = DataEncoder.run(
            DataEncoderParams(dataset=dataset,
                              encoder=encoder,
                              encoder_params=EncoderParams(
                                  model={},
                                  pool_size=2,
                                  label_config=lc,
                                  result_path=path,
                                  filename="dataset.csv"),
                              store_encoded_data=False))

        self.assertTrue(isinstance(res, RepertoireDataset))
        self.assertTrue(res.encoded_data.examples.shape[0] == 2)

        shutil.rmtree(path)
 def _encode_dataset(self, dataset, path, learn_model: bool = True):
     encoder = KmerFrequencyEncoder.build_object(dataset, **{
         "normalization_type": NormalizationType.RELATIVE_FREQUENCY.name,
         "reads": ReadsType.UNIQUE.name,
         "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name,
         "k": 3,
         'sequence_type': SequenceType.AMINO_ACID.name
     })  # encodes the repertoire by frequency of 3-mers
     lc = LabelConfiguration()
     lc.add_label("disease", [True, False])
     encoded_dataset = encoder.encode(dataset, EncoderParams(
         result_path=path / "encoded",
         label_config=lc,
         learn_model=learn_model,
         model={}
     ))
     return encoded_dataset
    def test_encode(self):

        test_path = EnvironmentSettings.root_path / "test/tmp/w2v/"

        PathBuilder.build(test_path)

        sequence1 = ReceptorSequence("CASSVFA", identifier="1")
        sequence2 = ReceptorSequence("CASSCCC", identifier="2")

        metadata1 = {"T1D": "T1D", "subject_id": "1"}
        rep1 = Repertoire.build_from_sequence_objects([sequence1, sequence2],
                                                      test_path, metadata1)

        metadata2 = {"T1D": "CTL", "subject_id": "2"}
        rep2 = Repertoire.build_from_sequence_objects([sequence1], test_path,
                                                      metadata2)

        dataset = RepertoireDataset(repertoires=[rep1, rep2])

        label_configuration = LabelConfiguration()
        label_configuration.add_label("T1D", ["T1D", "CTL"])

        config_params = EncoderParams(model={},
                                      learn_model=True,
                                      result_path=test_path,
                                      label_config=label_configuration,
                                      filename="dataset.pkl")

        encoder = Word2VecEncoder.build_object(
            dataset, **{
                "k": 3,
                "model_type": "sequence",
                "vector_size": 16
            })

        encoded_dataset = encoder.encode(dataset=dataset, params=config_params)

        self.assertIsNotNone(encoded_dataset.encoded_data)
        self.assertTrue(encoded_dataset.encoded_data.examples.shape[0] == 2)
        self.assertTrue(encoded_dataset.encoded_data.examples.shape[1] == 16)
        self.assertTrue(len(encoded_dataset.encoded_data.labels["T1D"]) == 2)
        self.assertTrue(encoded_dataset.encoded_data.labels["T1D"][0] == "T1D")
        self.assertTrue(isinstance(encoder, W2VRepertoireEncoder))

        shutil.rmtree(test_path)
Beispiel #29
0
    def test_not_positional(self):

        path = EnvironmentSettings.root_path / "test/tmp/onehot_vanilla/"

        PathBuilder.build(path)

        dataset, lc = self._construct_test_repertoiredataset(path, positional=False)

        encoder = OneHotEncoder.build_object(dataset, **{"use_positional_info": False,
                                                         "distance_to_seq_middle": 6,
                                                         "flatten": False})

        encoded_data = encoder.encode(dataset, EncoderParams(
            result_path=path,
            label_config=lc,
            learn_model=True,
            model={},
            filename="dataset.pkl"
        ))

        self.assertTrue(isinstance(encoded_data, RepertoireDataset))

        onehot_a = [1] + [0] * 19
        onehot_t = [0] * 16 + [1] + [0] * 3
        onehot_empty = [0] * 20

        self.assertListEqual([list(item) for item in encoded_data.encoded_data.examples[0][0]], [onehot_a for i in range(4)])
        self.assertListEqual([list(item) for item in encoded_data.encoded_data.examples[0][1]],
                             [onehot_a, onehot_t, onehot_a, onehot_empty])
        self.assertListEqual([list(item) for item in encoded_data.encoded_data.examples[0][2]],
                             [onehot_a, onehot_t, onehot_a, onehot_empty])

        self.assertListEqual([list(item) for item in encoded_data.encoded_data.examples[1][0]],
                             [onehot_a, onehot_t, onehot_a, onehot_empty])
        self.assertListEqual([list(item) for item in encoded_data.encoded_data.examples[1][1]],
                             [onehot_t, onehot_a, onehot_a, onehot_empty])
        self.assertListEqual([list(item) for item in encoded_data.encoded_data.examples[1][2]], [onehot_empty for i in range(4)])

        self.assertListEqual(list(encoded_data.encoded_data.example_ids), [repertoire.identifier for repertoire in dataset.get_data()])
        self.assertDictEqual(encoded_data.encoded_data.labels,
                             {"l1": [repertoire.metadata["l1"] for repertoire in dataset.get_data()],
                              "l2": [repertoire.metadata["l2"] for repertoire in dataset.get_data()]})

        shutil.rmtree(path)
    def test(self):
        path = EnvironmentSettings.tmp_test_path / "onehot_sequence_1/"
        PathBuilder.build(path)

        dataset, lc = self._construct_test_dataset(path)

        encoder = OneHotEncoder.build_object(
            dataset, **{
                "use_positional_info": False,
                'sequence_type': 'amino_acid',
                "distance_to_seq_middle": 6,
                "flatten": False
            })

        encoded_data = encoder.encode(
            dataset,
            EncoderParams(result_path=path / "encoded",
                          label_config=lc,
                          learn_model=True,
                          model={},
                          filename="dataset.pkl"))

        self.assertTrue(isinstance(encoded_data, ReceptorDataset))

        onehot_a = [1] + [0] * 19
        onehot_t = [0] * 16 + [1] + [0] * 3
        onehot_empty = [0] * 20

        self.assertListEqual(
            [list(item) for item in encoded_data.encoded_data.examples[0, 0]],
            [onehot_a for i in range(4)])
        self.assertListEqual(
            [list(item) for item in encoded_data.encoded_data.examples[0, 1]],
            [onehot_a, onehot_t, onehot_a, onehot_empty])

        self.assertListEqual(
            [list(item) for item in encoded_data.encoded_data.examples[1, 0]],
            [onehot_a, onehot_t, onehot_a, onehot_empty])
        self.assertListEqual(
            [list(item) for item in encoded_data.encoded_data.examples[1, 1]],
            [onehot_a, onehot_t, onehot_t, onehot_empty])

        shutil.rmtree(path)