Ejemplo n.º 1
0
    def test_encode(self):
        path = EnvironmentSettings.tmp_test_path + "abundance_encoder/"
        PathBuilder.build(path)

        repertoires, metadata = RepertoireBuilder.build([["GGG", "III", "LLL", "MMM"],
                                                         ["DDD", "EEE", "FFF", "III", "LLL", "MMM"],
                                                         ["CCC", "FFF", "MMM"],
                                                         ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"]],
                                                        labels={"l1": [True, True, False, False]}, path=path)

        dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata, identifier="1")

        encoder = SequenceAbundanceEncoder.build_object(dataset, **{
            "comparison_attributes": ["sequence_aas"],
            "p_value_threshold": 0.4, "sequence_batch_size": 4, "repertoire_batch_size": 8
        })

        label_config = LabelConfiguration([Label("l1", [True, False], positive_class=True)])

        encoded_dataset = encoder.encode(dataset, EncoderParams(result_path=path, label_config=label_config))

        self.assertTrue(np.array_equal(np.array([[1, 4], [1, 6], [0, 3], [0, 6]]), encoded_dataset.encoded_data.examples))

        encoder.p_value_threshold = 0.05

        encoded_dataset = encoder.encode(dataset, EncoderParams(result_path=path, label_config=label_config))

        self.assertTrue(np.array_equal(np.array([[0, 4], [0, 6], [0, 3], [0, 6]]), encoded_dataset.encoded_data.examples))

        shutil.rmtree(path)
Ejemplo n.º 2
0
    def import_hp_setting(config_dir: str) -> Tuple[HPSetting, Label]:

        config = MLMethodConfiguration()
        config.load(f'{config_dir}ml_config.yaml')

        ml_method = ReflectionHandler.get_class_by_name(
            config.ml_method, 'ml_methods/')()
        ml_method.load(config_dir)

        encoder = MLImport.import_encoder(config, config_dir)
        preprocessing_sequence = MLImport.import_preprocessing_sequence(
            config, config_dir)

        labels = list(config.labels_with_values.keys())
        assert len(
            labels
        ) == 1, "MLImport: Multiple labels set in a single ml_config file."

        label = Label(labels[0], config.labels_with_values[labels[0]])

        return HPSetting(
            encoder=encoder,
            encoder_params=config.encoding_parameters,
            encoder_name=config.encoding_name,
            ml_method=ml_method,
            ml_method_name=config.ml_method_name,
            ml_params={},
            preproc_sequence=preprocessing_sequence,
            preproc_sequence_name=config.preprocessing_sequence_name), label
Ejemplo n.º 3
0
    def test_sequence_flattened(self):
        path = EnvironmentSettings.root_path + "test/tmp/onehot_seq_flat/"

        PathBuilder.build(path)

        dataset = self.construct_test_flatten_dataset(path)

        encoder = OneHotEncoder.build_object(dataset, **{"use_positional_info": False, "distance_to_seq_middle": None, "flatten": True})

        encoded_data = encoder.encode(dataset, EncoderParams(
            result_path=path,
            label_config=LabelConfiguration([Label(name="l1", values=[1, 0], positive_class="1")]),
            pool_size=1,
            learn_model=True,
            model={},
            filename="dataset.pkl"
        ))

        self.assertTrue(isinstance(encoded_data, SequenceDataset))

        onehot_a = [1.0] + [0.0] * 19
        onehot_t = [0.0] * 16 + [1.0] + [0] * 3

        self.assertListEqual(list(encoded_data.encoded_data.examples[0]), onehot_a+onehot_a+onehot_a+onehot_t+onehot_t+onehot_t)
        self.assertListEqual(list(encoded_data.encoded_data.examples[1]), onehot_a+onehot_t+onehot_a+onehot_t+onehot_a+onehot_t)

        self.assertListEqual(list(encoded_data.encoded_data.feature_names), [f"{pos}_{char}" for pos in range(6) for char in EnvironmentSettings.get_sequence_alphabet()])
        shutil.rmtree(path)
Ejemplo n.º 4
0
    def _create_state_object(self, path):
        repertoires, metadata = RepertoireBuilder.build(sequences=[["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                                                                   ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"]],
                                                        path=path,
                                                        labels={
                                                            "l1": [1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
                                                                   1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2],
                                                            "l2": [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1,
                                                                   0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]})

        dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata,
                                    params={"l1": [1, 2], "l2": [0, 1]})
        enc_params = {"k": 3, "model_type": ModelType.SEQUENCE.name, "vector_size": 4}
        hp_settings = [HPSetting(Word2VecEncoder.build_object(dataset, **enc_params), enc_params,
                                 LogisticRegression(),
                                 {"model_selection_cv": False, "model_selection_n_folds": -1},
                                 [])]

        label_config = LabelConfiguration([Label("l1", [1, 2]), Label("l2", [0, 1])])

        process = TrainMLModelInstruction(dataset, GridSearch(hp_settings), hp_settings,
                                          SplitConfig(SplitType.RANDOM, 1, 0.5),
                                          SplitConfig(SplitType.RANDOM, 1, 0.5),
                                          {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, label_config, path)

        state = process.run(result_path=path)

        return state
    def test_run(self):

        path = EnvironmentSettings.tmp_test_path + "mlapplicationtest/"
        PathBuilder.build(path)

        dataset = RandomDatasetGenerator.generate_repertoire_dataset(
            50, {5: 1}, {5: 1}, {"l1": {
                1: 0.5,
                2: 0.5
            }}, path + 'dataset/')
        ml_method = LogisticRegression()
        encoder = KmerFreqRepertoireEncoder(
            NormalizationType.RELATIVE_FREQUENCY,
            ReadsType.UNIQUE,
            SequenceEncodingType.CONTINUOUS_KMER,
            3,
            scale_to_zero_mean=True,
            scale_to_unit_variance=True)
        label_config = LabelConfiguration([Label("l1", [1, 2])])

        enc_dataset = encoder.encode(
            dataset,
            EncoderParams(result_path=path,
                          label_config=label_config,
                          filename="tmp_enc_dataset.pickle",
                          pool_size=4))
        ml_method.fit(enc_dataset.encoded_data, 'l1')

        hp_setting = HPSetting(
            encoder, {
                "normalization_type": "relative_frequency",
                "reads": "unique",
                "sequence_encoding": "continuous_kmer",
                "k": 3,
                "scale_to_zero_mean": True,
                "scale_to_unit_variance": True
            }, ml_method, {}, [], 'enc1', 'ml1')

        PathBuilder.build(path + 'result/instr1/')
        shutil.copy(path + 'dict_vectorizer.pickle',
                    path + 'result/instr1/dict_vectorizer.pickle')
        shutil.copy(path + 'scaler.pickle',
                    path + 'result/instr1/scaler.pickle')

        ml_app = MLApplicationInstruction(dataset, label_config, hp_setting, 4,
                                          "instr1", False)
        ml_app.run(path + 'result/')

        predictions_path = path + "result/instr1/predictions.csv"
        self.assertTrue(os.path.isfile(predictions_path))

        df = pd.read_csv(predictions_path)
        self.assertEqual(50, df.shape[0])

        shutil.rmtree(path)
Ejemplo n.º 6
0
    def test_encode(self):
        path = EnvironmentSettings.tmp_test_path + "distance_encoder/"
        PathBuilder.build(path)

        dataset = self.create_dataset(path)

        enc = DistanceEncoder.build_object(
            dataset, **{
                "distance_metric": DistanceMetricType.JACCARD.name,
                "attributes_to_match": ["sequence_aas"],
                "sequence_batch_size": 20
            })

        enc.set_context({"dataset": dataset})
        encoded = enc.encode(
            dataset,
            EncoderParams(result_path=path,
                          label_config=LabelConfiguration(
                              [Label("l1", [0, 1]),
                               Label("l2", [2, 3])]),
                          pool_size=4,
                          filename="dataset.pkl"))

        self.assertEqual(8, encoded.encoded_data.examples.shape[0])
        self.assertEqual(8, encoded.encoded_data.examples.shape[1])

        self.assertEqual(1, encoded.encoded_data.examples.iloc[0, 0])
        self.assertEqual(1, encoded.encoded_data.examples.iloc[1, 1])
        self.assertEqual(1, encoded.encoded_data.examples.iloc[0, 4])

        self.assertTrue(
            np.array_equal([1, 0, 1, 0, 1, 0, 1, 0],
                           encoded.encoded_data.labels["l1"]))
        self.assertTrue(
            np.array_equal([2, 3, 2, 3, 2, 3, 3, 3],
                           encoded.encoded_data.labels["l2"]))

        shutil.rmtree(path)
Ejemplo n.º 7
0
    def test_encode(self):
        path = EnvironmentSettings.tmp_test_path + "deeprc_encoder/"
        PathBuilder.build(path)
        PathBuilder.build(path + "encoded_data/")

        main_dataset, sub_dataset = self.create_datasets(path)

        enc = DeepRCEncoder.build_object(sub_dataset, **{})

        enc.set_context({"dataset": main_dataset})

        encoded = enc.encode(
            sub_dataset,
            EncoderParams(result_path=path + "encoded_data/",
                          label_config=LabelConfiguration(
                              [Label("l1", [0, 1]),
                               Label("l2", [2, 3])]),
                          pool_size=4))

        self.assertListEqual(encoded.encoded_data.example_ids,
                             sub_dataset.get_repertoire_ids())
        self.assertTrue(
            os.path.isfile(encoded.encoded_data.info["metadata_filepath"]))

        metadata_content = pd.read_csv(
            encoded.encoded_data.info["metadata_filepath"], sep="\t")
        self.assertListEqual(list(metadata_content["ID"]),
                             sub_dataset.get_repertoire_ids())

        for repertoire in main_dataset.repertoires:
            rep_path = f"{path}/encoded_data/encoding/{repertoire.identifier}.tsv"
            self.assertTrue(os.path.isfile(rep_path))
            repertoire_tsv = pd.read_csv(rep_path, sep="\t")
            self.assertListEqual(list(repertoire_tsv["amino_acid"]),
                                 list(repertoire.get_sequence_aas()))

        shutil.rmtree(path)
Ejemplo n.º 8
0
    def test_generate(self):
        path = PathBuilder.build(EnvironmentSettings.tmp_test_path +
                                 "tcrdist_motif_discovery/")
        dataset_path = self._create_dataset(path)

        dataset = SingleLineReceptorImport.import_dataset(
            {
                "path":
                dataset_path,
                "result_path":
                path + "dataset/",
                "separator":
                ",",
                "columns_to_load": [
                    "subject", "epitope", "count", "v_a_gene", "j_a_gene",
                    "cdr3_a_aa", "v_b_gene", "j_b_gene", "cdr3_b_aa",
                    "clone_id", "cdr3_a_nucseq", "cdr3_b_nucseq"
                ],
                "column_mapping": {
                    "cdr3_a_aa": "alpha_amino_acid_sequence",
                    "cdr3_b_aa": "beta_amino_acid_sequence",
                    "cdr3_a_nucseq": "alpha_nucleotide_sequence",
                    "cdr3_b_nucseq": "beta_nucleotide_sequence",
                    "v_a_gene": "alpha_v_gene",
                    "v_b_gene": "beta_v_gene",
                    "j_a_gene": "alpha_j_gene",
                    "j_b_gene": "beta_j_gene",
                    "clone_id": "identifier"
                },
                "receptor_chains":
                "TRA_TRB",
                "region_type":
                "IMGT_CDR3",
                "sequence_file_size":
                50000,
                "organism":
                "mouse"
            }, 'd1')

        dataset = TCRdistEncoder(8).encode(
            dataset,
            EncoderParams(f"{path}result/",
                          LabelConfiguration([Label("epitope")])))

        report = TCRdistMotifDiscovery(dataset, path + "report/",
                                       "report name", 8)
        report.generate_report()

        shutil.rmtree(path)
Ejemplo n.º 9
0
def encode_dataset_by_kmer_freq(path_to_dataset_directory: str, result_path: str, metadata_path: str = None):
    """
    encodes the repertoire dataset using KmerFrequencyEncoder
    :param path_to_dataset_directory: path to directory containing all repertoire files with .tsv extension in MiXCR format
    :param result_path: where to store the results
    :param metadata_path: csv file with columns "filename", "subject_id", "disease" which is filled by default if value of argument is None,
                          otherwise any metadata csv file passed to the function, must include filename and subject_id columns,
                          and an arbitrary disease column
    :return: encoded dataset with encoded data in encoded_dataset.encoded_data.examples
    """
    if metadata_path is None:
        metadata_path = generate_random_metadata(path_to_dataset_directory, result_path)

    loader = MiXCRImport()
    dataset = loader.import_dataset({
        "is_repertoire": True,
        "path": path_to_dataset_directory,
        "metadata_file": metadata_path,
        "region_type": "IMGT_CDR3",  # import_dataset in only cdr3
        "number_of_processes": 4,  # number of parallel processes for loading the data
        "result_path": result_path,
        "separator": "\t",
        "columns_to_load": ["cloneCount", "allVHitsWithScore", "allJHitsWithScore", "aaSeqCDR3", "nSeqCDR3"],
        "column_mapping": {
            "cloneCount": "counts",
            "allVHitsWithScore": "v_genes",
            "allJHitsWithScore": "j_genes"
        },
    }, "mixcr_dataset")

    label_name = list(dataset.params.keys())[0]  # label that can be used for ML prediction - by default: "disease" with values True/False

    encoded_dataset = DataEncoder.run(DataEncoderParams(dataset, KmerFrequencyEncoder.build_object(dataset, **{
        "normalization_type": "relative_frequency",  # encode repertoire by the relative frequency of k-mers in repertoire
        "reads": "unique",  # count each sequence only once, do not use clonal count
        "k": 2,  # k-mer length
        "sequence_encoding": "continuous_kmer"  # split each sequence in repertoire to overlapping k-mers
    }), EncoderParams(result_path=result_path,
                      label_config=LabelConfiguration([Label(label_name, dataset.params[label_name])])), False))

    dataset_exporter = DesignMatrixExporter(dataset=encoded_dataset,
                                            result_path=f"{result_path if result_path[:-1] == '/' else result_path+'/'}csv_exported/")
    dataset_exporter.generate_report()

    return encoded_dataset
Ejemplo n.º 10
0
    def test_encode(self):

        file_content = """complex.id	Gene	CDR3	V	J	Species	MHC A	MHC B	MHC class	Epitope	Epitope gene	Epitope species	Reference	Method	Meta	CDR3fix	Score
3050	TRB	CASSPPRVYSNGAGLAGVGWRNEQFF	TRBV5-4*01	TRBJ2-1*01	HomoSapiens	HLA-A*11:01	B2M	MHCI	AVFDRKSDAK	EBNA4	EBV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""}	{"cdr3": "CASSPPRVYSNGAGLAGVGWRNEQFF", "cdr3_old": "CASSPPRVYSNGAGLAGVGWRNEQFF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-1*01", "jStart": 21, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-4*01"}	0
15760	TRB	CASSWTWDAATLWGQGALGGANVLTF	TRBV5-5*01	TRBJ2-6*01	HomoSapiens	HLA-A*03:01	B2M	MHCI	KLGGALQAK	IE1	CMV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""}	{"cdr3": "CASSWTWDAATLWGQGALGGANVLTF", "cdr3_old": "CASSWTWDAATLWGQGALGGANVLTF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-6*01", "jStart": 19, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-5*01"}	0
3050	TRA	CAAIYESRGSTLGRLYF	TRAV13-1*01	TRAJ18*01	HomoSapiens	HLA-A*11:01	B2M	MHCI	AVFDRKSDAK	EBNA4	EBV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""}	{"cdr3": "CAAIYESRGSTLGRLYF", "cdr3_old": "CAAIYESRGSTLGRLYF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ18*01", "jStart": 7, "oldVEnd": -1, "oldVFixType": "FailedBadSegment", "oldVId": null, "vCanonical": true, "vEnd": 3, "vFixType": "ChangeSegment", "vId": "TRAV13-1*01"}	0
15760	TRA	CALRLNNQGGKLIF	TRAV9-2*01	TRAJ23*01	HomoSapiens	HLA-A*03:01	B2M	MHCI	KLGGALQAK	IE1	CMV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""}	{"cdr3": "CALRLNNQGGKLIF", "cdr3_old": "CALRLNNQGGKLIF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ23*01", "jStart": 6, "vCanonical": true, "vEnd": 3, "vFixType": "NoFixNeeded", "vId": "TRAV9-2*01"}	0
3051	TRB	CASSPPRVYSNGAGLAGVGWRNEQFF	TRBV5-4*01	TRBJ2-1*01	HomoSapiens	HLA-A*11:01	B2M	MHCI	AVFDRKSDAK	EBNA4	EBV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""}	{"cdr3": "CASSPPRVYSNGAGLAGVGWRNEQFF", "cdr3_old": "CASSPPRVYSNGAGLAGVGWRNEQFF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-1*01", "jStart": 21, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-4*01"}	0
15761	TRB	CASSWTWDAATLWGQGALGGANVLTF	TRBV5-5*01	TRBJ2-6*01	HomoSapiens	HLA-A*03:01	B2M	MHCI	KLGGALQAK	IE1	CMV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""}	{"cdr3": "CASSWTWDAATLWGQGALGGANVLTF", "cdr3_old": "CASSWTWDAATLWGQGALGGANVLTF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-6*01", "jStart": 19, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-5*01"}	0
3051	TRA	CAAIYESRGSTLGRLYF	TRAV13-1*01	TRAJ18*01	HomoSapiens	HLA-A*11:01	B2M	MHCI	AVFDRKSDAK	EBNA4	EBV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""}	{"cdr3": "CAAIYESRGSTLGRLYF", "cdr3_old": "CAAIYESRGSTLGRLYF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ18*01", "jStart": 7, "oldVEnd": -1, "oldVFixType": "FailedBadSegment", "oldVId": null, "vCanonical": true, "vEnd": 3, "vFixType": "ChangeSegment", "vId": "TRAV13-1*01"}	0
15761	TRA	CALRLNNQGGKLIF	TRAV9-2*01	TRAJ23*01	HomoSapiens	HLA-A*03:01	B2M	MHCI	KLGGALQAK	IE1	CMV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""}	{"cdr3": "CALRLNNQGGKLIF", "cdr3_old": "CALRLNNQGGKLIF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ23*01", "jStart": 6, "vCanonical": true, "vEnd": 3, "vFixType": "NoFixNeeded", "vId": "TRAV9-2*01"}	0
        """
        path = PathBuilder.build(EnvironmentSettings.root_path +
                                 "test/tmp/trcdist_encoder/")

        with open(path + "receptors.tsv", "w") as file:
            file.writelines(file_content)

        params = DefaultParamsLoader.load(
            EnvironmentSettings.default_params_path + "datasets/", "vdjdb")
        params["is_repertoire"] = False
        params["paired"] = True
        params["result_path"] = path
        params["path"] = path
        params["sequence_file_size"] = 1
        params["receptor_chains"] = "TRA_TRB"
        params['organism'] = 'human'

        dataset = VDJdbImport.import_dataset(params, "vdjdb_dataset")

        encoder = TCRdistEncoder.build_object(dataset, **{"cores": 2})
        encoded_dataset = encoder.encode(
            dataset,
            EncoderParams(f"{path}result/",
                          LabelConfiguration([Label("epitope")])))

        self.assertTrue(encoded_dataset.encoded_data.examples.shape[0]
                        == encoded_dataset.encoded_data.examples.shape[1]
                        and encoded_dataset.encoded_data.examples.shape[0]
                        == dataset.get_example_count())

        shutil.rmtree(path)
Ejemplo n.º 11
0
    def test_encode(self):

        path = EnvironmentSettings.tmp_test_path + "count_encoder/"
        PathBuilder.build(path)

        repertoires, metadata = RepertoireBuilder.build(
            [["GGG", "III", "LLL", "MMM"],
             ["DDD", "EEE", "FFF", "III", "LLL", "MMM"], ["CCC", "FFF", "MMM"],
             ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"]],
            labels={"l1": [True, True, False, False]},
            path=path)

        dataset = RepertoireDataset(repertoires=repertoires,
                                    metadata_file=metadata,
                                    identifier="1")

        encoder = SequenceCountEncoder.build_object(
            dataset, **{
                "comparison_attributes": ["sequence_aas"],
                "p_value_threshold": 0.4,
                "sequence_batch_size": 4
            })

        label_config = LabelConfiguration(
            [Label("l1", [True, False], positive_class=True)])

        encoded_dataset = encoder.encode(
            dataset, EncoderParams(result_path=path,
                                   label_config=label_config))

        test = encoded_dataset.encoded_data.examples

        self.assertTrue(test[0] == 1)
        self.assertTrue(test[1] == 1)
        self.assertTrue(test[2] == 0)
        self.assertTrue(test[3] == 0)

        self.assertTrue("III" in encoded_dataset.encoded_data.feature_names)

        shutil.rmtree(path)
Ejemplo n.º 12
0
    def add_label(self,
                  label: str,
                  values: list = None,
                  auxiliary_labels: list = None,
                  positive_class=None):

        vals = list(values) if values else None

        if label in self._labels and self._labels[label] is not None and len(
                self._labels[label]) > 0:
            warnings.warn(
                "Label " + label +
                " has already been set. Overriding existing values...",
                Warning)

        if positive_class is not None:
            ParameterValidator.assert_in_valid_list(positive_class, values,
                                                    Label.__name__,
                                                    'positive_class')

        self._labels[label] = Label(label, vals, auxiliary_labels,
                                    positive_class)
Ejemplo n.º 13
0
    def test_generate(self):
        path = PathBuilder.build(f"{EnvironmentSettings.tmp_test_path}kernel_sequence_logo/")
        dataset = RandomDatasetGenerator.generate_receptor_dataset(receptor_count=500, chain_1_length_probabilities={4: 1},
                                                                   chain_2_length_probabilities={4: 1},
                                                                   labels={"CMV": {True: 0.5, False: 0.5}}, path=path + "dataset/")
        enc_dataset = OneHotReceptorEncoder(True, 1, False, "enc1").encode(dataset, EncoderParams(path + "result/",
                                                                                                  LabelConfiguration([Label("CMV", [True, False])])))
        cnn = ReceptorCNN(kernel_count=2, kernel_size=[3], positional_channels=3, sequence_type="amino_acid", device="cpu",
                          number_of_threads=4, random_seed=1, learning_rate=0.01, iteration_count=10, l1_weight_decay=0.1, evaluate_at=5,
                          batch_size=100, training_percentage=0.8, l2_weight_decay=0.0)
        cnn.fit(enc_dataset.encoded_data, "CMV")

        report = KernelSequenceLogo(method=cnn, result_path=path + "logos/")
        report.generate_report()

        self.assertTrue(os.path.isfile(f"{path}logos/alpha_kernel_3_1.png"))
        self.assertTrue(os.path.isfile(f"{path}logos/alpha_kernel_3_2.png"))
        self.assertTrue(os.path.isfile(f"{path}logos/beta_kernel_3_1.png"))
        self.assertTrue(os.path.isfile(f"{path}logos/beta_kernel_3_2.png"))
        self.assertTrue(os.path.isfile(f"{path}logos/alpha_kernel_3_1.csv"))
        self.assertTrue(os.path.isfile(f"{path}logos/alpha_kernel_3_2.csv"))
        self.assertTrue(os.path.isfile(f"{path}logos/beta_kernel_3_1.csv"))
        self.assertTrue(os.path.isfile(f"{path}logos/beta_kernel_3_2.csv"))
        self.assertTrue(os.path.isfile(f"{path}logos/fully_connected_layer_weights.csv"))
        self.assertTrue(os.path.isfile(f"{path}logos/fully_connected_layer_weights.html"))

        shutil.rmtree(path)
    def test_fit(self):
        path = PathBuilder.build(EnvironmentSettings.tmp_test_path +
                                 "kmermil/")

        repertoire_count = 10
        dataset = RandomDatasetGenerator.generate_repertoire_dataset(
            repertoire_count=repertoire_count,
            sequence_count_probabilities={2: 1},
            sequence_length_probabilities={4: 1},
            labels={"l1": {
                True: 0.5,
                False: 0.5
            }},
            path=path + "dataset/")
        enc_dataset = AtchleyKmerEncoder(
            2, 1, 1, 'relative_abundance', False).encode(
                dataset,
                EncoderParams(path + "result/",
                              LabelConfiguration([Label("l1",
                                                        [True, False])])))
        cls = AtchleyKmerMILClassifier(iteration_count=10,
                                       threshold=-0.0001,
                                       evaluate_at=2,
                                       use_early_stopping=False,
                                       random_seed=1,
                                       learning_rate=0.01,
                                       zero_abundance_weight_init=True,
                                       number_of_threads=8)
        cls.fit(enc_dataset.encoded_data, "l1")

        predictions = cls.predict(enc_dataset.encoded_data, "l1")
        self.assertEqual(repertoire_count, len(predictions["l1"]))
        self.assertEqual(
            repertoire_count,
            len([pred for pred in predictions["l1"]
                 if isinstance(pred, bool)]))

        predictions_proba = cls.predict_proba(enc_dataset.encoded_data, "l1")
        self.assertEqual(repertoire_count,
                         np.rint(np.sum(predictions_proba["l1"])))
        self.assertEqual(repertoire_count, predictions_proba["l1"].shape[0])

        cls.store(path + "model_storage/",
                  feature_names=enc_dataset.encoded_data.feature_names)

        cls2 = AtchleyKmerMILClassifier(iteration_count=10,
                                        threshold=-0.0001,
                                        evaluate_at=2,
                                        use_early_stopping=False,
                                        random_seed=1,
                                        learning_rate=0.01,
                                        zero_abundance_weight_init=True,
                                        number_of_threads=8)
        cls2.load(path + "model_storage/")

        cls2_vars = vars(cls2)
        del cls2_vars["logistic_regression"]
        cls_vars = vars(cls)
        del cls_vars["logistic_regression"]

        for item, value in cls_vars.items():
            if not isinstance(value, np.ndarray):
                loaded_value = cls2_vars[item]
                self.assertEqual(value, loaded_value)

        model = cls.get_model("l1")
        self.assertEqual(vars(cls), model)

        shutil.rmtree(path)
Ejemplo n.º 15
0
    def test_find_label_associated_sequence_p_values(self):
        path = EnvironmentSettings.tmp_test_path + "comparison_data_find_label_assocseqpvalues/"
        PathBuilder.build(path)

        repertoires = [Repertoire.build_from_sequence_objects([ReceptorSequence()], path, {
            "l1": val, "subject_id": subject_id
        }) for val, subject_id in zip([True, True, False, False], ["rep_0", "rep_1", "rep_2", "rep_3"])]

        col_name_index = {repertoires[index].identifier: index for index in range(len(repertoires))}

        comparison_data = ComparisonData(repertoire_ids=[repertoire.identifier for repertoire in repertoires],
                                         comparison_attributes=["sequence_aas"], sequence_batch_size=4, path=path)
        comparison_data.batches = [ComparisonDataBatch(**{'matrix': np.array([[1., 0., 0., 0.],
                                                                              [1., 1., 0., 0.]]),
                                                          'items': [('GGG',), ('III',)],
                                                          'repertoire_index_mapping': col_name_index, 'path': path, 'identifier': 0}),
                                   ComparisonDataBatch(**{'matrix': np.array([[1., 1., 0., 1.],
                                                                              [1., 1., 1., 1.]]),
                                                          'items': [('LLL',), ('MMM',)],
                                                          'repertoire_index_mapping': col_name_index, 'path': path, 'identifier': 1}),
                                   ComparisonDataBatch(**{'matrix': np.array([[0., 1., 0., 0.],
                                                                              [0., 1., 0., 1.]]),
                                                          'items': [('DDD',), ('EEE',)],
                                                          'repertoire_index_mapping': col_name_index, 'path': path, 'identifier': 2}),
                                   ComparisonDataBatch(**{'matrix': np.array([[0., 1., 1., 1.],
                                                                              [0., 0., 1., 1.]]),
                                                          'items': [('FFF',), ('CCC',)],
                                                          'repertoire_index_mapping': col_name_index, 'path': path, 'identifier': 3}),
                                   ComparisonDataBatch(**{'matrix': np.array([[0., 0., 0., 1.]]),
                                                          'items': [('AAA',)],
                                                          'repertoire_index_mapping': col_name_index, 'path': path, 'identifier': 4})]

        p_values = SequenceFilterHelper.find_label_associated_sequence_p_values(comparison_data, repertoires, Label('l1', [True, False], positive_class=True))

        print(p_values)

        self.assertTrue(
            np.allclose([SequenceFilterHelper.INVALID_P_VALUE, 0.1666666666666667, 0.5000000000000001, 1., SequenceFilterHelper.INVALID_P_VALUE,
                         0.8333333333333331, 1., 1., 2], p_values, equal_nan=True))

        shutil.rmtree(path)
Ejemplo n.º 16
0
    def test_encode(self):
        path = PathBuilder.build(EnvironmentSettings.tmp_test_path + "atchley_kmer_encoding/")
        dataset = RandomDatasetGenerator.generate_repertoire_dataset(3, {1: 1}, {4: 1}, {"l1": {True: 0.4, False: 0.6}}, path + "dataset/")

        encoder = AtchleyKmerEncoder.build_object(dataset, **{"k": 2, "skip_first_n_aa": 1, "skip_last_n_aa": 1, "abundance": "RELATIVE_ABUNDANCE",
                                                              "normalize_all_features": False})
        encoded_dataset = encoder.encode(dataset, EncoderParams(path + "result/", LabelConfiguration(labels=[Label("l1")])))

        self.assertEqual((3, 11, 3), encoded_dataset.encoded_data.examples.shape)
        self.assertEqual(0., encoded_dataset.encoded_data.examples[0, -1, 0])

        shutil.rmtree(path)
    def test_run(self):

        path = EnvironmentSettings.tmp_test_path + "hpoptimproc/"
        PathBuilder.build(path)

        repertoires, metadata = RepertoireBuilder.build(
            sequences=[["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"],
                       ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"]],
            path=path,
            labels={
                "l1": [
                    1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
                    1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2
                ],
                "l2": [
                    0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1,
                    0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
                ]
            })

        dataset = RepertoireDataset(repertoires=repertoires,
                                    metadata_file=metadata,
                                    params={
                                        "l1": [1, 2],
                                        "l2": [0, 1]
                                    })
        enc1 = {
            "k": 3,
            "model_type": ModelType.SEQUENCE.name,
            "vector_size": 4
        }
        enc2 = {
            "k": 3,
            "model_type": ModelType.SEQUENCE.name,
            "vector_size": 6
        }
        hp_settings = [
            HPSetting(Word2VecEncoder.build_object(dataset, **enc1), enc1,
                      LogisticRegression(), {
                          "model_selection_cv": False,
                          "model_selection_n_folds": -1
                      }, []),
            HPSetting(
                Word2VecEncoder.build_object(dataset, **enc2), enc2, SVM(), {
                    "model_selection_cv": False,
                    "model_selection_n_folds": -1
                },
                [ClonesPerRepertoireFilter(lower_limit=-1, upper_limit=1000)])
        ]

        report = SequenceLengthDistribution()
        label_config = LabelConfiguration(
            [Label("l1", [1, 2]), Label("l2", [0, 1])])

        process = TrainMLModelInstruction(
            dataset, GridSearch(hp_settings), hp_settings,
            SplitConfig(SplitType.RANDOM,
                        1,
                        0.5,
                        reports=ReportConfig(data_splits={"seqlen": report})),
            SplitConfig(SplitType.RANDOM,
                        1,
                        0.5,
                        reports=ReportConfig(data_splits={"seqlen": report})),
            {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, label_config,
            path)

        state = process.run(result_path=path)

        self.assertTrue(isinstance(state, TrainMLModelState))
        self.assertEqual(1, len(state.assessment_states))
        self.assertTrue("l1" in state.assessment_states[0].label_states)
        self.assertTrue("l2" in state.assessment_states[0].label_states)

        shutil.rmtree(path)
Ejemplo n.º 18
0
    def test_generate(self):
        path = EnvironmentSettings.tmp_test_path + "cv_feature_performance/"

        state = TrainMLModelState(
            assessment=SplitConfig(split_count=5,
                                   split_strategy=SplitType.K_FOLD),
            selection=SplitConfig(split_count=10,
                                  split_strategy=SplitType.K_FOLD),
            optimization_metric=Metric.ACCURACY,
            label_configuration=LabelConfiguration(
                labels=[Label(name="CMV", values=[True, False])]),
            hp_settings=[
                HPSetting(encoder_params={"p_value_threshold": 0.001},
                          encoder_name="e1",
                          encoder=SequenceAbundanceEncoder([], 0, 0, 0),
                          preproc_sequence=[],
                          ml_method_name="ml1",
                          ml_method=ProbabilisticBinaryClassifier(10, 0.1),
                          ml_params={}),
                HPSetting(encoder_params={"p_value_threshold": 0.01},
                          encoder_name="e2",
                          encoder=SequenceAbundanceEncoder([], 0, 0, 0),
                          preproc_sequence=[],
                          ml_method_name="ml1",
                          ml_method=ProbabilisticBinaryClassifier(10, 0.1),
                          ml_params={}),
                HPSetting(encoder_params={"p_value_threshold": 0.01},
                          encoder=SequenceAbundanceEncoder([], 0, 0, 0),
                          preproc_sequence=[],
                          ml_method=ProbabilisticBinaryClassifier(10, 0.01),
                          ml_params={})
            ],
            dataset=None,
            hp_strategy=None,
            metrics=None)

        report = CVFeaturePerformance("p_value_threshold",
                                      state,
                                      path,
                                      is_feature_axis_categorical=True,
                                      name="report1")
        with self.assertWarns(RuntimeWarning):
            report.generate_report()

        state.hp_settings = state.hp_settings[:2]

        state.assessment_states = [
            HPAssessmentState(i, None, None, None, state.label_configuration)
            for i in range(state.assessment.split_count)
        ]
        for assessment_state in state.assessment_states:
            assessment_state.label_states["CMV"] = HPLabelState("CMV", [])
            assessment_state.label_states["CMV"].assessment_items = {
                setting.get_key():
                HPItem(performance={'accuracy': random.uniform(0.5, 1)},
                       hp_setting=setting)
                for setting in state.hp_settings
            }
            assessment_state.label_states[
                "CMV"].selection_state = HPSelectionState(
                    [], [], "", GridSearch(state.hp_settings))
            assessment_state.label_states["CMV"].selection_state.hp_items = {
                str(setting): [
                    HPItem(performance={'accuracy': random.uniform(0.5, 1)},
                           hp_setting=setting)
                    for _ in range(state.selection.split_count)
                ]
                for setting in state.hp_settings
            }

        report.state = state

        report_result = report.generate_report()

        self.assertTrue(isinstance(report_result, ReportResult))
        self.assertEqual(2, len(report_result.output_tables))
        self.assertEqual(1, len(report_result.output_figures))
        self.assertTrue(os.path.isfile(report_result.output_figures[0].path))
        self.assertTrue(os.path.isfile(report_result.output_tables[0].path))
        self.assertTrue(os.path.isfile(report_result.output_tables[1].path))

        shutil.rmtree(path)
Ejemplo n.º 19
0
    def test_fit(self):
        path = PathBuilder.build(EnvironmentSettings.tmp_test_path + "cnn/")

        dataset = RandomDatasetGenerator.generate_receptor_dataset(
            receptor_count=500,
            chain_1_length_probabilities={4: 1},
            chain_2_length_probabilities={4: 1},
            labels={"CMV": {
                True: 0.5,
                False: 0.5
            }},
            path=path + "dataset/")
        enc_dataset = OneHotReceptorEncoder(True, 1, False, "enc1").encode(
            dataset,
            EncoderParams(path + "result/",
                          LabelConfiguration([Label("CMV", [True, False])])))
        cnn = ReceptorCNN(kernel_count=2,
                          kernel_size=[3],
                          positional_channels=3,
                          sequence_type="amino_acid",
                          device="cpu",
                          number_of_threads=4,
                          random_seed=1,
                          learning_rate=0.01,
                          iteration_count=10,
                          l1_weight_decay=0.1,
                          evaluate_at=5,
                          batch_size=100,
                          training_percentage=0.8,
                          l2_weight_decay=0.0)
        cnn.fit(encoded_data=enc_dataset.encoded_data, label_name="CMV")

        predictions = cnn.predict(enc_dataset.encoded_data, "CMV")
        self.assertEqual(500, len(predictions["CMV"]))
        self.assertEqual(
            500,
            len([
                pred for pred in predictions["CMV"] if isinstance(pred, bool)
            ]))

        predictions_proba = cnn.predict_proba(enc_dataset.encoded_data, "CMV")
        self.assertEqual(500, np.rint(np.sum(predictions_proba["CMV"])))
        self.assertEqual(500, predictions_proba["CMV"].shape[0])

        cnn.store(path + "model_storage/")

        cnn2 = ReceptorCNN(sequence_type="amino_acid")
        cnn2.load(path + "model_storage/")

        cnn2_vars = vars(cnn2)
        del cnn2_vars["CNN"]
        cnn_vars = vars(cnn)
        del cnn_vars["CNN"]

        for item, value in cnn_vars.items():
            if not isinstance(value, np.ndarray):
                self.assertEqual(value, cnn2_vars[item])

        model = cnn.get_model(["CMV"])
        self.assertEqual(vars(cnn), model)

        shutil.rmtree(path)