Exemple #1
0
    def test(self):

        is_installed = True

        try:
            from immuneML.ml_methods.DeepRC import DeepRC
            from deeprc.deeprc_binary.architectures import DeepRC as DeepRCInternal
        except Exception as e:
            is_installed = False

        if is_installed:

            logging.warning("DeepRC test is temporarily excluded")
            path = EnvironmentSettings.tmp_test_path / "deeprc_classifier"
            data_path = path / "encoded_data"
            result_path = path / "result"
            PathBuilder.build(data_path)
            PathBuilder.build(result_path)

            encoded_data = self.make_encoded_data(data_path)
            y = {"status": encoded_data.labels["status"]}

            params = DefaultParamsLoader.load("ml_methods/", "DeepRC")

            classifier = DeepRC(**params)

            # Prepare 'dummy training' for classifier, to test other functionalities
            classifier.result_path = path
            classifier.pytorch_device = torch.device("cpu")
            classifier.training_function = self.dummy_training_function

            train_indices, val_indices = classifier._get_train_val_indices(10, y['status'])
            self.assertEqual(len(train_indices) + len(val_indices), 10)
            self.assertEqual(set(list(train_indices) + list(val_indices)), set(range(10)))

            # test if 'fit' function saves models
            classifier.fit(encoded_data, "status")

            self.assertListEqual(classifier.get_classes(), ["A", "B"])
            self.assertIsInstance(classifier.model, DeepRCInternal)

            # Test storing and loading of models
            self.assertFalse(classifier.check_if_exists(result_path))
            classifier.store(result_path, feature_names=None)
            self.assertTrue(classifier.check_if_exists(result_path))

            second_classifier = DeepRC(**params)
            second_classifier.load(result_path)

            self.assertIsInstance(second_classifier.model, DeepRCInternal)

            shutil.rmtree(path)

            # test get package info
            params = DefaultParamsLoader.load("ml_methods/", "DeepRC")
            classifier = DeepRC(**params)
            classifier.get_package_info()

        else:
            logging.warning("DeepRC is not installed, skipping test. To install DeepRC, install the requirements from requirements_DeepRC.txt.")
Exemple #2
0
    def test_load_repertoire(self):
        """Test dataset content with and without a header included in the input file"""
        path = EnvironmentSettings.root_path / "test/tmp/io_igor_load/"

        PathBuilder.build(path)
        self.write_dummy_files(path, True)

        params = DefaultParamsLoader.load(
            EnvironmentSettings.default_params_path / "datasets/", "igor")
        params["is_repertoire"] = True
        params["result_path"] = path
        params["path"] = path
        params["metadata_file"] = path / "metadata.csv"

        dataset = IGoRImport.import_dataset(params, "igor_repertoire_dataset")

        self.assertEqual(2, dataset.get_example_count())
        self.assertEqual(len(dataset.repertoires[0].sequences), 1)
        self.assertEqual(len(dataset.repertoires[1].sequences), 1)

        self.assertEqual(
            dataset.repertoires[0].sequences[0].amino_acid_sequence,
            "ARDRWSTPVLRYFDWWTPPYYYYMDV")

        self.assertListEqual(list(dataset.repertoires[0].get_counts()), [1])
        self.assertEqual(dataset.repertoires[0].get_chains(), None)

        shutil.rmtree(path)
Exemple #3
0
    def test_import_repertoire_dataset(self):
        path = EnvironmentSettings.root_path / "test/tmp/io_10xGenomics/"
        PathBuilder.build(path)
        self.create_dumy_dataset(path, add_metadata=True)

        params = DefaultParamsLoader.load(
            EnvironmentSettings.default_params_path / "datasets/",
            "tenx_genomics")
        params["is_repertoire"] = True
        params["result_path"] = path
        params["path"] = path
        params["metadata_file"] = path / "metadata.csv"

        dataset = TenxGenomicsImport.import_dataset(params,
                                                    "tenx_dataset_repertoire")

        self.assertEqual(2, dataset.get_example_count())

        self.assertEqual(len(dataset.repertoires[0].sequences), 2)
        self.assertEqual(len(dataset.repertoires[1].sequences), 4)

        self.assertEqual(
            dataset.repertoires[0].sequences[0].amino_acid_sequence,
            "ALSGTGGYKVV")
        self.assertListEqual([Chain.ALPHA, Chain.BETA],
                             list(dataset.repertoires[0].get_chains()))
        self.assertListEqual([2, 4], list(dataset.repertoires[0].get_counts()))

        shutil.rmtree(path)
Exemple #4
0
    def test_import_receptor_dataset(self):
        path = EnvironmentSettings.root_path / "test/tmp/io_10xGenomics/"
        PathBuilder.build(path)
        self.create_dumy_dataset(path, add_metadata=False)

        params = DefaultParamsLoader.load(
            EnvironmentSettings.default_params_path / "datasets/",
            "tenx_genomics")
        params["is_repertoire"] = False
        params["paired"] = True
        params["result_path"] = path
        params["path"] = path
        params["sequence_file_size"] = 1
        params["receptor_chains"] = "TRA_TRB"

        dataset = TenxGenomicsImport.import_dataset(params,
                                                    "tenx_dataset_receptor")

        self.assertEqual(2, dataset.get_example_count())
        self.assertEqual(2, len(dataset.get_filenames()))

        data = dataset.get_data(1)
        for receptor in data:
            self.assertTrue(receptor.alpha.amino_acid_sequence in
                            ["ALSGTGGYKVV", "AIVGNTGKLI"])
            self.assertTrue(receptor.beta.amino_acid_sequence in
                            ["ASSLYGGPEVF", "ASSFATNSDYT"])

        shutil.rmtree(path)
Exemple #5
0
    def test_load_repertoire_with_stop_codon(self):
        path = EnvironmentSettings.root_path / "test/tmp/io_igor_load/"

        PathBuilder.build(path)
        self.write_dummy_files(path, True)

        params = DefaultParamsLoader.load(
            EnvironmentSettings.default_params_path / "datasets/", "igor")
        params["is_repertoire"] = True
        params["result_path"] = path
        params["path"] = path
        params["import_with_stop_codon"] = True
        params["metadata_file"] = path / "metadata.csv"

        dataset_stop_codons = IGoRImport.import_dataset(
            params, "igor_dataset_stop")

        self.assertEqual(2, dataset_stop_codons.get_example_count())
        self.assertEqual(len(dataset_stop_codons.repertoires[0].sequences), 2)
        self.assertEqual(len(dataset_stop_codons.repertoires[1].sequences), 2)

        self.assertEqual(
            dataset_stop_codons.repertoires[0].sequences[0].
            amino_acid_sequence, "ARVNRHIVVVTAIMTG*NWFDP")

        shutil.rmtree(path)
Exemple #6
0
    def test_load_sequence_dataset(self):
        """Test dataset content with and without a header included in the input file"""
        path = EnvironmentSettings.root_path / "test/tmp/io_igor_load/"

        PathBuilder.build(path)
        self.write_dummy_files(path, False)

        params = DefaultParamsLoader.load(
            EnvironmentSettings.default_params_path / "datasets/", "igor")
        params["is_repertoire"] = False
        params["paired"] = False
        params["result_path"] = path
        params["path"] = path
        params["import_with_stop_codon"] = True

        dataset = IGoRImport.import_dataset(params, "igor_seq_dataset")

        seqs = [sequence for sequence in dataset.get_data()]

        self.assertEqual(4, dataset.get_example_count())

        self.assertListEqual(
            sorted([
                "GCGAGACGTGTCTAGGGAGGATATTGTAGTAGTACCAGCTGCTATGACGGGCGGTCCGGTAGTACTACTTTGACTAC",
                "GCGAGAGGCTTCCATGGAACTACAGTAACTACGTTTGTAGGCTGTAGTACTACATGGACGTC",
                "GCGAGAGTTAATCGGCATATTGTGGTGGTGACTGCTATTATGACCGGGTAAAACTGGTTCGACCCC",
                "GCGAGAGATAGGTGGTCAACCCCAGTATTACGATATTTTGACTGGTGGACCCCGCCCTACTACTACTACATGGACGTC"
            ]), sorted([seq.nucleotide_sequence for seq in seqs]))

        shutil.rmtree(path)
Exemple #7
0
    def test_alternative_repertoire_import(self):
        path = EnvironmentSettings.root_path / "test/tmp/immunoseq_alternative/"

        rep1text = """sample_name	productive_frequency	templates	amino_acid	rearrangement	v_resolved	d_resolved	j_resolved
LivMet_45	0.014838454958215437	451	CASSLLGLGSEQYF	CTGCTGTCGGCTGCTCCCTCCCAGACATCTGTGTACTTCTGTGCCAGCAGTTTACTCGGGTTAGGGAGCGAGCAGTACTTCGGGCCG	TCRBV06	TCRBD02-01*02	TCRBJ02-07*01
LivMet_45	0.0106928999144568	325	CASSPGQGEGYEQYF	CACGCCCTGCAGCCAGAAGACTCAGCCCTGTATCTCTGCGCCAGCAGCCCGGGACAGGGGGAGGGCTACGAGCAGTACTTCGGGCCG	TCRBV04-01*01	TCRBD01-01*01	TCRBJ02-07*01
LivMet_45	0.0074356780943607296	226	CASSAGETQYF	ACTCTGACGATCCAGCGCACAGAGCAGCGGGACTCGGCCATGTATCGCTGTGCCAGCAGCGCAGGCGAGACCCAGTACTTCGGGCCA	TCRBV07-06*01	TCRBD01-01*01	TCRBJ02-05*01
LivMet_45	0.0072053694808185825	219	CASSGTGEKGEQYF	ATCCGGTCCACAAAGCTGGAGGACTCAGCCATGTACTTCTGTGCCAGCAGTGGGACAGGGGAGAAGGGCGAGCAGTACTTCGGGCCG	TCRBV02-01*01	TCRBD01-01*01	TCRBJ02-07*01
"""
        PathBuilder.build(path)

        with open(path / "rep1.tsv", "w") as file:
            file.writelines(rep1text)

        with open(path / "metadata.csv", "w") as file:
            file.writelines("""filename,chain,subject_id
rep1.tsv,TRB,1234a""")

        params = DefaultParamsLoader.load(
            EnvironmentSettings.default_params_path / "datasets/",
            "ImmunoSEQRearrangement")
        params["is_repertoire"] = True
        params["result_path"] = path
        params["metadata_file"] = path / "metadata.csv"
        params["path"] = path

        dataset = ImmunoSEQRearrangementImport.import_dataset(
            params, "alternative")

        self.assertEqual(1, dataset.get_example_count())

        shutil.rmtree(path)
    def _parse_split_config(self, instruction_key, instruction: dict, split_key: str, symbol_table: SymbolTable, settings_count: int) -> SplitConfig:

        try:

            default_params = DefaultParamsLoader.load("instructions/", SplitConfig.__name__)
            report_config_input = self._prepare_report_config(instruction_key, instruction, split_key, symbol_table)
            instruction[split_key] = {**default_params, **instruction[split_key]}

            split_strategy = SplitType[instruction[split_key]["split_strategy"].upper()]
            training_percentage = float(instruction[split_key]["training_percentage"]) if split_strategy == SplitType.RANDOM else -1

            if split_strategy == SplitType.RANDOM and training_percentage == 1 and settings_count > 1:
                raise ValueError(f"{TrainMLModelParser.__name__}: all data under {instruction_key}/{split_key} was specified to be used for "
                                 f"training, but {settings_count} settings were specified for evaluation. Please define a test/validation set by "
                                 f"reducing the training percentage (e.g., to 0.7) or use only one hyperparameter setting to run the analysis.")

            return SplitConfig(split_strategy=split_strategy,
                               split_count=int(instruction[split_key]["split_count"]),
                               training_percentage=training_percentage,
                               reports=ReportConfig(**report_config_input),
                               manual_config=ManualSplitConfig(**instruction[split_key]["manual_config"]) if "manual_config" in instruction[split_key] else None,
                               leave_one_out_config=LeaveOneOutConfig(**instruction[split_key]["leave_one_out_config"])
                               if "leave_one_out_config" in instruction[split_key] else None)

        except KeyError as key_error:
            raise KeyError(f"{TrainMLModelParser.__name__}: parameter {key_error.args[0]} was not defined under {split_key}.")
    def prepare_reference(reference_params: dict, location: str, paired: bool):
        ParameterValidator.assert_keys(list(reference_params.keys()), ["format", "params"], location,
                                       "reference")

        seq_import_params = reference_params["params"] if "params" in reference_params else {}

        assert os.path.isfile(seq_import_params["path"]), f"{location}: the file {seq_import_params['path']} does not exist. " \
                                                  f"Specify the correct path under reference."

        if "is_repertoire" in seq_import_params:
            assert seq_import_params["is_repertoire"] == False, f"{location}: is_repertoire must be False for SequenceImport"
        else:
            seq_import_params["is_repertoire"] = False

        if "paired" in seq_import_params:
            assert seq_import_params["paired"] == paired, f"{location}: paired must be {paired} for SequenceImport"
        else:
            seq_import_params["paired"] = paired

        format_str = reference_params["format"]

        import_class = ReflectionHandler.get_class_by_name("{}Import".format(format_str))
        default_params = DefaultParamsLoader.load(EnvironmentSettings.default_params_path / "datasets",
                                          DefaultParamsLoader.convert_to_snake_case(format_str))

        params = {**default_params, **seq_import_params}

        processed_params = DatasetImportParams.build_object(**params)

        receptors = ImportHelper.import_items(import_class, reference_params["params"]["path"], processed_params)

        return receptors
Exemple #10
0
    def _parse_ml_method(ml_method_id: str, ml_specification) -> tuple:

        valid_class_values = ReflectionHandler.all_nonabstract_subclass_basic_names(
            MLMethod, "", "ml_methods/")

        if type(ml_specification) is str:
            ml_specification = {ml_specification: {}}

        ml_specification = {
            **DefaultParamsLoader.load("ml_methods/", "MLMethod"),
            **ml_specification
        }
        ml_specification_keys = list(ml_specification.keys())

        ParameterValidator.assert_all_in_valid_list(
            list(ml_specification_keys),
            ["model_selection_cv", "model_selection_n_folds"] +
            valid_class_values, "MLParser", ml_method_id)

        non_default_keys = [
            key for key in ml_specification.keys()
            if key not in ["model_selection_cv", "model_selection_n_folds"]
        ]

        assert len(ml_specification_keys) == 3, f"MLParser: ML method {ml_method_id} was not correctly specified. Expected at least 1 key " \
                                                f"(ML method name), got {len(ml_specification_keys) - 2} instead: " \
                                                f"{str([key for key in non_default_keys])[1:-1]}."

        ml_method_class_name = non_default_keys[0]
        ml_method_class = ReflectionHandler.get_class_by_name(
            ml_method_class_name, "ml_methods/")

        ml_specification[ml_method_class_name] = {
            **DefaultParamsLoader.load("ml_methods/",
                                       ml_method_class_name,
                                       log_if_missing=False),
            **ml_specification[ml_method_class_name]
        }

        method, params = MLParser.create_method_instance(
            ml_specification, ml_method_class, ml_method_id)
        ml_specification[ml_method_class_name] = params
        method.name = ml_method_id

        return method, ml_specification
Exemple #11
0
    def parse_instruction(key: str, instruction: dict, symbol_table: SymbolTable, path) -> tuple:

        ParameterValidator.assert_keys_present(list(instruction.keys()), ["type"], InstructionParser.__name__, key)
        valid_instructions = [cls[:-6] for cls in ReflectionHandler.discover_classes_by_partial_name("Parser", "dsl/instruction_parsers/")]
        ParameterValidator.assert_in_valid_list(instruction["type"], valid_instructions, "InstructionParser", "type")

        default_params = DefaultParamsLoader.load("instructions/", instruction["type"])
        instruction = {**default_params, **instruction}
        parser = ReflectionHandler.get_class_by_name("{}Parser".format(instruction["type"]), "instruction_parsers/")()
        instruction_object = parser.parse(key, instruction, symbol_table, path)

        symbol_table.add(key, SymbolType.INSTRUCTION, instruction_object)
        return instruction, symbol_table
    def test_repertoire_import(self):
        path = EnvironmentSettings.root_path / "test/tmp/adaptive/"
        self.build_dummy_dataset(path, True)

        params = DefaultParamsLoader.load(
            EnvironmentSettings.default_params_path / "datasets/",
            "ImmunoSEQRearrangement")
        params["is_repertoire"] = True
        params["result_path"] = path
        params['import_empty_nt_sequences'] = False
        params['import_empty_aa_sequences'] = True
        params["metadata_file"] = path / "metadata.csv"
        params["path"] = path
        params["import_productive"] = True
        params["import_with_stop_codon"] = True
        params["import_out_of_frame"] = True

        dataset_name = "adaptive_dataset_reps"

        dataset = ImmunoSEQRearrangementImport.import_dataset(
            params, dataset_name)

        self.assertEqual(
            dataset.repertoires[0].sequences[1].metadata.frame_type,
            SequenceFrameType.IN)

        self.assertListEqual(
            list(dataset.repertoires[0].get_counts()),
            [10, 1772, 1763, None, 566, 506, 398, 394, 363, 363])
        self.assertListEqual(list(dataset.repertoires[0].get_chains()),
                             [Chain.BETA for i in range(10)])

        self.assertEqual(2, dataset.get_example_count())
        for index, rep in enumerate(dataset.get_data()):
            if index == 0:
                self.assertEqual("1234", rep.metadata["subject_id"])
                self.assertEqual(10, len(rep.sequences))
                self.assertEqual(10, rep.sequences[0].metadata.count)
                self.assertEqual("TRBV29",
                                 rep.sequences[0].metadata.v_subgroup)
            else:
                self.assertEqual("1234a", rep.metadata["subject_id"])
                self.assertEqual(11, len(rep.sequences))
                self.assertEqual(2, rep.sequences[-1].metadata.count)

        dataset_file = path / f"{dataset_name}.{ImportHelper.DATASET_FORMAT}"

        self.assertTrue(dataset_file.is_file())

        shutil.rmtree(path)
    def test_load(self):
        params = {"a": 1, "b": True}

        path = EnvironmentSettings.tmp_test_path / "defaultparamsloader/"
        PathBuilder.build(path)

        with open(path / "mixcr_params.yaml", "w") as file:
            yaml.dump(params, file)

        loaded = DefaultParamsLoader.load(path, "MiXCR")

        self.assertTrue(all(key in loaded.keys() for key in params.keys()))
        self.assertEqual(1, loaded["a"])
        self.assertEqual(True, loaded["b"])
        self.assertEqual(2, len(loaded.keys()))

        shutil.rmtree(path)
Exemple #14
0
    def test_load_repertoire_dataset(self):
        path = EnvironmentSettings.root_path / "test/tmp/mixcr/"
        PathBuilder.build(path)
        self.create_dummy_dataset(path, add_metadata=True)

        params = DefaultParamsLoader.load(
            EnvironmentSettings.default_params_path / "datasets/", "mixcr")
        params["is_repertoire"] = True
        params["result_path"] = path
        params["path"] = path
        params["metadata_file"] = path / "metadata.csv"

        dataset = MiXCRImport.import_dataset(params,
                                             "mixcr_repertoire_dataset")

        self.assertEqual(2, dataset.get_example_count())
        for index, repertoire in enumerate(dataset.get_data()):
            self.assertTrue(
                all(sequence.metadata.chain == Chain.ALPHA
                    for sequence in repertoire.sequences))
            if index == 0:
                self.assertEqual(9, len(repertoire.sequences))
                self.assertTrue(repertoire.sequences[0].amino_acid_sequence
                                in ["ALVTDSWGKLQ",
                                    "AVLETSGSRLT"])  # OSX/windows
                self.assertTrue(repertoire.sequences[0].metadata.v_gene
                                in ["TRAV6", "TRAV21"])  # OSX/windows

                self.assertListEqual([Chain.ALPHA for i in range(9)],
                                     list(repertoire.get_chains()))
                self.assertListEqual(
                    sorted([
                        956023, 90101, 69706, 56658, 55692, 43466, 42172,
                        41647, 19133
                    ]), sorted(list(repertoire.get_counts())))

            elif index == 1:
                self.assertEqual(5, len(repertoire.sequences))
                self.assertTrue(repertoire.sequences[0].nucleotide_sequence
                                in [
                                    "GCTGTGCTGGAAACCAGTGGCTCTAGGTTGACC",
                                    "GCTCTAGTAACTGACAGCTGGGGGAAATTGCAG"
                                ])  # OSX/windows

        shutil.rmtree(path)
Exemple #15
0
    def _prepare_params(dataset_specs: dict, result_path: Path,
                        dataset_name: str):
        params = DefaultParamsLoader.load(ImportParser.keyword,
                                          dataset_specs["format"])
        if "params" in dataset_specs.keys():
            params = {**params, **dataset_specs["params"]}
        if "result_path" not in params or params["result_path"] is None:
            params["result_path"] = Path(
                result_path) / "datasets" / dataset_name
        else:
            params["result_path"] = Path(params["result_path"])

        if "path" in params:
            params["path"] = Path(params["path"])
        if "metadata_file" in params:
            params["metadata_file"] = Path(params["metadata_file"])
        dataset_specs["params"] = params
        return params
    def test_sequence_import(self):
        path = EnvironmentSettings.root_path / "test/tmp/adaptive/"
        self.build_dummy_dataset(path, False)

        params = DefaultParamsLoader.load(
            EnvironmentSettings.default_params_path / "datasets/",
            "ImmunoSEQRearrangement")
        params["is_repertoire"] = False
        params["paired"] = False
        params["result_path"] = path
        params["path"] = path
        params["import_productive"] = True
        params["import_with_stop_codon"] = True
        params["import_out_of_frame"] = True
        params["import_empty_nt_sequences"] = False
        params["import_empty_aa_sequences"] = True

        dataset_name = "adaptive_dataset_seqs"

        dataset = ImmunoSEQRearrangementImport.import_dataset(
            params, dataset_name)

        self.assertEqual(21, dataset.get_example_count())

        seqs = [sequence for sequence in dataset.get_data()]
        self.assertTrue(seqs[0].amino_acid_sequence
                        in ["ASSLPGTNTGELF", "SVEESYEQY"])  # OSX/windows
        self.assertTrue(seqs[0].nucleotide_sequence in [
            "GCCAGCAGCTTACCGGGGACGAACACCGGGGAGCTGTTT",
            'AGCGTTGAAGAATCCTACGAGCAGTAC'
        ])  # OSX/windows
        self.assertEqual("IN", seqs[0].metadata.frame_type.name)
        self.assertTrue(seqs[0].metadata.v_gene in ['TRBV7-9',
                                                    'TRBV29-1'])  # OSX/windows
        self.assertTrue(seqs[0].metadata.j_gene in ['TRBJ2-2',
                                                    'TRBJ2-7'])  # OSX/windows

        dataset_file = path / f"{dataset_name}.{ImportHelper.DATASET_FORMAT}"

        self.assertTrue(dataset_file.is_file())

        shutil.rmtree(path)
Exemple #17
0
    def test_encode(self):

        file_content = """complex.id	Gene	CDR3	V	J	Species	MHC A	MHC B	MHC class	Epitope	Epitope gene	Epitope species	Reference	Method	Meta	CDR3fix	Score
3050	TRB	CASSPPRVYSNGAGLAGVGWRNEQFF	TRBV5-4*01	TRBJ2-1*01	HomoSapiens	HLA-A*11:01	B2M	MHCI	AVFDRKSDAK	EBNA4	EBV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""}	{"cdr3": "CASSPPRVYSNGAGLAGVGWRNEQFF", "cdr3_old": "CASSPPRVYSNGAGLAGVGWRNEQFF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-1*01", "jStart": 21, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-4*01"}	0
15760	TRB	CASSWTWDAATLWGQGALGGANVLTF	TRBV5-5*01	TRBJ2-6*01	HomoSapiens	HLA-A*03:01	B2M	MHCI	KLGGALQAK	IE1	CMV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""}	{"cdr3": "CASSWTWDAATLWGQGALGGANVLTF", "cdr3_old": "CASSWTWDAATLWGQGALGGANVLTF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-6*01", "jStart": 19, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-5*01"}	0
3050	TRA	CAAIYESRGSTLGRLYF	TRAV13-1*01	TRAJ18*01	HomoSapiens	HLA-A*11:01	B2M	MHCI	AVFDRKSDAK	EBNA4	EBV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""}	{"cdr3": "CAAIYESRGSTLGRLYF", "cdr3_old": "CAAIYESRGSTLGRLYF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ18*01", "jStart": 7, "oldVEnd": -1, "oldVFixType": "FailedBadSegment", "oldVId": null, "vCanonical": true, "vEnd": 3, "vFixType": "ChangeSegment", "vId": "TRAV13-1*01"}	0
15760	TRA	CALRLNNQGGKLIF	TRAV9-2*01	TRAJ23*01	HomoSapiens	HLA-A*03:01	B2M	MHCI	KLGGALQAK	IE1	CMV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""}	{"cdr3": "CALRLNNQGGKLIF", "cdr3_old": "CALRLNNQGGKLIF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ23*01", "jStart": 6, "vCanonical": true, "vEnd": 3, "vFixType": "NoFixNeeded", "vId": "TRAV9-2*01"}	0
3051	TRB	CASSPPRVYSNGAGLAGVGWRNEQFF	TRBV5-4*01	TRBJ2-1*01	HomoSapiens	HLA-A*11:01	B2M	MHCI	AVFDRKSDAK	EBNA4	EBV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""}	{"cdr3": "CASSPPRVYSNGAGLAGVGWRNEQFF", "cdr3_old": "CASSPPRVYSNGAGLAGVGWRNEQFF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-1*01", "jStart": 21, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-4*01"}	0
15761	TRB	CASSWTWDAATLWGQGALGGANVLTF	TRBV5-5*01	TRBJ2-6*01	HomoSapiens	HLA-A*03:01	B2M	MHCI	KLGGALQAK	IE1	CMV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""}	{"cdr3": "CASSWTWDAATLWGQGALGGANVLTF", "cdr3_old": "CASSWTWDAATLWGQGALGGANVLTF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-6*01", "jStart": 19, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-5*01"}	0
3051	TRA	CAAIYESRGSTLGRLYF	TRAV13-1*01	TRAJ18*01	HomoSapiens	HLA-A*11:01	B2M	MHCI	AVFDRKSDAK	EBNA4	EBV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""}	{"cdr3": "CAAIYESRGSTLGRLYF", "cdr3_old": "CAAIYESRGSTLGRLYF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ18*01", "jStart": 7, "oldVEnd": -1, "oldVFixType": "FailedBadSegment", "oldVId": null, "vCanonical": true, "vEnd": 3, "vFixType": "ChangeSegment", "vId": "TRAV13-1*01"}	0
15761	TRA	CALRLNNQGGKLIF	TRAV9-2*01	TRAJ23*01	HomoSapiens	HLA-A*03:01	B2M	MHCI	KLGGALQAK	IE1	CMV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""}	{"cdr3": "CALRLNNQGGKLIF", "cdr3_old": "CALRLNNQGGKLIF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ23*01", "jStart": 6, "vCanonical": true, "vEnd": 3, "vFixType": "NoFixNeeded", "vId": "TRAV9-2*01"}	0
        """
        path = PathBuilder.build(EnvironmentSettings.root_path /
                                 "test/tmp/trcdist_encoder/")

        with open(path / "receptors.tsv", "w") as file:
            file.writelines(file_content)

        params = DefaultParamsLoader.load(
            EnvironmentSettings.default_params_path / "datasets/", "vdjdb")
        params["is_repertoire"] = False
        params["paired"] = True
        params["result_path"] = path
        params["path"] = path
        params["sequence_file_size"] = 1
        params["receptor_chains"] = "TRA_TRB"
        params['organism'] = 'human'

        dataset = VDJdbImport.import_dataset(params, "vdjdb_dataset")

        encoder = TCRdistEncoder.build_object(dataset, **{"cores": 2})
        encoded_dataset = encoder.encode(
            dataset,
            EncoderParams(path / "result/",
                          LabelConfiguration([Label("epitope")])))

        self.assertTrue(encoded_dataset.encoded_data.examples.shape[0]
                        == encoded_dataset.encoded_data.examples.shape[1]
                        and encoded_dataset.encoded_data.examples.shape[0]
                        == dataset.get_example_count())

        shutil.rmtree(path)
    def test_import_sequence_dataset(self):
        path = EnvironmentSettings.root_path / "test/tmp/immunoseq/"

        self.create_dummy_dataset(path, False)

        params = DefaultParamsLoader.load(EnvironmentSettings.default_params_path / "datasets/", "ImmunoSEQSample")
        params["is_repertoire"] = False
        params["paired"] = False
        params["result_path"] = path
        params["path"] = path

        dataset = ImmunoSEQSampleImport.import_dataset(params, "immunoseq_dataset")

        seqs = [sequence for sequence in dataset.get_data()]

        self.assertEqual(seqs[0].amino_acid_sequence, "ATSDQLNRWGTGELF")
        self.assertEqual(seqs[1].amino_acid_sequence, "ASKDGDTGELF")
        self.assertEqual(seqs[2].amino_acid_sequence, "ASSGEGQGVFGGTEAF")
        self.assertEqual(seqs[3].amino_acid_sequence, "ASSEEVGGNQPQH")

        shutil.rmtree(path)
Exemple #19
0
    def test_load_sequence_dataset(self):
        path = EnvironmentSettings.root_path / "test/tmp/mixcr/"
        PathBuilder.build(path)
        self.create_dummy_dataset(path, add_metadata=False)

        params = DefaultParamsLoader.load(
            EnvironmentSettings.default_params_path / "datasets/", "mixcr")
        params["is_repertoire"] = False
        params["paired"] = False
        params["result_path"] = path
        params["path"] = path

        dataset = MiXCRImport.import_dataset(params,
                                             "mixcr_repertoire_dataset")

        seqs = [sequence for sequence in dataset.get_data()]

        self.assertTrue(seqs[0].amino_acid_sequence
                        in ["AVLETSGSRLT", "ALVTDSWGKLQ"])  # OSX/windows
        self.assertTrue(seqs[0].metadata.v_gene in ["TRAV21",
                                                    "TRAV6"])  # OSX/windows

        shutil.rmtree(path)
    def test_import_repertoire_dataset(self):
        path = EnvironmentSettings.root_path / "test/tmp/immunoseq/"

        self.create_dummy_dataset(path, True)


        params = DefaultParamsLoader.load(EnvironmentSettings.default_params_path / "datasets/", "ImmunoSEQSample")
        params["is_repertoire"] = True
        params["result_path"] = path
        params["metadata_file"] = path / "metadata.csv"
        params["path"] = path

        dataset = ImmunoSEQSampleImport.import_dataset(params, "immunoseq_dataset")

        self.assertEqual(1, dataset.get_example_count())
        for index, rep in enumerate(dataset.get_data()):
            self.assertEqual("1234a", rep.metadata["subject_id"])
            self.assertEqual(18, len(rep.sequences))
            self.assertEqual("ATSDQLNRWGTGELF", rep.sequences[0].get_sequence())
            self.assertEqual("TRBV25-1", rep.sequences[2].metadata.v_gene)
            self.assertListEqual([38, 48, 37, 53, 28, 16, 72, 14, 26, 13,  8, 16,  8, 28,  7,  1,  9, 1], list(rep.get_counts()))
            self.assertListEqual([Chain.BETA for i in range(18)], list(rep.get_chains()))

        shutil.rmtree(path)
Exemple #21
0
    def _get_implanting_strategy(key: str,
                                 signal: dict) -> SignalImplantingStrategy:

        valid_strategies = [
            cls[:-10]
            for cls in ReflectionHandler.discover_classes_by_partial_name(
                "Implanting", "simulation/signal_implanting_strategy/")
        ]
        ParameterValidator.assert_in_valid_list(signal["implanting"],
                                                valid_strategies,
                                                "SignalParser", key)

        defaults = DefaultParamsLoader.load(
            "signal_implanting_strategy/", f"{signal['implanting']}Implanting")
        signal = {**defaults, **signal}

        ParameterValidator.assert_keys_present(
            list(signal.keys()),
            ["motifs", "implanting", "sequence_position_weights"],
            SignalParser.__name__, key)

        implanting_comp = None
        if 'implanting_computation' in signal:
            implanting_comp = signal['implanting_computation'].lower()
            ParameterValidator.assert_in_valid_list(
                implanting_comp,
                [el.name.lower() for el in ImplantingComputation],
                SignalParser.__name__, 'implanting_computation')
            implanting_comp = ImplantingComputation[implanting_comp.upper()]

        implanting_strategy = ReflectionHandler.get_class_by_name(
            f"{signal['implanting']}Implanting")(
                GappedMotifImplanting(), signal["sequence_position_weights"],
                implanting_comp)

        return implanting_strategy
Exemple #22
0
 def get_all_params(specs, class_path, short_class_name, key: str = None):
     default_params = DefaultParamsLoader.load(class_path, short_class_name)
     specified_params = ObjectParser.get_params(specs, short_class_name)
     params = {**default_params, **specified_params, "name": key}
     return params