Exemple #1
0
    def _parse_ml_method(ml_method_id: str, ml_specification) -> tuple:

        valid_class_values = ReflectionHandler.all_nonabstract_subclass_basic_names(MLMethod, "", "ml_methods/")

        if type(ml_specification) is str:
            ml_specification = {ml_specification: {}}

        ml_specification = {**DefaultParamsLoader.load("ml_methods/", "MLMethod"), **ml_specification}
        ml_specification_keys = list(ml_specification.keys())

        ParameterValidator.assert_all_in_valid_list(list(ml_specification_keys), ["model_selection_cv", "model_selection_n_folds"] +
                                                    valid_class_values, "MLParser", ml_method_id)

        non_default_keys = [key for key in ml_specification.keys() if key not in ["model_selection_cv", "model_selection_n_folds"]]

        assert len(ml_specification_keys) == 3, f"MLParser: ML method {ml_method_id} was not correctly specified. Expected at least 1 key " \
                                                f"(ML method name), got {len(ml_specification_keys) - 2} instead: " \
                                                f"{str([key for key in non_default_keys])[1:-1]}."

        ml_method_class_name = non_default_keys[0]
        ml_method_class = ReflectionHandler.get_class_by_name(ml_method_class_name, "ml_methods/")

        ml_specification[ml_method_class_name] = {**DefaultParamsLoader.load("ml_methods/", ml_method_class_name, log_if_missing=False),
                                                  **ml_specification[ml_method_class_name]}

        method, params = MLParser.create_method_instance(ml_specification, ml_method_class, ml_method_id)
        ml_specification[ml_method_class_name] = params
        method.name = ml_method_id

        return method, ml_specification
    def test_import_repertoire_dataset(self):
        path = EnvironmentSettings.root_path + "test/tmp/io_10xGenomics/"
        PathBuilder.build(path)
        self.create_dumy_dataset(path, add_metadata=True)

        params = DefaultParamsLoader.load(
            EnvironmentSettings.default_params_path + "datasets/",
            "tenx_genomics")
        params["is_repertoire"] = True
        params["result_path"] = path
        params["path"] = path
        params["metadata_file"] = path + "metadata.csv"

        dataset = TenxGenomicsImport.import_dataset(params,
                                                    "tenx_dataset_repertoire")

        self.assertEqual(2, dataset.get_example_count())

        self.assertEqual(len(dataset.repertoires[0].sequences), 2)
        self.assertEqual(len(dataset.repertoires[1].sequences), 4)

        self.assertEqual(
            dataset.repertoires[0].sequences[0].amino_acid_sequence,
            "ALSGTGGYKVV")
        self.assertListEqual([Chain.ALPHA, Chain.BETA],
                             list(dataset.repertoires[0].get_chains()))
        self.assertListEqual([2, 4], list(dataset.repertoires[0].get_counts()))

        shutil.rmtree(path)
Exemple #3
0
    def test_load_sequence_dataset(self):
        """Test dataset content with and without a header included in the input file"""
        path = EnvironmentSettings.root_path + "test/tmp/io_igor_load/"

        PathBuilder.build(path)
        self.write_dummy_files(path, False)

        params = DefaultParamsLoader.load(
            EnvironmentSettings.default_params_path + "datasets/", "igor")
        params["is_repertoire"] = False
        params["paired"] = False
        params["result_path"] = path
        params["path"] = path
        params["import_with_stop_codon"] = True

        dataset = IGoRImport.import_dataset(params, "igor_seq_dataset")

        seqs = [sequence for sequence in dataset.get_data()]

        self.assertEqual(4, dataset.get_example_count())

        self.assertEqual(
            "GCGAGACGTGTCTAGGGAGGATATTGTAGTAGTACCAGCTGCTATGACGGGCGGTCCGGTAGTACTACTTTGACTAC",
            seqs[0].nucleotide_sequence)
        self.assertEqual(
            "GCGAGAGGCTTCCATGGAACTACAGTAACTACGTTTGTAGGCTGTAGTACTACATGGACGTC",
            seqs[1].nucleotide_sequence)
        self.assertEqual(
            "GCGAGAGTTAATCGGCATATTGTGGTGGTGACTGCTATTATGACCGGGTAAAACTGGTTCGACCCC",
            seqs[2].nucleotide_sequence)
        self.assertEqual(
            "GCGAGAGATAGGTGGTCAACCCCAGTATTACGATATTTTGACTGGTGGACCCCGCCCTACTACTACTACATGGACGTC",
            seqs[3].nucleotide_sequence)

        shutil.rmtree(path)
Exemple #4
0
    def test_load_repertoire(self):
        """Test dataset content with and without a header included in the input file"""
        path = EnvironmentSettings.root_path + "test/tmp/io_igor_load/"

        PathBuilder.build(path)
        self.write_dummy_files(path, True)

        params = DefaultParamsLoader.load(
            EnvironmentSettings.default_params_path + "datasets/", "igor")
        params["is_repertoire"] = True
        params["result_path"] = path
        params["path"] = path
        params["metadata_file"] = path + "metadata.csv"

        dataset = IGoRImport.import_dataset(params, "igor_repertoire_dataset")

        self.assertEqual(2, dataset.get_example_count())
        self.assertEqual(len(dataset.repertoires[0].sequences), 1)
        self.assertEqual(len(dataset.repertoires[1].sequences), 1)

        self.assertEqual(
            dataset.repertoires[0].sequences[0].amino_acid_sequence,
            "ARDRWSTPVLRYFDWWTPPYYYYMDV")

        self.assertListEqual(list(dataset.repertoires[0].get_counts()), [1])
        self.assertEqual(dataset.repertoires[0].get_chains(), None)

        shutil.rmtree(path)
Exemple #5
0
    def test_load_repertoire_with_stop_codon(self):
        path = EnvironmentSettings.root_path + "test/tmp/io_igor_load/"

        PathBuilder.build(path)
        self.write_dummy_files(path, True)

        params = DefaultParamsLoader.load(
            EnvironmentSettings.default_params_path + "datasets/", "igor")
        params["is_repertoire"] = True
        params["result_path"] = path
        params["path"] = path
        params["import_with_stop_codon"] = True
        params["metadata_file"] = path + "metadata.csv"

        dataset_stop_codons = IGoRImport.import_dataset(
            params, "igor_dataset_stop")

        self.assertEqual(2, dataset_stop_codons.get_example_count())
        self.assertEqual(len(dataset_stop_codons.repertoires[0].sequences), 2)
        self.assertEqual(len(dataset_stop_codons.repertoires[1].sequences), 2)

        self.assertEqual(
            dataset_stop_codons.repertoires[0].sequences[0].
            amino_acid_sequence, "ARVNRHIVVVTAIMTG*NWFDP")

        shutil.rmtree(path)
Exemple #6
0
    def test_import_repertoire_dataset(self):
        path = EnvironmentSettings.root_path + "test/tmp/immunoseq/"

        self.create_dummy_dataset(path, True)

        params = DefaultParamsLoader.load(
            EnvironmentSettings.default_params_path + "datasets/",
            "ImmunoSEQSample")
        params["is_repertoire"] = True
        params["result_path"] = path
        params["metadata_file"] = path + "metadata.csv"
        params["path"] = path

        dataset = ImmunoSEQSampleImport.import_dataset(params,
                                                       "immunoseq_dataset")

        self.assertEqual(1, dataset.get_example_count())
        for index, rep in enumerate(dataset.get_data()):
            self.assertEqual("1234a", rep.metadata["subject_id"])
            self.assertEqual(18, len(rep.sequences))
            self.assertEqual("ATSDQLNRWGTGELF",
                             rep.sequences[0].get_sequence())
            self.assertEqual("TRBV25-1", rep.sequences[2].metadata.v_gene)
            self.assertListEqual([
                38, 48, 37, 53, 28, 16, 72, 14, 26, 13, 8, 16, 8, 28, 7, 1, 9,
                1
            ], list(rep.get_counts()))
            self.assertListEqual([Chain.BETA for i in range(18)],
                                 list(rep.get_chains()))

        shutil.rmtree(path)
    def test_import_receptor_dataset(self):
        path = EnvironmentSettings.root_path + "test/tmp/io_10xGenomics/"
        PathBuilder.build(path)
        self.create_dumy_dataset(path, add_metadata=False)

        params = DefaultParamsLoader.load(
            EnvironmentSettings.default_params_path + "datasets/",
            "tenx_genomics")
        params["is_repertoire"] = False
        params["paired"] = True
        params["result_path"] = path
        params["path"] = path
        params["sequence_file_size"] = 1
        params["receptor_chains"] = "TRA_TRB"

        dataset = TenxGenomicsImport.import_dataset(params,
                                                    "tenx_dataset_receptor")

        self.assertEqual(2, dataset.get_example_count())
        self.assertEqual(2, len(dataset.get_filenames()))

        data = dataset.get_data(1)
        for receptor in data:
            self.assertTrue(receptor.alpha.amino_acid_sequence in
                            ["ALSGTGGYKVV", "AIVGNTGKLI"])
            self.assertTrue(receptor.beta.amino_acid_sequence in
                            ["ASSLYGGPEVF", "ASSFATNSDYT"])

        shutil.rmtree(path)
Exemple #8
0
 def _prepare_params(dataset_specs: dict, result_path: str, dataset_name: str):
     params = DefaultParamsLoader.load(ImportParser.keyword, dataset_specs["format"])
     if "params" in dataset_specs.keys():
         params = {**params, **dataset_specs["params"]}
     if "result_path" not in params or params["result_path"] is None:
         params["result_path"] = f"{result_path}datasets/{dataset_name}/"
     dataset_specs["params"] = params
     return params
Exemple #9
0
    def test_load_repertoire_dataset(self):
        file_content = """complex.id	Gene	CDR3	V	J	Species	MHC A	MHC B	MHC class	Epitope	Epitope gene	Epitope species	Reference	Method	Meta	CDR3fix	Score
3050	TRB	CASSPPRVYSNGAGLAGVGWRNEQFF	TRBV5-4*01	TRBJ2-1*01	HomoSapiens	HLA-A*11:01	B2M	MHCI	AVFDRKSDAK	EBNA4	EBV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""}	{"cdr3": "CASSPPRVYSNGAGLAGVGWRNEQFF", "cdr3_old": "CASSPPRVYSNGAGLAGVGWRNEQFF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-1*01", "jStart": 21, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-4*01"}	0
15760	TRB	CASSWTWDAATLWGQGALGGANVLTF	TRBV5-5*01	TRBJ2-6*01	HomoSapiens	HLA-A*03:01	B2M	MHCI	KLGGALQAK	IE1	CMV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""}	{"cdr3": "CASSWTWDAATLWGQGALGGANVLTF", "cdr3_old": "CASSWTWDAATLWGQGALGGANVLTF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-6*01", "jStart": 19, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-5*01"}	0
3050	TRA	CAAIYESRGSTLGRLYF	TRAV13-1*01	TRAJ18*01	HomoSapiens	HLA-A*11:01	B2M	MHCI	AVFDRKSDAK	EBNA4	EBV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""}	{"cdr3": "CAAIYESRGSTLGRLYF", "cdr3_old": "CAAIYESRGSTLGRLYF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ18*01", "jStart": 7, "oldVEnd": -1, "oldVFixType": "FailedBadSegment", "oldVId": null, "vCanonical": true, "vEnd": 3, "vFixType": "ChangeSegment", "vId": "TRAV13-1*01"}	0
15760	TRA	CALRLNNQGGKLIF	TRAV9-2*01	TRAJ23*01	HomoSapiens	HLA-A*03:01	B2M	MHCI	KLGGALQAK	IE1	CMV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""}	{"cdr3": "CALRLNNQGGKLIF", "cdr3_old": "CALRLNNQGGKLIF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ23*01", "jStart": 6, "vCanonical": true, "vEnd": 3, "vFixType": "NoFixNeeded", "vId": "TRAV9-2*01"}	0
"""
        path = EnvironmentSettings.root_path + "test/tmp/iovdjdb2/"
        PathBuilder.build(path)

        number_of_repertoires = 5

        for i in range(number_of_repertoires):
            with open(path + "receptors_{}.tsv".format(i + 1), "w") as file:
                file.writelines(file_content)

        metadata = {
            "filename": [
                "receptors_{}.tsv".format(i + 1)
                for i in range(number_of_repertoires)
            ],
            "label1": [i % 2 for i in range(number_of_repertoires)]
        }

        pd.DataFrame(metadata).to_csv(path + "metadata.csv")

        default_params = DefaultParamsLoader.load(
            EnvironmentSettings.default_params_path + "datasets/", "vdjdb")

        dataset = VDJdbImport.import_dataset(
            {
                "is_repertoire": True,
                "result_path": path,
                "metadata_file": path + "metadata.csv",
                "path": path,
                "import_empty_nt_sequences": True,
                "import_empty_aa_sequences": False,
                "import_illegal_characters": False,
                "column_mapping": default_params["column_mapping"],
                "separator": "\t",
                "region_type": "IMGT_CDR3"
            }, "vdjdb_rep_dataset")

        self.assertEqual(number_of_repertoires, dataset.get_example_count())
        self.assertEqual(number_of_repertoires, len(dataset.get_data()))

        for repertoire in dataset.get_data(2):
            self.assertTrue(repertoire.metadata["label1"] in {0, 1})
            self.assertEqual(4, len(repertoire.sequences))
            self.assertListEqual(
                [Chain.BETA, Chain.BETA, Chain.ALPHA, Chain.ALPHA],
                list(repertoire.get_chains()))
            self.assertEqual(None, repertoire.get_counts())

        shutil.rmtree(path)
Exemple #10
0
    def test(self):
        path = EnvironmentSettings.tmp_test_path + "deeprc_classifier/"
        data_path = path + "encoded_data/"
        result_path = path + "result/"
        PathBuilder.build(data_path)
        PathBuilder.build(result_path)

        encoded_data = self.make_encoded_data(data_path)
        y = {"status": encoded_data.labels["status"]}

        params = DefaultParamsLoader.load("ml_methods/", "DeepRC")

        classifier = DeepRC(**params)

        # Prepare 'dummy training' for classifier, to test other functionalities
        classifier.result_path = path
        classifier.pytorch_device = torch.device("cpu")
        classifier.training_function = self.dummy_training_function

        train_indices, val_indices = classifier.get_train_val_indices(10)
        self.assertEqual(len(train_indices) + len(val_indices), 10)
        self.assertEqual(set(list(train_indices) + list(val_indices)),
                         set(range(10)))

        # test if 'fit' function saves models
        classifier.fit(encoded_data, "status")

        self.assertListEqual(classifier.get_classes_for_label("status"),
                             ["A", "B"])

        self.assertIsInstance(classifier.models, dict)
        self.assertListEqual(list(classifier.models.keys()), ["status"])

        for model in classifier.models.values():
            self.assertIsInstance(model, DeepRCInternal)

        # Test storing and loading of models
        self.assertFalse(classifier.check_if_exists(result_path))
        classifier.store(result_path, feature_names=None)
        self.assertTrue(classifier.check_if_exists(result_path))

        second_classifier = DeepRC(**params)
        second_classifier.load(result_path)

        self.assertIsInstance(second_classifier.models, dict)
        self.assertListEqual(list(second_classifier.models.keys()), ["status"])

        for model in second_classifier.models.values():
            self.assertIsInstance(model, DeepRCInternal)

        shutil.rmtree(path)
    def prepare_reference(reference_params: dict, location: str, paired: bool):
        ParameterValidator.assert_keys(list(reference_params.keys()),
                                       ["format", "params"], location,
                                       "reference")

        seq_import_params = reference_params[
            "params"] if "params" in reference_params else {}

        assert os.path.isfile(seq_import_params["path"]), f"{location}: the file {seq_import_params['path']} does not exist. " \
                                                  f"Specify the correct path under reference."

        if "paired" in seq_import_params:
            assert seq_import_params[
                "paired"] == paired, f"{location}: paired must be {paired} for SequenceImport"
        else:
            seq_import_params["paired"] = paired

        format_str = reference_params["format"]

        if format_str == "IRIS":  # todo refactor this when refactoring IRISSequenceImport
            receptors = IRISSequenceImport.import_items(**seq_import_params)
        else:
            import_class = ReflectionHandler.get_class_by_name(
                "{}Import".format(format_str))
            params = DefaultParamsLoader.load(
                EnvironmentSettings.default_params_path + "datasets/",
                DefaultParamsLoader.convert_to_snake_case(format_str))
            for key, value in seq_import_params.items():
                params[key] = value
            params["paired"] = paired

            processed_params = DatasetImportParams.build_object(**params)

            receptors = ImportHelper.import_items(
                import_class, reference_params["params"]["path"],
                processed_params)

        return receptors
    def test_repertoire_import(self):
        path = EnvironmentSettings.root_path + "test/tmp/adaptive/"
        self.build_dummy_dataset(path, True)

        params = DefaultParamsLoader.load(
            EnvironmentSettings.default_params_path + "datasets/",
            "ImmunoSEQRearrangement")
        params["is_repertoire"] = True
        params["result_path"] = path
        params['import_empty_nt_sequences'] = False
        params['import_empty_aa_sequences'] = True
        params["metadata_file"] = path + "metadata.csv"
        params["path"] = path
        params["import_productive"] = True
        params["import_with_stop_codon"] = True
        params["import_out_of_frame"] = True

        dataset_name = "adaptive_dataset_reps"

        dataset = ImmunoSEQRearrangementImport.import_dataset(
            params, dataset_name)

        self.assertEqual(
            dataset.repertoires[0].sequences[1].metadata.frame_type,
            SequenceFrameType.IN)

        self.assertListEqual(
            list(dataset.repertoires[0].get_counts()),
            [10, 1772, 1763, None, 566, 506, 398, 394, 363, 363])
        self.assertListEqual(list(dataset.repertoires[0].get_chains()),
                             [Chain.BETA for i in range(10)])

        self.assertEqual(2, dataset.get_example_count())
        for index, rep in enumerate(dataset.get_data()):
            if index == 0:
                self.assertEqual("1234", rep.metadata["subject_id"])
                self.assertEqual(10, len(rep.sequences))
                self.assertEqual(10, rep.sequences[0].metadata.count)
                self.assertEqual("TRBV29",
                                 rep.sequences[0].metadata.v_subgroup)
            else:
                self.assertEqual("1234a", rep.metadata["subject_id"])
                self.assertEqual(11, len(rep.sequences))
                self.assertEqual(2, rep.sequences[-1].metadata.count)

        dataset_file = f"{path}{dataset_name}.{ImportHelper.DATASET_FORMAT}"

        self.assertTrue(os.path.isfile(dataset_file))

        shutil.rmtree(path)
Exemple #13
0
    def make_reports_docs(path):
        filename = "reports.rst"

        open(path + filename, "w").close()

        for report_type_class in [DataReport, EncodingReport, MLReport, TrainMLModelReport, MultiDatasetReport]:
            with open(path + filename, "a") as file:
                doc_format = DocumentationFormat(cls=report_type_class,
                                                 cls_name=f"**{report_type_class.get_title()}**",
                                                 level_heading=DocumentationFormat.LEVELS[1])
                write_class_docs(doc_format, file)

            subdir = DefaultParamsLoader.convert_to_snake_case(report_type_class.__name__) + "s"

            classes = ReflectionHandler.all_nonabstract_subclasses(report_type_class, "", f"reports/{subdir}/")
            make_docs(path, classes, filename, "", "a")
    def _parse_split_config(self, instruction_key, instruction: dict,
                            split_key: str, symbol_table: SymbolTable,
                            settings_count: int) -> SplitConfig:

        try:

            default_params = DefaultParamsLoader.load("instructions/",
                                                      SplitConfig.__name__)
            report_config_input = self._prepare_report_config(
                instruction_key, instruction, split_key, symbol_table)
            instruction[split_key] = {
                **default_params,
                **instruction[split_key]
            }

            split_strategy = SplitType[instruction[split_key]
                                       ["split_strategy"].upper()]
            training_percentage = float(
                instruction[split_key]["training_percentage"]
            ) if split_strategy == SplitType.RANDOM else -1

            if split_strategy == SplitType.RANDOM and training_percentage == 1 and settings_count > 1:
                raise ValueError(
                    f"{TrainMLModelParser.__name__}: all data under {instruction_key}/{split_key} was specified to be used for "
                    f"training, but {settings_count} settings were specified for evaluation. Please define a test/validation set by "
                    f"reducing the training percentage (e.g., to 0.7) or use only one hyperparameter setting to run the analysis."
                )

            return SplitConfig(
                split_strategy=split_strategy,
                split_count=int(instruction[split_key]["split_count"]),
                training_percentage=training_percentage,
                reports=ReportConfig(**report_config_input),
                manual_config=ManualSplitConfig(
                    **instruction[split_key]["manual_config"])
                if "manual_config" in instruction[split_key] else None,
                leave_one_out_config=LeaveOneOutConfig(
                    **instruction[split_key]["leave_one_out_config"])
                if "leave_one_out_config" in instruction[split_key] else None)

        except KeyError as key_error:
            raise KeyError(
                f"{TrainMLModelParser.__name__}: parameter {key_error.args[0]} was not defined under {split_key}."
            )
Exemple #15
0
    def test_load_repertoire_dataset(self):
        path = EnvironmentSettings.root_path + "test/tmp/mixcr/"
        PathBuilder.build(path)
        self.create_dummy_dataset(path, add_metadata=True)

        params = DefaultParamsLoader.load(
            EnvironmentSettings.default_params_path + "datasets/", "mixcr")
        params["is_repertoire"] = True
        params["result_path"] = path
        params["path"] = path
        params["metadata_file"] = path + "metadata.csv"

        dataset = MiXCRImport.import_dataset(params,
                                             "mixcr_repertoire_dataset")

        self.assertEqual(2, dataset.get_example_count())
        for index, repertoire in enumerate(dataset.get_data()):
            self.assertTrue(
                all(sequence.metadata.chain == Chain.ALPHA
                    for sequence in repertoire.sequences))
            if index == 0:
                self.assertEqual(9, len(repertoire.sequences))
                self.assertEqual("ALVTDSWGKLQ",
                                 repertoire.sequences[0].amino_acid_sequence)
                self.assertEqual("ALRITQGGSEKLV",
                                 repertoire.sequences[1].amino_acid_sequence)
                self.assertEqual("TRAV6",
                                 repertoire.sequences[0].metadata.v_gene)
                self.assertEqual("TRAV16",
                                 repertoire.sequences[1].metadata.v_gene)
                self.assertListEqual([Chain.ALPHA for i in range(9)],
                                     list(repertoire.get_chains()))
                self.assertListEqual([
                    956023, 90101, 69706, 56658, 55692, 43466, 42172, 41647,
                    19133
                ], list(repertoire.get_counts()))

            elif index == 1:
                self.assertEqual(5, len(repertoire.sequences))
                self.assertEqual("GCTGTGCTGGAAACCAGTGGCTCTAGGTTGACC",
                                 repertoire.sequences[0].nucleotide_sequence)

        shutil.rmtree(path)
    def test_encode(self):

        file_content = """complex.id	Gene	CDR3	V	J	Species	MHC A	MHC B	MHC class	Epitope	Epitope gene	Epitope species	Reference	Method	Meta	CDR3fix	Score
3050	TRB	CASSPPRVYSNGAGLAGVGWRNEQFF	TRBV5-4*01	TRBJ2-1*01	HomoSapiens	HLA-A*11:01	B2M	MHCI	AVFDRKSDAK	EBNA4	EBV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""}	{"cdr3": "CASSPPRVYSNGAGLAGVGWRNEQFF", "cdr3_old": "CASSPPRVYSNGAGLAGVGWRNEQFF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-1*01", "jStart": 21, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-4*01"}	0
15760	TRB	CASSWTWDAATLWGQGALGGANVLTF	TRBV5-5*01	TRBJ2-6*01	HomoSapiens	HLA-A*03:01	B2M	MHCI	KLGGALQAK	IE1	CMV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""}	{"cdr3": "CASSWTWDAATLWGQGALGGANVLTF", "cdr3_old": "CASSWTWDAATLWGQGALGGANVLTF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-6*01", "jStart": 19, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-5*01"}	0
3050	TRA	CAAIYESRGSTLGRLYF	TRAV13-1*01	TRAJ18*01	HomoSapiens	HLA-A*11:01	B2M	MHCI	AVFDRKSDAK	EBNA4	EBV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""}	{"cdr3": "CAAIYESRGSTLGRLYF", "cdr3_old": "CAAIYESRGSTLGRLYF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ18*01", "jStart": 7, "oldVEnd": -1, "oldVFixType": "FailedBadSegment", "oldVId": null, "vCanonical": true, "vEnd": 3, "vFixType": "ChangeSegment", "vId": "TRAV13-1*01"}	0
15760	TRA	CALRLNNQGGKLIF	TRAV9-2*01	TRAJ23*01	HomoSapiens	HLA-A*03:01	B2M	MHCI	KLGGALQAK	IE1	CMV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""}	{"cdr3": "CALRLNNQGGKLIF", "cdr3_old": "CALRLNNQGGKLIF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ23*01", "jStart": 6, "vCanonical": true, "vEnd": 3, "vFixType": "NoFixNeeded", "vId": "TRAV9-2*01"}	0
3051	TRB	CASSPPRVYSNGAGLAGVGWRNEQFF	TRBV5-4*01	TRBJ2-1*01	HomoSapiens	HLA-A*11:01	B2M	MHCI	AVFDRKSDAK	EBNA4	EBV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""}	{"cdr3": "CASSPPRVYSNGAGLAGVGWRNEQFF", "cdr3_old": "CASSPPRVYSNGAGLAGVGWRNEQFF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-1*01", "jStart": 21, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-4*01"}	0
15761	TRB	CASSWTWDAATLWGQGALGGANVLTF	TRBV5-5*01	TRBJ2-6*01	HomoSapiens	HLA-A*03:01	B2M	MHCI	KLGGALQAK	IE1	CMV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""}	{"cdr3": "CASSWTWDAATLWGQGALGGANVLTF", "cdr3_old": "CASSWTWDAATLWGQGALGGANVLTF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-6*01", "jStart": 19, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-5*01"}	0
3051	TRA	CAAIYESRGSTLGRLYF	TRAV13-1*01	TRAJ18*01	HomoSapiens	HLA-A*11:01	B2M	MHCI	AVFDRKSDAK	EBNA4	EBV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""}	{"cdr3": "CAAIYESRGSTLGRLYF", "cdr3_old": "CAAIYESRGSTLGRLYF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ18*01", "jStart": 7, "oldVEnd": -1, "oldVFixType": "FailedBadSegment", "oldVId": null, "vCanonical": true, "vEnd": 3, "vFixType": "ChangeSegment", "vId": "TRAV13-1*01"}	0
15761	TRA	CALRLNNQGGKLIF	TRAV9-2*01	TRAJ23*01	HomoSapiens	HLA-A*03:01	B2M	MHCI	KLGGALQAK	IE1	CMV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""}	{"cdr3": "CALRLNNQGGKLIF", "cdr3_old": "CALRLNNQGGKLIF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ23*01", "jStart": 6, "vCanonical": true, "vEnd": 3, "vFixType": "NoFixNeeded", "vId": "TRAV9-2*01"}	0
        """
        path = PathBuilder.build(EnvironmentSettings.root_path +
                                 "test/tmp/trcdist_encoder/")

        with open(path + "receptors.tsv", "w") as file:
            file.writelines(file_content)

        params = DefaultParamsLoader.load(
            EnvironmentSettings.default_params_path + "datasets/", "vdjdb")
        params["is_repertoire"] = False
        params["paired"] = True
        params["result_path"] = path
        params["path"] = path
        params["sequence_file_size"] = 1
        params["receptor_chains"] = "TRA_TRB"
        params['organism'] = 'human'

        dataset = VDJdbImport.import_dataset(params, "vdjdb_dataset")

        encoder = TCRdistEncoder.build_object(dataset, **{"cores": 2})
        encoded_dataset = encoder.encode(
            dataset,
            EncoderParams(f"{path}result/",
                          LabelConfiguration([Label("epitope")])))

        self.assertTrue(encoded_dataset.encoded_data.examples.shape[0]
                        == encoded_dataset.encoded_data.examples.shape[1]
                        and encoded_dataset.encoded_data.examples.shape[0]
                        == dataset.get_example_count())

        shutil.rmtree(path)
    def test_load(self):
        params = {
            "a": 1,
            "b": True
        }

        path = EnvironmentSettings.tmp_test_path + "defaultparamsloader/"
        PathBuilder.build(path)

        with open(path + "mixcr_params.yaml", "w") as file:
            yaml.dump(params, file)

        loaded = DefaultParamsLoader.load(path, "MiXCR")

        self.assertTrue(all(key in loaded.keys() for key in params.keys()))
        self.assertEqual(1, loaded["a"])
        self.assertEqual(True, loaded["b"])
        self.assertEqual(2, len(loaded.keys()))

        shutil.rmtree(path)
Exemple #18
0
    def _get_implanting_strategy(key: str, signal: dict) -> SignalImplantingStrategy:

        valid_strategies = [cls[:-10] for cls in
                            ReflectionHandler.discover_classes_by_partial_name("Implanting", "simulation/signal_implanting_strategy/")]
        ParameterValidator.assert_in_valid_list(signal["implanting"], valid_strategies, "SignalParser", key)

        defaults = DefaultParamsLoader.load("signal_implanting_strategy/", f"{signal['implanting']}Implanting")
        signal = {**defaults, **signal}

        ParameterValidator.assert_keys_present(list(signal.keys()), ["motifs", "implanting", "sequence_position_weights"], SignalParser.__name__, key)

        implanting_comp = None
        if 'implanting_computation' in signal:
            implanting_comp = signal['implanting_computation'].lower()
            ParameterValidator.assert_in_valid_list(implanting_comp, [el.name.lower() for el in ImplantingComputation], SignalParser.__name__,
                                                    'implanting_computation')
            implanting_comp = ImplantingComputation[implanting_comp.upper()]

        implanting_strategy = ReflectionHandler.get_class_by_name(f"{signal['implanting']}Implanting")(GappedMotifImplanting(),
                                                                                                       signal["sequence_position_weights"],
                                                                                                       implanting_comp)

        return implanting_strategy
Exemple #19
0
    def test_load_sequence_dataset(self):
        path = EnvironmentSettings.root_path + "test/tmp/mixcr/"
        PathBuilder.build(path)
        self.create_dummy_dataset(path, add_metadata=False)

        params = DefaultParamsLoader.load(
            EnvironmentSettings.default_params_path + "datasets/", "mixcr")
        params["is_repertoire"] = False
        params["paired"] = False
        params["result_path"] = path
        params["path"] = path

        dataset = MiXCRImport.import_dataset(params,
                                             "mixcr_repertoire_dataset")

        seqs = [sequence for sequence in dataset.get_data()]

        self.assertEqual("AVLETSGSRLT", seqs[0].amino_acid_sequence)
        self.assertEqual("AVNDAGNMLT", seqs[1].amino_acid_sequence)
        self.assertEqual("TRAV21", seqs[0].metadata.v_gene)
        self.assertEqual("TRAV12-2", seqs[1].metadata.v_gene)

        shutil.rmtree(path)
    def test_sequence_import(self):
        path = EnvironmentSettings.root_path + "test/tmp/adaptive/"
        self.build_dummy_dataset(path, False)

        params = DefaultParamsLoader.load(
            EnvironmentSettings.default_params_path + "datasets/",
            "ImmunoSEQRearrangement")
        params["is_repertoire"] = False
        params["paired"] = False
        params["result_path"] = path
        params["path"] = path
        params["import_productive"] = True
        params["import_with_stop_codon"] = True
        params["import_out_of_frame"] = True
        params["import_empty_nt_sequences"] = False
        params["import_empty_aa_sequences"] = True

        dataset_name = "adaptive_dataset_seqs"

        dataset = ImmunoSEQRearrangementImport.import_dataset(
            params, dataset_name)

        self.assertEqual(21, dataset.get_example_count())

        seqs = [sequence for sequence in dataset.get_data()]
        self.assertEqual("ASSLPGTNTGELF", seqs[0].amino_acid_sequence)
        self.assertEqual("IN", seqs[0].metadata.frame_type.name)
        self.assertEqual('TRBV7-9', seqs[0].metadata.v_gene)
        self.assertEqual('TRBJ2-2', seqs[0].metadata.j_gene)
        self.assertEqual('GCCAGCAGCTTACCGGGGACGAACACCGGGGAGCTGTTT',
                         seqs[0].nucleotide_sequence)

        dataset_file = f"{path}{dataset_name}.{ImportHelper.DATASET_FORMAT}"

        self.assertTrue(os.path.isfile(dataset_file))

        shutil.rmtree(path)
Exemple #21
0
    def test_import_sequence_dataset(self):
        path = EnvironmentSettings.root_path + "test/tmp/immunoseq/"

        self.create_dummy_dataset(path, False)

        params = DefaultParamsLoader.load(
            EnvironmentSettings.default_params_path + "datasets/",
            "ImmunoSEQSample")
        params["is_repertoire"] = False
        params["paired"] = False
        params["result_path"] = path
        params["path"] = path

        dataset = ImmunoSEQSampleImport.import_dataset(params,
                                                       "immunoseq_dataset")

        seqs = [sequence for sequence in dataset.get_data()]

        self.assertEqual(seqs[0].amino_acid_sequence, "ATSDQLNRWGTGELF")
        self.assertEqual(seqs[1].amino_acid_sequence, "ASKDGDTGELF")
        self.assertEqual(seqs[2].amino_acid_sequence, "ASSGEGQGVFGGTEAF")
        self.assertEqual(seqs[3].amino_acid_sequence, "ASSEEVGGNQPQH")

        shutil.rmtree(path)
Exemple #22
0
 def get_all_params(specs, class_path, short_class_name, key: str = None):
     default_params = DefaultParamsLoader.load(class_path, short_class_name)
     specified_params = ObjectParser.get_params(specs, short_class_name)
     params = {**default_params, **specified_params, "name": key}
     return params
Exemple #23
0
    def test_load_galaxy_bordercases(self):
        # This test is here because in the Galaxy interface, when importing data from VDJdb:
        # - receptors might be incomplete (one of two genes only)
        # - V and J genes might be missing
        file_content = """complex.id	Gene	CDR3	V	J	Species	MHC A	MHC B	MHC class	Epitope	Epitope gene	Epitope species	Reference	Method	Meta	CDR3fix	Score
3050	TRB	CASSPPRVYSNGAGLAGVGWRNEQFF	TRBV5-4*01	TRBJ2-1*01	HomoSapiens	HLA-A*11:01	B2M	MHCI	AVFDRKSDAK	EBNA4	EBV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""}	{"cdr3": "CASSPPRVYSNGAGLAGVGWRNEQFF", "cdr3_old": "CASSPPRVYSNGAGLAGVGWRNEQFF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-1*01", "jStart": 21, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-4*01"}	0
15760	TRB	CASSWTWDAATLWGQGALGGANVLTF		TRBJ2-6*01	HomoSapiens	HLA-A*03:01	B2M	MHCI	KLGGALQAK	IE1	CMV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""}	{"cdr3": "CASSWTWDAATLWGQGALGGANVLTF", "cdr3_old": "CASSWTWDAATLWGQGALGGANVLTF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-6*01", "jStart": 19, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-5*01"}	0
4000	TRB	CASSPPRVYSNGAGLAGVGWRNEQFF	TRBV5-4*01	TRBJ2-1*01	HomoSapiens	HLA-A*11:01	B2M	MHCI	AVFDRKSDAK	EBNA4	EBV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""}	{"cdr3": "CASSPPRVYSNGAGLAGVGWRNEQFF", "cdr3_old": "CASSPPRVYSNGAGLAGVGWRNEQFF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-1*01", "jStart": 21, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-4*01"}	0
3050	TRA	CAAIYESRGSTLGRLYF	TRAV13-1*01		HomoSapiens	HLA-A*11:01	B2M	MHCI	AVFDRKSDAK	EBNA4	EBV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""}	{"cdr3": "CAAIYESRGSTLGRLYF", "cdr3_old": "CAAIYESRGSTLGRLYF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ18*01", "jStart": 7, "oldVEnd": -1, "oldVFixType": "FailedBadSegment", "oldVId": null, "vCanonical": true, "vEnd": 3, "vFixType": "ChangeSegment", "vId": "TRAV13-1*01"}	0
15760	TRA	CALRLNNQGGKLIF			HomoSapiens	HLA-A*03:01	B2M	MHCI	KLGGALQAK	IE1	CMV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""}	{"cdr3": "CALRLNNQGGKLIF", "cdr3_old": "CALRLNNQGGKLIF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ23*01", "jStart": 6, "vCanonical": true, "vEnd": 3, "vFixType": "NoFixNeeded", "vId": "TRAV9-2*01"}	0
"""
        path = EnvironmentSettings.root_path + "test/tmp/iovdjdb/"
        PathBuilder.build(path)

        with open(path + "receptors.tsv", "w") as file:
            file.writelines(file_content)

        default_params = DefaultParamsLoader.load(
            EnvironmentSettings.default_params_path + "datasets/", "vdjdb")

        dataset = VDJdbImport.import_dataset(
            {
                "is_repertoire":
                False,
                "result_path":
                path,
                "paired":
                True,
                "path":
                path,
                "sequence_file_size":
                1,
                "region_type":
                "IMGT_CDR3",
                "separator":
                "\t",
                "receptor_chains":
                "TRA_TRB",
                "column_mapping":
                default_params["column_mapping"],
                "import_empty_nt_sequences":
                True,
                "import_empty_aa_sequences":
                False,
                "import_illegal_characters":
                False,
                "metadata_column_mapping":
                default_params["metadata_column_mapping"]
            }, "vdjdb_rec_dataset")

        self.assertEqual(2, dataset.get_example_count())
        self.assertEqual(2, len(dataset.get_filenames()))

        for receptor in dataset.get_data(2):
            self.assertTrue(receptor.alpha.amino_acid_sequence in
                            ["AAIYESRGSTLGRLY", "ALRLNNQGGKLI"])
            self.assertTrue(
                receptor.alpha.get_attribute("v_gene") in ["TRAV13-1", None])
            self.assertTrue(receptor.alpha.get_attribute("j_gene") in [None])
            self.assertTrue(
                receptor.beta.get_attribute("v_gene") in ["TRBV5-4", None])
            self.assertTrue(
                receptor.beta.get_attribute("j_gene") in
                ["TRBJ2-1", "TRBJ2-6"])
            self.assertTrue(
                receptor.metadata["epitope_species"] in ["EBV", "CMV"])
            self.assertTrue(
                receptor.metadata["epitope"] in ["AVFDRKSDAK", "KLGGALQAK"])
            self.assertTrue(
                receptor.metadata["epitope_gene"] in ["EBNA4", "IE1"])

        shutil.rmtree(path)